def find_profiles_google(company, role="software engineer", count=10): """Use Google dorking to find LinkedIn profiles.""" query = f'site:linkedin.com/in "{role}" "{company}"' response = requests.get( "https://www.google.com/search", params={"q": query, "num": count}, headers={"user agent parser": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"} ) # Parse Google results to extract LinkedIn URLs import re urls = re.findall(r'https://www\.linkedin\.com/in/[\w-]+', response.text) return list(set(urls)) </code></pre> <br>
<h2>Step 3: Scrape with Selenium (Small Scale)</h2> <br> <p>For a handful of profiles, Selenium with anti-detection measures works:</p> <br> <pre><code> from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from bs4 import BeautifulSoup import timedef create_stealth_browser(): """Create a Selenium browser with anti-detection flags.""" options = Options() options.add_argument("--headless=new") options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False)
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
})
return driver
def scrape_linkedin_profile(driver, url): """Scrape public data from a LinkedIn profile.""" driver.get(url) time.sleep(3) # Wait for JS to render
soup = BeautifulSoup(driver.page_source, "html.parser")
profile = {
"name": None,
"headline": None,
"location": None,
"company": None,
"about": None,
"experience": [],
}
# Extract name
name_el = soup.select_one("h1.text-heading-xlarge")
if name_el:
profile["name"] = name_el.get_text(strip=True)
# Extract headline
headline_el = soup.select_one("div.text-body-medium")
if headline_el:
profile["headline"] = headline_el.get_text(strip=True)
# Extract location
loc_el = soup.select_one("span.text-body-small")
if loc_el:
profile["location"] = loc_el.get_text(strip=True)
# Extract about section
about_el = soup.select_one("div#about, section[id='about'] div")
if about_el:
profile["about"] = about_el.get_text(strip=True)[:500] # Truncate long text
return profile
</code></pre> <br>
<h2>Step 4: Add Proxy Rotation</h2> <br> <p>For more than a few profiles, proxy rotation is essential:</p> <br> <pre><code> import randomdef scrape_with_proxy(urls, proxies): """Scrape multiple profiles with proxy rotation.""" results = [] driver = create_stealth_browser()
for i, url in enumerate(urls):
# Rotate proxy every 3 requests
if i % 3 == 0 and proxies:
proxy = random.choice(proxies)
# Configure Selenium to use proxy
from selenium.webdriver.chrome.service import Service
capabilities = driver.capabilities
# Note: Selenium proxy setup varies by browser
try:
profile = scrape_linkedin_profile(driver, url)
profile["url"] = url
if profile["name"]:
results.append(profile)
print(f"[{i+1}/{len(urls)}] {profile['name']}")
except Exception as e:
print(f"[{i+1}/{len(urls)}] Error: {e}")
time.sleep(random.uniform(5, 10)) # Random delay 5-10 seconds
driver.quit()
return results
</code></pre> <br>
<h2>Step 5: The SearchHive Approach (Production-Ready)</h2> <br> <p>SearchHive eliminates the complexity. JS rendering, proxy rotation, and CAPTCHA solving are built into every request:</p> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
Scrape a single LinkedIn profile
result = client.scrape( url="https://www.linkedin.com/in/some-profile/", selector="h1, .text-body-medium, .text-body-small, #about" )
profile = result.get("data", {}) print(f"Name: {profile.get('h1', ['N/A'])[0] if profile.get('h1') else 'N/A'}") print(f"Headline: {profile.get('.text-body-medium', ['N/A'])[0] if profile.get('.text-body-medium') else 'N/A'}") </code></pre> <br>
<h3>Batch Scrape Multiple Profiles</h3> <br> <pre><code> from searchhive import SearchHive import pandas as pdclient = SearchHive(api_key="your-api-key")
profile_urls = [ "https://www.linkedin.com/in/profile-1/", "https://www.linkedin.com/in/profile-2/", "https://www.linkedin.com/in/profile-3/", ]
Batch scrape — up to 100 URLs per request
results = client.batch( profile_urls, selector="h1, .text-body-medium, .text-body-small, #about" )
leads = [] for r in results: if r.get("status") == "success": data = r.get("data", {}) leads.append({ "name": data.get("h1", [None])[0], "headline": data.get(".text-body-medium", [None])[0], "location": data.get(".text-body-small", [None])[0], "about": data.get("#about", [None])[0], "url": r.get("url"), })
df = pd.DataFrame([l for l in leads if l["name"]]) df.to_csv("linkedin_leads.csv", index=False) print(f"Exported {len(df)} leads to linkedin_leads.csv") </code></pre> <br>
<h3>SwiftSearch: Discover Profiles by Keyword</h3> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
Find engineering managers at specific companies
results = client.search("site:linkedin.com/in engineering manager Stripe") for r in results[:10]: print(f"{r['title']} — {r['url']}") </code></pre> <br>
<h2>Step 6: Post-Processing and Lead Scoring</h2> <br> <p>Turn raw profile data into actionable leads:</p> <br> <pre><code> import pandas as pddef score_leads(csv_path): """Score leads based on profile data.""" df = pd.read_csv(csv_path)
# Simple scoring: keyword matching in headline
keywords = ["director", "vp", "head", "manager", "lead", "senior"]
df["score"] = df["headline"].apply(
lambda h: sum(1 for kw in keywords if kw in str(h).lower())
)
# Sort by score descending
df = df.sort_values("score", ascending=False)
df.to_csv("linkedin_leads_scored.csv", index=False)
print(f"Top leads:\n{df[['name', 'headline', 'score']].head(10)}")
return df
</code></pre> <br>
<h2>Complete Code Example</h2> <br> <pre><code> from searchhive import SearchHive import pandas as pdclient = SearchHive(api_key="your-api-key")
Step 1: Find profiles
profiles = [ "https://www.linkedin.com/in/profile-1/", "https://www.linkedin.com/in/profile-2/", "https://www.linkedin.com/in/profile-3/", "https://www.linkedin.com/in/profile-4/", "https://www.linkedin.com/in/profile-5/", ]
Step 2: Batch scrape
results = client.batch(profiles, selector="h1, .text-body-medium, .text-body-small, #about")
Step 3: Parse and score
leads = [] for r in results: if r.get("status") != "success": continue data = r.get("data", {}) name = data.get("h1", [None])[0] if data.get("h1") else None headline = data.get(".text-body-medium", [None])[0] if data.get(".text-body-medium") else None location = data.get(".text-body-small", [None])[0] if data.get(".text-body-small") else None
if not name:
continue
# Score based on seniority keywords
seniority = ["director", "vp", "head", "manager", "lead", "senior", "principal"]
score = sum(1 for kw in seniority if kw in str(headline or "").lower())
leads.append({
"name": name,
"headline": headline,
"location": location,
"score": score,
"url": r.get("url"),
})
Step 4: Export
df = pd.DataFrame(leads).sort_values("score", ascending=False) df.to_csv("leads_scored.csv", index=False) print(f"Exported {len(df)} scored leads") </code></pre> <br>
<h2>Common Issues</h2> <br> <p><strong>Cloudflare blocking your requests:</strong> LinkedIn's Cloudflare protection detects headless browsers. Use SearchHive's residential proxy rotation to bypass this automatically.</p> <br> <p><strong>Empty data despite successful request:</strong> The profile may be login-gated. Only scrape profiles that are publicly accessible without authentication.</p> <br> <p><strong>Rate limiting (429):</strong> LinkedIn is extremely aggressive with rate limits. Space requests 10+ seconds apart, use proxy rotation, and limit concurrent requests.</p> <br> <p><strong>WebDriver detection:</strong> LinkedIn checks <code>navigator.webdriver</code>. The Selenium stealth flags in Step 3 help, but SearchHive avoids this entirely by using its own browser infrastructure.</p> <br> <p><strong>Profile data structure changed:</strong> LinkedIn frequently updates its HTML classes. SearchHive's CSS selector approach handles missing elements gracefully (returns empty arrays).</p> <br> <p><strong>Legal risks:</strong> Stay within legal boundaries. Only scrape public data, comply with GDPR/CCPA, and have a legitimate business purpose.</p> <br> <h2>Next Steps</h2> <br> <li>**Integrate with CRM** — Pipe scored leads into HubSpot, Salesforce, or Pipedrive</li> <li>**Automate** — Schedule weekly scans with `cron` to track new profiles</li> <li>**Enrich data** — Cross-reference with company pages for employee count, funding, and industry</li> <li>**Scale carefully** — Start with small batches (10–20 profiles per day) and increase gradually</li> <li>**Complement with job data** — See [How to Scrape Job Postings with Python](/blog/how-to-scrape-job-postings-with-python-complete-tutorial) for full company data</li> <br> <hr> <br> <p>Get started with SearchHive's <a href="https://searchhive.dev">free tier</a> — 100 requests per month, no credit card required. Handles JS rendering, proxy rotation, and CAPTCHA solving out of the box. <a href="https://searchhive.dev/docs">Read the docs</a>.</p> <br>