How to Build a Lead Generation Tool with Web Scraping
Lead generation is the lifeblood of B2B sales and marketing. Manually researching prospects is slow, expensive, and doesn't scale. Web scraping automates the process — pulling business names, contact info, and company data from directories, social platforms, and industry websites into a structured database you can act on.
This tutorial walks you through building a production-grade lead generation tool with Python, from architecture to deployment. We'll use SearchHive for the heavy lifting — scraping, extraction, and enrichment.
Key Takeaways
- A lead generation scraper combines data collection, enrichment, and validation into a pipeline
- Python's requests + BeautifulSoup handle static directories; Playwright handles JavaScript-heavy sites
- SearchHive's ScrapeForge and SwiftSearch replace fragile custom scrapers with reliable API calls
- Legal compliance (GDPR, CCPA, robots.txt generator) is non-negotiable — always audit before scraping
- Email verification and deduplication are critical for lead quality
Prerequisites
- Python 3.9+ with pip
- Understanding of HTTP requests and HTML structure
- A SearchHive API key (free tier available)
- For the full pipeline: PostgreSQL or SQLite for storage
pip install requests beautifulsoup4 searchhive pandas
# Optional: playwright for JS-heavy sites
pip install playwright && playwright install chromium
Step 1: Define Your Lead Data Model
Before writing any scraping code, define what a "lead" looks like for your use case:
from dataclasses import dataclass, field, asdict
from typing import Optional
@dataclass
class Lead:
company_name: str
website: Optional[str] = None
phone: Optional[str] = None
email: Optional[str] = None
address: Optional[str] = None
industry: Optional[str] = None
employee_count: Optional[str] = None
rating: Optional[float] = None
source: Optional[str] = None
verified: bool = False
notes: list = field(default_factory=list)
def to_dict(self):
return asdict(self)
def to_csv_row(self):
return f"{self.company_name},{self.website or ''},{self.phone or ''},{self.email or ''}"
Step 2: Build the Base Scraper Class
Create a reusable scraper base that handles common concerns: rate limiting, error handling, and deduplication.
import requests
import time
import hashlib
import json
from bs4 import BeautifulSoup
from typing import List, Optional
class LeadScraper:
"""Base scraper for lead generation."""
def __init__(self, delay=1.0, user_agent=None):
self.delay = delay
self.session = requests.Session()
self.session.headers.update({
"User-Agent": user_agent or "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
})
self.seen_urls = set()
self.leads: List[Lead] = []
def _is_seen(self, url: str) -> bool:
url_hash = hashlib.md5(url.encode()).hexdigest()
if url_hash in self.seen_urls:
return True
self.seen_urls.add(url_hash)
return False
def _fetch(self, url: str) -> Optional[BeautifulSoup]:
"""Fetch a URL with rate limiting."""
if self._is_seen(url):
return None
time.sleep(self.delay)
try:
resp = self.session.get(url, timeout=15)
resp.raise_for_status()
return BeautifulSoup(resp.text, "html.parser")
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def deduplicate(self):
"""Remove duplicate leads by company name + phone."""
seen = set()
unique = []
for lead in self.leads:
key = f"{lead.company_name}|{lead.phone}".lower()
if key not in seen:
seen.add(key)
unique.append(lead)
self.leads = unique
print(f"Deduplicated: {len(self.leads)} unique leads remaining")
def save_to_json(self, filename="leads.json"):
with open(filename, "w") as f:
json.dump([l.to_dict() for l in self.leads], f, indent=2)
print(f"Saved {len(self.leads)} leads to {filename}")
def save_to_csv(self, filename="leads.csv"):
import csv
with open(filename, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["company_name", "website", "phone", "email", "address", "industry", "rating", "source"])
writer.writeheader()
for lead in self.leads:
writer.writerow({k: lead.to_dict().get(k, "") for k in writer.fieldnames})
print(f"Saved {len(self.leads)} leads to {filename}")
Step 3: Scrape Business Directories
Here's a concrete example scraping a business directory for leads:
class DirectoryScraper(LeadScraper):
"""Scrape business directories for leads."""
def scrape_directory(self, base_url: str, pages: int = 5):
"""Scrape multiple pages of a directory."""
for page in range(1, pages + 1):
url = f"{base_url}?page={page}"
print(f"Scraping page {page}: {url}")
soup = self._fetch(url)
if not soup:
continue
# Extract listing cards (adjust selectors to your target)
listings = soup.select(".listing-card, .business-item, .result-item")
for listing in listings:
lead = self._parse_listing(listing)
if lead:
self.leads.append(lead)
print(f" Found {len(listings)} listings on page {page}")
def _parse_listing(self, element) -> Optional[Lead]:
"""Parse a single listing element into a Lead."""
try:
name_el = element.select_one(".business-name, h3, h2")
if not name_el:
return None
name = name_el.get_text(strip=True)
phone_el = element.select_one(".phone, [href^='tel:']")
phone = phone_el.get_text(strip=True) if phone_el else None
website_el = element.select_one(".website a, a[href^='http']")
website = website_el.get("href") if website_el else None
address_el = element.select_one(".address, .location")
address = address_el.get_text(strip=True) if address_el else None
rating_el = element.select_one(".rating, .stars")
rating = float(rating_el.get("data-rating", 0)) if rating_el else None
return Lead(
company_name=name,
phone=phone,
website=website,
address=address,
rating=rating,
source="directory"
)
except Exception as e:
print(f" Parse error: {e}")
return None
# Usage
scraper = DirectoryScraper(delay=1.5)
scraper.scrape_directory("https://example-directory.com/search/restaurant", pages=5)
scraper.deduplicate()
scraper.save_to_csv()
Step 4: Enrich Leads with SearchHive
Raw directory data is just the starting point. SearchHive enriches each lead with additional context.
from searchhive import SwiftSearch, ScrapeForge
def enrich_lead_with_searchhive(lead: Lead, api_key: str) -> Lead:
"""Enrich a lead using SearchHive APIs."""
if not lead.website:
return lead
try:
# DeepDive: Analyze the company website
from searchhive import DeepDive
dd = DeepDive(api_key=api_key)
analysis = dd.analyze(
url=lead.website,
extract_features=True,
summarize=True
)
# Extract industry from analysis
if analysis.get("summary"):
lead.notes.append(f"Website summary: {analysis['summary'][:200]}")
if analysis.get("features"):
lead.notes.append(f"Key features: {', '.join(analysis['features'][:5])}")
except Exception as e:
lead.notes.append(f"Enrichment failed: {e}")
return lead
# SwiftSearch: Find additional data about the company
def find_company_contacts(company_name: str, api_key: str) -> dict:
"""Search for publicly available contact information."""
search = SwiftSearch(api_key=api_key)
results = search.search(
query=f"{company_name} contact email phone",
extract_fields=["title", "description", "url"]
)
return results
Step 5: Email Verification
Sending emails to invalid addresses hurts your deliverability score. Add verification before exporting:
import re
import smtplib
from typing import Optional
def extract_domain(website: str) -> Optional[str]:
"""Extract domain from website URL."""
if not website:
return None
website = website.replace("https://", "").replace("http://", "").replace("www.", "")
return website.split("/")[0]
def guess_email_patterns(name: str, domain: str) -> list:
"""Generate common email patterns for a company."""
if not name or not domain:
return []
# Extract first/last name
parts = name.lower().split()
if not parts:
return []
first = parts[0]
last = parts[-1] if len(parts) > 1 else ""
patterns = []
if last:
patterns = [
f"{first}@{domain}",
f"{first}.{last}@{domain}",
f"{first}{last}@{domain}",
f"{first[0]}{last}@{domain}",
f"{first[0]}.{last}@{domain}",
f"info@{domain}",
f"contact@{domain}",
f"hello@{domain}",
]
return patterns
def verify_email_smtp(email: str, domain: str) -> bool:
"""Basic SMTP verification (not 100% reliable)."""
try:
import dns.resolver
records = dns.resolver.resolve(domain, "MX")
mx_server = str(records[0].exchange)
with smtplib.SMTP(mx_server, timeout=10) as server:
server.helo("verify.example.com")
server.mail("verify@example.com")
code, message = server.rcpt(email)
return code == 250
except Exception:
return False
# For production, use a dedicated verification API
# (Hunter.io, NeverBounce, ZeroBounce)
Step 6: Complete Pipeline
Wire everything together into a production-ready pipeline:
import csv
from datetime import datetime
def run_lead_generation_pipeline(api_key: str, search_query: str, output_file: str):
"""Run the complete lead generation pipeline."""
print(f"=== Lead Generation Pipeline ===")
print(f"Query: {search_query}")
print(f"Started: {datetime.now().isoformat()}")
# Step 1: Collect raw leads from directories
scraper = DirectoryScraper(delay=1.5)
# Add your directory URLs here
scraper.scrape_directory(f"https://example-directory.com/search/{search_query}", pages=5)
print(f"Raw leads collected: {len(scraper.leads)}")
# Step 2: Deduplicate
scraper.deduplicate()
# Step 3: Enrich with SearchHive
print("Enriching leads with SearchHive...")
for i, lead in enumerate(scraper.leads):
if lead.website:
scraper.leads[i] = enrich_lead_with_searchhive(lead, api_key)
if (i + 1) % 10 == 0:
print(f" Enriched {i + 1}/{len(scraper.leads)} leads")
# Step 4: Save results
scraper.save_to_json(output_file.replace(".csv", ".json"))
scraper.save_to_csv(output_file)
print(f"\nPipeline complete: {len(scraper.leads)} leads")
return scraper.leads
# Run it
leads = run_lead_generation_pipeline(
api_key="your_searchhive_key",
search_query="marketing agencies",
output_file="leads_marketing_agencies.csv"
)
Step 7: Use SearchHive for the Entire Pipeline
Instead of building and maintaining custom scrapers, you can use SearchHive to handle the entire workflow:
from searchhive import SwiftSearch, ScrapeForge, DeepDive
api_key = "your_searchhive_key"
# Step 1: Discover businesses
search = SwiftSearch(api_key=api_key)
businesses = search.search(
query="marketing agencies in New York",
domains=["clutch.co", "yelp.com", "yellowpages.com"],
extract_fields=["title", "description", "url", "phone", "address", "rating"]
)
# Step 2: Scrape individual business pages for details
scraper = ScrapeForge(api_key=api_key)
detailed_leads = scraper.extract(
urls=[b["url"] for b in businesses if b.get("url")],
renderer="playwright",
extract={
"email": "a[href^='mailto']",
"phone": "a[href^='tel:']",
"services": ".services-list",
"team_size": ".team-size",
}
)
# Step 3: Analyze competitors
analyzer = DeepDive(api_key=api_key)
for lead in detailed_leads[:5]:
analysis = analyzer.analyze(
url=lead["url"],
extract_features=True,
summarize=True
)
print(f"{lead['name']}: {analysis.get('summary', 'N/A')[:150]}")
Legal Considerations
Before scraping any website for leads, understand the legal landscape:
What You Need to Know
- GDPR (EU): Personal data (names, emails, phones) requires a lawful basis. "Legitimate interest" may apply for B2B, but you need a Legitimate Interest Assessment
- CCPA (California): US state privacy laws generally exempt publicly available data, but check specifics
- robots.txt: Not legally binding, but ignoring it may trigger anti-scraping measures
- Terms of Service: If you create an account to access data, you've agreed to their ToS — which may prohibit scraping
- Copyright: Factual data (business names, addresses, phone numbers) is generally not copyrightable
Best Practices
- Respect rate limits — 1-2 requests per second maximum
- Don't scrape behind login walls without explicit permission
- Include an opt-out mechanism in your outreach
- Don't resell scraped data — this significantly increases legal risk
- Document your compliance — keep records of your legal basis for data collection
Common Issues
Getting Blocked by the Target Site
Cause: Too many requests from the same IP, missing or stale headers. Fix: Add delays between requests (1-3 seconds), rotate user agents, use residential proxies for high-volume scraping, or use SearchHive which handles this automatically.
Incomplete or Missing Data
Cause: JavaScript rendering required, dynamic loading, or broken HTML. Fix: Use ScrapeForge with renderer="playwright" for JS-heavy sites. SearchHive renders JavaScript automatically.
Duplicate Leads
Cause: Same business listed under multiple categories or slight name variations. Fix: Implement fuzzy matching (company name + address/phone as composite key), normalize names (strip LLC, Inc., etc.) before deduplication.
Email Bounce Rates Too High
Cause: Guessed email patterns are often wrong. Fix: Use a verification service (NeverBounce, ZeroBounce, Hunter.io) before adding emails to your outreach list. Budget $0.003-0.01 per verification.
Next Steps
- Scale with scheduling — use cron expression generator or APScheduler to run the pipeline daily/weekly
- Add CRM integration — auto-import leads to Salesforce, HubSpot, or Pipedrive
- Build a scoring model — rank leads by fit (industry match, company size, website quality)
- Monitor data freshness — leads decay fast; re-scrape quarterly
- Automate outreach — connect your lead database to email sequencing tools
Ready to build a lead generation machine? Start with SearchHive's free tier — get 100 free requests per month with no credit card required. Check the API documentation for integration guides.
See also: How to scrape Google Maps data | SearchHive vs Bright Data | Python web scraping guide