HEADERS = { "user agent parser": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml", }
def search_businesses(query, location, page=1): """Search Yellow Pages for businesses.""" url = f"https://www.yellowpages.com/search" params = { "search_terms": query, "geo_location_terms": location, "page": page, }
try:
response = requests.get(url, params=params, headers=HEADERS, timeout=30)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching page {page}: {e}")
return None
</code></pre> <br>
<h2>Step 3: Parse Business Listings</h2> <br> <p>Extract structured data from each search result:</p> <br> <pre><code> def parse_business_results(html): """Parse business listings from Yellow Pages search results.""" soup = BeautifulSoup(html, "html.parser") businesses = []for result in soup.select(".result"):
# Business name
name_el = result.select_one(".business-name")
name = name_el.get_text(strip=True) if name_el else None
# Rating
rating_el = result.select_one(".ratings")
rating = None
reviews = 0
if rating_el:
rating_text = rating_el.get_text(strip=True)
rating_match = re.search(r"([\d.]+)", rating_text)
if rating_match:
rating = float(rating_match.group(1))
review_match = re.search(r"\((\d+)", rating_text)
if review_match:
reviews = int(review_match.group(1))
# Phone number
phone_el = result.select_one(".phone")
phone = phone_el.get_text(strip=True) if phone_el else None
# Address
street = result.select_one(".street-address")
city = result.select_one(".city")
state = result.select_one(".state")
address = f"{street.get_text(strip=True) if street else ''}, {city.get_text(strip=True) if city else ''} {state.get_text(strip=True) if state else ''}".strip(", ")
# Categories
categories = [cat.get_text(strip=True) for cat in result.select(".categories a")]
# Link to detail page
link = result.select_one("a.business-name")
detail_url = f"https://www.yellowpages.com{link['href']}" if link and link.get("href") else None
if name:
businesses.append({
"name": name,
"rating": rating,
"reviews": reviews,
"phone": phone,
"address": address,
"categories": ", ".join(categories),
"detail_url": detail_url,
})
return businesses
</code></pre> <br>
<h2>Step 4: Handle Pagination</h2> <br> <p>Yellow Pages paginates results. Iterate through all pages:</p> <br> <pre><code> def scrape_yellow_pages(query, location, max_pages=10): """Scrape all pages of Yellow Pages search results.""" all_businesses = []for page in range(1, max_pages + 1):
print(f"Scraping page {page}...")
html = search_businesses(query, location, page=page)
if not html:
break
businesses = parse_business_results(html)
if not businesses:
print("No results found — stopping.")
break
all_businesses.extend(businesses)
print(f" Found {len(businesses)} businesses (total: {len(all_businesses)})")
# Check if there's a next page
soup = BeautifulSoup(html, "html.parser")
next_page = soup.select_one("a.next")
if not next_page:
print("No more pages.")
break
time.sleep(3) # Rate limiting
return all_businesses
</code></pre> <br>
<h2>Step 5: Scrape Business Detail Pages</h2> <br> <p>Each listing links to a detail page with more information:</p> <br> <pre><code> def scrape_business_detail(detail_url): """Get detailed information from a business's Yellow Pages page.""" html = search_businesses_detail(detail_url) if not html: return {}soup = BeautifulSoup(html, "html.parser")
# Website
website_el = soup.select_one(".website-link")
website = website_el.get("href") if website_el else None
# Hours
hours = {}
hours_section = soup.select_one(".hours")
if hours_section:
for row in hours_section.select("tr"):
day = row.select_one("th")
time_range = row.select_one("td")
if day and time_range:
hours[day.get_text(strip=True)] = time_range.get_text(strip=True)
# Additional categories
extra_categories = [cat.get_text(strip=True) for cat in soup.select(".business-categories a")]
return {
"website": website,
"hours": hours,
"extra_categories": ", ".join(extra_categories),
}
</code></pre> <br>
<h2>Step 6: Clean and Export Data</h2> <br> <pre><code> def clean_and_export(businesses, filename="yellow_pages_data.csv"): """Deduplicate and export business data.""" df = pd.DataFrame(businesses)# Remove duplicates by name + address
df.drop_duplicates(subset=["name", "address"], inplace=True)
# Clean phone numbers
df["phone"] = df["phone"].str.replace(r"[^\d+]", "", [regex tester](/tools/regex-tester)=True)
# Filter out businesses without phones (likely incomplete)
df = df[df["phone"].notna() & (df["phone"] != "")]
# Export
df.to_csv(filename, index=False)
df.to_json("yellow_pages_data.json", orient="records", indent=2)
print(f"\nExported {len(df)} businesses to {filename}")
print(f"Average rating: {df['rating'].mean():.1f}")
print(f"Top categories: {df['categories'].str.split(', ').explode().value_counts().head(10).to_dict()}")
return df
</code></pre> <br>
<h2>Step 7: The SearchHive Approach</h2> <br> <p>SearchHive simplifies the entire pipeline — no HTML parsing, no pagination logic, no proxy management:</p> <br> <pre><code> from searchhive import SearchHive import pandas as pdclient = SearchHive(api_key="your-api-key")
Scrape a single Yellow Pages search page
result = client.scrape( url="https://www.yellowpages.com/search?search_terms=plumbing&geo_location_terms=Chicago%2C+IL", selector=".business-name, .ratings, .phone, .street-address, .city, .state, .categories" )
print(f"Businesses found: {result['data']}") </code></pre> <br>
<h3>Batch Scrape Multiple Cities</h3> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
Scrape the same business category across multiple cities
urls = [ "https://www.yellowpages.com/search?search_terms=plumbing&geo_location_terms=Chicago%2C+IL", "https://www.yellowpages.com/search?search_terms=plumbing&geo_location_terms=Dallas%2C+TX", "https://www.yellowpages.com/search?search_terms=plumbing&geo_location_terms=Phoenix%2C+AZ", "https://www.yellowpages.com/search?search_terms=plumbing&geo_location_terms=Denver%2C+CO", "https://www.yellowpages.com/search?search_terms=plumbing&geo_location_terms=Seattle%2C+WA", ]
results = client.batch( urls, selector=".business-name, .ratings, .phone, .street-address, .city, .state, .categories" )
all_businesses = [] for r in results: if r.get("status") == "success": data = r["data"] names = data.get(".business-name", []) phones = data.get(".phone", []) for i, name in enumerate(names): all_businesses.append({ "name": name, "phone": phones[i] if i < len(phones) else None, "city": r["url"].split("geo_location_terms=")[1].split("+")[0] if "geo_location_terms=" in r["url"] else "Unknown", "source_url": r["url"], })
df = pd.DataFrame(all_businesses) df.to_csv("plumbing_multi_city.csv", index=False) print(f"Exported {len(df)} businesses across 5 cities") </code></pre> <br>
<h3>DeepDive: Crawl All Pages</h3> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
Crawl all pagination pages for a search
pages = client.deep_dive( url="https://www.yellowpages.com/search?search_terms=restaurant&geo_location_terms=New+York%2C+NY", max_depth=3, max_pages=30, url_pattern="yellowpages.com/search*" ) print(f"Crawled {len(pages)} pages") </code></pre> <br>
<h3>SwiftSearch: Find Yellow Pages Listings</h3> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
results = client.search("site:yellowpages.com restaurants Chicago IL") for r in results[:10]: print(f"[{r['title']}] {r['url']}") </code></pre> <br>
<h2>Complete Working Example</h2> <br> <pre><code> import requests from bs4 import BeautifulSoup import pandas as pd import time import reHEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", }
def yellow_pages_scraper(query, location, max_pages=5): """Complete Yellow Pages scraping pipeline.""" all_businesses = []
for page in range(1, max_pages + 1):
url = f"https://www.yellowpages.com/search"
params = {"search_terms": query, "geo_location_terms": location, "page": page}
try:
r = requests.get(url, params=params, headers=HEADERS, timeout=30)
r.raise_for_status()
except requests.RequestException as e:
print(f"Error on page {page}: {e}")
break
soup = BeautifulSoup(r.text, "html.parser")
for result in soup.select(".result"):
name_el = result.select_one(".business-name")
name = name_el.get_text(strip=True) if name_el else None
if not name:
continue
rating_el = result.select_one(".ratings")
rating_text = rating_el.get_text(strip=True) if rating_el else ""
rating_match = re.search(r"([\d.]+)", rating_text)
review_match = re.search(r"\((\d+)", rating_text)
phone_el = result.select_one(".phone")
street = result.select_one(".street-address")
city = result.select_one(".city")
state = result.select_one(".state")
address = ", ".join(filter(None, [
street.get_text(strip=True) if street else None,
city.get_text(strip=True) if city else None,
state.get_text(strip=True) if state else None,
]))
all_businesses.append({
"name": name,
"rating": float(rating_match.group(1)) if rating_match else None,
"reviews": int(review_match.group(1)) if review_match else 0,
"phone": phone_el.get_text(strip=True) if phone_el else None,
"address": address,
"category": query,
"location": location,
})
print(f"Page {page}: {len(all_businesses)} total businesses")
# Check for next page
if not soup.select_one("a.next"):
break
time.sleep(3)
# Export
df = pd.DataFrame(all_businesses)
df.drop_duplicates(subset=["name", "phone"], inplace=True)
df.to_csv(f"yp_{query.replace(' ', '_')}_{location.replace(' ', '_')}.csv", index=False)
print(f"\nSaved {len(df)} businesses")
return df
if name == "main": yellow_pages_scraper("plumbing", "Chicago, IL", max_pages=5) </code></pre> <br>
<h2>Common Issues</h2> <br> <br> <p><strong>Empty results:</strong> The HTML structure may have changed. Re-inspect the page and update your CSS selectors.</p> <br> <p><strong>Duplicate listings:</strong> Some businesses appear in multiple categories. Deduplicate by name + phone combination.</p> <br> <p><strong>Missing phone numbers:</strong> Some listings don't display phone numbers. Filter these out or use SearchHive to scrape detail pages.</p> <br> <p><strong>Inconsistent address formats:</strong> Address HTML varies between listings. Use robust parsing that handles missing city/state fields.</p> <br> <p><strong>Rate limiting:</strong> Yellow Pages throttles rapid requests. Keep delays at 3+ seconds and limit concurrent sessions.</p> <br> <h2>Next Steps</h2> <br> <li>**Scale across categories** — Scrape multiple business types for comprehensive local market data</li> <li>**Multi-city expansion** — Use [SearchHive's batch endpoint](https://searchhive.dev/docs) to scrape 100+ city combinations</li> <li>**Data enrichment** — Cross-reference with Google Maps API for additional data (photos, hours, more reviews)</li> <li>**Lead generation pipeline** — Filter by rating and review count, then import into CRM (HubSpot, Salesforce)</li> <li>**Competitive analysis** — Compare business densities across cities to identify underserved markets</li> <li>**Schedule updates** — Run monthly scrapes to track new businesses and rating changes</li> <br> <hr> <br> <p>Get started with SearchHive's <a href="https://searchhive.dev">free tier</a> — 100 requests per month with proxy rotation and CAPTCHA solving included. No credit card required. <a href="https://searchhive.dev/docs">Read the docs</a>.</p> <br>