Complete Guide to Headless Browser Scraping
Headless browser scraping lets you extract data from JavaScript-heavy websites that don't serve their content in raw HTML. This guide covers Playwright, Puppeteer, Selenium, and when to skip browser automation entirely in favor of API-based extraction.
Key Takeaways
- Playwright is the best choice for new projects -- faster than Selenium, better API
- Puppeteer is ideal if you're already in the Node.js ecosystem
- Browser automation is expensive -- each instance uses 100-200MB RAM
- API-based scraping (SearchHive ScrapeForge) eliminates infrastructure overhead
- Anti-detection measures are critical: rotating proxies, realistic headers, human-like timing
What Is Headless Browser Scraping?
A headless browser runs without a visible GUI. It loads pages the same way Chrome or Firefox would -- executing JavaScript, rendering CSS, handling redirects -- but everything happens in memory.
This matters because modern websites use client-side rendering. When you fetch a React or Next.js page with requests, you get an empty shell:
<div id="root"></div>
<script src="/static/js/main.a3f8b2c9.js"></script>
A headless browser waits for the JavaScript to execute, then gives you the fully rendered page with all the data.
Playwright: The Recommended Choice
Playwright by Microsoft supports Chromium, Firefox, and WebKit from a single API. It's faster than Selenium, has built-in auto-waiting, and handles modern web features like Shadow DOM natively.
Basic setup
pip install playwright
playwright install chromium
Simple extraction example
from playwright.sync_api import sync_playwright
def scrape_page(url: str) -> dict:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, wait_until="networkidle")
# Extract data from the fully rendered page
title = page.title()
content = page.inner_text("body")
browser.close()
return {"title": title, "content": content[:500]}
data = scrape_page("https://example.com/products")
print(data)
Waiting for dynamic content
The hardest part of headless scraping is timing. You need to wait for the right elements to appear:
from playwright.sync_api import sync_playwright
def scrape_with_waits(url: str):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
# Wait for specific element
page.wait_for_selector(".product-grid", timeout=10000)
# Wait for network requests to finish
page.wait_for_load_state("networkidle")
# Wait for a specific API call to complete
with page.expect_response("**/api/products") as response_info:
pass
api_response = response_info.value.json()
# Extract products
products = page.query_selector_all(".product-card")
results = []
for product in products:
name = product.query_selector(".name").inner_text()
price = product.query_selector(".price").inner_text()
results.append({"name": name, "price": price})
browser.close()
return results
Handling pagination
from playwright.sync_api import sync_playwright
def scrape_all_pages(base_url: str, max_pages: int = 5):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
all_items = []
for page_num in range(1, max_pages + 1):
page.goto(f"{base_url}?page={page_num}")
page.wait_for_selector(".item-card")
items = page.query_selector_all(".item-card")
for item in items:
all_items.append({
"title": item.query_selector(".title").inner_text(),
"price": item.query_selector(".price").inner_text()
})
# Check if next page exists
next_btn = page.query_selector("[rel='next']")
if not next_btn or next_btn.is_disabled():
break
browser.close()
return all_items
Selenium: Still Relevant
Selenium has been around since 2004 and still has the largest community. Use it when you need broad browser compatibility or are working with legacy codebases.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=options)
driver.get("https://example.com")
# Wait for element
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "product-list"))
)
products = driver.find_elements(By.CLASS_NAME, "product-card")
for p in products:
print(p.find_element(By.CLASS_NAME, "name").text)
driver.quit()
Anti-Detection Techniques
Sites actively block headless browsers. Here's how to avoid detection:
1. Realistic browser fingerprints
from playwright.sync_api import sync_playwright
def create_stealth_browser(playwright):
browser = playwright.chromium.launch(
headless=True,
args=[
"--disable-blink-features=AutomationControlled",
"--no-sandbox",
"--disable-setuid-sandbox"
]
)
context = browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={"width": 1920, "height": 1080},
locale="en-US"
)
# Remove webdriver flag
context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
""")
return browser, context
2. Human-like timing
import random
import time
def human_delay(min_sec=1.0, max_sec=3.0):
time.sleep(random.uniform(min_sec, max_sec))
# Use between page interactions
page.click(".next-page")
human_delay()
page.wait_for_selector(".results")
3. Rotate proxies
from playwright.sync_api import sync_playwright
proxies = [
"http://proxy1:8080",
"http://proxy2:8080",
"http://proxy3:8080",
]
def scrape_with_proxy(url: str):
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
proxy={"server": random.choice(proxies)}
)
page = browser.new_page()
page.goto(url)
# ... extract data
browser.close()
The Cost Problem with Browser Automation
Running headless browsers at scale is expensive:
- Memory: Each Chromium instance uses 150-300MB RAM
- CPU: Rendering JavaScript is CPU-intensive
- Infrastructure: You need servers, proxies, and monitoring
- Maintenance: Sites break your scrapers constantly
Here's what scaling looks like with self-hosted browsers:
| Scale | Instances | RAM Needed | Monthly Cost |
|---|---|---|---|
| 10 concurrent | 10 | 3 GB | ~$20 (VPS) |
| 50 concurrent | 50 | 15 GB | ~$100 (cloud) |
| 100 concurrent | 100 | 30 GB | ~$250 (cloud) |
And that doesn't include proxies (typically $50-200/month for rotating residential IPs).
ScrapeForge: Skip the Browser Infrastructure
SearchHive's ScrapeForge API handles headless rendering, anti-detection, proxy rotation, and CAPTCHA solving for you. One API call replaces all the infrastructure:
import httpx
SEARCHHIVE_API_KEY = "your-api-key-here"
def scrape_any_page(url: str) -> dict:
response = httpx.post(
"https://api.searchhive.dev/v1/scrape",
json={
"url": url,
"format": "markdown",
"wait_for": ".product-grid"
},
headers={"Authorization": f"Bearer {SEARCHHIVE_API_KEY}"}
)
return response.json()
# Works on JavaScript-heavy sites
result = scrape_any_page("https://example.com/react-app")
print(result["title"])
print(result["content"][:500])
Pricing comparison for 100K pages per month:
| Approach | Monthly Cost | Setup Time | Maintenance |
|---|---|---|---|
| Self-hosted Playwright | $150-400 | 2-5 days | Ongoing |
| Cloud browser service | $200-500 | 1 day | Minimal |
| ScrapeForge (Builder plan) | $49 | 5 minutes | None |
ScrapeForge handles the hardest parts of headless scraping -- JavaScript rendering, bot detection, proxy rotation, and rate limiting -- so you can focus on using the data, not managing infrastructure.
Combining Search with Scraping
The real power comes from combining SwiftSearch (find pages) with ScrapeForge (extract data):
import httpx
import asyncio
SEARCHHIVE_API_KEY = "your-api-key-here"
async def research_topic(query: str, depth: int = 5) -> list:
async with httpx.AsyncClient() as client:
# Find relevant pages
search_resp = await client.get(
"https://api.searchhive.dev/v1/search",
params={"q": query, "limit": depth},
headers={"Authorization": f"Bearer {SEARCHHIVE_API_KEY}"}
)
urls = [r["url"] for r in search_resp.json().get("results", [])]
# Scrape each page concurrently
scrape_tasks = [
client.post(
"https://api.searchhive.dev/v1/scrape",
json={"url": url, "format": "markdown"},
headers={"Authorization": f"Bearer {SEARCHHIVE_API_KEY}"}
)
for url in urls
]
responses = await asyncio.gather(*scrape_tasks, return_exceptions=True)
return [
{"url": urls[i], "data": r.json() if isinstance(r, httpx.Response) else None}
for i, r in enumerate(responses)
]
# Example: research competitors
results = asyncio.run(research_topic("best project management tools 2026"))
for r in results:
if r["data"]:
print(f"[{r['data'].get('title')}] {r['data'].get('content', '')[:100]}...")
Best Practices
- Use
wait_until="networkidle"for SPAs, but set a timeout -- some pages keep loading forever - Set a timeout on every operation -- never let a scrape hang indefinitely
- Log failures with context -- save the URL, timestamp, and error for retry
- Respect robots.txt generator -- check it before scraping, though it's not legally binding
- Cache results -- don't scrape the same page twice in the same session
Get Started
For small projects, Playwright is free and powerful. For production workloads that need reliability, check out SearchHive's free tier -- 500 credits to test SwiftSearch, ScrapeForge, and DeepDive with no credit card required. See the API documentation for details.
Related: /tutorials/data-extraction-python | /compare/firecrawl