HEADERS = { "user agent parser": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept-Language": "en-US,en;q=0.9", }
def fetch_page(url): """Fetch a page with rate limiting and error handling.""" try: response = requests.get(url, headers=HEADERS, timeout=30) response.raise_for_status() return response.text except requests.RequestException as e: print(f"Error fetching {url}: {e}") return None </code></pre> <br>
<h2>Step 3: Parse Job Listings</h2> <br> <p>Extract structured data from the HTML:</p> <br> <pre><code> def parse_job_cards(html): """Extract job listings from a page of results.""" soup = BeautifulSoup(html, "html.parser") jobs = []for card in soup.select(".job-card"):
job = {
"title": card.select_one(".title").get_text(strip=True) if card.select_one(".title") else None,
"company": card.select_one(".company").get_text(strip=True) if card.select_one(".company") else None,
"location": card.select_one(".location").get_text(strip=True) if card.select_one(".location") else None,
"salary": card.select_one(".salary").get_text(strip=True) if card.select_one(".salary") else "Not listed",
"link": card.select_one("a")["href"] if card.select_one("a") else None,
}
if job["title"]: # Skip empty entries
jobs.append(job)
return jobs
</code></pre> <br>
<h2>Step 4: Handle Pagination</h2> <br> <p>Most job boards paginate with <code>?page=N</code> or <code>&start=N</code>:</p> <br> <pre><code> def scrape_job_board(base_url, max_pages=10): """Scrape multiple pages of job listings.""" all_jobs = []for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
print(f"Scraping page {page}...")
html = fetch_page(url)
if not html:
break
jobs = parse_job_cards(html)
if not jobs:
print("No jobs found — stopping.")
break
all_jobs.extend(jobs)
time.sleep(2) # Rate limiting: 2 seconds between requests
print(f" Found {len(jobs)} jobs (total: {len(all_jobs)})")
return all_jobs
</code></pre> <br>
<h2>Step 5: Scrape Individual Job Details</h2> <br> <p>Each job card links to a detail page with the full description:</p> <br> <pre><code> def scrape_job_detail(detail_url): """Scrape full job description from detail page.""" html = fetch_page(detail_url) if not html: return {}soup = BeautifulSoup(html, "html.parser")
return {
"description": soup.select_one(".job-description").get_text(strip=True) if soup.select_one(".job-description") else None,
"requirements": [li.get_text(strip=True) for li in soup.select(".requirements li")],
"posted_date": soup.select_one(".date-posted").get_text(strip=True) if soup.select_one(".date-posted") else None,
}
</code></pre> <br>
<h2>Step 6: Clean and Export Data</h2> <br> <p>Use pandas for deduplication and export:</p> <br> <pre><code> def export_jobs(jobs, filename="jobs.csv"): """Deduplicate and export job data.""" df = pd.DataFrame(jobs) df.drop_duplicates(subset=["title", "company"], inplace=True) df.to_csv(filename, index=False) df.to_json("jobs.json", orient="records", indent=2) print(f"Exported {len(df)} unique jobs to {filename}") return df </code></pre> <br> <h2>The SearchHive Approach</h2> <br> <p>The manual approach works for simple, static job boards. For anything else — JS-rendered listings, anti-bot protection, large-scale scraping — SearchHive handles it in one call:</p> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
Scrape a single job board page
result = client.scrape( url="https://www.indeed.com/jobs?q=python+developer&l=San+Francisco", selector=".job_seen_beacon .jobTitle, .companyName, .companyLocation, .salary-snippet" )
The response includes rendered HTML, proxy rotation, and CAPTCHA solving
print(result["data"]) </code></pre> <br>
<h3>Batch Scrape Multiple Job Boards</h3> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
job_urls = [ "https://www.indeed.com/jobs?q=python+developer&l=New+York", "https://www.indeed.com/jobs?q=python+developer&l=Austin", "https://www.indeed.com/jobs?q=python+developer&l=Remote", ]
results = client.batch(job_urls, selector=".jobTitle, .companyName, .companyLocation") for r in results: print(f"{r['url']}: {len(r['data'].get('.jobTitle', []))} jobs found") </code></pre> <br>
<h3>SwiftSearch: Find Jobs Across the Web</h3> <br> <p>SearchHive's SwiftSearch combines search with scraping:</p> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
results = client.search("python developer jobs remote 2025") for r in results[:10]: print(f"[{r['title']}] {r['url']}") </code></pre> <br>
<h2>Complete Working Example</h2> <br> <pre><code> import requests import pandas as pd from bs4 import BeautifulSoup import timeHEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept-Language": "en-US,en;q=0.9", }
def fetch_page(url): try: r = requests.get(url, headers=HEADERS, timeout=30) r.raise_for_status() return r.text except requests.RequestException as e: print(f"Error: {e}") return None
def scrape_jobs(base_url, max_pages=5): all_jobs = [] for page in range(1, max_pages + 1): url = f"{base_url}?page={page}" html = fetch_page(url) if not html: break
soup = BeautifulSoup(html, "html.parser")
for card in soup.select(".job-card"):
all_jobs.append({
"title": card.select_one(".title").get_text(strip=True) if card.select_one(".title") else "",
"company": card.select_one(".company").get_text(strip=True) if card.select_one(".company") else "",
"location": card.select_one(".location").get_text(strip=True) if card.select_one(".location") else "",
"salary": card.select_one(".salary").get_text(strip=True) if card.select_one(".salary") else "Not listed",
})
print(f"Page {page}: {len(all_jobs)} total jobs")
time.sleep(2)
# Deduplicate and export
df = pd.DataFrame(all_jobs)
df.drop_duplicates(subset=["title", "company"], inplace=True)
df.to_csv("jobs_output.csv", index=False)
print(f"Saved {len(df)} unique jobs to jobs_output.csv")
return df
if name == "main": scrape_jobs("https://example-jobs.com/search", max_pages=5) </code></pre> <br>
<h2>Common Issues</h2> <br> <p><strong>Empty results from JS-rendered pages:</strong> The site loads job listings with JavaScript. Use SearchHive (which renders JavaScript) or Selenium instead of raw <code>requests</code>.</p> <br> <br> <p><strong>Duplicate listings across pages:</strong> Job boards repost the same listings. Always deduplicate by title+company or a unique job ID.</p> <br> <p><strong>Rate limiting (429 responses):</strong> You're requesting too fast. Increase your delay to 5+ seconds, or use a smaller page range.</p> <br> <p><strong>HTML structure changes:</strong> Selectors break when sites update their design. Use robust selectors (classes rather than nested paths) or SearchHive's CSS selectors that return empty arrays instead of crashing.</p> <br> <p><strong>Missing salary data:</strong> Many listings don't show salary. Handle <code>None</code> values gracefully — default to "Not listed" rather than crashing.</p> <br> <h2>Next Steps</h2> <br> <li>**Scale up** — Use [SearchHive's batch endpoint](https://searchhive.dev/docs) to scrape hundreds of job pages in parallel</li> <li>**Schedule scrapes** — Run your job scraper daily with `cron` or GitHub Actions to track new postings</li> <li>**Build alerts** — Trigger notifications when jobs matching specific criteria appear</li> <li>**Analyze trends** — Use pandas to analyze salary distributions, hiring locations, and in-demand skills</li> <li>**Cross-reference** — See [How to Scrape LinkedIn Public Data](/blog/how-to-scrape-linkedin-public-data-for-lead-generation) for complementary sourcing</li> <br> <hr> <br> <p>Get started with SearchHive's <a href="https://searchhive.dev">free tier</a> — 100 requests per month, no credit card required. JS rendering, proxy rotation, and CAPTCHA solving included on every plan. <a href="https://searchhive.dev/docs">Read the docs</a>.</p> <br>