Public court records contain valuable data for legal research, due diligence, competitive intelligence, and journalism. Federal courts (PACER), state courts, and county clerks publish millions of filings online -- but they're scattered across dozens of systems with different interfaces, authentication requirements, and data formats.
This tutorial walks through scraping public court records using Python and the SearchHive API, with working code for the most common sources.
Key Takeaways
- PACER (federal courts) requires a paid account but offers a bulk API (PACER Case Locator)
- State court systems vary wildly -- some have search portals, others require in-person requests
- SearchHive's ScrapeForge handles authentication, session management, and JavaScript rendering
- Always respect robots.txt generator, rate limits, and terms of service when scraping court records
- Structured extraction with SearchHive returns clean free JSON formatter from messy court websites
Prerequisites
- Python 3.8+
- SearchHive API key (free tier with 500 credits)
- Basic familiarity with HTTP requests and HTML structure
- A PACER account (for federal court data)
pip install requests searchhive
Step 1: Understand the Data Sources
Court records live in multiple systems:
| Source | Scope | Access | Notes |
|---|---|---|---|
| PACER | Federal courts | Paid ($0.10/page) | Bulk data available via API |
| RECAP | Federal courts | Free (donated) | Via CourtListener/Free Law Project |
| State court portals | Varies by state | Often free | Quality varies greatly |
| County clerk sites | County-level | Often free | Some require in-person visits |
| CourtListener | Federal + some state | Free API | Limited coverage, good for research |
For this tutorial, we'll focus on scraping state court portals and using SearchHive to handle the varied page structures.
Step 2: Scrape a State Court Case Search Portal
Most state court systems have a case search page where you can look up cases by name, case number, or date. Here's how to automate that with SearchHive:
import requests
API_KEY = "your-api-key"
BASE_URL = "https://api.searchhive.dev/v1"
def search_court_cases(state_portal_url, search_params, extract_fields):
# Search a state court portal and extract case information
response = requests.post(
f"{BASE_URL}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"url": state_portal_url,
"method": "POST",
"form_data": search_params,
"format": "json",
"extract": extract_fields,
"wait_for": ".search-results, table.case-list, #results"
}
)
if response.status_code == 200:
data = response.json()
return data.get("data", data.get("results", []))
else:
print(f"Error: {response.status_code} - {response.text}")
return []
# Example: Search for civil cases in a state portal
results = search_court_cases(
state_portal_url="https://example-state-court.gov/case-search",
search_params={
"case_type": "civil",
"filed_after": "2024-01-01",
"county": "all"
},
extract_fields={
"case_number": "td.case-number",
"case_title": "td.case-title a",
"filing_date": "td.filing-date",
"status": "td.case-status",
"court": "td.court-name"
}
)
for case in results:
print(f"{case['case_number']}: {case['case_title']} ({case['status']})")
The wait_for parameter tells SearchHive to wait for the search results to load -- essential for court portals that use JavaScript to render results.
Step 3: Navigate Pagination for Full Results
Court search portals typically show 10-25 results per page. To get all results, loop through pagination:
import requests
import time
API_KEY = "your-api-key"
BASE_URL = "https://api.searchhive.dev/v1"
def scrape_all_pages(search_url, base_params, extract_fields, max_pages=20):
# Scrape all pages of court search results
all_cases = []
for page in range(1, max_pages + 1):
params = {**base_params, "page": page}
response = requests.post(
f"{BASE_URL}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"url": search_url,
"method": "POST",
"form_data": params,
"format": "json",
"extract": extract_fields,
"wait_for": "table.case-list"
}
)
data = response.json()
results = data.get("data", [])
if not results:
print(f"No results on page {page}, stopping.")
break
all_cases.extend(results)
print(f"Page {page}: scraped {len(results)} cases")
time.sleep(2) # Be respectful of rate limits
return all_cases
cases = scrape_all_pages(
search_url="https://example-state-court.gov/case-search",
base_params={"case_type": "civil", "filed_after": "2025-01-01"},
extract_fields={
"case_number": "td.case-number",
"case_title": "td.case-title",
"filing_date": "td.filing-date",
"status": "td.case-status"
},
max_pages=50
)
print(f"Total cases collected: {len(cases)}")
Step 4: Extract Case Details from Individual Case Pages
Once you have a list of case numbers, scrape each case's detail page for full information:
import requests
import time
import csv
API_KEY = "your-api-key"
BASE_URL = "https://api.searchhive.dev/v1"
def get_case_detail(case_url):
# Extract detailed information from a case page
response = requests.post(
f"{BASE_URL}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"url": case_url,
"format": "json",
"extract": {
"case_number": ".case-header .case-number",
"case_title": ".case-header h1",
"court": ".court-info",
"judge": ".judge-name",
"filing_date": ".filing-date",
"case_type": ".case-type",
"status": ".case-status",
"parties": ".party-list li",
"docket_entries": {
"_container": "table.docket tbody tr",
"date": "td:nth-child(1)",
"description": "td:nth-child(2)",
"document": "td:nth-child(3) a::attr(href)"
},
"orders": ".orders-section .order-item"
},
"wait_for": ".case-header, .docket"
}
)
return response.json().get("data", {})
def batch_scrape_cases(case_urls, output_file="court_cases.csv"):
# Scrape multiple case detail pages and save to CSV
all_data = []
for url in case_urls:
try:
detail = get_case_detail(url)
all_data.append(detail)
print(f" Scraped: {detail.get('case_number', 'unknown')}")
time.sleep(3) # Rate limiting
except Exception as e:
print(f" Failed: {url} - {e}")
# Save to CSV
if all_data:
with open(output_file, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=all_data[0].keys())
writer.writeheader()
writer.writerows(all_data)
print(f"Saved {len(all_data)} cases to {output_file}")
return all_data
# Example usage
case_urls = [
"https://example-state-court.gov/case/2025-CV-001234",
"https://example-state-court.gov/case/2025-CV-001235",
]
batch_scrape_cases(case_urls, "civil_cases_2025.csv")
Step 5: Use SwiftSearch to Find Relevant Court Records
SearchHive's SwiftSearch API can find court records across the web, helping you discover which portals have the data you need:
from searchhive import SearchHive
client = SearchHive(api_key="your-api-key")
# Find court records related to a specific case or party
results = client.swift_search(
"Smith v. Johnson civil case filing 2025 court records",
num_results=10
)
for result in results:
print(f"Title: {result['title']}")
print(f"URL: {result['url']}")
print(f"Snippet: {result['snippet']}")
print("---")
# Find state-specific court portals
portals = client.swift_search(
"Texas district court case search online portal",
num_results=10
)
Step 6: Build a Monitoring Pipeline
Set up automated monitoring for new case filings matching your criteria:
import requests
import json
import time
from datetime import datetime
API_KEY = "your-api-key"
BASE_URL = "https://api.searchhive.dev/v1"
SEEN_FILE = "seen_cases.json"
def load_seen_cases():
try:
with open(SEEN_FILE) as f:
return set(json.load(f))
except FileNotFoundError:
return set()
def save_seen_cases(seen):
with open(SEEN_FILE, "w") as f:
json.dump(list(seen), f)
def monitor_new_filings(search_url, params, extract_fields):
# Check for new case filings and report only new ones
seen = load_seen_cases()
new_cases = []
response = requests.post(
f"{BASE_URL}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"url": search_url,
"method": "POST",
"form_data": params,
"format": "json",
"extract": extract_fields
}
)
results = response.json().get("data", [])
for case in results:
case_id = case.get("case_number", "")
if case_id and case_id not in seen:
seen.add(case_id)
new_cases.append(case)
print(f"NEW: {case_id} - {case.get('case_title', '')}")
if new_cases:
save_seen_cases(seen)
return new_cases
# Run daily monitoring
new_filings = monitor_new_filings(
search_url="https://example-state-court.gov/case-search",
params={"case_type": "civil", "filed_after": "2025-04-01"},
extract_fields={
"case_number": "td.case-number",
"case_title": "td.case-title",
"filing_date": "td.filing-date"
}
)
print(f"Found {len(new_filings)} new filings")
Common Issues and Solutions
JavaScript-rendered portals: Many court systems use Angular or React. Set "render_js": true in your ScrapeForge request (enabled by default) and use "wait_for" to specify which element signals the page is loaded.
Authentication walls: Some courts require login. SearchHive supports session-based auth -- pass cookies in the request headers, or use the "session" parameter to maintain state across requests.
CAPTCHA challenges: If a court portal serves CAPTCHAs, SearchHive's stealth mode handles most of them automatically. For aggressive protections, reduce your request frequency.
Inconsistent HTML structures: Different courts use different page layouts. Use flexible selectors (class names that appear across pages) and handle missing fields gracefully with .get() in Python.
Rate limiting: Court portals are often under-resourced. Space requests 3-5 seconds apart and respect Retry-After headers.
Legal and Ethical Considerations
- Public court records are, by definition, public -- but scraping terms may vary by jurisdiction
- PACER has specific terms about automated access; use their CM/ECF API for bulk federal data
- Some states restrict commercial use of court records even when access is free
- Always check
robots.txtfor the specific court portal - Consider using CourtListener's free API for federal cases instead of scraping PACER directly
Next Steps
- Set up a scheduled cron expression generator to run your monitoring pipeline daily
- Store scraped data in a database (PostgreSQL, SQLite) for querying and analysis
- Add alerting (email, Slack webhook) when new cases matching your criteria appear
- Explore SearchHive's DeepDive API for crawling entire court websites at scale
Get started free with 500 credits at searchhive.dev -- no credit card required. Check the docs for the full API reference and Python SDK documentation.
Related tutorials: /tutorials/how-to-scrape-e-commerce-pricing-data-with-python | /tutorials/how-to-monitor-brand-mentions-across-the-web-with-python
Compare tools: /compare/firecrawl
Advanced: PACER Data via CM/ECF API
For federal court data, PACER's CM/ECF (Case Management/Electronic Case Filing) API provides programmatic access. While PACER charges $0.10 per page, the API lets you retrieve case metadata, party information, and docket entries efficiently.
Here's how to combine PACER data with SearchHive for a comprehensive court data pipeline:
import requests
PACER_URL = "https://pcl.uscourts.gov/pcl"
def get_pacer_case(case_id, pacer_token):
headers = {"PACER-Token": pacer_token}
response = requests.get(
f"{PACER_URL}/public/case/{case_id}",
headers=headers
)
return response.json()
def combine_sources(case_id, pacer_token, state_portal_url):
federal = get_pacer_case(case_id, pacer_token)
state = requests.post(
"https://api.searchhive.dev/v1/scrape",
headers={"Authorization": f"Bearer your-searchhive-key"},
json={"url": state_portal_url, "format": "json"}
).json().get("data", {})
return {"federal": federal, "state": state}
Working with Court Data Formats
Court records come in various formats:
- PDF filings: Full document scans, often OCR'd. SearchHive's DeepDive can follow PDF links and extract text.
- HTML case pages: Varying quality. Some courts have well-structured pages; others paste everything into a single
<pre>block. - JSON APIs: A few modern court systems return structured JSON.
- Bulk data downloads: Some federal courts offer bulk data via the PACER archive at no cost.
For mixed-format sources, design your extraction pipeline to handle each format:
def extract_court_data(source_type, url_or_file, api_key):
if source_type == "pacer_api":
return get_pacer_case(url_or_file, api_key)
elif source_type == "html_portal":
return search_court_cases(url_or_file, {}, {})
elif source_type == "bulk_pdf":
response = requests.post(
"https://api.searchhive.dev/v1/scrape",
headers={"Authorization": f"Bearer {api_key}"},
json={"url": url_or_file, "format": "markdown"}
)
return {"text": response.json().get("data", "")}
Building a Court Data Database
For serious legal research, store your scraped data in a relational database:
import sqlite3
def init_court_db(db_path="court_data.db"):
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute("CREATE TABLE IF NOT EXISTS cases ("
"id INTEGER PRIMARY KEY, "
"case_number TEXT UNIQUE, "
"case_title TEXT, "
"court TEXT, "
"state TEXT, "
"case_type TEXT, "
"filing_date TEXT, "
"status TEXT, "
"judge TEXT)")
c.execute("CREATE TABLE IF NOT EXISTS parties ("
"id INTEGER PRIMARY KEY, "
"case_id INTEGER REFERENCES cases(id), "
"name TEXT, "
"role TEXT)")
c.execute("CREATE TABLE IF NOT EXISTS docket_entries ("
"id INTEGER PRIMARY KEY, "
"case_id INTEGER REFERENCES cases(id), "
"date_filed TEXT, "
"description TEXT, "
"document_url TEXT)")
conn.commit()
conn.close()
init_court_db()
With this schema, you can query across cases, filter by jurisdiction and date range, and join related records for comprehensive analysis.