How to Scrape Patent Data for Competitive Intelligence
Patent data is one of the most underutilized sources of competitive intelligence. Companies file patents months or years before launching products, making patent databases a window into future strategy. This tutorial shows you how to scrape patent data systematically using Python and SearchHive's APIs.
Key Takeaways
- Patent databases (USPTO, Google Patents, EPO) contain structured data about filings, claims, and inventors
- SearchHive's ScrapeForge API handles the heavy JavaScript rendering on modern patent search interfaces
- DeepDive extracts structured patent metadata (filing dates, assignees, classifications, abstracts) from raw pages
- Batch scraping with SwiftSearch finds relevant patents by keyword, company, or technology area
- The complete pipeline collects, structures, and exports patent data for analysis
Prerequisites
- Python 3.8 or later
- SearchHive API key (free tier -- 500 credits)
- pip install requests searchhive
pip install requests searchhive
Step 1: Find Relevant Patents with SwiftSearch
Start by identifying patents relevant to your competitive analysis. SearchHive's SwiftSearch API queries Google Patents and other patent databases:
from searchhive import SwiftSearch
client = SwiftSearch(api_key="YOUR_API_KEY")
# Search for patents by competitor
results = client.search(
"site:patents.google.com Tesla battery technology",
num_results=20
)
for r in results:
print(f"{r.title}")
print(f" {r.url}")
print()
Google Patents URLs follow a predictable pattern. You can also search USPTO directly or use technology-specific queries:
# Search by technology area
tech_results = client.search(
"site:patents.google.com large language model inference optimization",
num_results=15
)
# Search by assignee (company)
company_results = client.search(
"site:patents.google.com assignee:OpenAI",
num_results=15
)
Step 2: Scrape Individual Patent Pages
Google Patents uses JavaScript rendering extensively. ScrapeForge handles this automatically:
from searchhive import ScrapeForge
client = ScrapeForge(api_key="YOUR_API_KEY")
# Scrape a Google Patents page
patent_url = "https://patents.google.com/patent/US12345678B2/en"
result = client.scrape(patent_url, format="markdown", render_js=True)
print(result.content[:1000])
The markdown format strips away navigation elements and gives you the patent content in readable text. This includes the title, abstract, description, claims, and metadata.
Step 3: Extract Structured Patent Data
Patent pages contain structured metadata -- filing dates, assignees, inventors, classification codes, and citation data. Use DeepDive to extract these fields into clean free JSON formatter:
from searchhive import DeepDive
client = DeepDive(api_key="YOUR_API_KEY")
def extract_patent_data(markdown_content):
result = client.extract(
content=markdown_content,
schema={
"type": "object",
"properties": {
"patent_id": {"type": "string", "description": "Patent number"},
"title": {"type": "string", "description": "Patent title"},
"abstract": {"type": "string", "description": "Brief summary"},
"assignee": {"type": "string", "description": "Company or entity"},
"inventors": {
"type": "array",
"items": {"type": "string"},
"description": "List of inventors"
},
"filing_date": {"type": "string", "description": "Original filing date"},
"grant_date": {"type": "string", "description": "Date patent was granted"},
"status": {"type": "string", "description": "Granted, pending, abandoned"},
"classification": {
"type": "array",
"items": {"type": "string"},
"description": "IPC or CPC classification codes"
},
"citations": {
"type": "array",
"items": {"type": "string"},
"description": "Referenced patent numbers"
},
"claims_count": {"type": "string", "description": "Number of claims"}
}
}
)
return result.data
patent_data = extract_patent_data(result.content)
print(f"Patent: {patent_data.get('title', 'N/A')}")
print(f"Assignee: {patent_data.get('assignee', 'N/A')}")
print(f"Filed: {patent_data.get('filing_date', 'N/A')}")
print(f"Status: {patent_data.get('status', 'N/A')}")
print(f"Inventors: {patent_data.get('inventors', [])}")
DeepDive understands patent page structure and extracts the relevant fields even when the layout varies between patents. This is significantly more reliable than CSS selector-based parsing, which breaks when Google updates their page design.
Step 4: Scrape Patent Search Results Pages
For broader competitive analysis, scrape the search results page to get an overview of a competitor's patent portfolio:
from searchhive import ScrapeForge, DeepDive
scrape = ScrapeForge(api_key="YOUR_API_KEY")
deep = DeepDive(api_key="YOUR_API_KEY")
def scrape_patent_search_results(query, num_pages=3):
all_patents = []
for page in range(num_pages):
url = f"https://patents.google.com/?q={query}&oq={query}&page={page}"
try:
result = scrape.scrape(url, format="markdown", render_js=True)
data = deep.extract(
content=result.content,
schema={
"type": "object",
"properties": {
"patents": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"patent_id": {"type": "string"},
"assignee": {"type": "string"},
"filing_date": {"type": "string"},
"snippet": {"type": "string"},
"url": {"type": "string"}
}
}
},
"total_results": {"type": "string"}
}
}
)
patents = data.data.get("patents", [])
all_patents.extend(patents)
print(f" Page {page + 1}: {len(patents)} patents")
except Exception as e:
print(f" Page {page + 1} failed: {e}")
return all_patents
# Scrape a competitor's patent portfolio
patents = scrape_patent_search_results("assignee:Anthropic", num_pages=3)
print(f"Total patents found: {len(patents)}")
Step 5: Build a Competitive Intelligence Pipeline
Combine search, scraping, and extraction into a complete competitive intelligence pipeline:
from searchhive import SwiftSearch, ScrapeForge, DeepDive
import json, time, csv
API_KEY = "YOUR_API_KEY"
PATENT_SCHEMA = {
"type": "object",
"properties": {
"patent_id": {"type": "string"},
"title": {"type": "string"},
"assignee": {"type": "string"},
"inventors": {"type": "array", "items": {"type": "string"}},
"filing_date": {"type": "string"},
"grant_date": {"type": "string"},
"status": {"type": "string"},
"classification": {"type": "array", "items": {"type": "string"}},
"abstract": {"type": "string"}
}
}
def competitive_patent_analysis(companies, tech_keywords=None):
search = SwiftSearch(api_key=API_KEY)
scrape = ScrapeForge(api_key=API_KEY)
deep = DeepDive(api_key=API_KEY)
all_patents = []
for company in companies:
query = f"site:patents.google.com assignee:{company}"
if tech_keywords:
query += " " + " OR ".join(tech_keywords)
print(f"Searching patents for {company}...")
results = search.search(query, num_results=10)
urls = [r.url for r in results if "patents.google.com" in (r.url or "")]
for i, url in enumerate(urls):
try:
raw = scrape.scrape(url, format="markdown", render_js=True)
data = deep.extract(content=raw.content, schema=PATENT_SCHEMA)
patent = data.data
patent["source_url"] = url
patent["competitor"] = company
all_patents.append(patent)
print(f" [{i+1}/{len(urls)}] {patent.get('patent_id', 'N/A')}: {patent.get('title', 'N/A')[:60]}")
time.sleep(1)
except Exception as e:
print(f" [{i+1}/{len(urls)}] Error: {e}")
time.sleep(1)
return all_patents
# Analyze competitors
companies = ["Anthropic", "OpenAI"]
patents = competitive_patent_analysis(
companies,
tech_keywords=["reinforcement learning", "alignment", "safety"]
)
# Export results
with open("patent_intelligence.json", "w") as f:
json.dump(patents, f, indent=2)
with open("patent_intelligence.csv", "w", newline="") as f:
if patents:
fields = ["competitor", "patent_id", "title", "assignee", "filing_date", "status", "classification"]
writer = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
writer.writeheader()
writer.writerows(patents)
# Summary
for company in companies:
company_patents = [p for p in patents if p.get("competitor") == company]
print(f"
{company}: {len(company_patents)} patents")
for p in company_patents[:3]:
print(f" - {p.get('title', 'N/A')[:70]} ({p.get('filing_date', 'N/A')})")
Step 6: Handle Common Issues
Pagination limits. Google Patents search results typically show 10 patents per page. For large portfolios, scrape multiple result pages or use more specific queries to narrow results.
Rate limiting. Google Patents and USPTO throttle aggressive scraping. SearchHive's proxy rotation helps, but keep request rates at 1-2 per second to avoid blocks.
Patent page complexity. Some patents have hundreds of claims spanning multiple pages. DeepDive extracts the most relevant fields; if you need the full claims text, scrape the individual patent page and extract the claims section separately.
Classification codes. IPC and CPC classification codes use hierarchical notation (e.g., G06F 40/20). Use these codes to find related patents across competitors:
# Find all patents using the same classification
def find_similar_patents(classification_code):
search = SwiftSearch(api_key=API_KEY)
results = search.search(
f"site:patents.google.com CPC:{classification_code}",
num_results=20
)
return [(r.title, r.url) for r in results]
Complete Code Example
from searchhive import SwiftSearch, ScrapeForge, DeepDive
import json, time
API_KEY = "YOUR_API_KEY"
PATENT_DETAIL_SCHEMA = {
"type": "object",
"properties": {
"patent_id": {"type": "string"},
"title": {"type": "string"},
"abstract": {"type": "string"},
"assignee": {"type": "string"},
"inventors": {"type": "array", "items": {"type": "string"}},
"filing_date": {"type": "string"},
"grant_date": {"type": "string"},
"status": {"type": "string"},
"classification": {"type": "array", "items": {"type": "string"}}
}
}
def scrape_patent(url):
scrape = ScrapeForge(api_key=API_KEY)
deep = DeepDive(api_key=API_KEY)
raw = scrape.scrape(url, format="markdown", render_js=True)
data = deep.extract(content=raw.content, schema=PATENT_DETAIL_SCHEMA)
data.data["url"] = url
return data.data
def scan_competitor_patents(company, limit=10):
search = SwiftSearch(api_key=API_KEY)
results = search.search(f"site:patents.google.com {company}", num_results=limit)
patents = []
for r in results:
if r.url and "patents.google.com" in r.url:
try:
patent = scrape_patent(r.url)
patents.append(patent)
print(f" {patent.get('patent_id', 'N/A')}: {patent.get('title', 'N/A')[:60]}")
time.sleep(2)
except Exception as e:
print(f" Error: {e}")
return patents
if __name__ == "__main__":
patents = scan_competitor_patents("Nvidia", limit=5)
with open("patents.json", "w") as f:
json.dump(patents, f, indent=2)
print(f"Saved {len(patents)} patents")
Next Steps
- Track filing trends: Run monthly scans to detect new patent filings from competitors
- Claim analysis: Extract and compare specific claims across competing patents
- Citation networks: Map which patents cite each other to identify technology lineages
- Automated alerts: Schedule the pipeline to run weekly and notify on new filings
Get started with SearchHive's free tier -- 500 credits, no credit card needed. See the API docs for the complete reference.
See also: /blog/how-to-build-a-competitive-intelligence-dashboard and /blog/how-to-automate-market-research-with-python-and-web-data.