How to Automate Market Research with Python and Web Data
Market research traditionally means weeks of manual data collection -- surveys, competitor analysis, trend reports, industry benchmarks. Most of this data is already published online. The problem is collecting and structuring it at scale.
This tutorial shows how to automate market research using Python and SearchHive's APIs to gather competitor data, industry trends, customer sentiment, and market size estimates from public web sources.
Key Takeaways
- 80% of market research data is publicly available online -- the bottleneck is collection, not availability
- SearchHive's three APIs cover the full pipeline: discover (SwiftSearch), collect (ScrapeForge), extract (DeepDive)
- Automate competitor monitoring, industry trend analysis, and customer feedback collection
- Build a structured market research dataset that updates automatically
- Free tier includes 500 credits -- enough to prototype a full research workflow
Prerequisites
- Python 3.8+
requests,sqlite3(built-in),json(built-in)- SearchHive API key (get one free)
- A market or industry to research
Step 1: Define Your Research Framework
Good automated market research covers four dimensions:
RESEARCH_CONFIG = {
"market": "AI-powered developer tools",
"competitors": [
"github.com/copilot",
"cursor.sh",
"replit.com",
"tabnine.com"
],
"keywords": [
"AI code assistant market size",
"developer tools market trends 2026",
"AI coding assistant adoption rate",
"copilot alternatives comparison"
],
"review_sites": [
"g2.com/products/",
"capterra.com/",
"trustpilot.com/review/"
],
"industry_reports": [
"statista.com",
"grandviewresearch.com",
"marketsandmarkets.com"
]
}
Step 2: Gather Market Size and Industry Data
import requests
import json
import time
from datetime import datetime
API_KEY = "your_api_key"
BASE = "https://api.searchhive.dev/v1"
def search_industry_data(query):
# Search for industry reports, market size data, and trends
response = requests.get(
f"{BASE}/search",
headers={"Authorization": f"Bearer {API_KEY}"},
params={
"q": query,
"num": 10,
"search_type": "web"
}
)
response.raise_for_status()
return response.json().get("results", [])
def extract_market_data(url):
# Extract market data from an industry report page
# Scrape the page
scrape_resp = requests.post(
f"{BASE}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={"url": url, "render_js": True, "format": "markdown"}
)
scrape_resp.raise_for_status()
content = scrape_resp.json()["markdown"]
# Extract structured data
extract_resp = requests.post(
f"{BASE}/deepdive",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"content": content,
"extract": [
"market_size",
"growth_rate",
"forecast_period",
"key_players",
"market_segments",
"regional_breakdown",
"key_findings"
]
}
)
extract_resp.raise_for_status()
return extract_resp.json()["data"]
# Search for market data
print("=== GATHERING INDUSTRY DATA ===")
industry_data = []
for keyword in RESEARCH_CONFIG["keywords"][:2]:
print(f"\nSearching: {keyword}")
results = search_industry_data(keyword)
for r in results[:3]:
url = r.get("url", "")
# Focus on research/report sites
if any(site in url for site in RESEARCH_CONFIG["industry_reports"]):
print(f" Extracting from: {url}")
try:
data = extract_market_data(url)
data["source_url"] = url
data["source_query"] = keyword
industry_data.append(data)
print(f" Market size: {data.get('market_size', 'N/A')}")
except Exception as e:
print(f" Error: {e}")
time.sleep(2)
Step 3: Analyze Competitor Products and Pricing
def analyze_competitor(competitor_url):
# Analyze a competitor's product page and pricing
results = {}
# Scrape main landing page
try:
landing = requests.post(
f"{BASE}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={"url": f"https://{competitor_url}", "render_js": True}
)
content = landing.json()["markdown"]
# Extract product info
info = requests.post(
f"{BASE}/deepdive",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"content": content,
"extract": [
"value_proposition",
"key_features",
"target_audience",
"pricing_model",
"integrations",
"social_proof"
]
}
)
results["product_info"] = info.json()["data"]
except Exception as e:
results["product_info_error"] = str(e)
# Scrape pricing page
try:
pricing = requests.post(
f"{BASE}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={"url": f"https://{competitor_url}/pricing", "render_js": True}
)
pricing_content = pricing.json()["markdown"]
plans = requests.post(
f"{BASE}/deepdive",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"content": pricing_content,
"extract": [
"plan_name",
"monthly_price",
"annual_price",
"features_per_plan",
"free_tier"
]
}
)
results["pricing"] = plans.json()["data"]
except Exception as e:
results["pricing_error"] = str(e)
return results
print("\n=== COMPETITOR ANALYSIS ===")
competitor_analysis = {}
for comp in RESEARCH_CONFIG["competitors"]:
print(f"\nAnalyzing: {comp}")
data = analyze_competitor(comp)
competitor_analysis[comp] = data
time.sleep(3)
Step 4: Collect Customer Sentiment Data
def gather_customer_sentiment(product_name, review_sites):
# Gather customer reviews and sentiment from multiple sites
all_reviews = []
for site in review_sites:
url = f"https://{site}{product_name}"
try:
scrape_resp = requests.post(
f"{BASE}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={"url": url, "render_js": True, "format": "markdown"}
)
content = scrape_resp.json()["markdown"]
reviews = requests.post(
f"{BASE}/deepdive",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"content": content,
"extract": [
"overall_rating",
"review_count",
"common_pros",
"common_cons",
"top_positive_review",
"top_negative_review"
]
}
)
data = reviews.json()["data"]
data["source"] = site
all_reviews.append(data)
print(f" {site}: done")
except Exception as e:
print(f" {site}: error - {e}")
time.sleep(2)
return all_reviews
print("\n=== CUSTOMER SENTIMENT ===")
for comp in RESEARCH_CONFIG["competitors"][:2]:
print(f"\nReviews for: {comp}")
sentiment = gather_customer_sentiment(comp, RESEARCH_CONFIG["review_sites"])
Step 5: Store and Structure Your Research
import sqlite3
def init_research_db(db_path="market_research.db"):
# Initialize structured storage for market research data
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS market_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
market TEXT,
data_type TEXT,
metric TEXT,
value TEXT,
source TEXT,
collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
c.execute('''
CREATE TABLE IF NOT EXISTS competitor_profiles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
competitor TEXT,
market TEXT,
value_prop TEXT,
pricing_model TEXT,
target_audience TEXT,
key_features TEXT,
sentiment_rating TEXT,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
conn.close()
def save_research_data(market, data_type, findings):
# Save research findings to the database
conn = sqlite3.connect("market_research.db")
c = conn.cursor()
for item in findings:
c.execute(
"INSERT INTO market_data (market, data_type, metric, value, source) VALUES (?, ?, ?, ?, ?)",
(market, data_type, item.get("metric", ""),
str(item.get("value", "")), item.get("source", ""))
)
conn.commit()
conn.close()
init_research_db()
Step 6: Generate a Market Research Report
def generate_market_report(market, db_path="market_research.db"):
# Generate a comprehensive market research report
conn = sqlite3.connect(db_path)
c = conn.cursor()
report = []
report.append("#" * 60)
report.append(f"# MARKET RESEARCH REPORT: {market.upper()}")
report.append(f"# Generated: {datetime.now().strftime('%Y-%m-%d')}")
report.append("#" * 60)
# Market overview
report.append("\n## MARKET OVERVIEW")
c.execute(
"SELECT metric, value, source FROM market_data WHERE market = ? AND data_type = 'industry' ORDER BY collected_at DESC",
(market,)
)
for row in c.fetchall():
report.append(f"- **{row[0]}**: {row[1]} (source: {row[2]})")
# Competitive landscape
report.append("\n## COMPETITIVE LANDSCAPE")
c.execute(
"SELECT competitor, value_prop, pricing_model, target_audience FROM competitor_profiles WHERE market = ?",
(market,)
)
for row in c.fetchall():
report.append(f"\n### {row[0]}")
report.append(f"- Value prop: {row[1]}")
report.append(f"- Pricing: {row[2]}")
report.append(f"- Target: {row[3]}")
# Key trends
report.append("\n## KEY TRENDS")
c.execute(
"SELECT metric, value FROM market_data WHERE market = ? AND data_type = 'trend'",
(market,)
)
for row in c.fetchall():
report.append(f"- {row[0]}: {row[1]}")
conn.close()
return "\n".join(report)
report = generate_market_report(RESEARCH_CONFIG["market"])
print(report)
Step 7: Schedule Automated Research Updates
import schedule
def run_research_update():
# Run a full market research data collection cycle
print(f"[{datetime.now()}] Starting market research update...")
# Gather industry data
for keyword in RESEARCH_CONFIG["keywords"]:
results = search_industry_data(keyword)
# Process and store results...
# Update competitor profiles
for comp in RESEARCH_CONFIG["competitors"]:
analysis = analyze_competitor(comp)
# Store updated profiles...
# Generate report
report = generate_market_report(RESEARCH_CONFIG["market"])
timestamp = datetime.now().strftime("%Y%m%d")
with open(f"reports/market_research_{timestamp}.md", "w") as f:
f.write(report)
print("Research update complete.")
# Run weekly
schedule.every().monday.at("09:00").do(run_research_update)
# Or run once
run_research_update()
Common Issues and Solutions
Issue: Paywalled industry reports
Sites like Statista and Gartner put reports behind paywalls. ScrapeForge can extract preview data, abstracts, and key statistics that are often visible without subscription. For full reports, look for free press releases or summaries.
Issue: Inconsistent data across sources
Different sources report different market sizes. Always note the source and methodology. Store all values with source attribution so you can identify outliers.
Issue: Competitor pages change structure frequently
DeepDive's AI extraction is resilient to layout changes since it understands content semantically, not via CSS selectors. This is a major advantage over fragile BeautifulSoup scrapers.
Next Steps
- Combine with /blog/build-competitive-intelligence-dashboard for ongoing competitor monitoring
- Export research data to visualization tools (Tableau, Google Data Studio)
- Add automated alerts when market conditions change significantly
- See /compare/serpapi for more search API options
Automate your market research with 500 free credits. No credit card required -- sign up and start collecting data in minutes.