Scraping real estate listings gives you access to housing market data that APIs rarely provide -- pricing trends, property features, neighborhood details, and time-on-market metrics. Whether you're building a price estimator, market analysis tool, or investment tracker, Python makes it straightforward to collect this data at scale.
This tutorial covers scraping real estate listings using Python and SearchHive's ScrapeForge API, including handling JavaScript-rendered listings pages, extracting structured property data, and storing results for analysis.
Key Takeaways
- Most real estate sites use JavaScript rendering -- a simple
requests.get()won't work - SearchHive's ScrapeForge handles JS rendering and proxy rotation automatically
- Use DeepDive for AI-powered extraction of property details from listing pages
- Respect
robots.txtand rate limits to avoid getting blocked - Store scraped data in structured formats (CSV, SQLite) for analysis
Prerequisites
- Python 3.10+
- A SearchHive API key (free tier available)
- Basic Python knowledge
Install dependencies:
pip install requests pandas
Step 1: Scrape a Single Listing Page
Most real estate sites like Zillow, Redfin, and Realtor.com render listings with JavaScript. ScrapeForge handles this transparently.
import requests
import json
API_KEY = "your_searchhive_api_key"
def scrape_listing(url: str) -> dict:
response = requests.get(
"https://api.searchhive.dev/v1/scrapeforge",
headers={"Authorization": f"Bearer {API_KEY}"},
params={"url": url, "format": "json"}
)
response.raise_for_status()
return response.json()
listing = scrape_listing("https://www.realtor.com/realestateandhomes-detail/example")
print(json.dumps(listing, indent=2)[:500])
ScrapeForge returns the full page content as structured free JSON formatter, including data from dynamically loaded elements that a basic HTTP client would miss.
Step 2: Extract Property Details with DeepDive
Raw page content needs parsing. SearchHive's DeepDive endpoint uses AI to extract specific fields from any page.
def extract_property_data(url: str) -> dict:
response = requests.post(
"https://api.searchhive.dev/v1/deepdive",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"url": url,
"prompt": (
"Extract: property address, price, bedrooms, bathrooms, "
"square footage, lot size, year built, property type, "
"days on market, description, and neighborhood name. "
"Return as JSON."
)
}
)
response.raise_for_status()
return response.json()
property_data = extract_property_data(
"https://www.realtor.com/realestateandhomes-detail/example"
)
print(json.dumps(property_data, indent=2))
DeepDive returns structured JSON with the specific fields you requested, regardless of how the original page formats them. This handles differences between Zillow, Redfin, Realtor.com, and regional listing sites.
Step 3: Search for Listings in a Specific Area
Use SwiftSearch to find listing pages in a target area, then scrape each one.
def find_listings(city: str, state: str, num_results: int = 10) -> list:
query = f"houses for sale in {city} {state} site:realtor.com"
response = requests.get(
"https://api.searchhive.dev/v1/swiftsearch",
headers={"Authorization": f"Bearer {API_KEY}"},
params={"q": query, "num": num_results}
)
response.raise_for_status()
data = response.json()
return [r["url"] for r in data.get("results", [])]
listing_urls = find_listings("Austin", "TX", num_results=10)
print(f"Found {len(listing_urls)} listings")
for url in listing_urls[:3]:
print(url)
Step 4: Build a Batch Scraping Pipeline
Combine search and extraction into a pipeline that processes multiple listings.
import time
import pandas as pd
def scrape_area(city: str, state: str, max_listings: int = 20) -> list:
urls = find_listings(city, state, num_results=max_listings)
properties = []
for i, url in enumerate(urls):
try:
data = extract_property_data(url)
data["source_url"] = url
properties.append(data)
print(f"[{i+1}/{len(urls)}] Scraped: {url}")
time.sleep(1) # Be polite
except Exception as e:
print(f"Error scraping {url}: {e}")
return properties
properties = scrape_area("Austin", "TX", max_listings=10)
print(f"Collected {len(properties)} properties")
Step 5: Save Results to CSV
Convert the scraped properties into a DataFrame for easy analysis.
def save_to_csv(properties: list, filename: str = "listings.csv"):
df = pd.json_normalize(properties)
df.to_csv(filename, index=False)
print(f"Saved {len(df)} listings to {filename}")
print(df[["address", "price", "bedrooms", "bathrooms"]].head())
save_to_csv(properties)
Step 6: Calculate Market Statistics
Once you have the data, compute basic market metrics.
def analyze_market(properties: list):
df = pd.DataFrame(properties)
# Ensure price is numeric
df["price_num"] = df["price"].astype(str).str.replace(
r"[^0-9.]", "", regex=True
).astype(float)
stats = {
"total_listings": len(df),
"avg_price": df["price_num"].mean(),
"median_price": df["price_num"].median(),
"min_price": df["price_num"].min(),
"max_price": df["price_num"].max(),
}
print(f"Market Analysis:")
print(f" Listings: {stats['total_listings']}")
print(f" Average: ${stats['avg_price']:,.0f}")
print(f" Median: ${stats['median_price']:,.0f}")
print(f" Range: ${stats['min_price']:,.0f} - ${stats['max_price']:,.0f}")
return stats
analyze_market(properties)
Step 7: Monitor New Listings Over Time
Set up periodic scraping to track new listings in your target areas.
import sqlite3
from datetime import datetime
def init_db():
conn = sqlite3.connect("listings.db")
conn.execute("""
CREATE TABLE IF NOT EXISTS listings (
url TEXT PRIMARY KEY,
address TEXT,
price TEXT,
bedrooms TEXT,
bathrooms TEXT,
sqft TEXT,
scraped_at TEXT
)
""")
conn.commit()
return conn
def upsert_listing(conn, data: dict):
conn.execute("""
INSERT OR REPLACE INTO listings
(url, address, price, bedrooms, bathrooms, sqft, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (
data.get("source_url"),
data.get("address"),
data.get("price"),
data.get("bedrooms"),
data.get("bathrooms"),
data.get("square_footage"),
datetime.now().isoformat()
))
conn.commit()
conn = init_db()
for prop in properties:
upsert_listing(conn, prop)
# Find new listings since last run
new_listings = conn.execute("""
SELECT address, price FROM listings
WHERE scraped_at > datetime('now', '-1 hour')
""").fetchall()
print(f"New listings in last hour: {len(new_listings)}")
conn.close()
Complete Code Example
import requests
import time
import pandas as pd
import sqlite3
from datetime import datetime
API_KEY = "your_searchhive_api_key"
def extract_property_data(url: str) -> dict:
response = requests.post(
"https://api.searchhive.dev/v1/deepdive",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"url": url,
"prompt": (
"Extract: address, price, bedrooms, bathrooms, "
"sqft, year_built, property_type. Return as JSON."
)
}
)
response.raise_for_status()
return response.json()
def find_listings(city: str, state: str) -> list:
response = requests.get(
"https://api.searchhive.dev/v1/swiftsearch",
headers={"Authorization": f"Bearer {API_KEY}"},
params={
"q": f"houses for sale in {city} {state}",
"num": 10
}
)
data = response.json()
return [r["url"] for r in data.get("results", [])]
def run_scraper(city: str, state: str):
urls = find_listings(city, state)
print(f"Found {len(urls)} listing URLs")
for url in urls:
try:
data = extract_property_data(url)
print(f" {data.get('address', 'N/A')}: "
f"{data.get('price', 'N/A')}")
time.sleep(1)
except Exception as e:
print(f" Failed: {e}")
if __name__ == "__main__":
run_scraper("Austin", "TX")
Common Issues
Getting blocked by real estate sites: Major sites like Zillow aggressively block scrapers. SearchHive's proxy rotation handles most of this, but if you're still getting blocked, try using ScrapeForge with geotargeting or reducing your request rate.
Inconsistent data formats: Different listing sites format prices differently ($500K vs $500,000 vs 500000). DeepDive normalizes these, but add your own parsing logic for edge cases.
Missing fields: Not every listing has all fields. Always check for None values and handle missing data gracefully in your analysis code.
Stale listings: Use the SQLite tracking approach to avoid re-scraping listings you've already seen. Check by URL before calling the API.
Next Steps
- Add map coordinates using a geocoding API for spatial analysis
- Build alerts for new listings that match specific criteria
- Create a Streamlit dashboard for interactive market exploration
- Schedule daily runs with cron expression generator or GitHub Actions
Start building your real estate data pipeline with 500 free credits from SearchHive. The free tier gives you full access to SwiftSearch, ScrapeForge, and DeepDive -- enough to scrape hundreds of listings before you need a paid plan.
See also: /blog/how-to-scrape-google-maps-data-with-python, /compare/firecrawl, /compare/scrapingbee