Website change monitoring is essential for competitive intelligence, price tracking, content auditing, and security alerting. Whether you want to know when a competitor updates their pricing, when a documentation page changes, or when a government regulation site posts new content, a Python-based monitoring system gives you automated, real-time visibility. This tutorial builds a complete website change monitor using Python and SearchHive.
Prerequisites
- Python 3.8+ with
requests,hashlib, andsqlite3(built-in) installed - A SearchHive API key (free tier includes 500 credits)
pip install requests
Key Takeaways
- Effective website monitoring needs content hashing to detect actual changes, not noise from timestamps or ad slots
- SearchHive's ScrapeForge with
render_js: Truecaptures the full rendered page content, including dynamic elements - SQLite provides a simple, zero-dependency database for tracking change history
- The complete monitor handles multiple URLs, configurable check intervals, and notification alerts
Step 1: Choose Your Detection Strategy
There are three main approaches to detecting website changes:
- Content hashing: Hash the full page HTML or extracted text. Fast, but sensitive to noise (timestamps, ads, analytics scripts).
- Text diffing: Extract just the meaningful text content, hash that. More reliable than full HTML hashing.
- Element extraction: Target specific CSS selectors (prices, headings, paragraphs). Most precise, best for tracking specific data points.
This tutorial uses a combination -- extract meaningful content, hash it, and compare against the previous hash.
Step 2: Set Up the Database
Use SQLite to store URLs, their last-known content hashes, and change history:
import sqlite3
import hashlib
import json
import requests
import time
from datetime import datetime
def init_db(db_path="monitor.db"):
"""Initialize the monitoring database."""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS monitored_urls (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
selector TEXT,
last_hash TEXT,
last_check TEXT,
check_interval INTEGER DEFAULT 3600,
name TEXT
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS changes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url_id INTEGER,
detected_at TEXT,
old_hash TEXT,
new_hash TEXT,
snippet TEXT,
FOREIGN KEY (url_id) REFERENCES monitored_urls(id)
)
""")
conn.commit()
conn.close()
init_db()
Step 3: Fetch and Hash Page Content
Use SearchHive's ScrapeForge to fetch rendered content and compute a content hash:
API_KEY = "your_searchhive_key"
BASE_URL = "https://api.searchhive.dev/v1"
sh_headers = {"Authorization": f"Bearer {API_KEY}"}
def get_content_hash(url, selector=None):
"""Fetch a page and return its content hash."""
payload = {
"url": url,
"render_js": True
}
if selector:
payload["extract"] = {
"content": {
"selector": selector,
"fields": ["text"]
}
}
response = requests.post(
f"{BASE_URL}/scrape",
headers=sh_headers,
json=payload
)
if response.status_code == 200:
data = response.json()
# Serialize to string for hashing
content_str = json.dumps(data, sort_keys=True)
content_hash = hashlib.sha256(content_str.encode()).hexdigest()
return content_hash, data
else:
print(f"Error fetching {url}: {response.status_code}")
return None, None
Step 4: Check for Changes
Compare the current content hash against the stored hash and log any changes:
def check_url(url_id, url, selector=None):
"""Check a single URL for changes."""
current_hash, content = get_content_hash(url, selector)
if current_hash is None:
return False
conn = sqlite3.connect("monitor.db")
cursor = conn.cursor()
# Get last known hash
cursor.execute(
"SELECT last_hash FROM monitored_urls WHERE id = ?",
(url_id,)
)
row = cursor.fetchone()
last_hash = row[0] if row else None
changed = False
if last_hash is None:
# First check -- record initial state
print(f"[FIRST] {url}: initial hash recorded")
elif current_hash != last_hash:
changed = True
# Extract a snippet of what changed
snippet = json.dumps(content, ensure_ascii=False)[:500]
print(f"[CHANGED] {url}: hash changed from {last_hash[:12]}... to {current_hash[:12]}...")
# Log the change
cursor.execute(
"""INSERT INTO changes (url_id, detected_at, old_hash, new_hash, snippet)
VALUES (?, ?, ?, ?, ?)""",
(url_id, datetime.now().isoformat(), last_hash, current_hash, snippet)
)
else:
print(f"[SAME] {url}: no changes detected")
# Update last check
cursor.execute(
"""UPDATE monitored_urls SET last_hash = ?, last_check = ? WHERE id = ?""",
(current_hash, datetime.now().isoformat(), url_id)
)
conn.commit()
conn.close()
return changed
Step 5: Add and Manage Monitored URLs
Build functions to add new URLs and list monitored sites:
def add_url(url, name=None, selector=None, interval=3600):
"""Add a URL to monitor."""
conn = sqlite3.connect("monitor.db")
cursor = conn.cursor()
try:
cursor.execute(
"""INSERT INTO monitored_urls (url, name, selector, check_interval)
VALUES (?, ?, ?, ?)""",
(url, name, selector, interval)
)
conn.commit()
print(f"Added: {name or url}")
except sqlite3.IntegrityError:
print(f"Already monitoring: {url}")
finally:
conn.close()
def list_monitored():
"""List all monitored URLs."""
conn = sqlite3.connect("monitor.db")
cursor = conn.cursor()
cursor.execute("SELECT id, name, url, last_check, check_interval FROM monitored_urls")
rows = cursor.fetchall()
conn.close()
print(f"
Monitored URLs ({len(rows)}):")
print("-" * 80)
for row in rows:
uid, name, url, last_check, interval = row
print(f" [{uid}] {name or url}")
print(f" {url}")
print(f" Last check: {last_check or 'Never'} | Interval: {interval}s")
return rows
def get_change_history(url_id, limit=10):
"""Show recent changes for a monitored URL."""
conn = sqlite3.connect("monitor.db")
cursor = conn.cursor()
cursor.execute(
"""SELECT detected_at, snippet FROM changes
WHERE url_id = ? ORDER BY detected_at DESC LIMIT ?""",
(url_id, limit)
)
rows = cursor.fetchall()
conn.close()
for ts, snippet in rows:
print(f"
[{ts}]")
print(f" {snippet[:200]}...")
Step 6: Add Notifications
Get alerted when changes are detected -- via email, Slack, or any webhook:
import json as json_mod
def notify_change(url, name, old_hash, new_hash, snippet):
"""Send a notification when a change is detected."""
# Option 1: Webhook (Slack, Discord, etc.)
webhook_url = "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
if webhook_url:
payload = {
"text": f"Website Change Detected: {name or url}",
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"*Change detected: {name or url}*
URL: {url}
Old hash: {old_hash[:16]}
New hash: {new_hash[:16]}
Snippet: {snippet[:300]}"
}
}
]
}
try:
requests.post(webhook_url, json=payload, timeout=10)
print(" Notification sent via webhook")
except Exception as e:
print(f" Webhook failed: {e}")
# Option 2: Email via SearchHive (if configured)
# Option 3: Write to a log file
with open("change_log.txt", "a") as f:
f.write(f"[{datetime.now().isoformat()}] CHANGE: {name or url}
")
f.write(f" Snippet: {snippet[:200]}
")
Step 7: Run the Monitor Continuously
Put it all together in a monitoring loop:
def run_monitor():
"""Main monitoring loop."""
print("Starting website change monitor...")
while True:
conn = sqlite3.connect("monitor.db")
cursor = conn.cursor()
cursor.execute(
"""SELECT id, url, name, selector, check_interval, last_check
FROM monitored_urls"""
)
urls = cursor.fetchall()
conn.close()
now = datetime.now()
for uid, url, name, selector, interval, last_check in urls:
# Check if it is time to check this URL
if last_check:
last_dt = datetime.fromisoformat(last_check)
elapsed = (now - last_dt).total_seconds()
if elapsed < interval:
continue # Not time yet
# Check for changes
current_hash, content = get_content_hash(url, selector)
if current_hash is None:
continue
conn2 = sqlite3.connect("monitor.db")
cur2 = conn2.cursor()
cur2.execute("SELECT last_hash FROM monitored_urls WHERE id = ?", (uid,))
row = cur2.fetchone()
old_hash = row[0] if row else None
if old_hash and current_hash != old_hash:
snippet = json_mod.dumps(content, ensure_ascii=False)[:500]
cur2.execute(
"""INSERT INTO changes (url_id, detected_at, old_hash, new_hash, snippet)
VALUES (?, ?, ?, ?, ?)""",
(uid, now.isoformat(), old_hash, current_hash, snippet)
)
notify_change(url, name, old_hash, current_hash, snippet)
cur2.execute(
"UPDATE monitored_urls SET last_hash = ?, last_check = ? WHERE id = ?",
(current_hash, now.isoformat(), uid)
)
conn2.commit()
conn2.close()
# Sleep before next cycle
time.sleep(60)
print(f"Cycle complete at {datetime.now().strftime('%H:%M:%S')}")
if __name__ == "__main__":
# Add some URLs to monitor
add_url("https://competitor.com/pricing", "Competitor Pricing", interval=3600)
add_url("https://docs.example.com/changelog", "Docs Changelog", interval=1800)
# Show what we are monitoring
list_monitored()
# Start monitoring
run_monitor()
Common Issues
- False positives from dynamic content: Ads, social widgets, and analytics scripts change on every page load. Use a specific CSS selector to target only the content you care about.
- Rate limiting: Do not check the same URL too frequently. Respect the target site's server capacity and set appropriate intervals (minimum 5-10 minutes).
- Login-protected pages: Some pages require authentication. See how to scrape behind login for techniques to handle authenticated content.
- JavaScript-heavy pages: Single-page applications may not render content without JavaScript. SearchHive's
render_js: Truehandles this automatically. - Database growth: The changes table grows over time. Add a cleanup function to prune old entries (e.g., keep only the last 90 days).
Next Steps
- Deploy the monitor as a systemd service or Docker container for always-on monitoring
- Add diff comparison to show exactly what text changed between checks
- Integrate with DeepDive to get AI summaries of changes instead of raw snippets
- Build a simple web dashboard to view change history visually
Start Monitoring for Free
SearchHive's free tier gives you 500 credits to monitor up to 500 page loads with full JavaScript rendering. That covers dozens of URLs with hourly checks. Get your free API key and start monitoring today.
/tutorials/web-scraping-python-beginners-guide | /tutorials/how-to-scrape-github-data-for-developer-research | /compare/firecrawl