How to Build a Sentiment Analyzer with Web Data and Python
Sentiment analysis transforms raw text from reviews, social media, and news articles into actionable data about public opinion. This tutorial shows you how to build a complete sentiment analyzer that collects web data with SearchHive and classifies sentiment using Python's NLP libraries.
Key Takeaways
- A practical sentiment analyzer needs two parts: data collection (web scraping) and sentiment classification (NLP)
- SearchHive's ScrapeForge API collects review text from any website, handling JavaScript rendering automatically
- TextBlob provides fast, lightweight sentiment scoring without GPU requirements
- VADER is optimized for social media text with emoticons, slang, and shorthand
- The complete pipeline runs in under 60 lines of Python and works on SearchHive's free tier
Prerequisites
- Python 3.8 or later
- SearchHive API key (sign up free)
- pip install textblob vaderSentiment searchhive
pip install textblob vaderSentiment searchhive
Step 1: Collect Review Data with ScrapeForge
The first step is gathering text to analyze. SearchHive's ScrapeForge API extracts content from review sites, forums, and product pages -- including JavaScript-rendered content that basic scrapers miss:
from searchhive import ScrapeForge
client = ScrapeForge(api_key="YOUR_API_KEY")
# Scrape a product review page
result = client.scrape(
"https://www.example.com/product/reviews",
format="markdown",
render_js=True
)
# Extract individual reviews using DeepDive
from searchhive import DeepDive
deep = DeepDive(api_key="YOUR_API_KEY")
review_data = deep.extract(
content=result.content,
schema={
"type": "object",
"properties": {
"product_name": {"type": "string"},
"reviews": {
"type": "array",
"items": {
"type": "object",
"properties": {
"author": {"type": "string"},
"rating": {"type": "string"},
"text": {"type": "string"},
"date": {"type": "string"}
}
}
}
}
}
)
reviews = review_data.data.get("reviews", [])
print(f"Collected {len(reviews)} reviews")
ScrapeForge handles the hard parts: JavaScript rendering for dynamic review widgets, proxy rotation to avoid IP blocks, and automatic retries on transient failures.
Step 2: Set Up Sentiment Classification
Two popular libraries handle sentiment classification in Python:
TextBlob -- rule-based, fast, good for product reviews and formal text VADER -- optimized for social media, handles emoticons, slang, and capitalization
Install both and choose based on your data source:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def textblob_sentiment(text):
analysis = TextBlob(text)
polarity = analysis.sentiment.polarity # -1 to 1
if polarity > 0.1:
return "positive", polarity
elif polarity < -0.1:
return "negative", polarity
else:
return "neutral", polarity
def vader_sentiment(text):
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(text)
compound = scores["compound"] # -1 to 1
if compound >= 0.05:
return "positive", compound
elif compound <= -0.05:
return "negative", compound
else:
return "neutral", compound
TextBlob returns a polarity score from -1 (very negative) to 1 (very positive). VADER returns a compound score in the same range but with additional sub-scores for positive, negative, and neutral proportions.
Step 3: Analyze Collected Reviews
Combine data collection with sentiment analysis:
from searchhive import ScrapeForge, DeepDive
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
scrape = ScrapeForge(api_key="YOUR_API_KEY")
deep = DeepDive(api_key="YOUR_API_KEY")
def analyze_reviews(url):
# Scrape the page
raw = scrape.scrape(url, format="markdown", render_js=True)
# Extract structured reviews
data = deep.extract(
content=raw.content,
schema={
"type": "object",
"properties": {
"reviews": {
"type": "array",
"items": {
"type": "object",
"properties": {
"text": {"type": "string"},
"rating": {"type": "string"}
}
}
}
}
}
)
# Analyze each review
results = []
for review in data.data.get("reviews", []):
text = review.get("text", "")
scores = analyzer.polarity_scores(text)
results.append({
"text": text[:200],
"rating": review.get("rating"),
"sentiment": "positive" if scores["compound"] >= 0.05 else "negative" if scores["compound"] <= -0.05 else "neutral",
"score": scores["compound"]
})
return results
# Run analysis
results = analyze_reviews("https://www.example.com/product/reviews")
for r in results[:5]:
label = r["sentiment"].upper()
print(f"[{label}] ({r['score']:+.2f}) {r['text'][:100]}...")
Step 4: Aggregate and Visualize Results
For competitive analysis or brand monitoring, aggregate sentiment across multiple pages and visualize the distribution:
import json
from collections import Counter
def aggregate_sentiment(results):
labels = [r["sentiment"] for r in results]
avg_score = sum(r["score"] for r in results) / len(results) if results else 0
return {
"total_reviews": len(results),
"positive": labels.count("positive"),
"negative": labels.count("negative"),
"neutral": labels.count("neutral"),
"average_score": round(avg_score, 3),
"distribution": dict(Counter(labels))
}
summary = aggregate_sentiment(results)
print(json.dumps(summary, indent=2))
Output:
{
"total_reviews": 47,
"positive": 31,
"negative": 8,
"neutral": 8,
"average_score": 0.342,
"distribution": {
"positive": 31,
"negative": 8,
"neutral": 8
}
}
Step 5: Build a Multi-Source Pipeline
For brand monitoring, you'll want to collect data from multiple sources -- review sites, social media, news articles, and forums. Here's how to build a multi-source pipeline:
from searchhive import SwiftSearch, ScrapeForge, DeepDive
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json, time
API_KEY = "YOUR_API_KEY"
analyzer = SentimentIntensityAnalyzer()
search = SwiftSearch(api_key=API_KEY)
scrape = ScrapeForge(api_key=API_KEY)
deep = DeepDive(api_key=API_KEY)
def find_and_analyze(brand, num_sources=10):
# Find mentions across the web
query = f"{brand} reviews 2024"
results = search.search(query, num_results=num_sources)
urls = [r.url for r in results if r.url]
all_reviews = []
for url in urls:
try:
raw = scrape.scrape(url, format="markdown", render_js=True)
data = deep.extract(
content=raw.content,
schema={
"type": "object",
"properties": {
"reviews": {
"type": "array",
"items": {
"type": "object",
"properties": {
"text": {"type": "string"}
}
}
}
}
}
)
for review in data.data.get("reviews", []):
text = review.get("text", "")
if len(text) > 20:
scores = analyzer.polarity_scores(text)
all_reviews.append({
"source": url,
"text": text[:300],
"score": scores["compound"],
"sentiment": "positive" if scores["compound"] >= 0.05 else "negative" if scores["compound"] <= -0.05 else "neutral"
})
time.sleep(1)
except Exception as e:
print(f"Failed {url}: {e}")
return all_reviews
# Analyze a brand
reviews = find_and_analyze("Acme Corp", num_sources=8)
# Summary
pos = sum(1 for r in reviews if r["sentiment"] == "positive")
neg = sum(1 for r in reviews if r["sentiment"] == "negative")
neu = sum(1 for r in reviews if r["sentiment"] == "neutral")
avg = sum(r["score"] for r in reviews) / len(reviews) if reviews else 0
print(f"Brand sentiment: {pos} positive, {neg} negative, {neu} neutral (avg: {avg:+.3f})")
with open("sentiment_results.json", "w") as f:
json.dump(reviews, f, indent=2)
Step 6: Handle Common Issues
Short or empty reviews. Filter out reviews under 20 characters -- they produce unreliable sentiment scores. VADER handles short text better than TextBlob for this reason.
Mixed sentiment. Reviews like "Great food but terrible service" score near zero. For detailed analysis, split text into sentences and score each one individually:
import re
def sentence_level_sentiment(text):
sentences = re.split(r'[.!?]+', text)
return [analyzer.polarity_scores(s)["compound"] for s in sentences if len(s.strip()) > 10]
Domain-specific language. Generic sentiment models may misinterpret domain terms. For specialized use cases (medical reviews, financial sentiment), consider fine-tuning a model or using a domain-specific lexicon.
Non-English text. TextBlob supports multilingual sentiment analysis via the detect_language method. For other languages, use langdetect to filter or route to appropriate models.
Complete Code Example
from searchhive import SwiftSearch, ScrapeForge, DeepDive
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json, time
API_KEY = "YOUR_API_KEY"
REVIEW_SCHEMA = {
"type": "object",
"properties": {
"reviews": {
"type": "array",
"items": {
"type": "object",
"properties": {
"text": {"type": "string"},
"rating": {"type": "string"}
}
}
}
}
}
def build_sentiment_pipeline(query, num_sources=10):
search = SwiftSearch(api_key=API_KEY)
scrape = ScrapeForge(api_key=API_KEY)
deep = DeepDive(api_key=API_KEY)
analyzer = SentimentIntensityAnalyzer()
results = search.search(query, num_results=num_sources)
all_reviews = []
for result in results:
if not result.url:
continue
try:
raw = scrape.scrape(result.url, format="markdown", render_js=True)
data = deep.extract(content=raw.content, schema=REVIEW_SCHEMA)
for r in data.data.get("reviews", []):
if len(r.get("text", "")) > 20:
score = analyzer.polarity_scores(r["text"])["compound"]
all_reviews.append({
"source": result.url,
"text": r["text"][:300],
"score": score,
"label": "positive" if score >= 0.05 else "negative" if score <= -0.05 else "neutral"
})
time.sleep(1)
except Exception:
continue
return all_reviews
if __name__ == "__main__":
reviews = build_sentiment_pipeline("Samsung Galaxy S24 reviews", num_sources=8)
with open("sentiment_output.json", "w") as f:
json.dump(reviews, f, indent=2)
pos = sum(1 for r in reviews if r["label"] == "positive")
neg = sum(1 for r in reviews if r["label"] == "negative")
print(f"Results: {pos} positive, {neg} negative, {len(reviews)} total")
Next Steps
- Schedule regular runs: Automate the pipeline to track sentiment trends over time
- Compare competitors: Run the pipeline on competitor brands and benchmark scores side by side
- Alert on negative spikes: Set thresholds to trigger alerts when negative sentiment increases
- Integrate with dashboards: Feed results into Grafana or a custom dashboard for real-time monitoring
Get started with SearchHive's free tier -- 500 credits, no credit card required. See the API docs for the full SDK reference.
See also: /blog/how-to-scrape-trustpilot-reviews-for-brand-monitoring and /blog/how-to-build-a-competitive-intelligence-dashboard.