def scrape_reddit_subreddit(subreddit, pages=5): """Scrape posts from a Reddit subreddit.""" posts = []
for page in range(pages):
url = f"https://www.reddit.com/r/{subreddit}/hot/?page={page}"
headers = {"[user agent parser](/tools/user-agent-parser)": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"}
response = requests.get(url, headers=headers, timeout=30)
if response.status_code == 429:
print("Rate limited — waiting 60s")
time.sleep(60)
continue
soup = BeautifulSoup(response.text, "html.parser")
for post in soup.select("shreddit-post"):
posts.append({
"title": post.get("post-title", ""),
"score": post.get("score", 0),
"comments": post.get("comment-count", 0),
"author": post.get("author", ""),
"url": f"https://reddit.com{post.get('content-href', '')}",
})
time.sleep(2)
df = pd.DataFrame(posts)
df.drop_duplicates(subset=["title"], inplace=True)
print(f"Scraped {len(df)} posts from r/{subreddit}")
return df
</code></pre> <br>
<h2>Step 3: Scrape YouTube (Data API + HTML)</h2> <br> <p>YouTube's Data API v3 provides structured data. Register at <code>console.cloud.google.com</code>:</p> <br> <pre><code> import requests import pandas as pdAPI_KEY = "your_youtube_api_key"
def search_youtube(query, max_results=50): """Search YouTube for videos matching a query.""" videos = [] url = "https://www.googleapis.com/youtube/v3/search" next_token = None
while len(videos) < max_results:
params = {
"part": "snippet",
"q": query,
"type": "video",
"maxResults": min(50, max_results - len(videos)),
"key": API_KEY,
"order": "relevance",
}
if next_token:
params["pageToken"] = next_token
response = requests.get(url, params=params).json()
for item in response.get("items", []):
videos.append({
"title": item["snippet"]["title"],
"channel": item["snippet"]["channelTitle"],
"published": item["snippet"]["publishedAt"],
"video_id": item["id"]["videoId"],
"url": f"https://youtube.com/watch?v={item['id']['videoId']}",
"description": item["snippet"]["description"][:200],
})
next_token = response.get("nextPageToken")
if not next_token:
break
df = pd.DataFrame(videos)
print(f"Found {len(df)} YouTube videos for '{query}'")
return df
def get_video_stats(df): """Get view counts and engagement for YouTube videos.""" stats_url = "https://www.googleapis.com/youtube/v3/videos"
# Batch request (max 50 IDs)
video_ids = ",".join(df["video_id"].tolist())
params = {"part": "statistics", "id": video_ids, "key": API_KEY}
response = requests.get(stats_url, params=params).json()
stats_map = {}
for item in response.get("items", []):
vid = item["id"]
stats = item["statistics"]
stats_map[vid] = {
"views": int(stats.get("viewCount", 0)),
"likes": int(stats.get("likeCount", 0)),
"comments": int(stats.get("commentCount", 0)),
}
df["views"] = df["video_id"].map(lambda x: stats_map.get(x, {}).get("views", 0))
df["likes"] = df["video_id"].map(lambda x: stats_map.get(x, {}).get("likes", 0))
df["engagement_rate"] = (df["likes"] / df["views"].replace(0, 1) * 100).round(2)
return df.sort_values("views", ascending=False)
</code></pre> <br>
<h2>Step 4: Scrape LinkedIn (Public Company Pages)</h2> <br> <p>Company pages on LinkedIn are often publicly accessible:</p> <br> <pre><code> import requests from bs4 import BeautifulSoupdef scrape_linkedin_company(company_slug): """Scrape public data from a LinkedIn company page.""" url = f"https://www.linkedin.com/company/{company_slug}/" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"}
response = requests.get(url, headers=headers, timeout=30)
if response.status_code != 200:
print(f"Failed: {response.status_code}")
return None
soup = BeautifulSoup(response.text, "html.parser")
return {
"name": soup.select_one("h1") and soup.select_one("h1").get_text(strip=True),
"industry": None, # Often JS-rendered — need SearchHive
"employees": None,
"url": url,
}
</code></pre> <br>
<h2>Step 5: Use SearchHive for JS-Heavy Platforms</h2> <br> <p>Most social media platforms render content with JavaScript. SearchHive handles this automatically:</p> <br> <pre><code> from searchhive import SearchHive import pandas as pdclient = SearchHive(api_key="your-api-key")
Scrape a LinkedIn company page (JS-rendered content)
result = client.scrape( url="https://www.linkedin.com/company/google/", selector="h1, .org-top-card-summary__tagline, dd, .company-size" ) print(f"Company: {result['data']}")
Scrape a Reddit subreddit
result = client.scrape( url="https://www.reddit.com/r/SaaS/hot/", selector="shreddit-post" )
Batch scrape multiple social pages
social_urls = [ "https://www.reddit.com/r/startups/hot/", "https://www.reddit.com/r/SaaS/hot/", "https://www.reddit.com/r/Entrepreneur/hot/", ]
results = client.batch(social_urls, selector="shreddit-post") for r in results: print(f"{r['url']}: {len(r['data'].get('shreddit-post', []))} posts") </code></pre> <br>
<h3>SwiftSearch: Find Social Media Mentions</h3> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
Track brand mentions across platforms
queries = [ "site:reddit.com "your-brand-name"", "site:youtube.com "your-product" review", "site:linkedin.com "your-competitor" hiring", ]
for query in queries: results = client.search(query) print(f"\nQuery: {query}") for r in results[:5]: print(f" [{r['title']}] {r['url']}") </code></pre> <br>
<h2>Step 6: Cross-Platform Data Analysis</h2> <br> <p>Combine data from multiple platforms for a comprehensive view:</p> <br> <pre><code> import pandas as pddef merge_social_data(reddit_df, youtube_df=None): """Merge data from multiple social platforms.""" reddit_df["platform"] = "reddit" reddit_df["engagement"] = reddit_df["score"] + reddit_df["comments"]
if youtube_df is not None:
youtube_df["platform"] = "youtube"
youtube_df["engagement"] = youtube_df["views"]
combined = pd.concat([reddit_df], ignore_index=True)
if youtube_df is not None:
combined = pd.concat([combined, youtube_df], ignore_index=True)
combined = combined.sort_values("engagement", ascending=False)
combined.to_csv("social_market_research.csv", index=False)
return combined
def keyword_analysis(df, text_column="title"): """Extract trending keywords across platforms.""" from collections import Counter stop_words = {"the", "a", "an", "is", "are", "for", "in", "on", "to", "and", "of", "i", "it", "my", "with", "this", "that"} words = [] for text in df[text_column].dropna(): words.extend(w.lower() for w in str(text).split() if w.lower() not in stop_words and len(w) > 2) return Counter(words).most_common(20) </code></pre> <br>
<h2>Complete Working Example</h2> <br> <pre><code> import requests import pandas as pd from collections import Counterdef social_media_market_research(brand, subreddits): """Multi-platform market research pipeline.""" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"}
# 1. Scrape Reddit discussions
all_posts = []
for sub in subreddits:
for page in range(3):
url = f"https://www.reddit.com/r/{sub}/search/?q={brand}&sort=relevance&page={page}"
try:
r = requests.get(url, headers=headers, timeout=30)
soup = BeautifulSoup(r.text, "html.parser")
for post in soup.select("shreddit-post"):
all_posts.append({
"title": post.get("post-title", ""),
"score": post.get("score", 0),
"platform": "reddit",
"subreddit": sub,
})
except Exception as e:
print(f"Error on r/{sub}: {e}")
import time
time.sleep(2)
# 2. Analyze
df = pd.DataFrame(all_posts)
if len(df) == 0:
print("No results found")
return df
df.drop_duplicates(subset=["title"], inplace=True)
df.to_csv(f"social_research_{brand}.csv", index=False)
print(f"\n{'='*50}")
print(f"Market Research: '{brand}'")
print(f"Total mentions: {len(df)}")
print(f"Subreddits: {df['subreddit'].value_counts().to_dict()}")
print(f"\nTop keywords:")
stop = {"the","a","an","is","are","for","in","on","to","and","of","i","it","my","with"}
words = []
for t in df["title"].dropna():
words.extend(w.lower() for w in str(t).split() if w.lower() not in stop and len(w) > 2)
for word, count in Counter(words).most_common(15):
print(f" {word}: {count}")
return df
if name == "main": social_media_market_research( brand="Notion", subreddits=["SaaS", "productivity", "startups", "Entrepreneur"] ) </code></pre> <br>
<h2>Common Issues</h2> <br> <p><strong>Login walls on Twitter/X and Instagram:</strong> These platforms require authentication for most content. Use official APIs (Twitter API at $100/mo) or SearchHive's browser rendering.</p> <br> <p><strong>CAPTCHA challenges:</strong> LinkedIn and Facebook frequently show CAPTCHAs. SearchHive solves these automatically.</p> <br> <p><strong>Rate limiting:</strong> Every platform has limits. Add delays (2–5 seconds minimum), rotate proxies, and respect 429 responses.</p> <br> <p><strong>Platform API changes:</strong> Twitter/X has severely restricted their API. Facebook's Graph API requires app review. Always have fallback strategies.</p> <br> <p><strong>JavaScript rendering:</strong> Most social media content loads dynamically. Use SearchHive instead of raw <code>requests</code> for reliable extraction.</p> <br> <p><strong>Data quality:</strong> Social media data is noisy. Filter out bots, spam, and low-quality posts before analysis.</p> <br> <h2>Next Steps</h2> <br> <li>**Deepen with Reddit data** — See [How to Scrape Reddit Data for Market Research](/blog/how-to-scrape-reddit-data-for-market-research) for advanced Reddit techniques</li> <li>**Sentiment analysis** — Use TextBlob or NLTK for automated sentiment scoring</li> <li>**Automate reports** — Schedule weekly data collection and generate trend reports</li> <li>**Competitor tracking** — Set up searches for competitor names across platforms</li> <li>**Visualize trends** — Plot engagement over time with matplotlib to spot emerging patterns</li> <br> <hr> <br> <p>Get started with SearchHive's <a href="https://searchhive.dev">free tier</a> — 100 requests per month with JS rendering, proxy rotation, and CAPTCHA solving included on every plan. No credit card required. <a href="https://searchhive.dev/docs">Read the docs</a>.</p> <br>