reddit = praw.Reddit( client_id="your_client_id", client_secret="your_client_secret", user_agent="market-research-scraper/1.0" )
def scrape_subreddit_posts(subreddit_name, limit=500): """Get posts from a subreddit using PRAW.""" posts = [] subreddit = reddit.subreddit(subreddit_name)
for post in subreddit.hot(limit=limit):
posts.append({
"title": post.title,
"score": post.score,
"upvote_ratio": post.upvote_ratio,
"num_comments": post.num_comments,
"author": str(post.author) if post.author else "[deleted]",
"created_utc": post.created_utc,
"url": post.url,
"selftext": post.selftext[:500], # Truncate long posts
"flair": post.link_flair_text,
})
df = pd.DataFrame(posts)
print(f"Scraped {len(posts)} posts from r/{subreddit_name}")
return df
</code></pre> <br>
<h2>Step 3: Scrape with HTML Parsing</h2> <br> <p>No authentication required. Good for public subreddit pages:</p> <br> <pre><code> import requests from bs4 import BeautifulSoup import time import pandas as pdHEADERS = { "user agent parser": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", }
def scrape_reddit_html(subreddit, sort="hot", pages=5): """Scrape Reddit posts via HTML parsing.""" all_posts = []
for page in range(pages):
url = f"https://www.reddit.com/r/{subreddit}/{sort}/?page={page}"
response = requests.get(url, headers=HEADERS, timeout=30)
if response.status_code == 429:
print("Rate limited — waiting 60 seconds...")
time.sleep(60)
continue
soup = BeautifulSoup(response.text, "html.parser")
for post in soup.select("shreddit-post"):
try:
all_posts.append({
"title": post.get("post-title", ""),
"score": post.get("score", 0),
"author": post.get("author", ""),
"url": f"https://reddit.com{post.get('content-href', '')}",
"subreddit": subreddit,
})
except (KeyError, TypeError):
continue
time.sleep(2) # Rate limiting
df = pd.DataFrame(all_posts)
df.drop_duplicates(subset=["title"], inplace=True)
print(f"Scraped {len(df)} posts from r/{subreddit}")
return df
</code></pre> <br>
<h2>Step 4: Scrape Comments for Sentiment Analysis</h2> <br> <p>Comments are where the real insights live:</p> <br> <pre><code> def scrape_post_comments(subreddit, post_id): """Get comments from a specific Reddit post via PRAW.""" submission = reddit.submission(id=post_id) submission.comments.replace_more(limit=0) # Skip "load more" commentscomments = []
for comment in submission.comments.list():
comments.append({
"body": comment.body[:1000],
"score": comment.score,
"author": str(comment.author) if comment.author else "[deleted]",
"created_utc": comment.created_utc,
})
df = pd.DataFrame(comments)
print(f"Scraped {len(comments)} comments")
return df
</code></pre> <br>
<h2>Step 5: Search Reddit for Topic Research</h2> <br> <p>Reddit's search is powerful for finding discussions about specific topics:</p> <br> <pre><code> def search_reddit(query, subreddit="all", limit=200): """Search Reddit for posts matching a query.""" posts = [] for post in reddit.subreddit(subreddit).search(query, limit=limit, sort="relevance"): posts.append({ "title": post.title, "score": post.score, "num_comments": post.num_comments, "subreddit": post.subreddit.display_name, "url": f"https://reddit.com{post.permalink}", "selftext": post.selftext[:300], })df = pd.DataFrame(posts)
df.to_csv(f"reddit_search_{query.replace(' ', '_')}.csv", index=False)
print(f"Found {len(posts)} posts about '{query}'")
return df
</code></pre> <br>
<h2>Step 6: The SearchHive Approach</h2> <br> <p>SearchHive provides more flexibility — scrape Reddit without API credentials, handle rate limits automatically, and batch-process multiple subreddits:</p> <br> <pre><code> from searchhive import SearchHive import pandas as pdclient = SearchHive(api_key="your-api-key")
Scrape a subreddit page
result = client.scrape( url="https://www.reddit.com/r/Python/hot/", selector="shreddit-post, h1" )
Batch scrape multiple subreddits
subreddits = [ "https://www.reddit.com/r/python/hot/", "https://www.reddit.com/r/webdev/hot/", "https://www.reddit.com/r/datascience/hot/", ]
results = client.batch(subreddits, selector="shreddit-post") for r in results: print(f"{r['url']}: {len(r['data'].get('shreddit-post', []))} posts") </code></pre> <br>
<h3>SwiftSearch: Find Reddit Discussions by Topic</h3> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
Find Reddit discussions about a product or topic
results = client.search("site:reddit.com "best project management tool"") for r in results[:10]: print(f"[{r['title']}] {r['url']}") print(f" {r['snippet']}\n") </code></pre> <br>
<h3>DeepDive: Crawl a Subreddit</h3> <br> <pre><code> from searchhive import SearchHiveclient = SearchHive(api_key="your-api-key")
Crawl multiple pages of a subreddit
pages = client.deep_dive( url="https://www.reddit.com/r/SaaS/", max_depth=3, max_pages=50, url_pattern="reddit.com/r/SaaS/*" ) </code></pre> <br>
<h2>Step 7: Analyze Market Research Data</h2> <br> <p>Turn scraped Reddit data into insights:</p> <br> <pre><code> from collections import Counter import pandas as pddef analyze_sentiment(df): """Basic keyword-based sentiment analysis.""" positive_words = ["love", "great", "best", "awesome", "recommend", "amazing", "perfect"] negative_words = ["hate", "terrible", "worst", "awful", "waste", "broken", "slow", "expensive"]
def sentiment_score(text):
if not text:
return 0
text = str(text).lower()
return sum(1 for w in positive_words if w in text) - sum(1 for w in negative_words if w in text)
df["sentiment"] = df["selftext"].apply(sentiment_score)
print(f"Positive: {(df['sentiment'] > 0).sum()}")
print(f"Neutral: {(df['sentiment'] == 0).sum()}")
print(f"Negative: {(df['sentiment'] < 0).sum()}")
return df
def top_keywords(df, column="title", n=20): """Extract most common words from post titles.""" stop_words = {"the", "a", "an", "is", "are", "was", "were", "for", "in", "on", "to", "and", "of", "i", "it", "my", "with"} words = [] for text in df[column].dropna(): words.extend(w.lower() for w in str(text).split() if w.lower() not in stop_words and len(w) > 2) return Counter(words).most_common(n) </code></pre> <br>
<h2>Complete Working Example</h2> <br> <pre><code> import praw import pandas as pd from collections import CounterConfigure Reddit API
reddit = praw.Reddit( client_id="your_client_id", client_secret="your_client_secret", user_agent="market-research/1.0" )
def reddit_market_research(query, subreddits, limit=100): """Complete Reddit market research pipeline.""" all_posts = []
for sub in subreddits:
for post in reddit.subreddit(sub).search(query, limit=limit):
all_posts.append({
"title": post.title,
"score": post.score,
"comments": post.num_comments,
"subreddit": sub,
"url": f"https://reddit.com{post.permalink}",
"text": post.selftext[:500],
})
df = pd.DataFrame(all_posts)
df = df.sort_values("score", ascending=False)
df.to_csv("reddit_research.csv", index=False)
# Analysis
print(f"\n{'='*50}")
print(f"Found {len(df)} posts about '{query}'")
print(f"Top subreddits: {df['subreddit'].value_counts().to_dict()}")
print(f"Average score: {df['score'].mean():.1f}")
print(f"Average comments: {df['comments'].mean():.1f}")
print(f"\nTop keywords in titles:")
for word, count in top_keywords(df)[:10]:
print(f" {word}: {count}")
return df
if name == "main": df = reddit_market_research( query="Notion vs Confluence", subreddits=["SaaS", "productivity", "sysadmin", "cscareerquestions"], limit=50 ) </code></pre> <br>
<h2>Common Issues</h2> <br> <br> <p><strong>Empty post content:</strong> Many posts are link-only with no <code>selftext</code>. Handle <code>None</code> values.</p> <br> <p><strong>Deleted posts/comments:</strong> Reddit allows post deletion. Check <code>if post.author</code> before accessing author data.</p> <br> <p><strong>Shadowbanned accounts:</strong> If scraping without OAuth, use a real User-Agent and add delays. SearchHive's proxy rotation avoids IP-level blocks.</p> <br> <p><strong>[free JSON formatter](/tools/json-formatter) responses instead of HTML:</strong> Reddit sometimes returns JSON. Check the <code>Content-Type</code> header and handle both formats.</p> <br> <p><strong>Search results limited to 1,000:</strong> Reddit's search API caps at ~1,000 results per query. For more, use different query terms or time filters.</p> <br> <h2>Next Steps</h2> <br> <li>**Sentiment analysis** — Use NLTK, TextBlob, or a simple keyword approach to gauge opinion</li> <li>**Trend tracking** — Schedule weekly scrapes to track how sentiment changes over time</li> <li>**Competitor monitoring** — Set up searches for your competitors' products</li> <li>**Multi-platform research** — Combine Reddit data with [social media scraping](/blog/how-to-scrape-social-media-data-for-market-research) for a complete picture</li> <li>**Data visualization** — Use matplotlib or Plotly to create trend charts and word clouds</li> <br> <hr> <br> <p>Get started with SearchHive's <a href="https://searchhive.dev">free tier</a> — 100 requests per month, no credit card required. Handles proxy rotation, CAPTCHA solving, and JS rendering on every request. <a href="https://searchhive.dev/docs">Read the docs</a>.</p> <br>