Instagram holds massive amounts of public data -- follower counts, engagement rates, post content, hashtag performance. Marketers, researchers, and competitive analysts all need this data, but Instagram's API is notoriously restrictive and rate-limited.
This tutorial shows how to extract publicly available Instagram data using Python and the SearchHive API, with practical code examples for common marketing research tasks.
Key Takeaways
- Instagram's official Graph API requires a Business/Creator account and has strict rate limits
- Public Instagram profiles and posts can be accessed through web scraping with proper tools
- SearchHive's ScrapeForge handles JavaScript rendering and session management for Instagram pages
- Always respect Instagram's terms of service and rate limits when collecting data
- Structured extraction returns clean free JSON formatter ready for analysis
Prerequisites
- Python 3.8+
- SearchHive API key (free tier with 500 credits)
- Understanding of HTML/CSS selectors
pip install requests searchhive pandas
Step 1: Scrape a Public Instagram Profile
Public Instagram profiles display follower counts, post counts, bio information, and recent posts. Here's how to extract that data:
import requests
import json
API_KEY="***"
BASE_URL = "https://api.searchhive.dev/v1"
def scrape_instagram_profile(username):
# Extract public profile data from Instagram
response = requests.post(
f"{BASE_URL}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"url": f"https://www.instagram.com/{username}/",
"format": "json",
"render_js": True,
"wait_for": "header section, main",
"extract": {
"username": "header h2",
"full_name": "header h1",
"bio": "div.-vDIg span",
"followers": "header li span",
"following": "header li:nth-child(2) span",
"posts_count": "header li:nth-child(1) span",
"website": "header a[href^='http']::attr(href)",
"is_verified": "header svg[aria-label='Verified']",
"category": "header div._aa_c span"
}
}
)
if response.status_code == 200:
return response.json().get("data", {})
else:
print(f"Error: {response.status_code}")
return {}
# Example usage
profile = scrape_instagram_profile("natgeo")
print(f"Followers: {profile.get('followers')}")
print(f"Posts: {profile.get('posts_count')}")
print(f"Bio: {profile.get('bio')}")
Step 2: Extract Recent Post Data
Instagram feeds are loaded dynamically via JavaScript. ScrapeForge waits for the content to render before extracting:
def scrape_instagram_feed(username, max_posts=12):
# Extract recent posts from a public profile
response = requests.post(
f"{BASE_URL}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"url": f"https://www.instagram.com/{username}/",
"format": "json",
"render_js": True,
"wait_for": "div._ac7v, article",
"extract": {
"posts": {
"_container": "div._aagv",
"image_url": "img::attr(src)",
"post_link": "a::attr(href)",
"likes": "button span::text",
"comments": "button[aria-label*='Comment'] span::text",
"alt_text": "img::attr(alt)"
}
}
}
)
if response.status_code == 200:
posts = response.json().get("data", {}).get("posts", [])
return posts[:max_posts]
return []
posts = scrape_instagram_feed("natgeo", max_posts=6)
for i, post in enumerate(posts):
print(f"Post {i+1}: {post.get('alt_text', 'No alt')[:60]}...")
print(f" Likes: {post.get('likes')}, Comments: {post.get('comments')}")
print(f" Link: {post.get('post_link')}")
Step 3: Scrape Hashtag Pages for Trend Analysis
Hashtag pages show top and recent posts, making them valuable for trend analysis and content research:
def scrape_hashtag(hashtag):
# Extract data from an Instagram hashtag page
clean_tag = hashtag.strip("#").replace(" ", "")
response = requests.post(
f"{BASE_URL}/scrape",
headers={"Authorization": f"Bearer {API_KEY}"},
json={
"url": f"https://www.instagram.com/explore/tags/{clean_tag}/",
"format": "json",
"render_js": True,
"wait_for": "header, div._aagv",
"extract": {
"hashtag": "header h1",
"total_posts": "header span span",
"top_posts": {
"_container": "div._aagv",
"image_url": "img::attr(src)",
"post_link": "a::attr(href)",
"alt_text": "img::attr(alt)"
}
}
}
)
if response.status_code == 200:
return response.json().get("data", {})
return {}
# Analyze trending hashtags
hashtags = ["techstartup", "saas", "aimarketing"]
for tag in hashtags:
data = scrape_hashtag(tag)
print(f"#{tag}: {data.get('total_posts')} posts")
Step 4: Compare Competitor Engagement
Build a competitor comparison by scraping multiple profiles and calculating engagement metrics:
import time
import json
def compare_profiles(usernames):
# Compare engagement metrics across multiple Instagram profiles
results = []
for username in usernames:
profile = scrape_instagram_profile(username)
posts = scrape_instagram_feed(username, max_posts=10)
# Calculate engagement metrics
engagement_rates = []
for post in posts:
likes = post.get("likes", "0").replace(",", "")
comments = post.get("comments", "0").replace(",", "")
followers = profile.get("followers", "1").replace(",", "")
try:
total_engagement = int(likes) + int(comments)
follower_count = int(followers)
if follower_count > 0:
rate = (total_engagement / follower_count) * 100
engagement_rates.append(rate)
except (ValueError, ZeroDivisionError):
pass
avg_engagement = (
sum(engagement_rates) / len(engagement_rates)
if engagement_rates else 0
)
results.append({
"username": username,
"followers": profile.get("followers"),
"following": profile.get("following"),
"posts": profile.get("posts_count"),
"avg_engagement_rate": round(avg_engagement, 2),
"bio": profile.get("bio", "")[:100]
})
print(f"Scraped {username} - {profile.get('followers')} followers")
time.sleep(5) # Respect rate limits
return results
# Compare competitor profiles
competitors = ["competitor1", "competitor2", "competitor3"]
comparison = compare_profiles(competitors)
# Sort by engagement rate
comparison.sort(key=lambda x: x["avg_engagement_rate"], reverse=True)
print("\nEngagement Ranking:")
for i, profile in enumerate(comparison):
print(f" {i+1}. @{profile['username']}: "
f"{profile['avg_engagement_rate']}% engagement")
Step 5: Discover Influencers in Your Niche
Use SwiftSearch to find Instagram profiles and profiles in your niche, then scrape their metrics:
from searchhive import SearchHive
client = SearchHive(api_key="your-api-key")
def find_influencers(niche_query, min_followers=10000):
# Search for Instagram influencers in a specific niche
results = client.swift_search(
f"{niche_query} site:instagram.com top influencers",
num_results=20
)
influencers = []
for result in results:
url = result["url"]
if "instagram.com" in url:
username = url.rstrip("/").split("/")[-1]
if username and username not in ("p", "explore", "reel", "accounts"):
profile = scrape_instagram_profile(username)
followers = profile.get("followers", "0").replace(",", "")
if int(followers) >= min_followers:
influencers.append({
"username": username,
"followers": int(followers),
"bio": profile.get("bio", "")[:80],
"url": url
})
influencers.sort(key=lambda x: x["followers"], reverse=True)
return influencers
influencers = find_influencers("SaaS marketing", min_followers=5000)
print(f"Found {len(influencers)} potential influencers:")
for inf in influencers[:10]:
print(f" @{inf['username']}: {inf['followers']:,} followers")
Step 6: Save Results and Schedule Regular Checks
import csv
import json
from datetime import datetime
def save_profile_snapshot(profile_data, filename="instagram_tracking.csv"):
# Append profile data to a CSV file for tracking over time
profile_data["timestamp"] = datetime.now().isoformat()
file_exists = False
try:
with open(filename) as f:
file_exists = True
except FileNotFoundError:
pass
with open(filename, "a", newline="") as f:
writer = csv.DictWriter(f, fieldnames=profile_data.keys())
if not file_exists:
writer.writeheader()
writer.writerow(profile_data)
print(f"Saved snapshot for @{profile_data['username']}")
def track_competitors_weekly(usernames, filename="instagram_tracking.csv"):
# Weekly competitor tracking - run via cron
for username in usernames:
profile = scrape_instagram_profile(username)
posts = scrape_instagram_feed(username, max_posts=10)
# Calculate metrics
likes_list = []
for post in posts:
likes = post.get("likes", "0").replace(",", "")
try:
likes_list.append(int(likes))
except ValueError:
pass
snapshot = {
"username": username,
"followers": profile.get("followers", ""),
"following": profile.get("following", ""),
"posts_count": profile.get("posts_count", ""),
"avg_recent_likes": round(sum(likes_list) / len(likes_list)) if likes_list else 0,
}
save_profile_snapshot(snapshot, filename)
time.sleep(5)
# Run weekly
track_competitors_weekly(["competitor1", "competitor2"])
Common Issues
Login walls: Instagram shows login prompts more aggressively for non-authenticated users. If you hit a login wall, the page won't have the data you need. ScrapeForge handles basic JS rendering, but for logged-in content, you'd need to provide session cookies.
Rate limiting: Instagram aggressively rate-limits scrapers. Space requests 5-10 seconds apart, and avoid scraping more than 50 profiles per session.
Changing selectors: Instagram frequently updates its HTML structure and CSS class names. If extraction returns empty results, the selectors need updating. Check the page structure and adjust accordingly.
Private profiles: Private profiles cannot be scraped. The API will return the login page or a "private account" message. Skip these gracefully in your code.
Legal and Ethical Notes
- Only scrape publicly available data
- Respect Instagram's Terms of Service and robots.txt generator
- Don't store personal data (email, phone) even if visible
- Use scraped data for competitive analysis and trend research, not for harassment or spam
- Consider using Instagram's official Graph API for Business accounts where possible
Next Steps
- Build a dashboard with Streamlit to visualize competitor trends
- Add sentiment analysis on post captions and comments
- Track posting frequency and timing patterns
- Integrate with your marketing stack (Slack alerts for competitor milestones)
Get started free with 500 credits at searchhive.dev -- no credit card required. Check the docs for the full API reference.
Related: /tutorials/how-to-monitor-brand-mentions-across-the-web-with-python | /tutorials/how-to-scrape-e-commerce-pricing-data-with-python
Analyzing Post Performance Metrics
Beyond follower counts, engagement rate is the most important Instagram metric. Here's how to build a complete engagement analysis:
def detailed_post_analysis(username, api_key):
BASE_URL = "https://api.searchhive.dev/v1"
posts = requests.post(
f"{BASE_URL}/scrape",
headers={"Authorization": f"Bearer {api_key}"},
json={
"url": f"https://www.instagram.com/{username}/",
"format": "json",
"render_js": True,
"wait_for": "div._aagv",
"extract": {
"posts": {
"_container": "div._aagv",
"image_url": "img::attr(src)",
"post_link": "a::attr(href)",
"likes": "button span::text",
"comments": "button[aria-label*='Comment'] span::text",
"alt_text": "img::attr(alt)"
}
}
}
).json().get("data", {}).get("posts", [])
profile = scrape_instagram_profile(username)
followers = int(profile.get("followers", "1").replace(",", ""))
analysis = []
for i, post in enumerate(posts):
likes = int(post.get("likes", "0").replace(",", "") or "0")
comments = int(post.get("comments", "0").replace(",", "") or "0")
eng_rate = ((likes + comments) / followers) * 100 if followers > 0 else 0
analysis.append({
"post_number": i + 1,
"likes": likes,
"comments": comments,
"engagement_rate": round(eng_rate, 3)
})
return analysis
results = detailed_post_analysis("natgeo", "your-api-key")
avg_eng = sum(r['engagement_rate'] for r in results) / len(results)
print(f"Average engagement rate: {avg_eng:.3f}%")
Content Strategy Insights
Use scraped data to inform your content strategy:
- Posting frequency: Track how often competitors post by monitoring their feed over time
- Content types: Categorize posts by image type (carousel, single image, video) using alt text
- Hashtag analysis: Extract hashtags from post captions and track which ones correlate with higher engagement
- Optimal timing: Note when competitors post by comparing feed snapshots taken at different times
Instagram API vs Web Scraping
| Approach | Data Access | Rate Limits | Cost | Setup |
|---|---|---|---|---|
| Instagram Graph API | Full (Business only) | 200 calls/user/hour | Free | Complex (Meta approval) |
| Official API + scraping | Full | Low | Free + credits | Medium |
| SearchHive scraping | Public data | Per-credit | $0-$49/mo | Simple |
The Graph API requires a Business/Creator account and Meta's approval process. For quick research and competitive analysis, web scraping via SearchHive is faster to set up and doesn't require account access.
Handling Instagram's Anti-Scraping Measures
Instagram is one of the most aggressively protected platforms. Key challenges:
-
Login walls: After a few page loads, Instagram redirects to a login page. SearchHive's stealth mode mitigates this, but heavy scraping requires session management.
-
JavaScript-heavy rendering: The feed and profile pages are single-page React applications. Always use
render_js: Trueand set appropriatewait_forselectors. -
Dynamic class names: Instagram uses auto-generated CSS class names (like
._aagv) that change frequently. If extraction breaks, inspect the current page structure and update selectors. -
Rate limiting: Space requests 5-10 seconds apart. For bulk research, collect URLs first via SwiftSearch, then scrape them in smaller batches over time.
-
Shadow bans: Aggressive scraping from the same IP can trigger temporary blocks. SearchHive's proxy rotation prevents this.