GitHub hosts over 100 million repositories and is the largest collection of open-source software in the world. Whether you are analyzing technology trends, finding popular libraries, benchmarking competitors, or building developer tools, scraping GitHub data provides actionable intelligence. This tutorial shows you how to extract GitHub data efficiently using Python and SearchHive.
Prerequisites
- Python 3.8+ with
requestsandpandasinstalled - A SearchHive API key (free tier includes 500 credits)
- A GitHub personal access token (optional, but recommended for higher rate limits on GitHub's REST API)
pip install requests pandas
Key Takeaways
- GitHub provides both a REST API and web pages -- scraping the web pages gives you access to data not available through the API (like trending pages, explore sections, and rendered READMEs)
- SearchHive handles GitHub's dynamic JavaScript rendering (the site is a React SPA) automatically
- You can combine GitHub's official API (for structured metadata) with SearchHive's ScrapeForge (for page content and trends) for complete coverage
- DeepDive extracts structured insights from README files and documentation pages
Step 1: Scrape GitHub Trending Repositories
GitHub's trending page (github.com/trending) is one of the most valuable pages for developer research. It is not available through the official API.
import requests
import time
import json
API_KEY = "your_searchhive_key"
BASE_URL = "https://api.searchhive.dev/v1"
headers = {"Authorization": f"Bearer {API_KEY}"}
def scrape_github_trending(language=None, since="daily"):
"""Scrape GitHub trending page for a specific language."""
url = "https://github.com/trending"
if language:
url += f"/{language}"
url += f"?since={since}"
response = requests.post(
f"{BASE_URL}/scrape",
headers=headers,
json={
"url": url,
"render_js": True,
"extract": {
"repos": {
"selector": "article.Box-row",
"fields": {
"name": "h2 a",
"url": {"selector": "h2 a", "attr": "href"},
"description": "p.color-fg-muted",
"language": "[itemprop='programmingLanguage']",
"stars_today": ".d-inline-block.float-sm-right"
}
}
}
}
)
if response.status_code == 200:
repos = response.json().get("repos", [])
# Clean up names (remove leading whitespace and slash)
for repo in repos:
repo["name"] = repo["name"].strip().replace("
", "")
return repos
else:
print(f"Error: {response.status_code}")
return []
# Get trending repos across all languages
trending = scrape_github_trending()
print(f"Found {len(trending)} trending repos")
for repo in trending[:10]:
print(f" {repo['name']} - {repo.get('language', 'Unknown')}")
Step 2: Scrape GitHub Explore Topics
GitHub's topic pages show popular repositories by technology:
def scrape_github_topic(topic, sort="stars"):
"""Scrape repositories for a GitHub topic."""
url = f"https://github.com/topics/{topic}?o=desc&s={sort}"
response = requests.post(
f"{BASE_URL}/scrape",
headers=headers,
json={
"url": url,
"render_js": True,
"extract": {
"repos": {
"selector": "article.border-round",
"fields": {
"name": "h3.f3 a",
"url": {"selector": "h3.f3 a", "attr": "href"},
"description": "div.px-3 p",
"stars": ".Link--muted.d-inline-block.mr-3",
"language": "[itemprop='programmingLanguage']"
}
}
}
}
)
if response.status_code == 200:
return response.json().get("repos", [])
return []
# Scrape popular topics
topics = ["python", "machine-learning", "react", "rust", "web-scraping"]
all_repos = []
for topic in topics:
repos = scrape_github_topic(topic)
all_repos.extend(repos)
print(f"Topic '{topic}': {len(repos)} repos")
time.sleep(2)
print(f"Total repos collected: {len(all_repos)}")
Step 3: Extract README Content with DeepDive
README files contain valuable information about project features, installation instructions, and usage. Use DeepDive to extract structured data from READMEs:
def analyze_readme(repo_url):
"""Extract structured information from a repository README."""
readme_url = f"https://github.com/{repo_url}"
response = requests.post(
f"{BASE_URL}/deepdive",
headers=headers,
json={
"url": readme_url,
"prompt": (
"From this GitHub repository page, extract: "
"1. Project description (2-3 sentences) "
"2. Main programming language "
"3. License type "
"4. Number of contributors visible "
"5. Last commit date if visible "
"6. Key features listed in README "
"7. Installation requirements "
"Return as JSON."
)
}
)
if response.status_code == 200:
return response.json()
return {}
# Analyze top trending repos
for repo in trending[:5]:
repo_path = repo["url"].strip("/")
print(f"Analyzing {repo_path}...")
info = analyze_readme(repo_path)
repo["analysis"] = info
time.sleep(2)
Step 4: Scrape Developer Profiles
Analyze developer activity and expertise by scraping profile pages:
def scrape_github_profile(username):
"""Extract data from a GitHub developer profile."""
url = f"https://github.com/{username}"
response = requests.post(
f"{BASE_URL}/scrape",
headers=headers,
json={
"url": url,
"render_js": True,
"extract": {
"profile": {
"selector": ".Layout-sidebar",
"fields": {
"name": ".p-name",
"bio": ".p-note",
"location": ".p-label",
"company": "[itemprop='worksFor']",
"website": {"selector": "a.Link--primary", "attr": "href"},
"followers": ".Link--muted",
"contributions_count": ".f4.text-normal"
}
},
"top_repos": {
"selector": ".d-block.width-fit",
"fields": {
"name": "a",
"language": "[itemprop='programmingLanguage']"
}
}
}
}
)
if response.status_code == 200:
return response.json()
return {}
# Scrape profiles of trending repo maintainers
profiles = {}
for repo in trending[:5]:
owner = repo["url"].strip("/").split("/")[0]
if owner not in profiles:
profiles[owner] = scrape_github_profile(owner)
print(f"Scraped profile: {owner}")
time.sleep(2)
Step 5: Build a Research Dataset
Combine all scraped data into a structured dataset for analysis:
import pandas as pd
# Build comprehensive repo dataset
repo_data = []
for repo in trending:
repo_data.append({
"name": repo.get("name", ""),
"url": repo.get("url", ""),
"description": repo.get("description", ""),
"language": repo.get("language", ""),
"stars_today": repo.get("stars_today", ""),
"features": str(repo.get("analysis", {}).get("key_features", "")),
"license": repo.get("analysis", {}).get("license_type", "N/A"),
"install_reqs": repo.get("analysis", {}).get("installation_requirements", "")
})
df = pd.DataFrame(repo_data)
df = df.drop_duplicates(subset=["name"], keep="first")
# Filter by language
python_repos = df[df["language"] == "Python"]
print(f"Total repos: {len(df)}")
print(f"Python repos: {len(python_repos)}")
# Export
df.to_csv("github_trending_research.csv", index=False)
df.to_json("github_trending_research.json", orient="records", indent=2)
print("Data exported to CSV and JSON")
Step 6: Combine with GitHub REST API
For repositories where you need exact metadata (precise star counts, fork counts, issue counts), combine web scraping with GitHub's REST API:
GITHUB_TOKEN = "ghp_your_token_here" # Optional but recommended
gh_headers = {"Authorization": f"token {GITHUB_TOKEN}"} if GITHUB_TOKEN else {}
def get_repo_stats(repo_path):
"""Get exact stats from GitHub REST API."""
response = requests.get(
f"https://api.github.com/repos/{repo_path}",
headers=gh_headers
)
if response.status_code == 200:
data = response.json()
return {
"stars": data["stargazers_count"],
"forks": data["forks_count"],
"open_issues": data["open_issues_count"],
"watchers": data["watchers_count"],
"created_at": data["created_at"],
"updated_at": data["updated_at"],
"language": data["language"]
}
return {}
# Enrich scraped data with API stats
for i, row in df.iterrows():
repo_path = row["url"].strip("/")
stats = get_repo_stats(repo_path)
df.at[i, "stars"] = stats.get("stars", "")
df.at[i, "forks"] = stats.get("forks", "")
df.at[i, "open_issues"] = stats.get("open_issues", "")
time.sleep(1) # GitHub API rate limit: 60/hr unauthenticated, 5000/hr with token
df.to_csv("github_complete_research.csv", index=False)
Common Issues
- GitHub rate limits: The REST API allows 60 requests/hour without authentication and 5,000/hour with a token. SearchHive's scraping does not count against GitHub's API limits since it renders the page like a browser.
- Dynamic loading: GitHub uses JavaScript extensively (infinite scroll, lazy-loaded content). SearchHive's
render_js: Trueparameter handles this automatically. - Authentication walls: Some repositories and profiles may prompt login for full content. Using SearchHive's proxy rotation helps avoid these soft blocks.
- Anti-scraping measures: GitHub blocks aggressive scraping. Space out requests, use proxies (SearchHive provides this), and respect their terms of service.
Next Steps
- Track trending repos over time to build a trend analysis dataset
- Analyze dependency networks between repositories
- Monitor competitor repositories for release activity
- Build a developer skills database by analyzing profiles and contribution patterns
Start Scraping GitHub Today
SearchHive's free tier gives you 500 credits to scrape GitHub trending pages, topic pages, and profiles with full JavaScript rendering. Get your free API key and check the docs for the complete reference.
/tutorials/web-scraping-python-beginners-guide | /tutorials/how-to-scrape-websites-behind-login-with-python | /tutorials/how-to-monitor-website-changes-with-python