analyze-first-time-contributors.sh 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. #!/bin/bash
  2. # First-Time Contributor Analyzer
  3. # Analyzes PRs from first-time contributors over the last 4 weeks
  4. # Usage: ./scripts/analyze-first-time-contributors.sh
  5. set -euo pipefail
  6. REPO="sst/opencode"
  7. GITHUB_API="https://api.github.com/repos"
  8. FOUR_WEEKS_AGO=$(date -u -v-28d '+%Y-%m-%dT00:00:00Z' 2>/dev/null || date -u -d '4 weeks ago' '+%Y-%m-%dT00:00:00Z')
  9. echo "Analyzing first-time contributors from last 4 weeks..."
  10. echo "Start date: $FOUR_WEEKS_AGO"
  11. echo ""
  12. # Create temp files
  13. TEMP_PRS=$(mktemp)
  14. TEMP_CONTRIBUTORS=$(mktemp)
  15. trap "rm -f $TEMP_PRS $TEMP_CONTRIBUTORS" EXIT
  16. # Fetch all PRs from the last 4 weeks
  17. echo "Fetching PRs..."
  18. ALL_PRS="[]"
  19. for page in {1..10}; do
  20. echo " Page $page..."
  21. PAGE_DATA=$(curl -s "${GITHUB_API}/${REPO}/pulls?state=all&sort=created&direction=desc&per_page=100&page=${page}")
  22. COUNT=$(echo "$PAGE_DATA" | jq 'length')
  23. if [ "$COUNT" -eq 0 ]; then
  24. break
  25. fi
  26. FILTERED=$(echo "$PAGE_DATA" | jq "[.[] | select(.created_at >= \"${FOUR_WEEKS_AGO}\")]")
  27. ALL_PRS=$(echo "$ALL_PRS" "$FILTERED" | jq -s '.[0] + .[1]')
  28. OLDEST=$(echo "$PAGE_DATA" | jq -r '.[-1].created_at')
  29. if [[ "$OLDEST" < "$FOUR_WEEKS_AGO" ]]; then
  30. break
  31. fi
  32. done
  33. echo "$ALL_PRS" > "$TEMP_PRS"
  34. PR_COUNT=$(jq 'length' "$TEMP_PRS")
  35. echo " Found $PR_COUNT PRs"
  36. echo ""
  37. echo "Checking contributor status for each PR..."
  38. # Get contributors list (people with previous PRs)
  39. # For each PR, check if the author has "first-time contributor" label or
  40. # if this is their first PR to the repo
  41. # Extract PR data with author info
  42. jq -r '.[] | "\(.number)|\(.user.login)|\(.created_at)|\(.author_association)"' "$TEMP_PRS" > "$TEMP_CONTRIBUTORS"
  43. echo ""
  44. # Analyze with Python
  45. PYTHON_SCRIPT=$(mktemp)
  46. trap "rm -f $PYTHON_SCRIPT $TEMP_PRS $TEMP_CONTRIBUTORS" EXIT
  47. cat > "$PYTHON_SCRIPT" << 'EOF'
  48. import json
  49. import sys
  50. from datetime import datetime
  51. from collections import defaultdict
  52. # Read PR data
  53. pr_data = []
  54. with open(sys.argv[1], 'r') as f:
  55. for line in f:
  56. if line.strip():
  57. parts = line.strip().split('|')
  58. pr_data.append({
  59. 'number': parts[0],
  60. 'author': parts[1],
  61. 'created_at': parts[2],
  62. 'author_association': parts[3]
  63. })
  64. print(f"Analyzing {len(pr_data)} PRs...\n")
  65. # Categorize by week
  66. def get_week_label(date_str):
  67. date = datetime.fromisoformat(date_str.replace('Z', '+00:00')).replace(tzinfo=None)
  68. if date >= datetime(2025, 12, 22):
  69. return "Week 51: Dec 22-26"
  70. elif date >= datetime(2025, 12, 15):
  71. return "Week 50: Dec 15-21"
  72. elif date >= datetime(2025, 12, 8):
  73. return "Week 49: Dec 8-14"
  74. elif date >= datetime(2025, 12, 1):
  75. return "Week 48: Dec 1-7"
  76. else:
  77. return "Earlier"
  78. # First-time contributors have author_association of "FIRST_TIME_CONTRIBUTOR" or "NONE"
  79. # or sometimes "CONTRIBUTOR" for their first few PRs
  80. by_week = defaultdict(lambda: {
  81. 'total': 0,
  82. 'first_time': 0,
  83. 'returning': 0,
  84. 'first_time_authors': set()
  85. })
  86. all_authors = defaultdict(int)
  87. for pr in pr_data:
  88. week = get_week_label(pr['created_at'])
  89. author = pr['author']
  90. assoc = pr['author_association']
  91. by_week[week]['total'] += 1
  92. all_authors[author] += 1
  93. # GitHub marks first-time contributors explicitly
  94. # FIRST_TIME_CONTRIBUTOR = first PR to this repo
  95. # NONE = no association (could be first time)
  96. # For more accuracy, we check if author appears only once in our dataset
  97. if assoc == 'FIRST_TIME_CONTRIBUTOR' or (assoc == 'NONE' and all_authors[author] == 1):
  98. by_week[week]['first_time'] += 1
  99. by_week[week]['first_time_authors'].add(author)
  100. else:
  101. by_week[week]['returning'] += 1
  102. # Print results
  103. print("="*90)
  104. print("FIRST-TIME CONTRIBUTOR ANALYSIS - LAST 4 WEEKS")
  105. print("="*90 + "\n")
  106. weeks = ["Week 48: Dec 1-7", "Week 49: Dec 8-14", "Week 50: Dec 15-21", "Week 51: Dec 22-26"]
  107. print("PRs by Contributor Type:\n")
  108. for week in weeks:
  109. if week in by_week:
  110. data = by_week[week]
  111. total = data['total']
  112. first_time = data['first_time']
  113. returning = data['returning']
  114. first_time_pct = (first_time / total * 100) if total > 0 else 0
  115. print(f"{week}: {total} PRs")
  116. print(f" ✨ First-time contributors: {first_time} ({first_time_pct:.1f}%)")
  117. print(f" ↩️ Returning contributors: {returning} ({100-first_time_pct:.1f}%)")
  118. print()
  119. # Overall summary
  120. total_prs = sum(data['total'] for data in by_week.values())
  121. total_first_time = sum(data['first_time'] for data in by_week.values())
  122. total_returning = sum(data['returning'] for data in by_week.values())
  123. overall_first_time_pct = (total_first_time / total_prs * 100) if total_prs > 0 else 0
  124. print("="*90)
  125. print("OVERALL SUMMARY")
  126. print("="*90 + "\n")
  127. print(f"Total PRs (4 weeks): {total_prs}")
  128. print(f"From first-time contributors: {total_first_time} ({overall_first_time_pct:.1f}%)")
  129. print(f"From returning contributors: {total_returning} ({100-overall_first_time_pct:.1f}%)")
  130. # Count unique first-time contributors
  131. all_first_time_authors = set()
  132. for data in by_week.values():
  133. all_first_time_authors.update(data['first_time_authors'])
  134. print(f"\nUnique first-time contributors: {len(all_first_time_authors)}")
  135. # Week by week trend
  136. print("\n" + "="*90)
  137. print("TREND ANALYSIS")
  138. print("="*90 + "\n")
  139. print("First-Time Contributor Rate by Week:\n")
  140. for week in weeks:
  141. if week in by_week:
  142. data = by_week[week]
  143. rate = (data['first_time'] / data['total'] * 100) if data['total'] > 0 else 0
  144. bar = "█" * int(rate / 2)
  145. print(f" {week}: {rate:5.1f}% {bar}")
  146. print("\n" + "="*90)
  147. print("KEY INSIGHTS")
  148. print("="*90 + "\n")
  149. insights = []
  150. if total_first_time > 0:
  151. insights.append(
  152. f"1. New Contributors: {total_first_time} PRs from first-timers shows healthy\n" +
  153. f" community growth and welcoming environment for new contributors."
  154. )
  155. if overall_first_time_pct > 20:
  156. insights.append(
  157. f"2. High New Contributor Rate: {overall_first_time_pct:.1f}% from first-timers is\n" +
  158. f" excellent. Indicates strong onboarding and accessible contribution process."
  159. )
  160. elif overall_first_time_pct > 10:
  161. insights.append(
  162. f"2. Moderate New Contributor Rate: {overall_first_time_pct:.1f}% from first-timers\n" +
  163. f" is healthy. Good balance of new and returning contributors."
  164. )
  165. else:
  166. insights.append(
  167. f"2. Low New Contributor Rate: {overall_first_time_pct:.1f}% from first-timers.\n" +
  168. f" Most PRs from established contributors (mature project pattern)."
  169. )
  170. # Check for trend
  171. week_rates = []
  172. for week in weeks:
  173. if week in by_week:
  174. data = by_week[week]
  175. rate = (data['first_time'] / data['total'] * 100) if data['total'] > 0 else 0
  176. week_rates.append(rate)
  177. if len(week_rates) >= 3:
  178. if week_rates[-1] > week_rates[0]:
  179. insights.append(
  180. f"3. Growing Trend: First-time contributor rate increasing\n" +
  181. f" ({week_rates[0]:.1f}% → {week_rates[-1]:.1f}%). Project attracting more new contributors."
  182. )
  183. elif week_rates[-1] < week_rates[0]:
  184. insights.append(
  185. f"3. Declining Trend: First-time contributor rate decreasing\n" +
  186. f" ({week_rates[0]:.1f}% → {week_rates[-1]:.1f}%). May indicate shifting to core contributors."
  187. )
  188. else:
  189. insights.append(
  190. f"3. Stable Trend: First-time contributor rate relatively stable\n" +
  191. f" across weeks. Consistent new contributor engagement."
  192. )
  193. insights.append(
  194. f"4. Unique Contributors: {len(all_first_time_authors)} unique new people made their\n" +
  195. f" first contribution. Shows breadth of community involvement."
  196. )
  197. for insight in insights:
  198. print(f"{insight}\n")
  199. print("="*90 + "\n")
  200. EOF
  201. python3 "$PYTHON_SCRIPT" "$TEMP_CONTRIBUTORS"