app.py 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145
  1. import streamlit as st
  2. import sqlite3
  3. import pandas as pd
  4. import plotly.express as px
  5. import plotly.graph_objects as go
  6. from plotly.subplots import make_subplots
  7. import numpy as np
  8. from datetime import datetime
  9. import os
  10. import json
  11. import difflib
  12. # import mimetypes # No longer needed here if guess_language_from_filepath handles it
  13. from utils import get_database_connection, guess_language_from_filepath # Import from utils
  14. # Page config
  15. st.set_page_config(
  16. page_title="Diff Edits Evaluation Dashboard",
  17. page_icon="📊",
  18. layout="wide",
  19. initial_sidebar_state="expanded"
  20. )
  21. # Custom CSS for beautiful styling
  22. st.markdown("""
  23. <style>
  24. /* Import Google Fonts */
  25. @import url('https://fonts.googleapis.com/css2?family=Azeret+Mono:wght@400;700&display=swap');
  26. /* Global Styles */
  27. .main {
  28. font-family: 'Azeret Mono', monospace;
  29. }
  30. /* Hero Section */
  31. .hero-container {
  32. background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
  33. padding: 2rem;
  34. border-radius: 15px;
  35. margin-bottom: 2rem;
  36. color: white;
  37. text-align: center;
  38. }
  39. .hero-title {
  40. font-size: 3rem;
  41. font-weight: 700;
  42. margin-bottom: 0.5rem;
  43. text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
  44. }
  45. .hero-subtitle {
  46. font-size: 1.2rem;
  47. font-weight: 300;
  48. opacity: 0.9;
  49. }
  50. /* Model Performance Cards */
  51. .model-card {
  52. background: white;
  53. border-radius: 15px;
  54. padding: 1.5rem;
  55. margin: 1rem 0;
  56. box-shadow: 0 8px 32px rgba(0,0,0,0.1);
  57. border: 1px solid rgba(255,255,255,0.2);
  58. transition: transform 0.3s ease, box-shadow 0.3s ease;
  59. }
  60. .model-card:hover {
  61. transform: translateY(-5px);
  62. box-shadow: 0 12px 40px rgba(0,0,0,0.15);
  63. }
  64. .model-card.best-performer {
  65. border: 2px solid #00D4AA;
  66. background: linear-gradient(135deg, #f0fdf4 0%, #ecfdf5 100%);
  67. }
  68. .model-name {
  69. font-size: 1.5rem;
  70. font-weight: 600;
  71. margin-bottom: 1rem;
  72. color: #1f2937;
  73. }
  74. .success-rate {
  75. font-size: 3rem;
  76. font-weight: 700;
  77. margin-bottom: 0.5rem;
  78. }
  79. .success-rate.excellent { color: #10b981; }
  80. .success-rate.good { color: #f59e0b; }
  81. .success-rate.poor { color: #ef4444; }
  82. .metric-row {
  83. display: flex;
  84. justify-content: space-between;
  85. margin: 0.5rem 0;
  86. padding: 0.5rem;
  87. background: rgba(0,0,0,0.02);
  88. border-radius: 8px;
  89. }
  90. .metric-label {
  91. font-weight: 500;
  92. color: #6b7280;
  93. }
  94. .metric-value {
  95. font-weight: 600;
  96. color: #1f2937;
  97. }
  98. /* Performance Badge */
  99. .performance-badge {
  100. display: inline-block;
  101. padding: 0.25rem 0.75rem;
  102. border-radius: 20px;
  103. font-weight: 600;
  104. font-size: 0.875rem;
  105. margin-left: 1rem;
  106. }
  107. .badge-a { background: #10b981; color: white; }
  108. .badge-b { background: #f59e0b; color: white; }
  109. .badge-c { background: #ef4444; color: white; }
  110. /* Comparison Charts */
  111. .chart-container {
  112. background: white;
  113. border-radius: 15px;
  114. padding: 1.5rem;
  115. margin: 1rem 0;
  116. box-shadow: 0 4px 20px rgba(0,0,0,0.08);
  117. }
  118. /* Result Detail Modal */
  119. .result-detail {
  120. background: white;
  121. border-radius: 15px;
  122. padding: 2rem;
  123. margin: 1rem 0;
  124. box-shadow: 0 8px 32px rgba(0,0,0,0.1);
  125. }
  126. .file-viewer {
  127. background: #f8fafc;
  128. border: 1px solid #e2e8f0;
  129. border-radius: 8px;
  130. padding: 1rem;
  131. font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
  132. font-size: 0.875rem;
  133. line-height: 1.5;
  134. overflow-x: auto;
  135. }
  136. .diff-added {
  137. background-color: #dcfce7;
  138. color: #166534;
  139. }
  140. .diff-removed {
  141. background-color: #fef2f2;
  142. color: #dc2626;
  143. }
  144. .error-display {
  145. background: #fef2f2;
  146. border: 1px solid #fecaca;
  147. border-radius: 8px;
  148. padding: 1rem;
  149. color: #dc2626;
  150. font-family: monospace;
  151. }
  152. /* Sidebar Styling */
  153. .sidebar .sidebar-content {
  154. background: linear-gradient(180deg, #f8fafc 0%, #f1f5f9 100%);
  155. }
  156. /* Custom Metrics */
  157. .custom-metric {
  158. text-align: center;
  159. padding: 1rem;
  160. background: white;
  161. border-radius: 10px;
  162. box-shadow: 0 2px 10px rgba(0,0,0,0.05);
  163. margin: 0.5rem 0;
  164. }
  165. .custom-metric-value {
  166. font-size: 2rem;
  167. font-weight: 700;
  168. color: #1f2937;
  169. }
  170. .custom-metric-label {
  171. font-size: 0.875rem;
  172. color: #6b7280;
  173. font-weight: 500;
  174. margin-top: 0.25rem;
  175. }
  176. </style>
  177. """, unsafe_allow_html=True)
  178. # Enhanced data loading functions
  179. @st.cache_data
  180. def load_all_runs():
  181. """Load all evaluation runs"""
  182. conn = get_database_connection()
  183. query = """
  184. SELECT run_id, description, created_at, system_prompt_hash
  185. FROM runs
  186. ORDER BY created_at DESC
  187. """
  188. return pd.read_sql_query(query, conn)
  189. @st.cache_data
  190. def load_run_comparison(run_id):
  191. """Load a specific run with model comparison data"""
  192. conn = get_database_connection()
  193. # Get the run details
  194. run_query = f"""
  195. SELECT run_id, description, created_at, system_prompt_hash
  196. FROM runs
  197. WHERE run_id = '{run_id}'
  198. """
  199. run_data = pd.read_sql_query(run_query, conn)
  200. if run_data.empty:
  201. return None, None
  202. # Get model performance for this run
  203. model_perf_query = f"""
  204. SELECT
  205. res.model_id,
  206. COUNT(*) as total_results,
  207. AVG(CASE WHEN res.succeeded THEN 1.0 ELSE 0.0 END) as success_rate,
  208. AVG(res.cost_usd) as avg_cost,
  209. SUM(res.cost_usd) as total_cost,
  210. AVG(res.time_to_first_token_ms) as avg_first_token_ms,
  211. AVG(res.time_to_first_edit_ms) as avg_first_edit_ms,
  212. AVG(res.time_round_trip_ms) as avg_round_trip_ms,
  213. AVG(res.completion_tokens) as avg_completion_tokens,
  214. AVG(res.num_edits) as avg_num_edits,
  215. MIN(res.time_round_trip_ms) as min_round_trip_ms,
  216. MAX(res.time_round_trip_ms) as max_round_trip_ms
  217. FROM results res
  218. JOIN cases c ON res.case_id = c.case_id
  219. WHERE c.run_id = '{run_id}'
  220. AND (res.error_enum NOT IN (1, 6, 7) OR res.error_enum IS NULL) -- Exclude: no_tool_calls, wrong_tool_call, wrong_file_edited
  221. GROUP BY res.model_id
  222. ORDER BY success_rate DESC, avg_round_trip_ms ASC
  223. """
  224. model_performance = pd.read_sql_query(model_perf_query, conn)
  225. return run_data.iloc[0], model_performance
  226. @st.cache_data
  227. def load_latest_run_comparison():
  228. """Load the latest run with model comparison data"""
  229. conn = get_database_connection()
  230. # Get the latest run
  231. latest_run_query = """
  232. SELECT run_id, description, created_at, system_prompt_hash
  233. FROM runs
  234. ORDER BY created_at DESC
  235. LIMIT 1
  236. """
  237. latest_run = pd.read_sql_query(latest_run_query, conn)
  238. if latest_run.empty:
  239. return None, None
  240. return load_run_comparison(latest_run.iloc[0]['run_id'])
  241. @st.cache_data
  242. def load_detailed_results(run_id, model_id=None, valid_only=False):
  243. """Load detailed results for drill-down analysis"""
  244. conn = get_database_connection()
  245. where_clause = f"WHERE c.run_id = '{run_id}'"
  246. if model_id:
  247. where_clause += f" AND res.model_id = '{model_id}'"
  248. # Option to filter out invalid attempts
  249. if valid_only:
  250. where_clause += " AND (res.error_enum NOT IN (1, 6, 7) OR res.error_enum IS NULL)"
  251. query = f"""
  252. SELECT
  253. res.*,
  254. c.task_id,
  255. c.description as case_description,
  256. c.tokens_in_context,
  257. sp.name as system_prompt_name,
  258. pf.name as processing_functions_name,
  259. orig_f.filepath as original_filepath,
  260. orig_f.content as original_file_content,
  261. edit_f.filepath as edited_filepath,
  262. edit_f.content as edited_file_content
  263. FROM results res
  264. JOIN cases c ON res.case_id = c.case_id
  265. LEFT JOIN system_prompts sp ON c.system_prompt_hash = sp.hash
  266. LEFT JOIN processing_functions pf ON res.processing_functions_hash = pf.hash
  267. LEFT JOIN files orig_f ON c.file_hash = orig_f.hash
  268. LEFT JOIN files edit_f ON res.file_edited_hash = edit_f.hash
  269. {where_clause}
  270. ORDER BY res.created_at DESC
  271. """
  272. return pd.read_sql_query(query, conn)
  273. def get_performance_grade(success_rate):
  274. """Get performance grade based on success rate"""
  275. if success_rate >= 0.9:
  276. return "A+", "excellent"
  277. elif success_rate >= 0.8:
  278. return "A", "excellent"
  279. elif success_rate >= 0.7:
  280. return "B+", "good"
  281. elif success_rate >= 0.6:
  282. return "B", "good"
  283. elif success_rate >= 0.5:
  284. return "C+", "good"
  285. else:
  286. return "C", "poor"
  287. def get_error_description(error_enum, error_string=None):
  288. """Map error enum values to user-friendly descriptions"""
  289. error_map = {
  290. 1: "No tool calls - Model didn't use the replace_in_file tool",
  291. 2: "Multiple tool calls - Model called multiple tools instead of one",
  292. 3: "Wrong tool call - Model used wrong tool (not replace_in_file)",
  293. 4: "Missing parameters - Tool call missing required path or diff",
  294. 5: "Wrong file edited - Model edited different file than expected",
  295. 6: "Wrong tool call - Model used wrong tool type",
  296. 7: "Wrong file edited - Model targeted incorrect file path",
  297. 8: "API/Stream error - Problem with model API connection",
  298. 9: "Configuration error - Invalid evaluation parameters",
  299. 10: "Function error - Invalid parsing/diff functions",
  300. 11: "Other error - Unexpected failure"
  301. }
  302. base_description = error_map.get(error_enum, f"Unknown error (code: {error_enum})")
  303. if error_string:
  304. return f"{base_description}: {error_string}"
  305. return base_description
  306. def get_error_guidance(error_enum):
  307. """Provide specific guidance based on error type"""
  308. guidance_map = {
  309. 1: "💡 The model provided a response but didn't use the replace_in_file tool. Check the raw output to see what the model actually said.",
  310. 2: "💡 The model called multiple tools when it should only call replace_in_file once. Check the parsed tool call section.",
  311. 3: "💡 The model used a different tool instead of replace_in_file. This might indicate confusion about the task.",
  312. 4: "💡 The model called replace_in_file but didn't provide the required 'path' or 'diff' parameters.",
  313. 5: "💡 The model tried to edit a different file than expected. Check the parsed tool call to see which file it targeted.",
  314. 6: "💡 The model used the wrong tool type. Check the raw output to see what tool it attempted to use.",
  315. 7: "💡 The model tried to edit a different file path than expected. This could indicate path confusion or hallucination.",
  316. }
  317. return guidance_map.get(error_enum, "")
  318. def render_hero_section(current_run, model_performance):
  319. """Render the hero section with key metrics"""
  320. run_title = current_run['description'] if current_run['description'] else f"Run {current_run['run_id'][:8]}..."
  321. st.markdown(f"""
  322. <div class="hero-container">
  323. <div class="hero-title">Diff Edit Evaluation Results</div>
  324. <div class="hero-subtitle">A comprehensive analysis of model performance on code editing tasks.</div>
  325. <div class="hero-subtitle" style="font-size: 0.9rem; margin-top: 10px;">
  326. <strong>Current Run:</strong> {run_title} • {current_run['created_at']}
  327. </div>
  328. </div>
  329. """, unsafe_allow_html=True)
  330. # Key metrics row
  331. col1, col2, col3, col4 = st.columns(4)
  332. total_results = model_performance['total_results'].sum()
  333. overall_success = model_performance['success_rate'].mean()
  334. total_cost = model_performance['total_cost'].sum()
  335. avg_latency = model_performance['avg_round_trip_ms'].mean()
  336. with col1:
  337. st.markdown(f"""
  338. <div class="custom-metric">
  339. <div class="custom-metric-value">{len(model_performance)}</div>
  340. <div class="custom-metric-label">Models Tested</div>
  341. </div>
  342. """, unsafe_allow_html=True)
  343. with col2:
  344. st.markdown(f"""
  345. <div class="custom-metric">
  346. <div class="custom-metric-value">{total_results}</div>
  347. <div class="custom-metric-label">Valid Results</div>
  348. </div>
  349. """, unsafe_allow_html=True)
  350. with col3:
  351. success_color = "#10b981" if overall_success > 0.8 else "#f59e0b" if overall_success > 0.6 else "#ef4444"
  352. st.markdown(f"""
  353. <div class="custom-metric">
  354. <div class="custom-metric-value" style="color: {success_color}">{overall_success:.1%}</div>
  355. <div class="custom-metric-label">Avg Success Rate</div>
  356. </div>
  357. """, unsafe_allow_html=True)
  358. with col4:
  359. st.markdown(f"""
  360. <div class="custom-metric">
  361. <div class="custom-metric-value">${total_cost:.3f}</div>
  362. <div class="custom-metric-label">Total Cost</div>
  363. </div>
  364. """, unsafe_allow_html=True)
  365. def render_model_comparison_cards(model_performance):
  366. """Render beautiful model comparison cards"""
  367. st.markdown("## Model Leaderboard")
  368. # Find best performer
  369. best_model = model_performance.iloc[0]['model_id']
  370. for idx, model in model_performance.iterrows():
  371. is_best = model['model_id'] == best_model
  372. grade, grade_class = get_performance_grade(model['success_rate'])
  373. # Create a container for each model
  374. with st.container():
  375. col1, col2 = st.columns([3, 1])
  376. with col1:
  377. # Use Streamlit's native components instead of raw HTML
  378. if is_best:
  379. st.success(f"**{model['model_id']}** - Best Performer")
  380. else:
  381. st.info(f"**{model['model_id']}**")
  382. # Success rate with color coding
  383. success_rate = model['success_rate']
  384. if success_rate >= 0.8:
  385. st.success(f"**Success Rate:** {success_rate:.1%} ({grade})")
  386. elif success_rate >= 0.6:
  387. st.warning(f"**Success Rate:** {success_rate:.1%} ({grade})")
  388. else:
  389. st.error(f"**Success Rate:** {success_rate:.1%} ({grade})")
  390. # Metrics in columns
  391. metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)
  392. with metric_col1:
  393. if pd.notna(model['avg_round_trip_ms']):
  394. st.metric("Avg Latency", f"{model['avg_round_trip_ms']:.0f}ms")
  395. else:
  396. st.metric("Avg Latency", "N/A")
  397. with metric_col2:
  398. if pd.notna(model['avg_cost']):
  399. st.metric("Avg Cost", f"${model['avg_cost']:.4f}")
  400. else:
  401. st.metric("Avg Cost", "N/A")
  402. with metric_col3:
  403. st.metric("Valid Results", f"{model['total_results']}")
  404. with metric_col4:
  405. if pd.notna(model['avg_first_token_ms']):
  406. st.metric("First Token", f"{model['avg_first_token_ms']:.0f}ms")
  407. else:
  408. st.metric("First Token", "N/A")
  409. with col2:
  410. st.write("") # Add some spacing
  411. if st.button(f"Drill Down", key=f"drill_{model['model_id']}", use_container_width=True):
  412. st.session_state.drill_down_model = model['model_id']
  413. # Update URL with model_id for drill down
  414. st.query_params["model_id"] = model['model_id']
  415. st.rerun()
  416. st.divider() # Add a divider between models
  417. def render_comparison_charts(model_performance):
  418. """Render interactive comparison charts"""
  419. st.markdown("## Performance Analysis")
  420. col1, col2 = st.columns(2)
  421. with col1:
  422. # Time to First Edit
  423. fig_first_edit = px.bar(
  424. model_performance,
  425. x='model_id',
  426. y='avg_first_edit_ms',
  427. title="Time to First Edit",
  428. labels={'avg_first_edit_ms': 'Time to First Edit (ms)', 'model_id': 'Model'},
  429. color='avg_first_edit_ms',
  430. color_continuous_scale='bluered',
  431. text='avg_first_edit_ms',
  432. template='plotly_dark'
  433. )
  434. fig_first_edit.update_traces(texttemplate='%{text:.0f}ms', textposition='outside')
  435. fig_first_edit.update_layout(
  436. showlegend=False,
  437. plot_bgcolor='rgba(0,0,0,0)',
  438. paper_bgcolor='rgba(0,0,0,0)',
  439. font=dict(family="Azeret Mono, monospace"),
  440. margin=dict(t=50)
  441. )
  442. st.plotly_chart(fig_first_edit, use_container_width=True)
  443. with col2:
  444. # Latency vs Cost Scatter
  445. fig_scatter = px.scatter(
  446. model_performance,
  447. x='avg_round_trip_ms',
  448. y='avg_cost',
  449. size='total_results',
  450. color='success_rate',
  451. hover_name='model_id',
  452. title="Latency vs Cost Analysis",
  453. labels={
  454. 'avg_round_trip_ms': 'Avg Round Trip (ms)',
  455. 'avg_cost': 'Avg Cost ($)',
  456. 'success_rate': 'Success Rate',
  457. 'total_results': 'Valid Results'
  458. },
  459. color_continuous_scale='RdYlGn',
  460. template='plotly_dark'
  461. )
  462. fig_scatter.update_layout(
  463. plot_bgcolor='rgba(0,0,0,0)',
  464. paper_bgcolor='rgba(0,0,0,0)',
  465. font=dict(family="Azeret Mono, monospace")
  466. )
  467. st.plotly_chart(fig_scatter, use_container_width=True)
  468. def render_detailed_analysis(run_id, model_id):
  469. """Render detailed drill-down analysis"""
  470. st.markdown(f"## Detailed Analysis: {model_id}")
  471. # Load all results (including invalid attempts)
  472. detailed_results = load_detailed_results(run_id, model_id)
  473. # Also load only valid results for metrics
  474. valid_results = load_detailed_results(run_id, model_id, valid_only=True)
  475. if detailed_results.empty:
  476. st.warning("No detailed results found.")
  477. return
  478. # Show total vs valid results
  479. st.info(f"Showing all {len(detailed_results)} results ({len(valid_results)} valid, {len(detailed_results) - len(valid_results)} invalid)")
  480. # Results overview
  481. col1, col2, col3 = st.columns(3)
  482. with col1:
  483. success_count = valid_results['succeeded'].sum()
  484. total_count = len(valid_results)
  485. st.metric("Success Rate", f"{success_count}/{total_count} ({success_count/total_count:.1%} of valid results)")
  486. with col2:
  487. avg_latency = detailed_results['time_round_trip_ms'].mean()
  488. st.metric("Avg Latency", f"{avg_latency:.0f}ms")
  489. with col3:
  490. total_cost = detailed_results['cost_usd'].sum()
  491. st.metric("Total Cost", f"${total_cost:.4f}")
  492. # Interactive results table
  493. st.markdown("### 📋 Individual Results")
  494. # Add result selector with indicators for valid/invalid attempts
  495. result_options = []
  496. for idx, row in detailed_results.iterrows():
  497. # Check if this is a valid result
  498. is_valid = (row['error_enum'] not in [1, 6, 7]) if not pd.isna(row['error_enum']) else True
  499. # Create status indicator
  500. if is_valid:
  501. status = "✅" if row['succeeded'] else "❌"
  502. else:
  503. status = "⚠️" # Warning symbol for invalid results
  504. # Add validity indicator to the option text
  505. validity_text = "" if is_valid else " [INVALID RESULT]"
  506. result_options.append(f"{status} {row['task_id']} - {row['time_round_trip_ms']:.0f}ms{validity_text}")
  507. selected_result_idx = st.selectbox(
  508. "Select a result to analyze:",
  509. range(len(result_options)),
  510. format_func=lambda x: result_options[x]
  511. )
  512. if selected_result_idx is not None:
  513. render_result_detail(detailed_results.iloc[selected_result_idx])
  514. def render_result_detail(result):
  515. """Render detailed view of a single result"""
  516. st.markdown("### 🔬 Result Deep Dive")
  517. # Check if this is a valid result (only invalid if no tool calls or wrong file)
  518. is_valid = True
  519. if not pd.isna(result['error_enum']):
  520. # Only these specific errors make a result "invalid" for the benchmark:
  521. # 1 = no_tool_calls, 5 = wrong_file_edited, 7 = wrong_file_edited
  522. is_valid = result['error_enum'] not in [1, 5, 7]
  523. # Show validity warning if needed
  524. if not is_valid:
  525. st.warning("⚠️ **This is an invalid result** - The model didn't call the replace_in_file tool or edited the wrong file. This result is excluded from success rate calculations.")
  526. # Result metadata
  527. col1, col2, col3, col4 = st.columns(4)
  528. with col1:
  529. status_icon = "✅" if result['succeeded'] else "❌"
  530. st.markdown(f"**Status:** {status_icon} {'Success' if result['succeeded'] else 'Failed'}")
  531. with col2:
  532. st.markdown(f"**Task ID:** {result['task_id']}")
  533. with col3:
  534. st.markdown(f"**Round Trip:** {result['time_round_trip_ms']:.0f}ms")
  535. with col4:
  536. if pd.notna(result['cost_usd']) and result['cost_usd'] is not None:
  537. st.markdown(f"**Cost:** ${result['cost_usd']:.4f}")
  538. else:
  539. st.markdown(f"**Cost:** Free")
  540. # Tabbed interface for different views
  541. tab1, tab2, tab3, tab4 = st.tabs(["📄 File & Edits", "🤖 Raw Output", "🔧 Parsed Tool Call", "📊 Metrics"])
  542. with tab1:
  543. render_file_and_edits_view(result)
  544. with tab2:
  545. render_raw_output_view(result)
  546. with tab3:
  547. render_parsed_tool_call_view(result)
  548. with tab4:
  549. render_metrics_view(result)
  550. def render_file_and_edits_view(result):
  551. """Render side-by-side file and edits view"""
  552. st.markdown("#### 📄 File Content & Edit Analysis")
  553. # Check if we have original file content
  554. has_original = not pd.isna(result['original_file_content']) and result['original_file_content']
  555. has_edited = not pd.isna(result['edited_file_content']) and result['edited_file_content']
  556. if not has_original and not has_edited:
  557. st.warning("No file content available for this result.")
  558. return
  559. col1, col2 = st.columns(2)
  560. with col1:
  561. st.markdown("**Original File:**")
  562. if has_original:
  563. filepath = result['original_filepath'] if not pd.isna(result['original_filepath']) else 'Unknown file'
  564. st.markdown(f"📁 `{filepath}`")
  565. # Display full original file content in a scrollable code block
  566. with st.expander("View Original File Content", expanded=True):
  567. # Prepare content for the copy button (needs JS-specific escaping)
  568. raw_content_for_copy = result['original_file_content']
  569. # Escape for JavaScript template literal: backticks, backslashes, newlines
  570. js_escaped_content = raw_content_for_copy.replace('\\', '\\\\') \
  571. .replace('`', '\\`') \
  572. .replace('\r\n', '\\n') \
  573. .replace('\n', '\\n') \
  574. .replace('\r', '\\n')
  575. unique_suffix = str(result.name if hasattr(result, 'name') else result['task_id']).replace('-', '_').replace('.', '_')
  576. button_id = f"copyBtnOriginal_{unique_suffix}"
  577. copy_button_html = f"""
  578. <button id="{button_id}" onclick="copyOriginalToClipboard(`{js_escaped_content}`, '{button_id}')" style="margin-bottom: 10px; padding: 5px 10px; border-radius: 5px; border: 1px solid #ccc; cursor: pointer;">Copy Original File</button>
  579. <script>
  580. if (!window.copyOriginalToClipboard) {{
  581. window.copyOriginalToClipboard = async function(text, buttonId) {{
  582. try {{
  583. await navigator.clipboard.writeText(text);
  584. const button = document.getElementById(buttonId);
  585. button.innerText = 'Copied!';
  586. button.style.backgroundColor = '#d4edda'; // Optional: success feedback
  587. setTimeout(() => {{
  588. button.innerText = 'Copy Original File';
  589. button.style.backgroundColor = '';
  590. }}, 2000);
  591. }} catch (err) {{
  592. console.error('Failed to copy original: ', err);
  593. const button = document.getElementById(buttonId);
  594. button.innerText = 'Copy Failed!';
  595. button.style.backgroundColor = '#f8d7da'; // Optional: error feedback
  596. setTimeout(() => {{
  597. button.innerText = 'Copy Original File';
  598. button.style.backgroundColor = '';
  599. }}, 2000);
  600. }}
  601. }}
  602. }}
  603. </script>
  604. """
  605. st.components.v1.html(copy_button_html, height=50)
  606. # Prepare content for st.code (needs actual newlines)
  607. content_for_display = result['original_file_content']
  608. # Iteratively replace common escaped newline sequences with actual newlines
  609. # This handles cases like "\\n" -> "\n" and then "\n" (if it was literally "\n")
  610. # Order might matter if there are multiple levels of escaping, but this covers common ones.
  611. content_for_display = content_for_display.replace('\\\\r\\\\n', '\r\n').replace('\\\\n', '\n') # Double escaped
  612. content_for_display = content_for_display.replace('\\r\\n', '\r\n').replace('\\n', '\n') # Single escaped
  613. language = guess_language_from_filepath(filepath)
  614. st.code(content_for_display, language=language, line_numbers=False)
  615. else:
  616. st.warning("Original file content not available")
  617. with col2:
  618. st.markdown("**Edit Analysis:**")
  619. if not result['succeeded']:
  620. # Show error information
  621. st.error("❌ **Edit Failed**")
  622. # Show detailed error reason
  623. if not pd.isna(result['error_enum']):
  624. error_description = get_error_description(
  625. result['error_enum'],
  626. result.get('error_string')
  627. )
  628. st.markdown(f"**Reason:** {error_description}")
  629. # Show specific guidance based on error type
  630. guidance = get_error_guidance(result['error_enum'])
  631. if guidance:
  632. st.info(guidance)
  633. # For valid results that failed, check for diff application failures
  634. elif not result['succeeded']:
  635. # This is a valid result that failed - likely due to diff application issues
  636. raw_output = result.get('raw_model_output', '')
  637. # Check if we have specific error information in the raw output
  638. if 'does not match anything in the file' in str(raw_output).lower():
  639. st.warning("⚠️ **Diff Application Failed**")
  640. st.info("💡 The SEARCH block in the diff didn't match any content in the original file. This usually means the model hallucinated code that doesn't exist.")
  641. elif 'malformatted' in str(raw_output).lower() or 'malformed' in str(raw_output).lower():
  642. st.warning("⚠️ **Diff Format Error**")
  643. st.info("💡 The diff format was incorrect. Check the raw tool call to see the formatting issues.")
  644. elif 'error:' in str(raw_output).lower():
  645. # Try to extract the specific error message
  646. lines = str(raw_output).split('\n')
  647. error_lines = [line for line in lines if 'error:' in line.lower()]
  648. if error_lines:
  649. error_msg = error_lines[0].strip()
  650. st.warning("⚠️ **Diff Application Failed**")
  651. st.info(f"💡 {error_msg}")
  652. else:
  653. st.warning("⚠️ **Diff Application Failed**")
  654. st.info("💡 The diff couldn't be applied to the original file. Check the raw output and parsed tool call for more details.")
  655. else:
  656. # Generic diff application failure
  657. st.warning("⚠️ **Diff Application Failed**")
  658. st.info("💡 The model made a valid tool call but the diff couldn't be applied to the original file. This usually indicates a mismatch between the expected and actual file content.")
  659. else:
  660. # Show successful edit information
  661. st.success("✅ **Edit Successful**")
  662. # Show edit metrics
  663. metric_col1, metric_col2, metric_col3 = st.columns(3)
  664. with metric_col1:
  665. if not pd.isna(result['num_edits']):
  666. st.metric("Edits", int(result['num_edits']))
  667. with metric_col2:
  668. if not pd.isna(result['num_lines_added']):
  669. st.metric("Added", int(result['num_lines_added']))
  670. with metric_col3:
  671. if not pd.isna(result['num_lines_deleted']):
  672. st.metric("Deleted", int(result['num_lines_deleted']))
  673. # Show edited file if available
  674. if has_edited:
  675. st.markdown("**Edited File:**")
  676. with st.expander("View Edited File Content"):
  677. edited_lines = result['edited_file_content'].split('\n')
  678. for i, line in enumerate(edited_lines[:50], 1):
  679. st.text(f"{i:3d} | {line}")
  680. if len(edited_lines) > 50:
  681. st.text(f"... ({len(edited_lines) - 50} more lines)")
  682. # Show raw and parsed tool calls if available
  683. if not pd.isna(result['parsed_tool_call_json']):
  684. with st.expander("View Raw Tool Call"):
  685. # Extract the raw tool call text from the model output
  686. raw_output = result['raw_model_output'] if not pd.isna(result['raw_model_output']) else ""
  687. # Try to extract just the tool call portion
  688. if raw_output and '<replace_in_file>' in raw_output:
  689. # Find the tool call block
  690. start_idx = raw_output.find('<replace_in_file>')
  691. end_idx = raw_output.find('</replace_in_file>') + len('</replace_in_file>')
  692. if start_idx != -1 and end_idx != -1:
  693. raw_tool_call = raw_output[start_idx:end_idx]
  694. st.code(raw_tool_call, language='xml')
  695. else:
  696. st.text("Tool call not found in raw output")
  697. else:
  698. st.text("No raw tool call available")
  699. with st.expander("View Parsed Tool Call"):
  700. try:
  701. parsed_call = json.loads(result['parsed_tool_call_json'])
  702. st.json(parsed_call)
  703. except:
  704. st.text(result['parsed_tool_call_json'])
  705. def render_raw_output_view(result):
  706. """Render raw model output"""
  707. st.markdown("#### 🤖 Raw Model Output")
  708. if pd.isna(result['raw_model_output']) or not result['raw_model_output']:
  709. st.warning("No raw output available for this result.")
  710. return
  711. st.markdown("""
  712. <div class="file-viewer">
  713. """, unsafe_allow_html=True)
  714. st.text(result['raw_model_output'])
  715. st.markdown("</div>", unsafe_allow_html=True)
  716. def render_parsed_tool_call_view(result):
  717. """Render parsed tool call analysis"""
  718. st.markdown("#### 🔧 Parsed Tool Call Analysis")
  719. if pd.isna(result['parsed_tool_call_json']) or not result['parsed_tool_call_json']:
  720. st.warning("No parsed tool call available for this result.")
  721. return
  722. try:
  723. parsed_call = json.loads(result['parsed_tool_call_json'])
  724. # Pretty print the JSON
  725. st.json(parsed_call)
  726. # If it's a replace_in_file call, show the diff blocks
  727. if isinstance(parsed_call, dict) and 'diff' in parsed_call:
  728. st.markdown("**Diff Blocks:**")
  729. st.code(parsed_call['diff'], language='diff')
  730. except json.JSONDecodeError:
  731. st.markdown("**Raw Parsed Call (Invalid JSON):**")
  732. st.text(result['parsed_tool_call_json'])
  733. def render_metrics_view(result):
  734. """Render detailed metrics for the result"""
  735. st.markdown("#### 📊 Detailed Metrics")
  736. col1, col2 = st.columns(2)
  737. with col1:
  738. st.markdown("**Timing Metrics:**")
  739. if not pd.isna(result['time_to_first_token_ms']):
  740. st.metric("Time to First Token", f"{result['time_to_first_token_ms']:.0f}ms")
  741. if not pd.isna(result['time_to_first_edit_ms']):
  742. st.metric("Time to First Edit", f"{result['time_to_first_edit_ms']:.0f}ms")
  743. if not pd.isna(result['time_round_trip_ms']):
  744. st.metric("Round Trip Time", f"{result['time_round_trip_ms']:.0f}ms")
  745. with col2:
  746. st.markdown("**Token & Cost Metrics:**")
  747. if not pd.isna(result['completion_tokens']):
  748. st.metric("Completion Tokens", int(result['completion_tokens']))
  749. if pd.notna(result['cost_usd']) and result['cost_usd'] is not None:
  750. st.metric("Cost", f"${result['cost_usd']:.4f}")
  751. else:
  752. st.metric("Cost", "Free")
  753. if not pd.isna(result['tokens_in_context']):
  754. st.metric("Context Tokens", int(result['tokens_in_context']))
  755. def guess_language_from_filepath(filepath):
  756. """Guess the language for syntax highlighting from filepath."""
  757. if not filepath or pd.isna(filepath):
  758. return None
  759. extension_map = {
  760. '.py': 'python',
  761. '.js': 'javascript',
  762. '.ts': 'typescript',
  763. '.java': 'java',
  764. '.cs': 'csharp',
  765. '.cpp': 'cpp',
  766. '.c': 'c',
  767. '.html': 'html',
  768. '.css': 'css',
  769. '.json': 'json',
  770. '.sql': 'sql',
  771. '.md': 'markdown',
  772. '.rb': 'ruby',
  773. '.php': 'php',
  774. '.go': 'go',
  775. '.rs': 'rust',
  776. '.swift': 'swift',
  777. '.kt': 'kotlin',
  778. '.sh': 'bash',
  779. '.yaml': 'yaml',
  780. '.yml': 'yaml',
  781. '.xml': 'xml',
  782. }
  783. _, ext = os.path.splitext(filepath)
  784. def main():
  785. # Add a note about valid attempts
  786. st.sidebar.markdown("""
  787. ### Note on Metrics
  788. Success rates are calculated based on **valid results only**.
  789. Invalid results (where the model didn't call the diff edit tool or edited the wrong file) are excluded from calculations.
  790. """)
  791. # Initialize session state
  792. if 'drill_down_model' not in st.session_state:
  793. st.session_state.drill_down_model = None
  794. if 'selected_run_id' not in st.session_state:
  795. st.session_state.selected_run_id = None
  796. # Handle URL parameters for direct linking
  797. query_params = st.query_params
  798. url_run_id = query_params.get("run_id")
  799. url_model_id = query_params.get("model_id")
  800. # Load all runs for sidebar
  801. all_runs = load_all_runs()
  802. if all_runs.empty:
  803. st.error("No evaluation runs found in the database.")
  804. st.stop()
  805. # Set initial run selection from URL or default to latest
  806. if url_run_id and url_run_id in all_runs['run_id'].values:
  807. if st.session_state.selected_run_id != url_run_id:
  808. st.session_state.selected_run_id = url_run_id
  809. st.session_state.drill_down_model = None # Reset drill down when changing runs via URL
  810. elif st.session_state.selected_run_id is None:
  811. st.session_state.selected_run_id = all_runs.iloc[0]['run_id'] # Default to latest
  812. # Set drill down model from URL
  813. if url_model_id and st.session_state.selected_run_id == url_run_id:
  814. st.session_state.drill_down_model = url_model_id
  815. # Sidebar for run selection
  816. with st.sidebar:
  817. st.markdown("## 📊 Evaluation Runs")
  818. st.markdown("Select a run to analyze:")
  819. # Create run options with nice formatting
  820. run_options = []
  821. run_ids = []
  822. for idx, run in all_runs.iterrows():
  823. # Format the run description nicely
  824. date_str = run['created_at'][:10] # Get just the date part
  825. time_str = run['created_at'][11:16] # Get just the time part
  826. if run['description']:
  827. display_name = f"🚀 {run['description']}"
  828. else:
  829. display_name = f"📅 Run {run['run_id'][:8]}..."
  830. run_options.append(f"{display_name}\n📅 {date_str} {time_str}")
  831. run_ids.append(run['run_id'])
  832. # Default to latest run if no selection
  833. if st.session_state.selected_run_id is None:
  834. default_index = 0 # Latest run is first
  835. st.session_state.selected_run_id = run_ids[0]
  836. else:
  837. try:
  838. default_index = run_ids.index(st.session_state.selected_run_id)
  839. except ValueError:
  840. default_index = 0
  841. st.session_state.selected_run_id = run_ids[0]
  842. selected_run_idx = st.selectbox(
  843. "Choose run:",
  844. range(len(run_options)),
  845. format_func=lambda x: run_options[x],
  846. index=default_index,
  847. key="run_selector"
  848. )
  849. # Update selected run if changed
  850. if run_ids[selected_run_idx] != st.session_state.selected_run_id:
  851. st.session_state.selected_run_id = run_ids[selected_run_idx]
  852. st.session_state.drill_down_model = None # Reset drill down when changing runs
  853. # Update URL with new run_id
  854. st.query_params["run_id"] = st.session_state.selected_run_id
  855. if "model_id" in st.query_params:
  856. del st.query_params["model_id"] # Clear model_id when changing runs
  857. st.rerun()
  858. # Show run details in sidebar
  859. selected_run = all_runs.iloc[selected_run_idx]
  860. st.markdown("---")
  861. st.markdown("### 📋 Run Details")
  862. st.markdown(f"**Run ID:** `{selected_run['run_id'][:12]}...`")
  863. st.markdown(f"**Created:** {selected_run['created_at']}")
  864. if selected_run['description']:
  865. st.markdown(f"**Description:** {selected_run['description']}")
  866. # Show shareable URL
  867. st.markdown("---")
  868. st.markdown("### 🔗 Share This View")
  869. # Build current URL
  870. # Dynamically derive the base URL
  871. try:
  872. # For older Streamlit versions
  873. server_address = st.server.server_address
  874. server_port = st.server.server_port
  875. except AttributeError:
  876. # Fallback for newer Streamlit versions where st.server is removed
  877. # We can't reliably get the server address/port from within the script anymore.
  878. # We'll default to localhost and the default port.
  879. # The user can see the correct network URL in the terminal.
  880. server_address = "localhost"
  881. server_port = 8501
  882. base_url = f"http://{server_address}:{server_port}"
  883. current_url = f"{base_url}/?run_id={st.session_state.selected_run_id}"
  884. if st.session_state.drill_down_model:
  885. current_url += f"&model_id={st.session_state.drill_down_model}"
  886. st.markdown("**Current URL:**")
  887. st.code(current_url, language=None)
  888. # Copy button using HTML/JS
  889. copy_button_html = f"""
  890. <button onclick="copyToClipboard('{current_url}')" style="
  891. padding: 8px 16px;
  892. border-radius: 5px;
  893. border: 1px solid #ccc;
  894. background: #f0f2f6;
  895. cursor: pointer;
  896. font-size: 14px;
  897. margin-top: 5px;
  898. ">📋 Copy Link</button>
  899. <script>
  900. function copyToClipboard(text) {{
  901. navigator.clipboard.writeText(text).then(function() {{
  902. // Success feedback
  903. event.target.innerText = '✅ Copied!';
  904. event.target.style.backgroundColor = '#d4edda';
  905. setTimeout(() => {{
  906. event.target.innerText = '📋 Copy Link';
  907. event.target.style.backgroundColor = '#f0f2f6';
  908. }}, 2000);
  909. }}, function(err) {{
  910. // Error feedback
  911. event.target.innerText = '❌ Failed';
  912. event.target.style.backgroundColor = '#f8d7da';
  913. setTimeout(() => {{
  914. event.target.innerText = '📋 Copy Link';
  915. event.target.style.backgroundColor = '#f0f2f6';
  916. }}, 2000);
  917. }});
  918. }}
  919. </script>
  920. """
  921. st.components.v1.html(copy_button_html, height=50)
  922. # Load data for selected run
  923. current_run, model_performance = load_run_comparison(st.session_state.selected_run_id)
  924. if current_run is None or model_performance.empty:
  925. st.error("No data found for the selected run.")
  926. st.stop()
  927. # Render main dashboard
  928. render_hero_section(current_run, model_performance)
  929. # Check if we're in drill-down mode
  930. if st.session_state.drill_down_model:
  931. col1, col2 = st.columns([1, 4])
  932. with col1:
  933. if st.button("Back to Overview", use_container_width=True):
  934. st.session_state.drill_down_model = None
  935. # Clear model_id from URL when going back to overview
  936. if "model_id" in st.query_params:
  937. del st.query_params["model_id"]
  938. st.rerun()
  939. render_detailed_analysis(current_run['run_id'], st.session_state.drill_down_model)
  940. else:
  941. # Success Rate Comparison
  942. fig_success = px.bar(
  943. model_performance,
  944. x='model_id',
  945. y='success_rate',
  946. title="Success Rate by Model",
  947. labels={'success_rate': 'Success Rate', 'model_id': 'Model'},
  948. color='success_rate',
  949. color_continuous_scale='RdYlGn',
  950. text='success_rate',
  951. template='plotly_dark'
  952. )
  953. fig_success.update_traces(texttemplate='%{text:.1%}', textposition='outside')
  954. fig_success.update_layout(
  955. showlegend=False,
  956. plot_bgcolor='rgba(0,0,0,0)',
  957. paper_bgcolor='rgba(0,0,0,0)',
  958. font=dict(family="Azeret Mono, monospace"),
  959. yaxis_range=[0,1], # Set y-axis from 0% to 100%
  960. margin=dict(t=50) # Add top margin to prevent clipping
  961. )
  962. st.plotly_chart(fig_success, use_container_width=True)
  963. render_model_comparison_cards(model_performance)
  964. render_comparison_charts(model_performance)
  965. if __name__ == "__main__":
  966. main()