AI
/
cline
miroir de https://github.com/cline/cline.git


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145
							import streamlit as st
import sqlite3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from datetime import datetime
import os
import json
import difflib
# import mimetypes # No longer needed here if guess_language_from_filepath handles it
from utils import get_database_connection, guess_language_from_filepath # Import from utils

# Page config
st.set_page_config(
    page_title="Diff Edits Evaluation Dashboard",
    page_icon="📊",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for beautiful styling
st.markdown("""
<style>
    /* Import Google Fonts */
    @import url('https://fonts.googleapis.com/css2?family=Azeret+Mono:wght@400;700&display=swap');
    
    /* Global Styles */
    .main {
        font-family: 'Azeret Mono', monospace;
    }
    
    /* Hero Section */
    .hero-container {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        padding: 2rem;
        border-radius: 15px;
        margin-bottom: 2rem;
        color: white;
        text-align: center;
    }
    
    .hero-title {
        font-size: 3rem;
        font-weight: 700;
        margin-bottom: 0.5rem;
        text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
    }
    
    .hero-subtitle {
        font-size: 1.2rem;
        font-weight: 300;
        opacity: 0.9;
    }
    
    /* Model Performance Cards */
    .model-card {
        background: white;
        border-radius: 15px;
        padding: 1.5rem;
        margin: 1rem 0;
        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
        border: 1px solid rgba(255,255,255,0.2);
        transition: transform 0.3s ease, box-shadow 0.3s ease;
    }
    
    .model-card:hover {
        transform: translateY(-5px);
        box-shadow: 0 12px 40px rgba(0,0,0,0.15);
    }
    
    .model-card.best-performer {
        border: 2px solid #00D4AA;
        background: linear-gradient(135deg, #f0fdf4 0%, #ecfdf5 100%);
    }
    
    .model-name {
        font-size: 1.5rem;
        font-weight: 600;
        margin-bottom: 1rem;
        color: #1f2937;
    }
    
    .success-rate {
        font-size: 3rem;
        font-weight: 700;
        margin-bottom: 0.5rem;
    }
    
    .success-rate.excellent { color: #10b981; }
    .success-rate.good { color: #f59e0b; }
    .success-rate.poor { color: #ef4444; }
    
    .metric-row {
        display: flex;
        justify-content: space-between;
        margin: 0.5rem 0;
        padding: 0.5rem;
        background: rgba(0,0,0,0.02);
        border-radius: 8px;
    }
    
    .metric-label {
        font-weight: 500;
        color: #6b7280;
    }
    
    .metric-value {
        font-weight: 600;
        color: #1f2937;
    }
    
    /* Performance Badge */
    .performance-badge {
        display: inline-block;
        padding: 0.25rem 0.75rem;
        border-radius: 20px;
        font-weight: 600;
        font-size: 0.875rem;
        margin-left: 1rem;
    }
    
    .badge-a { background: #10b981; color: white; }
    .badge-b { background: #f59e0b; color: white; }
    .badge-c { background: #ef4444; color: white; }
    
    /* Comparison Charts */
    .chart-container {
        background: white;
        border-radius: 15px;
        padding: 1.5rem;
        margin: 1rem 0;
        box-shadow: 0 4px 20px rgba(0,0,0,0.08);
    }
    
    /* Result Detail Modal */
    .result-detail {
        background: white;
        border-radius: 15px;
        padding: 2rem;
        margin: 1rem 0;
        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
    }
    
    .file-viewer {
        background: #f8fafc;
        border: 1px solid #e2e8f0;
        border-radius: 8px;
        padding: 1rem;
        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
        font-size: 0.875rem;
        line-height: 1.5;
        overflow-x: auto;
    }
    
    .diff-added {
        background-color: #dcfce7;
        color: #166534;
    }
    
    .diff-removed {
        background-color: #fef2f2;
        color: #dc2626;
    }
    
    .error-display {
        background: #fef2f2;
        border: 1px solid #fecaca;
        border-radius: 8px;
        padding: 1rem;
        color: #dc2626;
        font-family: monospace;
    }
    
    /* Sidebar Styling */
    .sidebar .sidebar-content {
        background: linear-gradient(180deg, #f8fafc 0%, #f1f5f9 100%);
    }
    
    /* Custom Metrics */
    .custom-metric {
        text-align: center;
        padding: 1rem;
        background: white;
        border-radius: 10px;
        box-shadow: 0 2px 10px rgba(0,0,0,0.05);
        margin: 0.5rem 0;
    }
    
    .custom-metric-value {
        font-size: 2rem;
        font-weight: 700;
        color: #1f2937;
    }
    
    .custom-metric-label {
        font-size: 0.875rem;
        color: #6b7280;
        font-weight: 500;
        margin-top: 0.25rem;
    }
</style>
""", unsafe_allow_html=True)

# Enhanced data loading functions
@st.cache_data
def load_all_runs():
    """Load all evaluation runs"""
    conn = get_database_connection()
    
    query = """
    SELECT run_id, description, created_at, system_prompt_hash
    FROM runs 
    ORDER BY created_at DESC
    """
    
    return pd.read_sql_query(query, conn)

@st.cache_data
def load_run_comparison(run_id):
    """Load a specific run with model comparison data"""
    conn = get_database_connection()
    
    # Get the run details
    run_query = f"""
    SELECT run_id, description, created_at, system_prompt_hash
    FROM runs 
    WHERE run_id = '{run_id}'
    """
    run_data = pd.read_sql_query(run_query, conn)
    
    if run_data.empty:
        return None, None
    
    # Get model performance for this run
    model_perf_query = f"""
    SELECT 
        res.model_id,
        COUNT(*) as total_results,
        AVG(CASE WHEN res.succeeded THEN 1.0 ELSE 0.0 END) as success_rate,
        AVG(res.cost_usd) as avg_cost,
        SUM(res.cost_usd) as total_cost,
        AVG(res.time_to_first_token_ms) as avg_first_token_ms,
        AVG(res.time_to_first_edit_ms) as avg_first_edit_ms,
        AVG(res.time_round_trip_ms) as avg_round_trip_ms,
        AVG(res.completion_tokens) as avg_completion_tokens,
        AVG(res.num_edits) as avg_num_edits,
        MIN(res.time_round_trip_ms) as min_round_trip_ms,
        MAX(res.time_round_trip_ms) as max_round_trip_ms
    FROM results res
    JOIN cases c ON res.case_id = c.case_id
    WHERE c.run_id = '{run_id}'
      AND (res.error_enum NOT IN (1, 6, 7) OR res.error_enum IS NULL)  -- Exclude: no_tool_calls, wrong_tool_call, wrong_file_edited
    GROUP BY res.model_id
    ORDER BY success_rate DESC, avg_round_trip_ms ASC
    """
    
    model_performance = pd.read_sql_query(model_perf_query, conn)
    
    return run_data.iloc[0], model_performance

@st.cache_data
def load_latest_run_comparison():
    """Load the latest run with model comparison data"""
    conn = get_database_connection()
    
    # Get the latest run
    latest_run_query = """
    SELECT run_id, description, created_at, system_prompt_hash
    FROM runs 
    ORDER BY created_at DESC 
    LIMIT 1
    """
    latest_run = pd.read_sql_query(latest_run_query, conn)
    
    if latest_run.empty:
        return None, None
    
    return load_run_comparison(latest_run.iloc[0]['run_id'])

@st.cache_data
def load_detailed_results(run_id, model_id=None, valid_only=False):
    """Load detailed results for drill-down analysis"""
    conn = get_database_connection()
    
    where_clause = f"WHERE c.run_id = '{run_id}'"
    if model_id:
        where_clause += f" AND res.model_id = '{model_id}'"
    
    # Option to filter out invalid attempts
    if valid_only:
        where_clause += " AND (res.error_enum NOT IN (1, 6, 7) OR res.error_enum IS NULL)"
    
    query = f"""
    SELECT 
        res.*,
        c.task_id,
        c.description as case_description,
        c.tokens_in_context,
        sp.name as system_prompt_name,
        pf.name as processing_functions_name,
        orig_f.filepath as original_filepath,
        orig_f.content as original_file_content,
        edit_f.filepath as edited_filepath,
        edit_f.content as edited_file_content
    FROM results res
    JOIN cases c ON res.case_id = c.case_id
    LEFT JOIN system_prompts sp ON c.system_prompt_hash = sp.hash
    LEFT JOIN processing_functions pf ON res.processing_functions_hash = pf.hash
    LEFT JOIN files orig_f ON c.file_hash = orig_f.hash
    LEFT JOIN files edit_f ON res.file_edited_hash = edit_f.hash
    {where_clause}
    ORDER BY res.created_at DESC
    """
    
    return pd.read_sql_query(query, conn)

def get_performance_grade(success_rate):
    """Get performance grade based on success rate"""
    if success_rate >= 0.9:
        return "A+", "excellent"
    elif success_rate >= 0.8:
        return "A", "excellent"
    elif success_rate >= 0.7:
        return "B+", "good"
    elif success_rate >= 0.6:
        return "B", "good"
    elif success_rate >= 0.5:
        return "C+", "good"
    else:
        return "C", "poor"

def get_error_description(error_enum, error_string=None):
    """Map error enum values to user-friendly descriptions"""
    error_map = {
        1: "No tool calls - Model didn't use the replace_in_file tool",
        2: "Multiple tool calls - Model called multiple tools instead of one", 
        3: "Wrong tool call - Model used wrong tool (not replace_in_file)",
        4: "Missing parameters - Tool call missing required path or diff",
        5: "Wrong file edited - Model edited different file than expected",
        6: "Wrong tool call - Model used wrong tool type",
        7: "Wrong file edited - Model targeted incorrect file path",
        8: "API/Stream error - Problem with model API connection",
        9: "Configuration error - Invalid evaluation parameters",
        10: "Function error - Invalid parsing/diff functions",
        11: "Other error - Unexpected failure"
    }
    
    base_description = error_map.get(error_enum, f"Unknown error (code: {error_enum})")
    
    if error_string:
        return f"{base_description}: {error_string}"
    return base_description

def get_error_guidance(error_enum):
    """Provide specific guidance based on error type"""
    guidance_map = {
        1: "💡 The model provided a response but didn't use the replace_in_file tool. Check the raw output to see what the model actually said.",
        2: "💡 The model called multiple tools when it should only call replace_in_file once. Check the parsed tool call section.",
        3: "💡 The model used a different tool instead of replace_in_file. This might indicate confusion about the task.",
        4: "💡 The model called replace_in_file but didn't provide the required 'path' or 'diff' parameters.",
        5: "💡 The model tried to edit a different file than expected. Check the parsed tool call to see which file it targeted.",
        6: "💡 The model used the wrong tool type. Check the raw output to see what tool it attempted to use.",
        7: "💡 The model tried to edit a different file path than expected. This could indicate path confusion or hallucination.",
    }
    
    return guidance_map.get(error_enum, "")

def render_hero_section(current_run, model_performance):
    """Render the hero section with key metrics"""
    run_title = current_run['description'] if current_run['description'] else f"Run {current_run['run_id'][:8]}..."
    st.markdown(f"""
    <div class="hero-container">
        <div class="hero-title">Diff Edit Evaluation Results</div>
        <div class="hero-subtitle">A comprehensive analysis of model performance on code editing tasks.</div>
        <div class="hero-subtitle" style="font-size: 0.9rem; margin-top: 10px;">
            <strong>Current Run:</strong> {run_title} • {current_run['created_at']}
        </div>
    </div>
    """, unsafe_allow_html=True)
    
    # Key metrics row
    col1, col2, col3, col4 = st.columns(4)
    
    total_results = model_performance['total_results'].sum()
    overall_success = model_performance['success_rate'].mean()
    total_cost = model_performance['total_cost'].sum()
    avg_latency = model_performance['avg_round_trip_ms'].mean()
    
    with col1:
        st.markdown(f"""
        <div class="custom-metric">
            <div class="custom-metric-value">{len(model_performance)}</div>
            <div class="custom-metric-label">Models Tested</div>
        </div>
        """, unsafe_allow_html=True)
    
    with col2:
        st.markdown(f"""
        <div class="custom-metric">
            <div class="custom-metric-value">{total_results}</div>
            <div class="custom-metric-label">Valid Results</div>
        </div>
        """, unsafe_allow_html=True)
    
    with col3:
        success_color = "#10b981" if overall_success > 0.8 else "#f59e0b" if overall_success > 0.6 else "#ef4444"
        st.markdown(f"""
        <div class="custom-metric">
            <div class="custom-metric-value" style="color: {success_color}">{overall_success:.1%}</div>
            <div class="custom-metric-label">Avg Success Rate</div>
        </div>
        """, unsafe_allow_html=True)
    
    with col4:
        st.markdown(f"""
        <div class="custom-metric">
            <div class="custom-metric-value">${total_cost:.3f}</div>
            <div class="custom-metric-label">Total Cost</div>
        </div>
        """, unsafe_allow_html=True)

def render_model_comparison_cards(model_performance):
    """Render beautiful model comparison cards"""
    st.markdown("## Model Leaderboard")
    
    # Find best performer
    best_model = model_performance.iloc[0]['model_id']
    
    for idx, model in model_performance.iterrows():
        is_best = model['model_id'] == best_model
        grade, grade_class = get_performance_grade(model['success_rate'])
        
        # Create a container for each model
        with st.container():
            col1, col2 = st.columns([3, 1])
            
            with col1:
                # Use Streamlit's native components instead of raw HTML
                if is_best:
                    st.success(f"**{model['model_id']}** - Best Performer")
                else:
                    st.info(f"**{model['model_id']}**")
                
                # Success rate with color coding
                success_rate = model['success_rate']
                if success_rate >= 0.8:
                    st.success(f"**Success Rate:** {success_rate:.1%} ({grade})")
                elif success_rate >= 0.6:
                    st.warning(f"**Success Rate:** {success_rate:.1%} ({grade})")
                else:
                    st.error(f"**Success Rate:** {success_rate:.1%} ({grade})")
                
                # Metrics in columns
                metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)
                
                with metric_col1:
                    if pd.notna(model['avg_round_trip_ms']):
                        st.metric("Avg Latency", f"{model['avg_round_trip_ms']:.0f}ms")
                    else:
                        st.metric("Avg Latency", "N/A")
                
                with metric_col2:
                    if pd.notna(model['avg_cost']):
                        st.metric("Avg Cost", f"${model['avg_cost']:.4f}")
                    else:
                        st.metric("Avg Cost", "N/A")
                
                with metric_col3:
                    st.metric("Valid Results", f"{model['total_results']}")
                
                with metric_col4:
                    if pd.notna(model['avg_first_token_ms']):
                        st.metric("First Token", f"{model['avg_first_token_ms']:.0f}ms")
                    else:
                        st.metric("First Token", "N/A")
            
            with col2:
                st.write("")  # Add some spacing
                if st.button(f"Drill Down", key=f"drill_{model['model_id']}", use_container_width=True):
                    st.session_state.drill_down_model = model['model_id']
                    # Update URL with model_id for drill down
                    st.query_params["model_id"] = model['model_id']
                    st.rerun()
            
            st.divider()  # Add a divider between models

def render_comparison_charts(model_performance):
    """Render interactive comparison charts"""
    st.markdown("## Performance Analysis")
    
    col1, col2 = st.columns(2)

    with col1:
        # Time to First Edit
        fig_first_edit = px.bar(
            model_performance,
            x='model_id',
            y='avg_first_edit_ms',
            title="Time to First Edit",
            labels={'avg_first_edit_ms': 'Time to First Edit (ms)', 'model_id': 'Model'},
            color='avg_first_edit_ms',
            color_continuous_scale='bluered',
            text='avg_first_edit_ms',
            template='plotly_dark'
        )
        fig_first_edit.update_traces(texttemplate='%{text:.0f}ms', textposition='outside')
        fig_first_edit.update_layout(
            showlegend=False,
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            font=dict(family="Azeret Mono, monospace"),
            margin=dict(t=50)
        )
        st.plotly_chart(fig_first_edit, use_container_width=True)

    with col2:
        # Latency vs Cost Scatter
        fig_scatter = px.scatter(
            model_performance,
            x='avg_round_trip_ms',
            y='avg_cost',
            size='total_results',
            color='success_rate',
            hover_name='model_id',
            title="Latency vs Cost Analysis",
            labels={
                'avg_round_trip_ms': 'Avg Round Trip (ms)',
                'avg_cost': 'Avg Cost ($)',
                'success_rate': 'Success Rate',
                'total_results': 'Valid Results'
            },
            color_continuous_scale='RdYlGn',
            template='plotly_dark'
        )
        fig_scatter.update_layout(
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            font=dict(family="Azeret Mono, monospace")
        )
        st.plotly_chart(fig_scatter, use_container_width=True)

def render_detailed_analysis(run_id, model_id):
    """Render detailed drill-down analysis"""
    st.markdown(f"## Detailed Analysis: {model_id}")
    
    # Load all results (including invalid attempts)
    detailed_results = load_detailed_results(run_id, model_id)
    
    # Also load only valid results for metrics
    valid_results = load_detailed_results(run_id, model_id, valid_only=True)
    
    if detailed_results.empty:
        st.warning("No detailed results found.")
        return
    
    # Show total vs valid results
    st.info(f"Showing all {len(detailed_results)} results ({len(valid_results)} valid, {len(detailed_results) - len(valid_results)} invalid)")
    
    # Results overview
    col1, col2, col3 = st.columns(3)
    
    with col1:
        success_count = valid_results['succeeded'].sum()
        total_count = len(valid_results)
        st.metric("Success Rate", f"{success_count}/{total_count} ({success_count/total_count:.1%} of valid results)")
    
    with col2:
        avg_latency = detailed_results['time_round_trip_ms'].mean()
        st.metric("Avg Latency", f"{avg_latency:.0f}ms")
    
    with col3:
        total_cost = detailed_results['cost_usd'].sum()
        st.metric("Total Cost", f"${total_cost:.4f}")
    
    # Interactive results table
    st.markdown("### 📋 Individual Results")
    
    # Add result selector with indicators for valid/invalid attempts
    result_options = []
    for idx, row in detailed_results.iterrows():
        # Check if this is a valid result
        is_valid = (row['error_enum'] not in [1, 6, 7]) if not pd.isna(row['error_enum']) else True
        
        # Create status indicator
        if is_valid:
            status = "✅" if row['succeeded'] else "❌"
        else:
            status = "⚠️"  # Warning symbol for invalid results
            
        # Add validity indicator to the option text
        validity_text = "" if is_valid else " [INVALID RESULT]"
        result_options.append(f"{status} {row['task_id']} - {row['time_round_trip_ms']:.0f}ms{validity_text}")
    
    selected_result_idx = st.selectbox(
        "Select a result to analyze:",
        range(len(result_options)),
        format_func=lambda x: result_options[x]
    )
    
    if selected_result_idx is not None:
        render_result_detail(detailed_results.iloc[selected_result_idx])

def render_result_detail(result):
    """Render detailed view of a single result"""
    st.markdown("### 🔬 Result Deep Dive")
    
    # Check if this is a valid result (only invalid if no tool calls or wrong file)
    is_valid = True
    if not pd.isna(result['error_enum']):
        # Only these specific errors make a result "invalid" for the benchmark:
        # 1 = no_tool_calls, 5 = wrong_file_edited, 7 = wrong_file_edited
        is_valid = result['error_enum'] not in [1, 5, 7]
    
    # Show validity warning if needed
    if not is_valid:
        st.warning("⚠️ **This is an invalid result** - The model didn't call the replace_in_file tool or edited the wrong file. This result is excluded from success rate calculations.")
    
    # Result metadata
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        status_icon = "✅" if result['succeeded'] else "❌"
        st.markdown(f"**Status:** {status_icon} {'Success' if result['succeeded'] else 'Failed'}")
    
    with col2:
        st.markdown(f"**Task ID:** {result['task_id']}")
    
    with col3:
        st.markdown(f"**Round Trip:** {result['time_round_trip_ms']:.0f}ms")
    
    with col4:
        if pd.notna(result['cost_usd']) and result['cost_usd'] is not None:
            st.markdown(f"**Cost:** ${result['cost_usd']:.4f}")
        else:
            st.markdown(f"**Cost:** Free")
    
    # Tabbed interface for different views
    tab1, tab2, tab3, tab4 = st.tabs(["📄 File & Edits", "🤖 Raw Output", "🔧 Parsed Tool Call", "📊 Metrics"])
    
    with tab1:
        render_file_and_edits_view(result)
    
    with tab2:
        render_raw_output_view(result)
    
    with tab3:
        render_parsed_tool_call_view(result)
    
    with tab4:
        render_metrics_view(result)

def render_file_and_edits_view(result):
    """Render side-by-side file and edits view"""
    st.markdown("#### 📄 File Content & Edit Analysis")
    
    # Check if we have original file content
    has_original = not pd.isna(result['original_file_content']) and result['original_file_content']
    has_edited = not pd.isna(result['edited_file_content']) and result['edited_file_content']
    
    if not has_original and not has_edited:
        st.warning("No file content available for this result.")
        return
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.markdown("**Original File:**")
        if has_original:
            filepath = result['original_filepath'] if not pd.isna(result['original_filepath']) else 'Unknown file'
            st.markdown(f"📁 `{filepath}`")
            
            # Display full original file content in a scrollable code block
            with st.expander("View Original File Content", expanded=True):
                # Prepare content for the copy button (needs JS-specific escaping)
                raw_content_for_copy = result['original_file_content']
                # Escape for JavaScript template literal: backticks, backslashes, newlines
                js_escaped_content = raw_content_for_copy.replace('\\', '\\\\') \
                                                       .replace('`', '\\`') \
                                                       .replace('\r\n', '\\n') \
                                                       .replace('\n', '\\n') \
                                                       .replace('\r', '\\n')

                unique_suffix = str(result.name if hasattr(result, 'name') else result['task_id']).replace('-', '_').replace('.', '_')
                button_id = f"copyBtnOriginal_{unique_suffix}"
                
                copy_button_html = f"""
                    <button id="{button_id}" onclick="copyOriginalToClipboard(`{js_escaped_content}`, '{button_id}')" style="margin-bottom: 10px; padding: 5px 10px; border-radius: 5px; border: 1px solid #ccc; cursor: pointer;">Copy Original File</button>
                    <script>
                        if (!window.copyOriginalToClipboard) {{
                            window.copyOriginalToClipboard = async function(text, buttonId) {{
                                try {{
                                    await navigator.clipboard.writeText(text);
                                    const button = document.getElementById(buttonId);
                                    button.innerText = 'Copied!';
                                    button.style.backgroundColor = '#d4edda'; // Optional: success feedback
                                    setTimeout(() => {{ 
                                        button.innerText = 'Copy Original File'; 
                                        button.style.backgroundColor = '';
                                    }}, 2000);
                                }} catch (err) {{
                                    console.error('Failed to copy original: ', err);
                                    const button = document.getElementById(buttonId);
                                    button.innerText = 'Copy Failed!';
                                    button.style.backgroundColor = '#f8d7da'; // Optional: error feedback
                                    setTimeout(() => {{ 
                                        button.innerText = 'Copy Original File'; 
                                        button.style.backgroundColor = '';
                                    }}, 2000);
                                }}
                            }}
                        }}
                    </script>
                    """
                st.components.v1.html(copy_button_html, height=50)

                # Prepare content for st.code (needs actual newlines)
                content_for_display = result['original_file_content']
                # Iteratively replace common escaped newline sequences with actual newlines
                # This handles cases like "\\n" -> "\n" and then "\n" (if it was literally "\n")
                # Order might matter if there are multiple levels of escaping, but this covers common ones.
                content_for_display = content_for_display.replace('\\\\r\\\\n', '\r\n').replace('\\\\n', '\n') # Double escaped
                content_for_display = content_for_display.replace('\\r\\n', '\r\n').replace('\\n', '\n')     # Single escaped

                language = guess_language_from_filepath(filepath)
                st.code(content_for_display, language=language, line_numbers=False)

        else:
            st.warning("Original file content not available")
    
    with col2:
        st.markdown("**Edit Analysis:**")
        
        if not result['succeeded']:
            # Show error information
            st.error("❌ **Edit Failed**")
            
            # Show detailed error reason
            if not pd.isna(result['error_enum']):
                error_description = get_error_description(
                    result['error_enum'], 
                    result.get('error_string')
                )
                st.markdown(f"**Reason:** {error_description}")
                
                # Show specific guidance based on error type
                guidance = get_error_guidance(result['error_enum'])
                if guidance:
                    st.info(guidance)
            
            # For valid results that failed, check for diff application failures
            elif not result['succeeded']:
                # This is a valid result that failed - likely due to diff application issues
                raw_output = result.get('raw_model_output', '')
                
                # Check if we have specific error information in the raw output
                if 'does not match anything in the file' in str(raw_output).lower():
                    st.warning("⚠️ **Diff Application Failed**")
                    st.info("💡 The SEARCH block in the diff didn't match any content in the original file. This usually means the model hallucinated code that doesn't exist.")
                elif 'malformatted' in str(raw_output).lower() or 'malformed' in str(raw_output).lower():
                    st.warning("⚠️ **Diff Format Error**")
                    st.info("💡 The diff format was incorrect. Check the raw tool call to see the formatting issues.")
                elif 'error:' in str(raw_output).lower():
                    # Try to extract the specific error message
                    lines = str(raw_output).split('\n')
                    error_lines = [line for line in lines if 'error:' in line.lower()]
                    if error_lines:
                        error_msg = error_lines[0].strip()
                        st.warning("⚠️ **Diff Application Failed**")
                        st.info(f"💡 {error_msg}")
                    else:
                        st.warning("⚠️ **Diff Application Failed**")
                        st.info("💡 The diff couldn't be applied to the original file. Check the raw output and parsed tool call for more details.")
                else:
                    # Generic diff application failure
                    st.warning("⚠️ **Diff Application Failed**")
                    st.info("💡 The model made a valid tool call but the diff couldn't be applied to the original file. This usually indicates a mismatch between the expected and actual file content.")
        else:
            # Show successful edit information
            st.success("✅ **Edit Successful**")
            
            # Show edit metrics
            metric_col1, metric_col2, metric_col3 = st.columns(3)
            
            with metric_col1:
                if not pd.isna(result['num_edits']):
                    st.metric("Edits", int(result['num_edits']))
            
            with metric_col2:
                if not pd.isna(result['num_lines_added']):
                    st.metric("Added", int(result['num_lines_added']))
            
            with metric_col3:
                if not pd.isna(result['num_lines_deleted']):
                    st.metric("Deleted", int(result['num_lines_deleted']))
            
            # Show edited file if available
            if has_edited:
                st.markdown("**Edited File:**")
                with st.expander("View Edited File Content"):
                    edited_lines = result['edited_file_content'].split('\n')
                    for i, line in enumerate(edited_lines[:50], 1):
                        st.text(f"{i:3d} | {line}")
                    
                    if len(edited_lines) > 50:
                        st.text(f"... ({len(edited_lines) - 50} more lines)")
        
        # Show raw and parsed tool calls if available
        if not pd.isna(result['parsed_tool_call_json']):
            with st.expander("View Raw Tool Call"):
                # Extract the raw tool call text from the model output
                raw_output = result['raw_model_output'] if not pd.isna(result['raw_model_output']) else ""
                
                # Try to extract just the tool call portion
                if raw_output and '<replace_in_file>' in raw_output:
                    # Find the tool call block
                    start_idx = raw_output.find('<replace_in_file>')
                    end_idx = raw_output.find('</replace_in_file>') + len('</replace_in_file>')
                    if start_idx != -1 and end_idx != -1:
                        raw_tool_call = raw_output[start_idx:end_idx]
                        st.code(raw_tool_call, language='xml')
                    else:
                        st.text("Tool call not found in raw output")
                else:
                    st.text("No raw tool call available")
            
            with st.expander("View Parsed Tool Call"):
                try:
                    parsed_call = json.loads(result['parsed_tool_call_json'])
                    st.json(parsed_call)
                except:
                    st.text(result['parsed_tool_call_json'])

def render_raw_output_view(result):
    """Render raw model output"""
    st.markdown("#### 🤖 Raw Model Output")
    
    if pd.isna(result['raw_model_output']) or not result['raw_model_output']:
        st.warning("No raw output available for this result.")
        return
    
    st.markdown("""
    <div class="file-viewer">
    """, unsafe_allow_html=True)
    
    st.text(result['raw_model_output'])
    
    st.markdown("</div>", unsafe_allow_html=True)

def render_parsed_tool_call_view(result):
    """Render parsed tool call analysis"""
    st.markdown("#### 🔧 Parsed Tool Call Analysis")
    
    if pd.isna(result['parsed_tool_call_json']) or not result['parsed_tool_call_json']:
        st.warning("No parsed tool call available for this result.")
        return
    
    try:
        parsed_call = json.loads(result['parsed_tool_call_json'])
        
        # Pretty print the JSON
        st.json(parsed_call)
        
        # If it's a replace_in_file call, show the diff blocks
        if isinstance(parsed_call, dict) and 'diff' in parsed_call:
            st.markdown("**Diff Blocks:**")
            st.code(parsed_call['diff'], language='diff')
            
    except json.JSONDecodeError:
        st.markdown("**Raw Parsed Call (Invalid JSON):**")
        st.text(result['parsed_tool_call_json'])

def render_metrics_view(result):
    """Render detailed metrics for the result"""
    st.markdown("#### 📊 Detailed Metrics")
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.markdown("**Timing Metrics:**")
        if not pd.isna(result['time_to_first_token_ms']):
            st.metric("Time to First Token", f"{result['time_to_first_token_ms']:.0f}ms")
        
        if not pd.isna(result['time_to_first_edit_ms']):
            st.metric("Time to First Edit", f"{result['time_to_first_edit_ms']:.0f}ms")
        
        if not pd.isna(result['time_round_trip_ms']):
            st.metric("Round Trip Time", f"{result['time_round_trip_ms']:.0f}ms")
    
    with col2:
        st.markdown("**Token & Cost Metrics:**")
        if not pd.isna(result['completion_tokens']):
            st.metric("Completion Tokens", int(result['completion_tokens']))
        
        if pd.notna(result['cost_usd']) and result['cost_usd'] is not None:
            st.metric("Cost", f"${result['cost_usd']:.4f}")
        else:
            st.metric("Cost", "Free")
        
        if not pd.isna(result['tokens_in_context']):
            st.metric("Context Tokens", int(result['tokens_in_context']))

def guess_language_from_filepath(filepath):
    """Guess the language for syntax highlighting from filepath."""
    if not filepath or pd.isna(filepath):
        return None
    
    extension_map = {
        '.py': 'python',
        '.js': 'javascript',
        '.ts': 'typescript',
        '.java': 'java',
        '.cs': 'csharp',
        '.cpp': 'cpp',
        '.c': 'c',
        '.html': 'html',
        '.css': 'css',
        '.json': 'json',
        '.sql': 'sql',
        '.md': 'markdown',
        '.rb': 'ruby',
        '.php': 'php',
        '.go': 'go',
        '.rs': 'rust',
        '.swift': 'swift',
        '.kt': 'kotlin',
        '.sh': 'bash',
        '.yaml': 'yaml',
        '.yml': 'yaml',
        '.xml': 'xml',
    }
    
    _, ext = os.path.splitext(filepath)
def main():
    # Add a note about valid attempts
    st.sidebar.markdown("""
    ### Note on Metrics
    Success rates are calculated based on **valid results only**. 
    
    Invalid results (where the model didn't call the diff edit tool or edited the wrong file) are excluded from calculations.
    """)
    
    # Initialize session state
    if 'drill_down_model' not in st.session_state:
        st.session_state.drill_down_model = None
    if 'selected_run_id' not in st.session_state:
        st.session_state.selected_run_id = None
    
    # Handle URL parameters for direct linking
    query_params = st.query_params
    url_run_id = query_params.get("run_id")
    url_model_id = query_params.get("model_id")
    
    # Load all runs for sidebar
    all_runs = load_all_runs()
    
    if all_runs.empty:
        st.error("No evaluation runs found in the database.")
        st.stop()
    
    # Set initial run selection from URL or default to latest
    if url_run_id and url_run_id in all_runs['run_id'].values:
        if st.session_state.selected_run_id != url_run_id:
            st.session_state.selected_run_id = url_run_id
            st.session_state.drill_down_model = None  # Reset drill down when changing runs via URL
    elif st.session_state.selected_run_id is None:
        st.session_state.selected_run_id = all_runs.iloc[0]['run_id']  # Default to latest
    
    # Set drill down model from URL
    if url_model_id and st.session_state.selected_run_id == url_run_id:
        st.session_state.drill_down_model = url_model_id
    
    # Sidebar for run selection
    with st.sidebar:
        st.markdown("## 📊 Evaluation Runs")
        st.markdown("Select a run to analyze:")
        
        # Create run options with nice formatting
        run_options = []
        run_ids = []
        
        for idx, run in all_runs.iterrows():
            # Format the run description nicely
            date_str = run['created_at'][:10]  # Get just the date part
            time_str = run['created_at'][11:16]  # Get just the time part
            
            if run['description']:
                display_name = f"🚀 {run['description']}"
            else:
                display_name = f"📅 Run {run['run_id'][:8]}..."
            
            run_options.append(f"{display_name}\n📅 {date_str} {time_str}")
            run_ids.append(run['run_id'])
        
        # Default to latest run if no selection
        if st.session_state.selected_run_id is None:
            default_index = 0  # Latest run is first
            st.session_state.selected_run_id = run_ids[0]
        else:
            try:
                default_index = run_ids.index(st.session_state.selected_run_id)
            except ValueError:
                default_index = 0
                st.session_state.selected_run_id = run_ids[0]
        
        selected_run_idx = st.selectbox(
            "Choose run:",
            range(len(run_options)),
            format_func=lambda x: run_options[x],
            index=default_index,
            key="run_selector"
        )
        
        # Update selected run if changed
        if run_ids[selected_run_idx] != st.session_state.selected_run_id:
            st.session_state.selected_run_id = run_ids[selected_run_idx]
            st.session_state.drill_down_model = None  # Reset drill down when changing runs
            # Update URL with new run_id
            st.query_params["run_id"] = st.session_state.selected_run_id
            if "model_id" in st.query_params:
                del st.query_params["model_id"]  # Clear model_id when changing runs
            st.rerun()
        
        # Show run details in sidebar
        selected_run = all_runs.iloc[selected_run_idx]
        st.markdown("---")
        st.markdown("### 📋 Run Details")
        st.markdown(f"**Run ID:** `{selected_run['run_id'][:12]}...`")
        st.markdown(f"**Created:** {selected_run['created_at']}")
        if selected_run['description']:
            st.markdown(f"**Description:** {selected_run['description']}")
        
        # Show shareable URL
        st.markdown("---")
        st.markdown("### 🔗 Share This View")
        
        # Build current URL
        # Dynamically derive the base URL
        try:
            # For older Streamlit versions
            server_address = st.server.server_address
            server_port = st.server.server_port
        except AttributeError:
            # Fallback for newer Streamlit versions where st.server is removed
            # We can't reliably get the server address/port from within the script anymore.
            # We'll default to localhost and the default port.
            # The user can see the correct network URL in the terminal.
            server_address = "localhost"
            server_port = 8501
        
        base_url = f"http://{server_address}:{server_port}"
        current_url = f"{base_url}/?run_id={st.session_state.selected_run_id}"
        if st.session_state.drill_down_model:
            current_url += f"&model_id={st.session_state.drill_down_model}"
        
        st.markdown("**Current URL:**")
        st.code(current_url, language=None)
        
        # Copy button using HTML/JS
        copy_button_html = f"""
        <button onclick="copyToClipboard('{current_url}')" style="
            padding: 8px 16px; 
            border-radius: 5px; 
            border: 1px solid #ccc; 
            background: #f0f2f6;
            cursor: pointer;
            font-size: 14px;
            margin-top: 5px;
        ">📋 Copy Link</button>
        <script>
            function copyToClipboard(text) {{
                navigator.clipboard.writeText(text).then(function() {{
                    // Success feedback
                    event.target.innerText = '✅ Copied!';
                    event.target.style.backgroundColor = '#d4edda';
                    setTimeout(() => {{ 
                        event.target.innerText = '📋 Copy Link'; 
                        event.target.style.backgroundColor = '#f0f2f6';
                    }}, 2000);
                }}, function(err) {{
                    // Error feedback
                    event.target.innerText = '❌ Failed';
                    event.target.style.backgroundColor = '#f8d7da';
                    setTimeout(() => {{ 
                        event.target.innerText = '📋 Copy Link'; 
                        event.target.style.backgroundColor = '#f0f2f6';
                    }}, 2000);
                }});
            }}
        </script>
        """
        st.components.v1.html(copy_button_html, height=50)
    
    # Load data for selected run
    current_run, model_performance = load_run_comparison(st.session_state.selected_run_id)
    
    if current_run is None or model_performance.empty:
        st.error("No data found for the selected run.")
        st.stop()
    
    # Render main dashboard
    render_hero_section(current_run, model_performance)
    
    # Check if we're in drill-down mode
    if st.session_state.drill_down_model:
        col1, col2 = st.columns([1, 4])
        with col1:
            if st.button("Back to Overview", use_container_width=True):
                st.session_state.drill_down_model = None
                # Clear model_id from URL when going back to overview
                if "model_id" in st.query_params:
                    del st.query_params["model_id"]
                st.rerun()
        
        render_detailed_analysis(current_run['run_id'], st.session_state.drill_down_model)
    else:
        # Success Rate Comparison
        fig_success = px.bar(
            model_performance,
            x='model_id',
            y='success_rate',
            title="Success Rate by Model",
            labels={'success_rate': 'Success Rate', 'model_id': 'Model'},
            color='success_rate',
            color_continuous_scale='RdYlGn',
            text='success_rate',
            template='plotly_dark'
        )
        fig_success.update_traces(texttemplate='%{text:.1%}', textposition='outside')
        fig_success.update_layout(
            showlegend=False,
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            font=dict(family="Azeret Mono, monospace"),
            yaxis_range=[0,1],  # Set y-axis from 0% to 100%
            margin=dict(t=50)  # Add top margin to prevent clipping
        )
        st.plotly_chart(fig_success, use_container_width=True)
        
        render_model_comparison_cards(model_performance)
        render_comparison_charts(model_performance)

if __name__ == "__main__":
    main()