Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion backend/app/api/docs/evaluation/get_evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ Returns comprehensive evaluation information including processing status, config
**Query Parameters:**
* `get_trace_info` (optional, default: false) - Include Langfuse trace scores with Q&A context. Data is fetched from Langfuse on first request and cached for subsequent calls. Only available for completed evaluations.
* `resync_score` (optional, default: false) - Clear cached scores and re-fetch from Langfuse. Useful when evaluators have been updated. Requires `get_trace_info=true`.
* `export_format` (optional, default: row) - Controls the structure of traces in the response. Requires `get_trace_info=true` when set to "grouped". Allowed values: `row`, `grouped`.

**Score Format** (`get_trace_info=true`):
**Score Format** (`get_trace_info=true`,`export_format=row`):

```json
{
Expand Down Expand Up @@ -49,6 +50,33 @@ Returns comprehensive evaluation information including processing status, config
}
```

**Score Format** (`get_trace_info=true`,`export_format=grouped`):
```json
{
"summary_scores": [...],
"traces": [...],
"grouped_traces": [
{
"question_id": 1,
"question": "What is Python?",
"ground_truth_answer": "Python is a high-level programming language.",
"llm_answers": [
"Answer from evaluation run 1...",
"Answer from evaluation run 2..."
],
"trace_ids": [
"uuid-123",
"uuid-456"
],
"scores": [
[{"name": "cosine_similarity", "value": 0.82, "data_type": "NUMERIC"}],
[{"name": "cosine_similarity", "value": 0.75, "data_type": "NUMERIC"}]
]
}
]
}
```

**Score Details:**
* NUMERIC scores include average (`avg`) and standard deviation (`std`) in summary
* CATEGORICAL scores include distribution counts in summary
Expand Down
20 changes: 20 additions & 0 deletions backend/app/api/routes/evaluations/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from app.api.deps import AuthContextDep, SessionDep
from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud
from app.crud.evaluations.core import group_traces_by_question_id
from app.models.evaluation import EvaluationRunPublic
from app.api.permissions import Permission, require_permission
from app.services.evaluations import (
Expand Down Expand Up @@ -121,13 +122,25 @@ def get_evaluation_run_status(
"Requires get_trace_info=true."
),
),
export_format: str = Query(
"row",
description=(
"Controls the Traces structure."
"'grouped' collates repeated questions horizontally using Parent Question ID."
),
enum=["row", "grouped"],
),
) -> APIResponse[EvaluationRunPublic]:
"""Get evaluation run status with optional trace info."""
if resync_score and not get_trace_info:
raise HTTPException(
status_code=400,
detail="resync_score=true requires get_trace_info=true",
)
if export_format == "grouped" and not get_trace_info:
raise HTTPException(
status_code=400, detail="export_format=grouped requires get_trace_info=true"
)

eval_run, error = get_evaluation_with_scores(
session=_session,
Expand All @@ -146,6 +159,13 @@ def get_evaluation_run_status(
"to this organization"
),
)
# Formatter = grouped
if export_format == "grouped" and eval_run.score and "traces" in eval_run.score:
try:
grouped_traces = group_traces_by_question_id(eval_run.score["traces"])
eval_run.score["traces"] = grouped_traces
except ValueError as e:
return APIResponse.failure_response(error=str(e), data=eval_run)

if error:
return APIResponse.failure_response(error=error, data=eval_run)
Expand Down
54 changes: 54 additions & 0 deletions backend/app/crud/evaluations/core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from typing import Any
from uuid import UUID

from langfuse import Langfuse
Expand Down Expand Up @@ -352,6 +353,59 @@ def save_score(
return eval_run


def group_traces_by_question_id(
traces: list[dict[str, Any]],
) -> list[dict[str, Any]]:
"""
Group evaluation traces by question_id for horizontal comparison.

Returns:
List of grouped traces sorted by question_id:
[
{
"question_id": 1,
"question": "What is Python?",
"ground_truth_answer": "...",
"llm_answers": ["Answer 1", "Answer 2"],
"trace_ids": ["trace-1", "trace-2"],
"scores": [[...], [...]]
}
]
"""

# whether question_id exists in the traces
if traces and (
traces[0].get("question_id") is None or traces[0].get("question_id") == ""
):
raise ValueError("Grouped export format is not available for this evaluation.")

groups: dict[int, list[dict[str, Any]]] = {}

for trace in traces:
question_id = trace.get("question_id")
if question_id not in groups:
groups[question_id] = []
groups[question_id].append(trace)

result: list[dict[str, Any]] = []
for question_id in sorted(groups.keys()):
group_traces = groups[question_id]
first = group_traces[0]
result.append(
{
"question_id": question_id,
"question": first.get("question", ""),
"ground_truth_answer": first.get("ground_truth_answer", ""),
"llm_answers": [t.get("llm_answer", "") for t in group_traces],
"trace_ids": [t.get("trace_id", "") for t in group_traces],
"scores": [t.get("scores", []) for t in group_traces],
}
)

logger.info(f"[group_traces_by_question_id] Created {len(result)} groups")
return result


def resolve_model_from_config(
session: Session,
eval_run: EvaluationRun,
Expand Down
142 changes: 142 additions & 0 deletions backend/app/tests/api/routes/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,148 @@ def test_get_evaluation_run_resync_without_trace_info_fails(
and "get_trace_info" in error_str.lower()
)

def test_get_evaluation_run_grouped_format_without_trace_info_fails(
self,
client: TestClient,
user_api_key_header: dict[str, str],
db: Session,
user_api_key: TestAuthContext,
create_test_dataset: EvaluationDataset,
) -> None:
eval_run = EvaluationRun(
run_name="test_run",
dataset_name=create_test_dataset.name,
dataset_id=create_test_dataset.id,
config={"model": "gpt-4o"},
status="completed",
total_items=3,
organization_id=user_api_key.organization_id,
project_id=user_api_key.project_id,
)
db.add(eval_run)
db.commit()
db.refresh(eval_run)

response = client.get(
f"/api/v1/evaluations/{eval_run.id}",
params={"export_format": "grouped"}, # Missing get_trace_info=true
headers=user_api_key_header,
)

assert response.status_code == 400
response_data = response.json()
error_str = response_data.get(
"detail", response_data.get("error", str(response_data))
)
assert (
"export_format" in error_str.lower()
and "get_trace_info" in error_str.lower()
)

def test_get_evaluation_run_grouped_format_success(
self,
client: TestClient,
user_api_key_header: dict[str, str],
db: Session,
user_api_key: TestAuthContext,
create_test_dataset: EvaluationDataset,
) -> None:
eval_run = EvaluationRun(
run_name="test_run",
dataset_name=create_test_dataset.name,
dataset_id=create_test_dataset.id,
config={"model": "gpt-4o"},
status="completed",
total_items=4,
score={
"traces": [
{
"trace_id": "trace-1a",
"question_id": 1,
"question": "What is Python?",
"ground_truth_answer": "A programming language",
"llm_answer": "Python is a high-level programming language",
"scores": [
{
"name": "cosine_similarity",
"value": 0.82,
"data_type": "NUMERIC",
}
],
},
{
"trace_id": "trace-1b",
"question_id": 1,
"question": "What is Python?",
"ground_truth_answer": "A programming language",
"llm_answer": "Python is an interpreted language",
"scores": [
{
"name": "cosine_similarity",
"value": 0.75,
"data_type": "NUMERIC",
}
],
},
# Row format - 1 trace for question_id=2
{
"trace_id": "trace-2a",
"question_id": 2,
"question": "What is Java?",
"ground_truth_answer": "An OOP language",
"llm_answer": "Java is a statically typed language",
"scores": [
{
"name": "cosine_similarity",
"value": 0.80,
"data_type": "NUMERIC",
}
],
},
],
"summary_scores": [
{
"avg": 0.79,
"std": 0.03,
"name": "cosine_similarity",
"data_type": "NUMERIC",
"total_pairs": 3,
}
],
},
organization_id=user_api_key.organization_id,
project_id=user_api_key.project_id,
)
db.add(eval_run)
db.commit()
db.refresh(eval_run)

response = client.get(
f"/api/v1/evaluations/{eval_run.id}",
params={
"export_format": "grouped",
"get_trace_info": True,
}, # Missing get_trace_info=true
headers=user_api_key_header,
)
Comment on lines +1047 to +1054
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Stale comment: get_trace_info is actually True here.

The comment on line 1042 says # Missing get_trace_info=true but get_trace_info: True is present in the params. This appears to be copy-paste from the previous test.

🧹 Remove stale comment
         response = client.get(
             f"/api/v1/evaluations/{eval_run.id}",
             params={
                 "export_format": "grouped",
                 "get_trace_info": True,
-            },  # Missing get_trace_info=true
+            },
             headers=user_api_key_header,
         )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
response = client.get(
f"/api/v1/evaluations/{eval_run.id}",
params={
"export_format": "grouped",
"get_trace_info": True,
}, # Missing get_trace_info=true
headers=user_api_key_header,
)
response = client.get(
f"/api/v1/evaluations/{eval_run.id}",
params={
"export_format": "grouped",
"get_trace_info": True,
},
headers=user_api_key_header,
)
🤖 Prompt for AI Agents
In `@backend/app/tests/api/routes/test_evaluation.py` around lines 1037 - 1044,
The inline comment next to the client.get call is stale; in the params dict the
key "get_trace_info" is already set to True, so remove the incorrect comment ("#
Missing get_trace_info=true") adjacent to the response = client.get(...) call
(the params containing "export_format" and "get_trace_info") to avoid confusion.


assert response.status_code == 200
response_data = response.json()
assert response_data["success"] is True
data = response_data["data"]
assert data["id"] == eval_run.id
assert data["status"] == "completed"

traces = data["score"]["traces"]
assert (
isinstance(traces, list)
and len(traces) > 0
and "llm_answers" in traces[0]
and isinstance(traces[0]["llm_answers"], list)
and "trace_ids" in traces[0]
and isinstance(traces[0]["trace_ids"], list)
)


class TestGetDataset:
"""Test GET /evaluations/datasets/{dataset_id} endpoint."""
Expand Down