跳转到主要内容

browseruse_bench.utils.stats_utils

统计相关的工具函数。

导入

from browseruse_bench.utils import (
    calculate_metric_stats,
    calculate_all_metrics_stats,
    filter_tasks_by_label,
    generate_evaluation_summary,
)

calculate_metric_stats

为指定指标计算统计信息。
def calculate_metric_stats(
    tasks: List[Dict[str, Any]],
    metric: str,
    path: str = "evaluation_details"
) -> Dict[str, float]
tasks
list of dict
必填
任务结果列表
metric
str
必填
指标名称,例如 ttft_msend_to_end_mssteps
path
str
默认值:"evaluation_details"
指标在任务字典中的路径
return
dict
统计字典,包含 countmeanminmaxmedian

calculate_all_metrics_stats

为多个指标计算统计信息。
def calculate_all_metrics_stats(
    tasks: List[Dict[str, Any]],
    metrics: Optional[List[str]] = None,
    path: str = "evaluation_details"
) -> Dict[str, Dict[str, float]]
metrics
list of str
默认值:"见下方说明"
指标名称列表

返回结构

{
    "ttft_ms": {"count": 10, "mean": 1234.5, ...},
    "end_to_end_ms": {"count": 10, "mean": 5678.9, ...},
    "steps": {"count": 10, "mean": 3.2, ...},
    "usage": {
        "total_tokens": {"count": 10, "mean": 1500, ...},
        "total_cost": {"count": 10, "mean": 0.05, ...},
        ...
    }
}

filter_tasks_by_label

按 label 过滤任务。
def filter_tasks_by_label(
    tasks: List[Dict[str, Any]],
    key: str = "predicted_label",
    val: int = 1
) -> List[Dict[str, Any]]
tasks
list of dict
必填
任务结果列表
key
str
默认值:"predicted_label"
Label 键名
val
int
默认值:"1"
Label 值(1 = 成功,0 = 失败)

generate_evaluation_summary

生成评估汇总。
def generate_evaluation_summary(
    results: List[Dict[str, Any]],
    total: int,
    metrics: Optional[List[str]] = None
) -> Dict[str, Any]
results
list of dict
必填
评估结果列表
total
int
必填
任务总数
metrics
list of str
默认值:"见下方说明"
要计算的指标列表

返回结构

{
    "overall_statistics": {
        "total_tasks": 100,
        "evaluated_tasks": 95,
        "successful_tasks": 70,
        "failed_tasks": 25,
        "success_rate": 73.68,
        "failure_rate": 26.32
    },
    "metrics_statistics": {...},
    "successful_tasks_metrics": {...},
    "failed_tasks_metrics": {...},
    "failure_category_statistics": {...},
    "task_list": {
        "successful_task_ids": [...],
        "failed_task_ids": [...]
    }
}