Single-commit clean baseline after security scrub of niche-tells, project codenames, internal jargon, and contributor-email leaks. Contents: - 100 Rust crates (_primitives/_rust/) - 37 agent manifests (_manifests/) + generated specs (_generated/) - 67 user-invocable skills (skills/) - 33 hooks (hooks/) - Composition blocks (_blocks/) - Documentation (docs/, README.md) - TS adapter packages (_ts_packages/) - Assembler (_assembler/) - Roles (_roles/) - Templates (_templates/) - Forgejo CI (.forgejo/) Author: Denis Parfionovich <info@greendragon.info> License: see LICENSE.
115 lines
4.4 KiB
Rust
115 lines
4.4 KiB
Rust
//! Eval — compare regex-based (v1) vs firmware-based (v2) classification on
|
|
//! a hand-labelled gold set. Reports per-category precision / recall / f1,
|
|
//! overall accuracy and macro-f1, plus two confusion matrices.
|
|
//!
|
|
//! This module consumes APIs from firmware.rs (Z1) and classifier.rs (Z2).
|
|
//! If those modules have different method names at orchestrator-merge time,
|
|
//! update the call sites here — the eval LOGIC is independent of the
|
|
//! internal firmware representation.
|
|
//!
|
|
//! Constructor Pattern: this file holds only the public types + the
|
|
//! `evaluate` orchestrator. Helpers live in sibling cubes:
|
|
//!
|
|
//! * `eval_gold` — parse labelled JSONL, filter quality=gold
|
|
//! * `eval_predict` — `CategoryPredictor` trait + regex / firmware impls
|
|
//! * `eval_metrics` — pure precision / recall / f1 math
|
|
//! * `eval_report` — CSV write + stdout summary
|
|
//!
|
|
//! Purity: every mathematical step in eval_metrics is a pure function of
|
|
//! two integer vectors (true + predicted). Predictors are behind a trait
|
|
//! so tests can inject `MockClassifier` without Z1/Z2 on disk.
|
|
|
|
use crate::categories::compile_all;
|
|
use crate::classifier::Classifier;
|
|
use crate::eval_gold::load_gold_rows;
|
|
use crate::eval_metrics::{build_confusion, compute_metrics};
|
|
use crate::eval_predict::{
|
|
predict_all, CategoryPredictor, FirmwarePredictor, RegexPredictor,
|
|
};
|
|
use crate::eval_report::{print_summary, write_csv};
|
|
use anyhow::{Context, Result};
|
|
use std::collections::HashMap;
|
|
use std::path::PathBuf;
|
|
|
|
/// CLI input bundle — from `main.rs` eval subcommand.
|
|
pub struct EvalInput {
|
|
pub gold_jsonl: PathBuf,
|
|
pub model_dir: PathBuf,
|
|
pub output_csv: PathBuf,
|
|
}
|
|
|
|
/// Full report produced by `evaluate`.
|
|
pub struct EvalReport {
|
|
pub total_gold_rows: usize,
|
|
pub regex_metrics: Metrics,
|
|
pub firmware_metrics: Metrics,
|
|
pub confusion_regex: HashMap<(String, String), usize>,
|
|
pub confusion_firmware: HashMap<(String, String), usize>,
|
|
}
|
|
|
|
/// Overall + per-category metrics for one classifier.
|
|
pub struct Metrics {
|
|
pub accuracy: f64,
|
|
pub per_category: Vec<PerCategoryMetric>,
|
|
}
|
|
|
|
/// Per-category precision / recall / f1 / support, sklearn convention.
|
|
pub struct PerCategoryMetric {
|
|
pub category: String,
|
|
pub precision: f64,
|
|
pub recall: f64,
|
|
pub f1: f64,
|
|
pub support: usize,
|
|
}
|
|
|
|
/// Run the full eval pipeline: load gold, run both classifiers, compute
|
|
/// metrics, write CSV, print summary.
|
|
///
|
|
/// This is the ONLY function main.rs calls; all heavy lifting is delegated
|
|
/// to sibling cubes. Kept under 30 LOC per Constructor Pattern rule.
|
|
pub fn evaluate(input: &EvalInput) -> Result<EvalReport> {
|
|
let gold = load_gold_rows(&input.gold_jsonl)
|
|
.with_context(|| format!("load gold {}", input.gold_jsonl.display()))?;
|
|
let regex_pred = RegexPredictor::new(compile_all());
|
|
let classifier = Classifier::load_from_dir(&input.model_dir)
|
|
.with_context(|| format!("load classifier {}", input.model_dir.display()))?;
|
|
let fw_pred = FirmwarePredictor::new(classifier);
|
|
let report = run_with_predictors(&gold, ®ex_pred, &fw_pred);
|
|
write_csv(&input.output_csv, &report)
|
|
.with_context(|| format!("write csv {}", input.output_csv.display()))?;
|
|
print_summary(&report);
|
|
Ok(report)
|
|
}
|
|
|
|
/// Core eval loop over gold rows + two predictors — the pure-function
|
|
/// version used by tests. Does not touch disk.
|
|
///
|
|
/// Exposed `pub(crate)` so integration tests can wire MockClassifier
|
|
/// implementations without needing Firmware files on disk.
|
|
pub(crate) fn run_with_predictors(
|
|
gold: &[GoldRow],
|
|
regex_pred: &dyn CategoryPredictor,
|
|
firmware_pred: &dyn CategoryPredictor,
|
|
) -> EvalReport {
|
|
let regex_preds = predict_all(regex_pred, gold);
|
|
let firmware_preds = predict_all(firmware_pred, gold);
|
|
let truth: Vec<&str> = gold.iter().map(|g| g.category.as_str()).collect();
|
|
let confusion_regex = build_confusion(&truth, ®ex_preds);
|
|
let confusion_firmware = build_confusion(&truth, &firmware_preds);
|
|
let regex_metrics = compute_metrics(&truth, ®ex_preds);
|
|
let firmware_metrics = compute_metrics(&truth, &firmware_preds);
|
|
EvalReport {
|
|
total_gold_rows: gold.len(),
|
|
regex_metrics,
|
|
firmware_metrics,
|
|
confusion_regex,
|
|
confusion_firmware,
|
|
}
|
|
}
|
|
|
|
/// One parsed gold row — shared input type for predictors + metrics.
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub struct GoldRow {
|
|
pub category: String,
|
|
pub text: String,
|
|
}
|