KeiSeiKit-1.0/_primitives/_rust/frustration-matrix/src/eval_gold.rs
Parfii-bot a4e667de10 KeiSeiKit-public — clean state
Single-commit clean baseline after security scrub of niche-tells,
project codenames, internal jargon, and contributor-email leaks.

Contents:
- 100 Rust crates (_primitives/_rust/)
- 37 agent manifests (_manifests/) + generated specs (_generated/)
- 67 user-invocable skills (skills/)
- 33 hooks (hooks/)
- Composition blocks (_blocks/)
- Documentation (docs/, README.md)
- TS adapter packages (_ts_packages/)
- Assembler (_assembler/)
- Roles (_roles/)
- Templates (_templates/)
- Forgejo CI (.forgejo/)

Author: Denis Parfionovich <info@greendragon.info>

License: see LICENSE.
2026-05-01 12:09:03 +08:00

96 lines
3.3 KiB
Rust

//! Gold-set JSONL parser.
//!
//! One file, one job: read the hand-labelled training set, drop anything
//! whose `quality != "gold"` (silver is noisy per spec), return clean
//! `GoldRow`s for the eval pipeline.
//!
//! Tolerant parse strategy: we deserialize into `serde_json::Value` rather
//! than a strict struct so upstream shape changes (extra columns, different
//! source tags) don't break the eval. Only the 4 fields named in the spec
//! are consulted: `category`, `text`, `source`, `quality`.
//!
//! One bad line is NEVER a fatal error — the whole file gets skipped only
//! if `fs::read_to_string` itself fails.
use crate::eval::GoldRow;
use anyhow::{Context, Result};
use serde_json::Value;
use std::fs;
use std::path::Path;
/// Load gold-quality rows from a JSONL file.
///
/// Rows with `quality != "gold"` are silently skipped. Rows missing the
/// `category` or `text` field are also skipped — we cannot evaluate a
/// classifier on an unlabelled example. Line-level JSON errors are
/// tolerated (one bad line does not abort the file).
pub fn load_gold_rows(path: &Path) -> Result<Vec<GoldRow>> {
let body = fs::read_to_string(path)
.with_context(|| format!("read {}", path.display()))?;
Ok(parse_jsonl_body(&body))
}
/// Pure-function variant — public inside the crate for test injection.
pub(crate) fn parse_jsonl_body(body: &str) -> Vec<GoldRow> {
body.lines()
.filter(|l| !l.trim().is_empty())
.filter_map(parse_one_line)
.collect()
}
/// Parse a single JSONL line. Returns `None` if:
/// * JSON is malformed,
/// * `quality` is absent or not "gold",
/// * `category` or `text` is absent / non-string.
fn parse_one_line(raw: &str) -> Option<GoldRow> {
let v: Value = serde_json::from_str(raw).ok()?;
if !is_gold_quality(&v) {
return None;
}
let category = v.get("category").and_then(Value::as_str)?.to_string();
let text = v.get("text").and_then(Value::as_str)?.to_string();
if category.is_empty() || text.is_empty() {
return None;
}
Some(GoldRow { category, text })
}
/// Gold filter: spec says "skip rows with quality != gold (silver is noisy)".
/// Missing quality field → not gold, drop.
fn is_gold_quality(v: &Value) -> bool {
v.get("quality").and_then(Value::as_str) == Some("gold")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn keeps_gold_drops_silver_and_broken() {
let body = concat!(
r#"{"category":"a","text":"hello","source":"x","quality":"gold"}"#,
"\n",
r#"{"category":"b","text":"world","source":"x","quality":"silver"}"#,
"\n",
"not valid json",
"\n",
r#"{"category":"c","text":"","quality":"gold"}"#,
"\n",
r#"{"text":"no cat","quality":"gold"}"#,
"\n",
r#"{"category":"d","text":"kept","quality":"gold"}"#,
"\n",
);
let rows = parse_jsonl_body(body);
assert_eq!(rows.len(), 2);
assert_eq!(rows[0].category, "a");
assert_eq!(rows[0].text, "hello");
assert_eq!(rows[1].category, "d");
}
#[test]
fn empty_body_returns_empty_vec() {
assert!(parse_jsonl_body("").is_empty());
assert!(parse_jsonl_body("\n\n\n").is_empty());
}
}