KeiSeiKit-1.0/_primitives/_rust/frustration-matrix/src/classifier.rs
Parfii-bot 0be354a920 KeiSeiKit-public — clean state
Single-commit clean baseline after security scrub of niche-tells,
project codenames, internal jargon, and contributor-email leaks.

Contents:
- 100 Rust crates (_primitives/_rust/)
- 37 agent manifests (_manifests/) + generated specs (_generated/)
- 67 user-invocable skills (skills/)
- 33 hooks (hooks/)
- Composition blocks (_blocks/)
- Documentation (docs/, README.md)
- TS adapter packages (_ts_packages/)
- Assembler (_assembler/)
- Roles (_roles/)
- Templates (_templates/)
- Forgejo CI (.forgejo/)

Author: Denis Parfionovich <info@greendragon.info>

License: see LICENSE.
2026-05-01 12:09:03 +08:00

185 lines
6.4 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Firmware-based likelihood-ratio classifier.
//!
//! One cube, one responsibility: given per-category firmwares + a neutral
//! baseline, assign a message to the category with the highest
//! length-normalized log-likelihood ratio
//! `log P(msg|cat) log P(msg|neutral)` / chars(msg).
//!
//! ASSUMPTION: `firmware.rs` is supplied by a parallel Z1 agent (RULE 0.13).
//! Expected API: `Firmware::{train_from_dir, train_from_text,
//! log_likelihood, save, load}`. We adapt call sites here; never edit Z1.
//!
//! Ratio removes base-rate language entropy (~1 bit/char). Length-normalize
//! so long messages don't trivially outscore short ones.
//!
//! Defaults (see constants below): MIN_LEN = 20 — DERIVED from internal
//! n-gram entropy (` §4 l.54:
//! "7-9 chars of context = almost full predictability"; max_depth = 8 → 2
//! full prediction windows = 16 → 20 with safety margin). THRESHOLD = 0.0
//! — any net-positive ratio counts; permissive default for tuning later.
use crate::firmware::Firmware;
use anyhow::{anyhow, Context, Result};
use std::collections::HashMap;
use std::ffi::OsStr;
use std::fs;
use std::path::Path;
/// Default minimum message length (chars) below which classification is
/// skipped. See module docs §Defaults for derivation.
pub const MIN_LEN: usize = 20;
/// Default normalized log-ratio threshold. See module docs §Defaults.
pub const THRESHOLD: f64 = 0.0;
/// Stem used for the baseline firmware file inside the model directory.
const NEUTRAL_STEM: &str = "neutral";
/// File extension expected for firmware files on disk.
const FIRMWARE_EXT: &str = "fw";
/// Bundle of trained firmwares: per-category + neutral baseline.
#[derive(Debug)]
pub struct Classifier {
/// Category name (file stem) → trained firmware.
pub categories: HashMap<String, Firmware>,
/// Baseline firmware; ratios are reported against this.
pub neutral: Firmware,
}
/// Result of classifying a single message. `scores` is always populated
/// (sorted desc by `normalized`) for diagnostics.
pub struct ClassificationResult {
pub best_category: Option<String>,
pub scores: Vec<CategoryScore>,
}
/// Per-category score for one input. `log_ratio = ll(cat) ll(neutral)`;
/// `normalized = log_ratio / chars(msg)` is the length-fair ranking key.
pub struct CategoryScore {
pub category: String,
pub log_ratio: f64,
pub normalized: f64,
}
impl Classifier {
/// Load bundle from directory. `<dir>/neutral.fw` REQUIRED; every
/// other `<stem>.fw` becomes a category keyed by `<stem>`.
pub fn load_from_dir(dir: &Path) -> Result<Self> {
let files = list_firmware_files(dir)?;
let (neutral_path, category_paths) = partition_neutral(files)?;
let neutral =
Firmware::load(&neutral_path).context("load neutral firmware")?;
let categories = load_category_map(&category_paths)?;
Ok(Classifier { categories, neutral })
}
/// Classify one message. `best_category = None` when msg is shorter
/// than `min_len` chars or no category's `normalized` clears `threshold`.
pub fn classify(
&self,
msg: &str,
min_len: usize,
threshold: f64,
) -> ClassificationResult {
let char_len = msg.chars().count();
if char_len < min_len {
return ClassificationResult {
best_category: None,
scores: Vec::new(),
};
}
let scores = self.score_all(msg, char_len);
let best_category = pick_best(&scores, threshold);
ClassificationResult { best_category, scores }
}
/// Compute + sort scores for every loaded category. Split out to keep
/// `classify` small.
fn score_all(&self, msg: &str, char_len: usize) -> Vec<CategoryScore> {
let neutral_ll = self.neutral.log_likelihood(msg);
let len_f = char_len.max(1) as f64;
let mut scores: Vec<CategoryScore> = self
.categories
.iter()
.map(|(cat, fw)| {
let log_ratio = fw.log_likelihood(msg) - neutral_ll;
CategoryScore {
category: cat.clone(),
log_ratio,
normalized: log_ratio / len_f,
}
})
.collect();
scores.sort_by(|a, b| {
b.normalized
.partial_cmp(&a.normalized)
.unwrap_or(std::cmp::Ordering::Equal)
});
scores
}
}
/// Pick the winning category if its normalized score clears `threshold`.
fn pick_best(scores: &[CategoryScore], threshold: f64) -> Option<String> {
scores
.first()
.filter(|s| s.normalized > threshold)
.map(|s| s.category.clone())
}
/// List `*.fw` files under `dir`. Sorted for deterministic test output.
fn list_firmware_files(dir: &Path) -> Result<Vec<std::path::PathBuf>> {
let entries = fs::read_dir(dir)
.with_context(|| format!("read_dir {}", dir.display()))?;
let mut out = Vec::new();
for entry in entries {
let e = entry.context("read dir entry")?;
let p = e.path();
if p.is_file() && p.extension() == Some(OsStr::new(FIRMWARE_EXT)) {
out.push(p);
}
}
out.sort();
Ok(out)
}
/// Split the list into `(neutral_path, category_paths)`; err if no neutral.
fn partition_neutral(
files: Vec<std::path::PathBuf>,
) -> Result<(std::path::PathBuf, Vec<std::path::PathBuf>)> {
let mut neutral: Option<std::path::PathBuf> = None;
let mut categories = Vec::new();
for p in files {
if file_stem(&p) == Some(NEUTRAL_STEM) {
neutral = Some(p);
} else {
categories.push(p);
}
}
let neutral = neutral.ok_or_else(|| {
anyhow!("model dir missing required {NEUTRAL_STEM}.{FIRMWARE_EXT}")
})?;
Ok((neutral, categories))
}
/// Build `HashMap<stem, Firmware>` from the category path list.
fn load_category_map(
paths: &[std::path::PathBuf],
) -> Result<HashMap<String, Firmware>> {
let mut out = HashMap::new();
for p in paths {
let stem = file_stem(p)
.ok_or_else(|| anyhow!("firmware file has no stem: {}", p.display()))?
.to_string();
let fw = Firmware::load(p)
.with_context(|| format!("load firmware {}", p.display()))?;
out.insert(stem, fw);
}
Ok(out)
}
/// UTF-8 file stem as `&str`, or `None` for non-UTF-8 / missing stems.
fn file_stem(p: &Path) -> Option<&str> {
p.file_stem().and_then(|s| s.to_str())
}