KeiSeiKit-1.0/_primitives/_rust/kei-frustration-loop/src/auto_train.rs
Parfii-bot 0be354a920 KeiSeiKit-public — clean state
Single-commit clean baseline after security scrub of niche-tells,
project codenames, internal jargon, and contributor-email leaks.

Contents:
- 100 Rust crates (_primitives/_rust/)
- 37 agent manifests (_manifests/) + generated specs (_generated/)
- 67 user-invocable skills (skills/)
- 33 hooks (hooks/)
- Composition blocks (_blocks/)
- Documentation (docs/, README.md)
- TS adapter packages (_ts_packages/)
- Assembler (_assembler/)
- Roles (_roles/)
- Templates (_templates/)
- Forgejo CI (.forgejo/)

Author: Denis Parfionovich <info@greendragon.info>

License: see LICENSE.
2026-05-01 12:09:03 +08:00

151 lines
5.3 KiB
Rust

//! Auto-retrain trigger logic.
//!
//! When the per-user feedback log crosses `threshold` rows, the orchestrator
//! invokes `auto_train`: rebuild the corpus from confirmed-correct hits
//! (treating `Wrong` and `NewCategory(_)` as exclusions / extension only),
//! retrain the firmware, and atomic-swap it into place.
//!
//! Default threshold = 20 (see `DEFAULT_THRESHOLD`); overridable via the
//! `KEI_FRUSTRATION_THRESHOLD` env var or the `--threshold` CLI flag.
//!
//! Constructor Pattern: this cube only orchestrates `feedback`,
//! `frustration_matrix::firmware`, `frustration_matrix::firmware_corpus`,
//! `persistence`. No format decisions.
use crate::feedback::{count_pending, read_all, Feedback, Label};
use crate::persistence::atomic_write;
use anyhow::{Context, Result};
use frustration_matrix::firmware::{Firmware, DEFAULT_MAX_DEPTH};
use frustration_matrix::firmware_corpus::load_corpus_text;
use serde::{Deserialize, Serialize};
use std::path::Path;
/// Default feedback-row count at which `should_retrain` flips to `true`.
pub const DEFAULT_THRESHOLD: usize = 20;
/// Env var name that overrides `DEFAULT_THRESHOLD`.
pub const THRESHOLD_ENV: &str = "KEI_FRUSTRATION_THRESHOLD";
/// Result returned to the CLI for the `auto-train` subcommand.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct TrainReport {
/// True iff a retrain happened (false = under threshold or no corpus).
pub trained: bool,
/// Total UTF-8 chars used to train the new firmware.
pub corpus_size: usize,
/// Path of the per-user firmware after the swap.
pub output_path: String,
/// Feedback row count seen at trigger time.
pub feedback_count: usize,
/// Threshold the count was compared against.
pub threshold: usize,
}
/// Resolve effective threshold: explicit `cli_threshold` (if `Some`) wins,
/// then `KEI_FRUSTRATION_THRESHOLD` env var, then `DEFAULT_THRESHOLD`.
pub fn resolve_threshold(cli_threshold: Option<usize>) -> usize {
if let Some(t) = cli_threshold {
return t;
}
if let Ok(raw) = std::env::var(THRESHOLD_ENV) {
if let Ok(parsed) = raw.trim().parse::<usize>() {
return parsed;
}
}
DEFAULT_THRESHOLD
}
/// True iff the feedback log at `path` has at least `threshold` rows.
pub fn should_retrain(path: &Path, threshold: usize) -> Result<bool> {
Ok(count_pending(path)? >= threshold)
}
/// Rebuild the per-user firmware from the supplied corpus + feedback log.
///
/// Treats `Label::Correct` rows as additional positive corpus material
/// appended to the corpus loaded from `traces_dir`. `Label::Wrong` rows
/// are dropped. `Label::NewCategory(_)` rows are also appended to corpus
/// (the slug is informational; the message text is the training signal).
pub fn auto_train(
traces_dir: &Path,
feedback_path: &Path,
output: &Path,
threshold: usize,
) -> Result<TrainReport> {
let count = count_pending(feedback_path)?;
let mut report = TrainReport {
trained: false,
corpus_size: 0,
output_path: output.display().to_string(),
feedback_count: count,
threshold,
};
if count < threshold {
return Ok(report);
}
let corpus = build_corpus(traces_dir, feedback_path)?;
if corpus.is_empty() {
return Ok(report);
}
let firmware = Firmware::train_from_text(&corpus, DEFAULT_MAX_DEPTH);
save_firmware_atomic(&firmware, output)?;
report.trained = true;
report.corpus_size = corpus.chars().count();
Ok(report)
}
/// Concatenate base-corpus text with every kept feedback message into one
/// big training buffer. Newline-separated to keep n-grams from bleeding.
fn build_corpus(traces_dir: &Path, feedback_path: &Path) -> Result<String> {
let mut buf = load_corpus_text(traces_dir)
.with_context(|| format!("load corpus from {}", traces_dir.display()))?;
let extra = collect_feedback_text(feedback_path)?;
if !extra.is_empty() {
if !buf.is_empty() && !buf.ends_with('\n') {
buf.push('\n');
}
buf.push_str(&extra);
}
Ok(buf)
}
/// Pull every kept feedback message body into one newline-joined string.
fn collect_feedback_text(path: &Path) -> Result<String> {
let rows = read_all(path)?;
let mut buf = String::new();
for fb in rows {
if !keep_for_corpus(&fb) {
continue;
}
if !buf.is_empty() {
buf.push('\n');
}
buf.push_str(&fb.message);
}
Ok(buf)
}
/// Decide whether one feedback row contributes to the new corpus.
/// `Wrong` rows are excluded (the user said the original classification
/// was bogus); everything else feeds the rebuild.
fn keep_for_corpus(fb: &Feedback) -> bool {
!matches!(fb.label, Label::Wrong)
}
/// Save firmware via tmp-sibling + atomic_write so a crash mid-write
/// never leaves a half-baked `.gz` in place of the previous one.
fn save_firmware_atomic(firmware: &Firmware, dest: &Path) -> Result<()> {
let tmp = {
let mut s = dest.as_os_str().to_owned();
s.push(".autotrain-tmp");
std::path::PathBuf::from(s)
};
firmware
.save(&tmp)
.with_context(|| format!("save tmp {}", tmp.display()))?;
let bytes = std::fs::read(&tmp)
.with_context(|| format!("read tmp {}", tmp.display()))?;
let _ = std::fs::remove_file(&tmp);
atomic_write(dest, &bytes)?;
Ok(())
}