KeiSeiKit-1.0/_primitives/_rust/frustration-matrix/tests/firmware.rs
Parfii-bot 0be354a920 KeiSeiKit-public — clean state
Single-commit clean baseline after security scrub of niche-tells,
project codenames, internal jargon, and contributor-email leaks.

Contents:
- 100 Rust crates (_primitives/_rust/)
- 37 agent manifests (_manifests/) + generated specs (_generated/)
- 67 user-invocable skills (skills/)
- 33 hooks (hooks/)
- Composition blocks (_blocks/)
- Documentation (docs/, README.md)
- TS adapter packages (_ts_packages/)
- Assembler (_assembler/)
- Roles (_roles/)
- Templates (_templates/)
- Forgejo CI (.forgejo/)

Author: Denis Parfionovich <info@greendragon.info>

License: see LICENSE.
2026-05-01 12:09:03 +08:00

124 lines
5.2 KiB
Rust

//! Firmware tests — cover training, save/load, multilingual alphabet,
//! unigram fallback, and size budget (≤50 KB at depth 4 on 1 MB corpus).
//!
//! Like `tests/integration.rs`, we link source modules via `#[path]` so
//! the binary crate doesn't need to export a library surface.
#[path = "../src/jsonl.rs"]
mod jsonl;
#[path = "../src/firmware_ngram.rs"]
mod firmware_ngram;
#[path = "../src/firmware.rs"]
mod firmware;
#[path = "../src/firmware_corpus.rs"]
mod firmware_corpus;
use firmware::Firmware;
use std::fs;
use tempfile::tempdir;
// ---------------------------------------------------------------
// 1. train_bigram_from_trivial_text
// "abab" has seen only (a→b) and (b→a). Querying "ab" must score
// higher than "ac" whose second-position transition is unseen.
// ---------------------------------------------------------------
#[test]
fn train_bigram_from_trivial_text() {
let fw = Firmware::train_from_text("abababab", 1);
let ll_ab = fw.log_likelihood("ab");
let ll_ac = fw.log_likelihood("ac");
assert!(
ll_ab > ll_ac,
"ll_ab={} should exceed ll_ac={} (seen vs unseen transition)",
ll_ab,
ll_ac,
);
assert!(ll_ac.is_finite(), "ll_ac must be finite, got {}", ll_ac);
}
// ---------------------------------------------------------------
// 2. save_load_roundtrip — alphabet + ngrams identical after a round trip.
// ---------------------------------------------------------------
#[test]
fn save_load_roundtrip() {
let dir = tempdir().expect("tempdir");
let corpus = dir.path().join("corpus");
fs::create_dir_all(&corpus).expect("mkdir corpus");
fs::write(corpus.join("a.txt"), "the quick brown fox jumps over the lazy dog").unwrap();
fs::write(corpus.join("b.txt"), "the rain in spain falls mainly on the plain").unwrap();
let fw = Firmware::train_from_dir(&corpus, 3).expect("train");
let out = dir.path().join("fw.json.gz");
fw.save(&out).expect("save");
let loaded = Firmware::load(&out).expect("load");
assert_eq!(loaded.alphabet, fw.alphabet, "alphabet mismatch");
assert_eq!(loaded.max_depth, fw.max_depth, "max_depth mismatch");
assert_eq!(loaded.total_chars, fw.total_chars, "total_chars mismatch");
assert_eq!(loaded.ngrams, fw.ngrams, "ngrams mismatch");
let t = "the fox";
assert!(
(loaded.log_likelihood(t) - fw.log_likelihood(t)).abs() < 1e-9,
"log_likelihood differs after roundtrip",
);
}
// ---------------------------------------------------------------
// 3. multilingual_corpus_splits_alphabet — Cyrillic and Latin both present.
// ---------------------------------------------------------------
#[test]
fn multilingual_corpus_splits_alphabet() {
// Repeat each char enough times to clear the min_count=2 filter.
let text = "privet privet мир мир hello hello world world";
let fw = Firmware::train_from_text(text, 2);
let has_latin = fw.alphabet.iter().any(|c| matches!(*c, 'a'..='z'));
let has_cyrillic = fw.alphabet.iter().any(|c| {
let u = *c as u32;
(0x0400..=0x04FF).contains(&u)
});
assert!(has_latin, "alphabet missing Latin: {:?}", fw.alphabet);
assert!(has_cyrillic, "alphabet missing Cyrillic: {:?}", fw.alphabet);
}
// ---------------------------------------------------------------
// 4. unseen_context_falls_back_to_unigram — finite log-lik even when
// context is not in the n-gram map.
// ---------------------------------------------------------------
#[test]
fn unseen_context_falls_back_to_unigram() {
let fw = Firmware::train_from_text("the the the the", 3);
// 'x' never seen — unigram fallback must use the floor, not -inf.
let ll = fw.log_likelihood("xyz");
assert!(ll.is_finite(), "log_likelihood returned non-finite: {}", ll);
// `t` is in the alphabet but a context like "zz" is unseen;
// back-off to unigram must give a non-zero finite value.
let ll2 = fw.log_likelihood("zzt");
assert!(ll2.is_finite(), "log_likelihood after backoff: {}", ll2);
}
// ---------------------------------------------------------------
// 5. depth_4_on_small_corpus_stays_under_50kb — size budget sanity.
// Generates a 1 MB corpus of predictable prose; saves; asserts
// file size < 50 KB. This is the internal compression-ratio target.
// ---------------------------------------------------------------
#[test]
fn depth_4_on_small_corpus_stays_under_50kb() {
let dir = tempdir().expect("tempdir");
let corpus = dir.path().join("corpus");
fs::create_dir_all(&corpus).expect("mkdir");
// 1 MB of natural-ish English — repeated sentences keep context
// count bounded while still exercising depth-4 branching.
let sentence = "the quick brown fox jumps over the lazy dog near the old oak tree. ";
let mut buf = String::with_capacity(1_048_576);
while buf.len() < 1_048_576 {
buf.push_str(sentence);
}
fs::write(corpus.join("prose.txt"), &buf).unwrap();
let fw = Firmware::train_from_dir(&corpus, 4).expect("train");
let out = dir.path().join("fw.json.gz");
fw.save(&out).expect("save");
let size = fs::metadata(&out).expect("stat").len();
assert!(
size < 50 * 1024,
"firmware file size {} bytes exceeds 50 KB budget",
size,
);
}