KeiSeiKit-1.0/_primitives/_rust/frustration-matrix/tests/firmware.rs

//! Firmware tests — cover training, save/load, multilingual alphabet,
//! unigram fallback, and size budget (≤50 KB at depth 4 on 1 MB corpus).
//!
//! Like `tests/integration.rs`, we link source modules via `#[path]` so
//! the binary crate doesn't need to export a library surface.

#[path = "../src/jsonl.rs"]
mod jsonl;
#[path = "../src/firmware_ngram.rs"]
mod firmware_ngram;
#[path = "../src/firmware.rs"]
mod firmware;
#[path = "../src/firmware_corpus.rs"]
mod firmware_corpus;

use firmware::Firmware;
use std::fs;
use tempfile::tempdir;

// ---------------------------------------------------------------
// 1. train_bigram_from_trivial_text
//    "abab" has seen only (a→b) and (b→a). Querying "ab" must score
//    higher than "ac" whose second-position transition is unseen.
// ---------------------------------------------------------------
#[test]
fn train_bigram_from_trivial_text() {
    let fw = Firmware::train_from_text("abababab", 1);
    let ll_ab = fw.log_likelihood("ab");
    let ll_ac = fw.log_likelihood("ac");
    assert!(
        ll_ab > ll_ac,
        "ll_ab={} should exceed ll_ac={} (seen vs unseen transition)",
        ll_ab,
        ll_ac,
    );
    assert!(ll_ac.is_finite(), "ll_ac must be finite, got {}", ll_ac);
}

// ---------------------------------------------------------------
// 2. save_load_roundtrip — alphabet + ngrams identical after a round trip.
// ---------------------------------------------------------------
#[test]
fn save_load_roundtrip() {
    let dir = tempdir().expect("tempdir");
    let corpus = dir.path().join("corpus");
    fs::create_dir_all(&corpus).expect("mkdir corpus");
    fs::write(corpus.join("a.txt"), "the quick brown fox jumps over the lazy dog").unwrap();
    fs::write(corpus.join("b.txt"), "the rain in spain falls mainly on the plain").unwrap();
    let fw = Firmware::train_from_dir(&corpus, 3).expect("train");
    let out = dir.path().join("fw.json.gz");
    fw.save(&out).expect("save");
    let loaded = Firmware::load(&out).expect("load");
    assert_eq!(loaded.alphabet, fw.alphabet, "alphabet mismatch");
    assert_eq!(loaded.max_depth, fw.max_depth, "max_depth mismatch");
    assert_eq!(loaded.total_chars, fw.total_chars, "total_chars mismatch");
    assert_eq!(loaded.ngrams, fw.ngrams, "ngrams mismatch");
    let t = "the fox";
    assert!(
        (loaded.log_likelihood(t) - fw.log_likelihood(t)).abs() < 1e-9,
        "log_likelihood differs after roundtrip",
    );
}

// ---------------------------------------------------------------
// 3. multilingual_corpus_splits_alphabet — Cyrillic and Latin both present.
// ---------------------------------------------------------------
#[test]
fn multilingual_corpus_splits_alphabet() {
    // Repeat each char enough times to clear the min_count=2 filter.
    let text = "privet privet мир мир hello hello world world";
    let fw = Firmware::train_from_text(text, 2);
    let has_latin = fw.alphabet.iter().any(|c| matches!(*c, 'a'..='z'));
    let has_cyrillic = fw.alphabet.iter().any(|c| {
        let u = *c as u32;
        (0x0400..=0x04FF).contains(&u)
    });
    assert!(has_latin, "alphabet missing Latin: {:?}", fw.alphabet);
    assert!(has_cyrillic, "alphabet missing Cyrillic: {:?}", fw.alphabet);
}

// ---------------------------------------------------------------
// 4. unseen_context_falls_back_to_unigram — finite log-lik even when
//    context is not in the n-gram map.
// ---------------------------------------------------------------
#[test]
fn unseen_context_falls_back_to_unigram() {
    let fw = Firmware::train_from_text("the the the the", 3);
    // 'x' never seen — unigram fallback must use the floor, not -inf.
    let ll = fw.log_likelihood("xyz");
    assert!(ll.is_finite(), "log_likelihood returned non-finite: {}", ll);
    // `t` is in the alphabet but a context like "zz" is unseen;
    // back-off to unigram must give a non-zero finite value.
    let ll2 = fw.log_likelihood("zzt");
    assert!(ll2.is_finite(), "log_likelihood after backoff: {}", ll2);
}

// ---------------------------------------------------------------
// 5. depth_4_on_small_corpus_stays_under_50kb — size budget sanity.
//    Generates a 1 MB corpus of predictable prose; saves; asserts
//    file size < 50 KB. This is the internal compression-ratio target.
// ---------------------------------------------------------------
#[test]
fn depth_4_on_small_corpus_stays_under_50kb() {
    let dir = tempdir().expect("tempdir");
    let corpus = dir.path().join("corpus");
    fs::create_dir_all(&corpus).expect("mkdir");
    // 1 MB of natural-ish English — repeated sentences keep context
    // count bounded while still exercising depth-4 branching.
    let sentence = "the quick brown fox jumps over the lazy dog near the old oak tree. ";
    let mut buf = String::with_capacity(1_048_576);
    while buf.len() < 1_048_576 {
        buf.push_str(sentence);
    }
    fs::write(corpus.join("prose.txt"), &buf).unwrap();
    let fw = Firmware::train_from_dir(&corpus, 4).expect("train");
    let out = dir.path().join("fw.json.gz");
    fw.save(&out).expect("save");
    let size = fs::metadata(&out).expect("stat").len();
    assert!(
        size < 50 * 1024,
        "firmware file size {} bytes exceeds 50 KB budget",
        size,
    );
}