KeiSeiKit-1.0/_primitives/_rust/frustration-matrix/src/firmware.rs
Parfii-bot 0be354a920 KeiSeiKit-public — clean state
Single-commit clean baseline after security scrub of niche-tells,
project codenames, internal jargon, and contributor-email leaks.

Contents:
- 100 Rust crates (_primitives/_rust/)
- 37 agent manifests (_manifests/) + generated specs (_generated/)
- 67 user-invocable skills (skills/)
- 33 hooks (hooks/)
- Composition blocks (_blocks/)
- Documentation (docs/, README.md)
- TS adapter packages (_ts_packages/)
- Assembler (_assembler/)
- Roles (_roles/)
- Templates (_templates/)
- Forgejo CI (.forgejo/)

Author: Denis Parfionovich <info@greendragon.info>

License: see LICENSE.
2026-05-01 12:09:03 +08:00

170 lines
6.8 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Byte-level n-gram language firmware.
//!
//! Encodes `P(next_char | last_k_chars)` for k ∈ 1..=max_depth as a sparse
//! hashmap of `(context, next_char) → count`. Compact: ~10-50 KB for a
//! single language class. Replaces BPE/word-embeddings for likelihood
//! scoring.
//!
//! Theorem
//! backing: internal calibration-6 (Shannon entropy on space-separated
//! token streams; Phase 5 entropy curve: 3 chars → 1.91 bits, 7 chars →
//! 0.59 bits — depth-4 is the knee).
//!
//! Constructor Pattern: this file holds struct + API only. Corpus loading
//! is in `firmware_corpus.rs`, the n-gram accumulator in `firmware_ngram.rs`.
use crate::firmware_corpus::load_corpus_text;
use crate::firmware_ngram::NGramStats;
use anyhow::{Context, Result};
use flate2::read::GzDecoder;
use flate2::write::GzEncoder;
use flate2::Compression;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs;
use std::io::{Read, Write};
use std::path::Path;
/// Default max-depth for n-gram contexts.
///
/// internal calibration entropy curve: 0 chars → 4.48 bits, 3 chars →
/// 1.91 bits (57%), 7 chars → 0.59 bits (87%). Depth-4 is the knee —
/// most marginal gain per KB of storage on corpora in the 10-25 MB range.
/// Beyond k=4 the sparse map size grows ~3× per depth for ~15% entropy
/// reduction.
pub const DEFAULT_MAX_DEPTH: usize = 4;
/// Minimum context count required to retain an n-gram entry.
///
/// internal predecessor line 25: `min_count=2`. Drops hapax-legomena
/// which inflate size with no predictive value. [E1 VERIFIED: source]
pub const DEFAULT_MIN_COUNT: u32 = 2;
/// Compact byte-level n-gram firmware.
///
/// Fields are `pub` to match the spec. `ngrams` keys are UTF-8 context
/// strings (1..=max_depth chars long). Inner maps hold counts of each
/// observed next-character, not probabilities — keeps storage integer
/// and defers division to `log_likelihood`.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct Firmware {
/// Stable index of chars that passed `min_count`, sorted by codepoint.
pub alphabet: Vec<char>,
/// `P(char)` per alphabet index. Used as fallback when context unseen.
pub unigram: Vec<f64>,
/// `k ∈ 1..=max_depth` for all context lengths stored.
pub max_depth: usize,
/// `context → (next_char → count)`. Sparse: only observed contexts.
pub ngrams: HashMap<String, HashMap<char, u32>>,
/// Total chars scanned during training (before `min_count` filter).
pub total_chars: u64,
}
impl Firmware {
/// Train a firmware from a directory of `.md` / `.txt` / `.jsonl` files.
///
/// For `.jsonl`, extracts user turns only. For `.md`, drops
/// `### Assistant` blocks. See `firmware_corpus` for the extractor.
pub fn train_from_dir(path: &Path, max_depth: usize) -> Result<Self> {
let text = load_corpus_text(path)
.with_context(|| format!("load corpus from {}", path.display()))?;
Ok(Self::train_from_text(&text, max_depth))
}
/// Train from an in-memory string (tests, one-shot use).
pub fn train_from_text(text: &str, max_depth: usize) -> Self {
let depth = max_depth.max(1);
let mut stats = NGramStats::new(depth, DEFAULT_MIN_COUNT);
stats.observe_text(text);
stats.finalize()
}
/// Log-likelihood of `text` under this firmware.
///
/// Uses max-available depth for each position, backs off to shorter
/// contexts if unseen, finally to unigram. Unseen chars at unigram
/// level are assigned a floor probability of `1 / (total_chars + 1)`
/// to keep the value finite (no `-inf`).
pub fn log_likelihood(&self, text: &str) -> f64 {
let chars: Vec<char> = text.chars().collect();
let mut total = 0.0_f64;
for i in 0..chars.len() {
total += self.log_prob_at(&chars, i);
}
total
}
/// Persist to gzipped JSON. JSON keeps the file human-grepable; gzip
/// brings a 25 MB-trained firmware well under 50 KB (internal phase
/// reported 2981× compression ratio).
pub fn save(&self, path: &Path) -> Result<()> {
if let Some(parent) = path.parent() {
if !parent.as_os_str().is_empty() {
fs::create_dir_all(parent)
.with_context(|| format!("mkdir {}", parent.display()))?;
}
}
let file = fs::File::create(path)
.with_context(|| format!("create {}", path.display()))?;
let mut enc = GzEncoder::new(file, Compression::best());
let json = serde_json::to_vec(self).context("serialize firmware")?;
enc.write_all(&json).context("gz write")?;
enc.finish().context("gz finish")?;
Ok(())
}
/// Load from gzipped JSON produced by `save`.
pub fn load(path: &Path) -> Result<Self> {
let file = fs::File::open(path)
.with_context(|| format!("open {}", path.display()))?;
let mut dec = GzDecoder::new(file);
let mut buf = Vec::new();
dec.read_to_end(&mut buf).context("gz read")?;
let fw: Firmware = serde_json::from_slice(&buf).context("parse firmware json")?;
Ok(fw)
}
/// Probability of `chars[i]` given `chars[..i]` at max available depth.
/// Returns log P; falls back from deepest available context to unigram.
fn log_prob_at(&self, chars: &[char], i: usize) -> f64 {
let target = chars[i];
let max_back = self.max_depth.min(i);
for back in (1..=max_back).rev() {
let ctx: String = chars[i - back..i].iter().collect();
if let Some(p) = self.prob_in_context(&ctx, target) {
return p.ln();
}
}
self.log_prob_unigram(target)
}
/// Probability of `target` under context `ctx`, or None if ctx unseen.
fn prob_in_context(&self, ctx: &str, target: char) -> Option<f64> {
let next_map = self.ngrams.get(ctx)?;
let total: u32 = next_map.values().sum();
if total == 0 {
return None;
}
let count = next_map.get(&target).copied().unwrap_or(0);
// Add-one smoothing ONLY when target is absent from context's next-
// set — keeps probability strictly positive without disturbing seen
// transitions.
if count == 0 {
let alpha = self.alphabet.len().max(1) as f64;
return Some(1.0 / (total as f64 + alpha));
}
Some(count as f64 / total as f64)
}
/// Unigram fallback with a `1/(N+1)` floor for unseen chars.
fn log_prob_unigram(&self, target: char) -> f64 {
if let Some(idx) = self.alphabet.iter().position(|c| *c == target) {
let p = self.unigram[idx];
if p > 0.0 {
return p.ln();
}
}
let floor = 1.0 / (self.total_chars as f64 + 1.0);
floor.ln()
}
}