KeiSeiKit-1.0/_primitives/_rust/frustration-matrix/src/firmware_ngram.rs
Parfii-bot a4e667de10 KeiSeiKit-public — clean state
Single-commit clean baseline after security scrub of niche-tells,
project codenames, internal jargon, and contributor-email leaks.

Contents:
- 100 Rust crates (_primitives/_rust/)
- 37 agent manifests (_manifests/) + generated specs (_generated/)
- 67 user-invocable skills (skills/)
- 33 hooks (hooks/)
- Composition blocks (_blocks/)
- Documentation (docs/, README.md)
- TS adapter packages (_ts_packages/)
- Assembler (_assembler/)
- Roles (_roles/)
- Templates (_templates/)
- Forgejo CI (.forgejo/)

Author: Denis Parfionovich <info@greendragon.info>

License: see LICENSE.
2026-05-01 12:09:03 +08:00

130 lines
4.2 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! N-gram statistics accumulator — a pure cube.
//!
//! Single-pass scan over a UTF-8 string: for every position `i`, observe
//! contexts of every length `k ∈ 1..=max_depth` ending at `i-1` paired
//! with the char at `i`. Final step filters hapax-legomena (`min_count`)
//! and builds the alphabet + unigram vector on alphabet indices.
//!
//! Constructor Pattern: no IO, no dependencies on `Firmware`. Produces
//! owned `HashMap`s that `Firmware::finalize` moves into the struct.
use crate::firmware::Firmware;
use std::collections::HashMap;
/// Mutable accumulator for one training pass.
pub struct NGramStats {
max_depth: usize,
min_count: u32,
total_chars: u64,
unigram_counts: HashMap<char, u64>,
ngram_counts: HashMap<String, HashMap<char, u32>>,
}
impl NGramStats {
pub fn new(max_depth: usize, min_count: u32) -> Self {
Self {
max_depth,
min_count,
total_chars: 0,
unigram_counts: HashMap::new(),
ngram_counts: HashMap::new(),
}
}
/// Consume a chunk of UTF-8 text. Character-boundary-safe: we iterate
/// over `chars()` and rebuild context strings via `collect::<String>()`,
/// never `&text[i..j]` byte slices (see markdown.rs note on `×`).
pub fn observe_text(&mut self, text: &str) {
let chars: Vec<char> = text.chars().collect();
for i in 0..chars.len() {
self.count_unigram(chars[i]);
self.count_ngrams_at(&chars, i);
}
}
fn count_unigram(&mut self, ch: char) {
*self.unigram_counts.entry(ch).or_insert(0) += 1;
self.total_chars += 1;
}
/// For position `i`, record every context of length `k ∈ 1..=max_depth`
/// ending at `i-1` with next-char `chars[i]`. Skipped at `i=0`.
fn count_ngrams_at(&mut self, chars: &[char], i: usize) {
if i == 0 {
return;
}
let max_back = self.max_depth.min(i);
for back in 1..=max_back {
let ctx: String = chars[i - back..i].iter().collect();
let nxt = chars[i];
self.ngram_counts
.entry(ctx)
.or_insert_with(HashMap::new)
.entry(nxt)
.and_modify(|c| *c += 1)
.or_insert(1);
}
}
/// Build the final `Firmware`. Applies `min_count` filter on each
/// `(context, next_char)` pair, drops newly-empty contexts, then
/// derives alphabet + unigram vector from the filtered unigram map.
pub fn finalize(self) -> Firmware {
let alphabet = build_alphabet(&self.unigram_counts, self.min_count);
let unigram = build_unigram(&alphabet, &self.unigram_counts, self.total_chars);
let ngrams = filter_ngrams(self.ngram_counts, self.min_count);
Firmware {
alphabet,
unigram,
max_depth: self.max_depth,
ngrams,
total_chars: self.total_chars,
}
}
}
/// Alphabet = chars with `count >= min_count`, sorted by codepoint.
/// Deterministic across runs — critical for save/load round-trip tests.
fn build_alphabet(counts: &HashMap<char, u64>, min_count: u32) -> Vec<char> {
let mut v: Vec<char> = counts
.iter()
.filter(|(_, c)| **c >= min_count as u64)
.map(|(ch, _)| *ch)
.collect();
v.sort_unstable();
v
}
/// Unigram vector aligned to alphabet order. `P(ch) = count / total`.
fn build_unigram(
alphabet: &[char],
counts: &HashMap<char, u64>,
total: u64,
) -> Vec<f64> {
if total == 0 {
return vec![0.0; alphabet.len()];
}
alphabet
.iter()
.map(|ch| counts.get(ch).copied().unwrap_or(0) as f64 / total as f64)
.collect()
}
/// Drop n-grams below `min_count`. Contexts that become empty after the
/// filter are removed entirely.
fn filter_ngrams(
raw: HashMap<String, HashMap<char, u32>>,
min_count: u32,
) -> HashMap<String, HashMap<char, u32>> {
let mut out = HashMap::with_capacity(raw.len());
for (ctx, nexts) in raw {
let kept: HashMap<char, u32> = nexts
.into_iter()
.filter(|(_, c)| *c >= min_count)
.collect();
if !kept.is_empty() {
out.insert(ctx, kept);
}
}
out
}