Single-commit clean baseline after security scrub of niche-tells, project codenames, internal jargon, and contributor-email leaks. Contents: - 100 Rust crates (_primitives/_rust/) - 37 agent manifests (_manifests/) + generated specs (_generated/) - 67 user-invocable skills (skills/) - 33 hooks (hooks/) - Composition blocks (_blocks/) - Documentation (docs/, README.md) - TS adapter packages (_ts_packages/) - Assembler (_assembler/) - Roles (_roles/) - Templates (_templates/) - Forgejo CI (.forgejo/) Author: Denis Parfionovich <info@greendragon.info> License: see LICENSE.
156 lines
5 KiB
Rust
156 lines
5 KiB
Rust
//! DNA similarity kernel for unseen task-classes.
|
||
//!
|
||
//! When a new task arrives whose `task_class_dna` is not in the ledger,
|
||
//! we transfer learning from similar past task-classes via this kernel.
|
||
//!
|
||
//! K(d, d') = α_role · 1[role=role'] +
|
||
//! α_caps · |caps ∩ caps'| / |caps ∪ caps'| +
|
||
//! α_scope · 1[scope=scope'] +
|
||
//! α_body · jaccard(body8, body8') ← coarse n-gram on hex
|
||
//!
|
||
//! Calibrated weights are reset by the `calibrate` CLI subcommand from
|
||
//! observed outcomes; defaults below are seed values.
|
||
//!
|
||
//! Constructor Pattern: pure-fn cube. No SQL, no I/O. Caller composes
|
||
//! with `posterior::from_ledger` to weight transferred posteriors.
|
||
|
||
use crate::dna_class;
|
||
|
||
#[derive(Debug, Clone, Copy)]
|
||
pub struct KernelWeights {
|
||
pub alpha_role: f64,
|
||
pub alpha_caps: f64,
|
||
pub alpha_scope: f64,
|
||
pub alpha_body: f64,
|
||
}
|
||
|
||
impl Default for KernelWeights {
|
||
fn default() -> Self {
|
||
Self {
|
||
alpha_role: 0.40,
|
||
alpha_caps: 0.25,
|
||
alpha_scope: 0.25,
|
||
alpha_body: 0.10,
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Similarity score in [0, 1]. Higher = more similar.
|
||
pub fn similarity(a: &str, b: &str, w: KernelWeights) -> f64 {
|
||
let mut s = 0.0;
|
||
|
||
// role match
|
||
if let (Some(ra), Some(rb)) = (dna_class::role(a), dna_class::role(b)) {
|
||
if ra == rb {
|
||
s += w.alpha_role;
|
||
}
|
||
}
|
||
|
||
// caps Jaccard on '-' tokens
|
||
if let (Some(ca), Some(cb)) = (dna_class::caps(a), dna_class::caps(b)) {
|
||
s += w.alpha_caps * jaccard_caps(ca, cb);
|
||
}
|
||
|
||
// scope_sha exact match
|
||
if let (Some(sa), Some(sb)) = (dna_class::scope_sha(a), dna_class::scope_sha(b)) {
|
||
if sa == sb {
|
||
s += w.alpha_scope;
|
||
}
|
||
}
|
||
|
||
// body sha hex similarity (only meaningful for task_class_dna inputs)
|
||
if let (Some(ba), Some(bb)) = (dna_class::body_sha(a), dna_class::body_sha(b)) {
|
||
s += w.alpha_body * jaccard_hex(ba, bb);
|
||
}
|
||
|
||
s.clamp(0.0, 1.0)
|
||
}
|
||
|
||
fn jaccard_caps(a: &str, b: &str) -> f64 {
|
||
let sa: std::collections::BTreeSet<&str> = a.split('-').filter(|t| !t.is_empty()).collect();
|
||
let sb: std::collections::BTreeSet<&str> = b.split('-').filter(|t| !t.is_empty()).collect();
|
||
if sa.is_empty() && sb.is_empty() {
|
||
return 1.0;
|
||
}
|
||
let inter = sa.intersection(&sb).count() as f64;
|
||
let union = sa.union(&sb).count() as f64;
|
||
if union == 0.0 {
|
||
0.0
|
||
} else {
|
||
inter / union
|
||
}
|
||
}
|
||
|
||
/// Coarse character-bigram Jaccard for two short hex strings.
|
||
fn jaccard_hex(a: &str, b: &str) -> f64 {
|
||
if a.len() < 2 || b.len() < 2 {
|
||
return if a == b { 1.0 } else { 0.0 };
|
||
}
|
||
let bigrams = |s: &str| -> std::collections::BTreeSet<[u8; 2]> {
|
||
s.as_bytes().windows(2).map(|w| [w[0], w[1]]).collect()
|
||
};
|
||
let sa = bigrams(a);
|
||
let sb = bigrams(b);
|
||
let inter = sa.intersection(&sb).count() as f64;
|
||
let union = sa.union(&sb).count() as f64;
|
||
if union == 0.0 {
|
||
0.0
|
||
} else {
|
||
inter / union
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
#[test]
|
||
fn identical_dna_full_match() {
|
||
let dna = "code-implementer-rust::?::e3929e37::041b7526";
|
||
let s = similarity(dna, dna, KernelWeights::default());
|
||
// role(0.40) + caps_jaccard=1(0.25) + scope(0.25) + body_jaccard=1(0.10) = 1.00
|
||
assert!((s - 1.0).abs() < 1e-9);
|
||
}
|
||
|
||
#[test]
|
||
fn different_role_only_keeps_other_signals() {
|
||
let a = "role-a::cap1-cap2::s1::b1";
|
||
let b = "role-b::cap1-cap2::s1::b1";
|
||
let s = similarity(a, b, KernelWeights::default());
|
||
// role miss(0) + caps(0.25) + scope(0.25) + body bigram match
|
||
// body "b1" len=2 → bigram set={[b,1]}, identical → 1.0
|
||
// total = 0 + 0.25 + 0.25 + 0.10 = 0.60
|
||
assert!((s - 0.60).abs() < 1e-9, "got {}", s);
|
||
}
|
||
|
||
#[test]
|
||
fn caps_partial_overlap() {
|
||
let a = "r::a-b-c::s::body1234";
|
||
let b = "r::a-b-d::s::body5678";
|
||
let s = similarity(a, b, KernelWeights::default());
|
||
// role(0.40) + caps Jaccard 2/4 = 0.5 → 0.125 + scope(0.25) + body bigram
|
||
// body bigrams: {bo,od,dy,y1,12,23,34} vs {bo,od,dy,y5,56,67,78}: shared={bo,od,dy}=3, union=11, jaccard=3/11≈0.273
|
||
// body weight: 0.10 * 0.273 ≈ 0.0273
|
||
// total ≈ 0.40 + 0.125 + 0.25 + 0.027 = 0.802
|
||
assert!(s > 0.78 && s < 0.82, "got {}", s);
|
||
}
|
||
|
||
#[test]
|
||
fn unrelated_dna_low_score() {
|
||
let a = "role-x::FOO-BAR::deadbeef::aaaa1111";
|
||
let b = "role-y::QUX-ZAP::cafebabe::ffff8888";
|
||
let s = similarity(a, b, KernelWeights::default());
|
||
// role miss(0) + caps Jaccard 0(0) + scope miss(0) + body bigrams
|
||
// mostly disjoint hex bigrams
|
||
assert!(s < 0.10, "got {}", s);
|
||
}
|
||
|
||
#[test]
|
||
fn empty_caps_treated_as_full_match() {
|
||
// "?" placeholder caps in some real DNAs
|
||
let a = "r::?::s::b1";
|
||
let b = "r::?::s::b1";
|
||
let s = similarity(a, b, KernelWeights::default());
|
||
assert!((s - 1.0).abs() < 1e-9);
|
||
}
|
||
}
|