KeiSeiKit-1.0/_primitives/_rust/kei-router/src/extract.rs
Parfii-bot adc007b7b0 feat(primitives): 10 Rust crates extracted from LBM (Genesis-scrubbed)
- kei-router — keyword-dispatch meta-tool (CfC ML fallback removed)
- kei-sage — Obsidian-style knowledge graph, FTS5 + BFS + PageRank
- kei-task — task DAG with deps, milestones, dependency-chain queries
- kei-chat-store — Claude conversation session persistence + FTS search
- kei-crossdomain — typed-edge store + BFS cross-domain glue
- kei-search-core — 3-wave deep research with microcent budget cap
- kei-content-store — asset + prompt + campaign registry
- kei-social-store — people + interactions CRM (lite)
- kei-curator — edge-decay graph hygiene utility
- kei-auth — multi-tenant session tokens (replaces single-bearer)

Genesis-scan pre-import pass: skipped pkg/mxl1/*, pkg/inference/*, pkg/trainer/*,
pkg/nc01/*, internal/ml/* (all Genesis/CfC adjacent, sensitive IP).
Security: skipped tools_threat/radio/protocol/med/mlreg (offensive/banned).
Domain verticals skipped: hr/legal/infra/ops/api/osint/edu/geo/hw/finance.

New 'mcp' profile in MANIFEST.toml bundles all 10 for MCP server deployment.

Workspace now 24 crates, cargo check --workspace clean, 94 workspace tests pass.
2026-04-22 12:48:56 +08:00

167 lines
4.8 KiB
Rust

//! Param extraction — regex scans the raw query for path / limit / id / URI / KV.
//!
//! Ported from LBM pkg/keirouter/extract.go.
use regex::Regex;
use std::collections::HashMap;
use std::sync::OnceLock;
#[derive(Debug, Default, Clone)]
pub struct Extracted {
pub path: String,
pub paths: String,
pub limit: i64,
pub depth: i64,
pub id: i64,
pub query: String,
pub text: String,
pub text_clean: String,
pub uri: String,
pub kv: HashMap<String, String>,
}
fn re(pat: &str) -> Regex {
Regex::new(pat).expect("invalid regex pattern in kei-router")
}
fn re_abs_path() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| re(r"(?:^|\s)((?:/[\w.~-]+)+(?:\.\w+)?)"))
}
fn re_rel_path() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| re(r"(?:^|\s)((?:[\w.-]+/)+[\w.-]+\.\w+)"))
}
fn re_json_arr() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| re(r#"\[(?:\s*"[^"]*"\s*,?\s*)+\]"#))
}
fn re_number() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| re(r"\b(?:limit|max|top)\s*[=:]?\s*(\d+)"))
}
fn re_depth() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| re(r"\b(?:depth)\s*[=:]?\s*(\d+)"))
}
fn re_id_num() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| re(r"\b(?:id|unit)\s*[=:#]?\s*(\d+)"))
}
fn re_bare_num() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| re(r"\b(\d{1,4})\b"))
}
fn re_vault_uri() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| re(r"\bnote://vault/[\w/.\-]+"))
}
fn re_domain_uri() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| re(r"\b(\w+://[\w/.+\-]+)"))
}
fn re_kv() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| re(r"\b(\w+)=([\w://._+\-]+)"))
}
fn parse_i64(s: &str) -> i64 {
s.parse::<i64>().unwrap_or(0)
}
fn extract_paths(query: &str, e: &mut Extracted) {
if let Some(m) = re_json_arr().find(query) {
e.paths = m.as_str().to_string();
}
if let Some(c) = re_abs_path().captures(query) {
if let Some(m) = c.get(1) {
e.path = m.as_str().to_string();
}
}
if e.path.is_empty() {
if let Some(c) = re_rel_path().captures(query) {
if let Some(m) = c.get(1) {
e.path = m.as_str().to_string();
}
}
}
if let Some(m) = re_vault_uri().find(query) {
if e.path.is_empty() {
e.path = m.as_str().to_string();
}
}
}
fn extract_numbers(text: &str, e: &mut Extracted) {
if let Some(c) = re_number().captures(text) {
if let Some(m) = c.get(1) {
e.limit = parse_i64(m.as_str());
}
}
if let Some(c) = re_depth().captures(text) {
if let Some(m) = c.get(1) {
e.depth = parse_i64(m.as_str());
}
}
if let Some(c) = re_id_num().captures(text) {
if let Some(m) = c.get(1) {
e.id = parse_i64(m.as_str());
}
}
if e.limit == 0 && e.id == 0 {
if let Some(c) = re_bare_num().captures(text) {
if let Some(m) = c.get(1) {
let n = parse_i64(m.as_str());
if n > 0 && n <= 500 {
e.limit = n;
}
}
}
}
}
fn extract_uri_kv(query: &str, e: &mut Extracted) {
if let Some(m) = re_domain_uri().find(query) {
let s = m.as_str();
if !s.starts_with("note://") {
e.uri = s.to_string();
}
}
for c in re_kv().captures_iter(query) {
if let (Some(k), Some(v)) = (c.get(1), c.get(2)) {
e.kv.insert(k.as_str().to_string(), v.as_str().to_string());
}
}
}
fn build_clean_query(e: &mut Extracted) {
let mut q = e.text.clone();
if !e.path.is_empty() {
q = q.replacen(&e.path.to_lowercase(), "", 1);
}
q = re_number().replace_all(&q, "").to_string();
q = re_depth().replace_all(&q, "").to_string();
q = re_id_num().replace_all(&q, "").to_string();
q = q.trim().to_string();
if !q.is_empty() {
e.query = q;
}
e.text_clean = e.text.clone();
if !e.path.is_empty() {
e.text_clean = e.text_clean.replacen(&e.path.to_lowercase(), " ", 1).trim().to_string();
}
}
/// Parse a raw NL query into structured [`Extracted`] params.
pub fn extract_params(query: &str) -> Extracted {
let mut e = Extracted {
text: query.trim().to_lowercase(),
..Default::default()
};
extract_paths(query, &mut e);
let text_copy = e.text.clone();
extract_numbers(&text_copy, &mut e);
extract_uri_kv(query, &mut e);
build_clean_query(&mut e);
e
}