Merge feat/stream-c-kei-sage-substrate — kei-sage walks atoms/*.md

This commit is contained in:
Parfii-bot 2026-04-23 00:10:44 +08:00
commit 2361a21d15
9 changed files with 604 additions and 0 deletions

View file

@ -1968,6 +1968,7 @@ dependencies = [
"rusqlite",
"serde",
"serde_json",
"serde_yaml",
"tempfile",
]

View file

@ -18,6 +18,7 @@ rusqlite = { version = "0.31", features = ["bundled"] }
clap = { version = "4", features = ["derive"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
serde_yaml = "0.9"
anyhow = "1"
chrono = { version = "0.4", default-features = false, features = ["clock"] }

View file

@ -0,0 +1,73 @@
//! CLI handlers for `atoms-*` subcommands — walks, indexes, queries atoms.
//!
//! Separate from `main.rs` to keep both files under Constructor Pattern
//! 200-LOC limit. `main.rs` wires clap, this module implements the verbs.
use crate::atom_index::index_atoms;
use crate::atoms::{discover_atoms, AtomRecord};
use crate::bfs::bfs;
use crate::pagerank::pagerank;
use crate::search::fts_search;
use crate::store::Store;
use anyhow::Result;
use std::path::{Path, PathBuf};
pub fn default_atoms_root() -> PathBuf {
let home = std::env::var("HOME").unwrap_or_else(|_| ".".into());
PathBuf::from(home).join(".claude/agents/_primitives/_rust")
}
pub fn cmd_atoms_discover(root: &Path) -> Result<()> {
let records = discover_atoms(root)?;
println!("full_id\tkind\tstability\tmd_path");
for r in &records {
println!(
"{}\t{}\t{}\t{}",
r.full_id,
r.kind.as_str(),
r.stability,
r.md_path.display()
);
}
eprintln!("discovered {} atom(s) under {}", records.len(), root.display());
Ok(())
}
pub fn cmd_atoms_rank(store: &Store, root: &Path, limit: usize) -> Result<()> {
ingest(store, root)?;
for (path, score) in pagerank(store)?.into_iter().take(limit) {
println!("{:.6}\t{}", score, path);
}
Ok(())
}
pub fn cmd_atoms_related(store: &Store, root: &Path, atom_id: &str, depth: i64) -> Result<()> {
ingest(store, root)?;
for r in bfs(store, atom_id, depth)? {
println!("{}\t{}\t(depth {})", r.edge_type, r.path, r.depth);
}
Ok(())
}
pub fn cmd_atoms_search(store: &Store, root: &Path, query: &str, limit: i64) -> Result<()> {
ingest(store, root)?;
for u in fts_search(store, query, limit)? {
if u.unit_type != "atom" {
continue;
}
println!("{}\t{}\t{}", u.id, u.category, u.vault_path);
}
Ok(())
}
fn ingest(store: &Store, root: &Path) -> Result<Vec<AtomRecord>> {
let records = discover_atoms(root)?;
let stats = index_atoms(store, &records)?;
eprintln!(
"indexed {} atom(s), {} wikilink edge(s) from {}",
stats.units_indexed,
stats.edges_indexed,
root.display()
);
Ok(records)
}

View file

@ -0,0 +1,63 @@
//! Persist discovered atoms into the kei-sage Store as Units + typed edges.
//!
//! Unit-type = `"atom"`; `vault_path` = atom full_id (e.g. `kei-task::create`).
//! Edge-type = `"atom_related"` for wikilinks between atoms. Idempotent:
//! re-ingesting the same corpus replaces existing rows by vault_path.
use crate::atoms::{resolve_wikilinks, AtomRecord};
use crate::edges::add_edge;
use crate::store::Store;
use crate::types::Unit;
use anyhow::Result;
pub struct IndexStats {
pub units_indexed: usize,
pub edges_indexed: usize,
}
pub fn index_atoms(store: &Store, records: &[AtomRecord]) -> Result<IndexStats> {
let units_indexed = index_units(store, records)?;
let edges_indexed = index_edges(store, records)?;
Ok(IndexStats { units_indexed, edges_indexed })
}
fn index_units(store: &Store, records: &[AtomRecord]) -> Result<usize> {
let mut n = 0;
for rec in records {
store.add_unit(&record_to_unit(rec))?;
n += 1;
}
Ok(n)
}
fn record_to_unit(rec: &AtomRecord) -> Unit {
Unit {
unit_type: "atom".into(),
title: rec.full_id.clone(),
content: build_content(rec),
evidence_grade: rec.stability.clone(),
source_path: rec.md_path.to_string_lossy().into(),
vault_path: rec.full_id.clone(),
category: rec.kind.as_str().into(),
..Default::default()
}
}
fn build_content(rec: &AtomRecord) -> String {
let kw = rec.keywords.join(", ");
let mut s = String::with_capacity(rec.body.len() + kw.len() + 64);
s.push_str("[keywords] ");
s.push_str(&kw);
s.push_str("\n\n");
s.push_str(&rec.body);
s
}
fn index_edges(store: &Store, records: &[AtomRecord]) -> Result<usize> {
let mut n = 0;
for (src, dst) in resolve_wikilinks(records) {
add_edge(store, &src, &dst, "atom_related", 1.0)?;
n += 1;
}
Ok(n)
}

View file

@ -0,0 +1,121 @@
//! Frontmatter splitting + wikilink extraction helpers for atom `.md` files.
//!
//! Pure functions, no I/O. See `atoms.rs` for the discovery walker.
use anyhow::{anyhow, Result};
/// Split a `.md` file into (frontmatter_yaml, body). Frontmatter must start
/// with `---\n` and end with a line that is exactly `---`.
pub fn split_frontmatter(text: &str) -> Result<(&str, &str)> {
let rest = text
.strip_prefix("---\n")
.or_else(|| text.strip_prefix("---\r\n"))
.ok_or_else(|| anyhow!("missing leading --- frontmatter delimiter"))?;
let end = find_closing_delim(rest)
.ok_or_else(|| anyhow!("missing closing --- frontmatter delimiter"))?;
let fm = &rest[..end.0];
let body_start = end.0 + end.1;
Ok((fm, rest.get(body_start..).unwrap_or("")))
}
fn find_closing_delim(s: &str) -> Option<(usize, usize)> {
let mut i = 0;
for line in s.split_inclusive('\n') {
let trimmed = line.trim_end_matches(&['\n', '\r'][..]);
if trimmed == "---" {
return Some((i, line.len()));
}
i += line.len();
}
None
}
/// Parse a single wikilink `[[target]]`. Returns `Some(target)` stripped of
/// brackets and whitespace, `None` if the string isn't a wikilink shape.
pub fn parse_wikilink(raw: &str) -> Option<String> {
let t = raw.trim();
let inner = t.strip_prefix("[[").and_then(|s| s.strip_suffix("]]"))?;
let inner = inner.trim();
if inner.is_empty() {
None
} else {
Some(inner.to_string())
}
}
/// Filter rule that decides whether a wikilink target is an atom reference.
/// Atoms use `<crate>::<verb>`; we exclude `rules/*` and `rule*` targets.
pub fn is_atom_target(target: &str) -> bool {
!target.starts_with("rules/") && !target.starts_with("rule ")
}
/// Split `<crate>::<verb>` atom id into components.
pub fn split_atom_id(id: &str) -> Result<(String, String)> {
match id.split_once("::") {
Some((c, v)) if !c.is_empty() && !v.is_empty() => Ok((c.into(), v.into())),
_ => Err(anyhow!("atom id must be <crate>::<verb>, got {id}")),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn split_basic() {
let src = "---\nfoo: bar\n---\nbody text\n";
let (fm, body) = split_frontmatter(src).unwrap();
assert_eq!(fm, "foo: bar\n");
assert_eq!(body, "body text\n");
}
#[test]
fn split_crlf() {
let src = "---\r\nfoo: bar\r\n---\r\nbody\r\n";
let (fm, _body) = split_frontmatter(src).unwrap();
assert!(fm.contains("foo: bar"));
}
#[test]
fn split_missing_start() {
assert!(split_frontmatter("no frontmatter\n").is_err());
}
#[test]
fn split_missing_end() {
assert!(split_frontmatter("---\nfoo: bar\nbody\n").is_err());
}
#[test]
fn wikilink_simple() {
assert_eq!(
parse_wikilink("[[kei-task::create]]"),
Some("kei-task::create".into())
);
}
#[test]
fn wikilink_none() {
assert_eq!(parse_wikilink("just text"), None);
assert_eq!(parse_wikilink("[[ ]]"), None);
}
#[test]
fn atom_target_filter() {
assert!(is_atom_target("kei-task::create"));
assert!(!is_atom_target("rules/RULE 0.12"));
}
#[test]
fn split_id_ok() {
let (c, v) = split_atom_id("kei-task::create").unwrap();
assert_eq!(c, "kei-task");
assert_eq!(v, "create");
}
#[test]
fn split_id_bad() {
assert!(split_atom_id("no-separator").is_err());
assert!(split_atom_id("::empty").is_err());
}
}

View file

@ -0,0 +1,179 @@
//! Substrate-atom discovery + frontmatter parsing + wikilink extraction.
//!
//! Walks `<root>/<crate>/atoms/*.md`, parses YAML frontmatter, returns
//! `AtomRecord`. Tolerant: skips files with invalid frontmatter (logs to
//! stderr, continues scan). See `docs/SUBSTRATE-SCHEMA.md` §Graph contract.
use crate::atom_parse::{is_atom_target, parse_wikilink, split_atom_id, split_frontmatter};
use anyhow::{anyhow, Context, Result};
use serde::Deserialize;
use std::fs;
use std::path::{Path, PathBuf};
use std::str::FromStr;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum AtomKind {
Command,
Query,
Stream,
Transform,
}
impl FromStr for AtomKind {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self> {
match s.trim().to_ascii_lowercase().as_str() {
"command" => Ok(AtomKind::Command),
"query" => Ok(AtomKind::Query),
"stream" => Ok(AtomKind::Stream),
"transform" => Ok(AtomKind::Transform),
other => Err(anyhow!("unknown atom kind: {other}")),
}
}
}
impl AtomKind {
pub fn as_str(&self) -> &'static str {
match self {
AtomKind::Command => "command",
AtomKind::Query => "query",
AtomKind::Stream => "stream",
AtomKind::Transform => "transform",
}
}
}
#[derive(Debug, Clone)]
pub struct AtomRecord {
pub full_id: String,
pub kind: AtomKind,
pub crate_name: String,
pub verb: String,
pub version: String,
pub md_path: PathBuf,
pub input_schema: Option<PathBuf>,
pub output_schema: Option<PathBuf>,
pub related: Vec<String>,
pub keywords: Vec<String>,
pub stability: String,
pub body: String,
}
#[derive(Debug, Deserialize)]
struct SchemaRef {
schema: Option<String>,
}
#[derive(Debug, Deserialize)]
struct Frontmatter {
atom: String,
kind: String,
#[serde(default)]
version: Option<String>,
#[serde(default)]
input: Option<SchemaRef>,
#[serde(default)]
output: Option<SchemaRef>,
#[serde(default)]
related: Vec<String>,
#[serde(default)]
keywords: Vec<String>,
#[serde(default)]
stability: Option<String>,
}
pub fn discover_atoms(root: &Path) -> Result<Vec<AtomRecord>> {
let mut out = Vec::new();
if !root.is_dir() {
return Ok(out);
}
for entry in fs::read_dir(root).with_context(|| format!("read_dir {}", root.display()))? {
let crate_dir = entry?.path();
if crate_dir.is_dir() {
collect_from_crate(&crate_dir, &mut out);
}
}
Ok(out)
}
fn collect_from_crate(crate_dir: &Path, out: &mut Vec<AtomRecord>) {
let atoms_dir = crate_dir.join("atoms");
if !atoms_dir.is_dir() {
return;
}
let crate_name = crate_dir
.file_name()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string();
let iter = match fs::read_dir(&atoms_dir) {
Ok(it) => it,
Err(e) => {
eprintln!("skip {}: {}", atoms_dir.display(), e);
return;
}
};
for entry in iter.flatten() {
let path = entry.path();
if !is_md_file(&path) {
continue;
}
match parse_atom_file(&path, &crate_name) {
Ok(rec) => out.push(rec),
Err(e) => eprintln!("skip {}: {}", path.display(), e),
}
}
}
fn is_md_file(path: &Path) -> bool {
path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("md")
}
fn parse_atom_file(path: &Path, crate_name: &str) -> Result<AtomRecord> {
let text = fs::read_to_string(path)
.with_context(|| format!("read {}", path.display()))?;
let (fm_text, body) = split_frontmatter(&text)?;
let fm: Frontmatter =
serde_yaml::from_str(fm_text).with_context(|| "parse frontmatter YAML")?;
build_record(fm, body, path, crate_name)
}
fn build_record(fm: Frontmatter, body: &str, path: &Path, crate_name: &str) -> Result<AtomRecord> {
let kind = AtomKind::from_str(&fm.kind)?;
let (crate_from_id, verb) = split_atom_id(&fm.atom)?;
let md_dir = path.parent().unwrap_or(path).to_path_buf();
Ok(AtomRecord {
full_id: fm.atom.clone(),
kind,
crate_name: if crate_from_id.is_empty() {
crate_name.to_string()
} else {
crate_from_id
},
verb,
version: fm.version.unwrap_or_default(),
md_path: path.to_path_buf(),
input_schema: fm.input.and_then(|s| s.schema).map(|s| md_dir.join(&s)),
output_schema: fm.output.and_then(|s| s.schema).map(|s| md_dir.join(&s)),
related: fm.related,
keywords: fm.keywords,
stability: fm.stability.unwrap_or_else(|| "unknown".into()),
body: body.to_string(),
})
}
/// Extract `(source_atom_id, target)` edges from `related:` wikilinks.
/// Non-atom targets (rules, notes) are filtered out here — scope: atoms only.
pub fn resolve_wikilinks(records: &[AtomRecord]) -> Vec<(String, String)> {
let mut out = Vec::new();
for rec in records {
for w in &rec.related {
if let Some(target) = parse_wikilink(w) {
if is_atom_target(&target) {
out.push((rec.full_id.clone(), target));
}
}
}
}
out
}

View file

@ -2,6 +2,10 @@
//!
//! Port of LBM internal/sage. Constructor Pattern: one concept per file.
pub mod atom_cli;
pub mod atom_index;
pub mod atom_parse;
pub mod atoms;
pub mod bfs;
pub mod edges;
pub mod import;

View file

@ -1,6 +1,9 @@
//! kei-sage CLI — import / search / related / rank / add / edit.
use clap::{Parser, Subcommand};
use kei_sage::atom_cli::{
cmd_atoms_discover, cmd_atoms_related, cmd_atoms_search, cmd_atoms_rank, default_atoms_root,
};
use kei_sage::bfs::bfs;
use kei_sage::edges::add_edge;
use kei_sage::import::import_vault;
@ -39,6 +42,23 @@ enum Cmd {
#[arg(long)] grade: Option<String>,
},
Link { src: String, dst: String, #[arg(long, default_value = "related")] edge_type: String },
AtomsDiscover {
#[arg(long)] root: Option<PathBuf>,
},
AtomsRank {
#[arg(long)] root: Option<PathBuf>,
#[arg(long, default_value_t = 20)] limit: usize,
},
AtomsRelated {
atom_id: String,
#[arg(long)] root: Option<PathBuf>,
#[arg(long, default_value_t = 2)] depth: i64,
},
AtomsSearch {
query: String,
#[arg(long)] root: Option<PathBuf>,
#[arg(long, default_value_t = 20)] limit: i64,
},
}
fn db_path(cli_db: Option<PathBuf>) -> PathBuf {
@ -65,6 +85,14 @@ fn dispatch(store: &Store, cmd: Cmd) -> anyhow::Result<()> {
Cmd::Edit { id, title, content, grade } =>
cmd_edit(store, id, title, content, grade),
Cmd::Link { src, dst, edge_type } => cmd_link(store, &src, &dst, &edge_type),
Cmd::AtomsDiscover { root } =>
cmd_atoms_discover(&root.unwrap_or_else(default_atoms_root)),
Cmd::AtomsRank { root, limit } =>
cmd_atoms_rank(store, &root.unwrap_or_else(default_atoms_root), limit),
Cmd::AtomsRelated { atom_id, root, depth } =>
cmd_atoms_related(store, &root.unwrap_or_else(default_atoms_root), &atom_id, depth),
Cmd::AtomsSearch { query, root, limit } =>
cmd_atoms_search(store, &root.unwrap_or_else(default_atoms_root), &query, limit),
}
}

View file

@ -0,0 +1,134 @@
//! Integration smoke test for atom discovery + wikilink resolution.
//!
//! Creates a temp root with 2 fake crates, each with `atoms/<verb>.md`,
//! asserts `discover_atoms` returns 2 records and frontmatter is parsed.
use kei_sage::atom_index::index_atoms;
use kei_sage::atoms::{discover_atoms, resolve_wikilinks, AtomKind};
use kei_sage::Store;
use std::fs;
use tempfile::tempdir;
const ATOM_A: &str = r#"---
atom: kei-task::create
kind: command
version: "0.1.0"
input:
schema: schemas/create-input.json
output:
schema: schemas/create-output.json
stability: stable
keywords: [task, todo]
related:
- "[[kei-task::add-dependency]]"
- "[[rules/RULE 0.12]]"
---
# kei-task::create
Creates a task.
"#;
const ATOM_B: &str = r#"---
atom: kei-task::add-dependency
kind: command
version: "0.1.0"
stability: beta
keywords: [task, dag]
related: []
---
# kei-task::add-dependency
Links two tasks.
"#;
const ATOM_BAD: &str = r#"not-yaml-frontmatter
just a plain markdown file
"#;
fn write_atom(root: &std::path::Path, crate_name: &str, verb: &str, body: &str) {
let atoms_dir = root.join(crate_name).join("atoms");
fs::create_dir_all(&atoms_dir).unwrap();
fs::write(atoms_dir.join(format!("{verb}.md")), body).unwrap();
}
#[test]
fn discover_returns_both_records() {
let tmp = tempdir().unwrap();
write_atom(tmp.path(), "kei-task", "create", ATOM_A);
write_atom(tmp.path(), "kei-task", "add-dependency", ATOM_B);
let recs = discover_atoms(tmp.path()).unwrap();
assert_eq!(recs.len(), 2, "expected 2 records, got {}", recs.len());
let ids: Vec<&str> = recs.iter().map(|r| r.full_id.as_str()).collect();
assert!(ids.contains(&"kei-task::create"));
assert!(ids.contains(&"kei-task::add-dependency"));
}
#[test]
fn frontmatter_fields_parsed() {
let tmp = tempdir().unwrap();
write_atom(tmp.path(), "kei-task", "create", ATOM_A);
let recs = discover_atoms(tmp.path()).unwrap();
let rec = recs.iter().find(|r| r.full_id == "kei-task::create").unwrap();
assert_eq!(rec.kind, AtomKind::Command);
assert_eq!(rec.crate_name, "kei-task");
assert_eq!(rec.verb, "create");
assert_eq!(rec.version, "0.1.0");
assert_eq!(rec.stability, "stable");
assert!(rec.keywords.contains(&"task".to_string()));
assert!(rec.input_schema.is_some());
assert!(rec.output_schema.is_some());
assert!(rec.body.contains("Creates a task"));
}
#[test]
fn invalid_frontmatter_is_skipped_not_fatal() {
let tmp = tempdir().unwrap();
write_atom(tmp.path(), "kei-task", "create", ATOM_A);
write_atom(tmp.path(), "kei-task", "broken", ATOM_BAD);
let recs = discover_atoms(tmp.path()).unwrap();
assert_eq!(recs.len(), 1);
assert_eq!(recs[0].full_id, "kei-task::create");
}
#[test]
fn wikilinks_filter_rule_targets() {
let tmp = tempdir().unwrap();
write_atom(tmp.path(), "kei-task", "create", ATOM_A);
write_atom(tmp.path(), "kei-task", "add-dependency", ATOM_B);
let recs = discover_atoms(tmp.path()).unwrap();
let edges = resolve_wikilinks(&recs);
// Only atom-to-atom edges remain; `[[rules/RULE 0.12]]` filtered.
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].0, "kei-task::create");
assert_eq!(edges[0].1, "kei-task::add-dependency");
}
#[test]
fn empty_root_returns_empty() {
let tmp = tempdir().unwrap();
let recs = discover_atoms(tmp.path()).unwrap();
assert!(recs.is_empty());
}
#[test]
fn index_atoms_persists_units_and_edges() {
let tmp = tempdir().unwrap();
write_atom(tmp.path(), "kei-task", "create", ATOM_A);
write_atom(tmp.path(), "kei-task", "add-dependency", ATOM_B);
let recs = discover_atoms(tmp.path()).unwrap();
let store = Store::open_memory().unwrap();
let stats = index_atoms(&store, &recs).unwrap();
assert_eq!(stats.units_indexed, 2);
assert_eq!(stats.edges_indexed, 1);
assert_eq!(store.count_units().unwrap(), 2);
}