diff --git a/_primitives/_rust/Cargo.lock b/_primitives/_rust/Cargo.lock index a70234e..d0d520d 100644 --- a/_primitives/_rust/Cargo.lock +++ b/_primitives/_rust/Cargo.lock @@ -1083,12 +1083,13 @@ checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" [[package]] name = "fancy-regex" -version = "0.11.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95f7c0680e4142284cf8b22c14a476e87d61b004a3a0861872b32ef7ead40a2" +checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" dependencies = [ "bit-set", - "regex", + "regex-automata", + "regex-syntax", ] [[package]] @@ -1167,9 +1168,9 @@ dependencies = [ [[package]] name = "fraction" -version = "0.13.1" +version = "0.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3027ae1df8d41b4bed2241c8fdad4acc1e7af60c8e17743534b545e77182d678" +checksum = "e076045bb43dac435333ed5f04caf35c7463631d0dae2deb2638d94dd0a5b872" dependencies = [ "lazy_static", "num", @@ -1818,13 +1819,13 @@ dependencies = [ [[package]] name = "jsonschema" -version = "0.17.1" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a071f4f7efc9a9118dfb627a0a94ef247986e1ab8606a4c806ae2b3aa3b6978" +checksum = "fa0f4bea31643be4c6a678e9aa4ae44f0db9e5609d5ca9dc9083d06eb3e9a27a" dependencies = [ "ahash", "anyhow", - "base64 0.21.7", + "base64 0.22.1", "bytecount", "fancy-regex", "fraction", @@ -1858,6 +1859,17 @@ dependencies = [ "tempfile", ] +[[package]] +name = "kei-atom-discovery" +version = "0.1.0" +dependencies = [ + "serde", + "serde_yaml_ng", + "tempfile", + "thiserror 1.0.69", + "walkdir", +] + [[package]] name = "kei-auth" version = "0.1.0" @@ -2047,10 +2059,12 @@ dependencies = [ "anyhow", "clap", "jsonschema", + "kei-atom-discovery", "serde", "serde_json", - "serde_yaml", + "serde_yaml_ng", "tempfile", + "url", "walkdir", ] @@ -2061,10 +2075,10 @@ dependencies = [ "anyhow", "chrono", "clap", + "kei-atom-discovery", "rusqlite", "serde", "serde_json", - "serde_yaml", "tempfile", ] @@ -3080,6 +3094,19 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "serde_yaml_ng" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4db627b98b36d4203a7b458cf3573730f2bb591b28871d916dfa9efabfd41f" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "sha1" version = "0.10.6" diff --git a/_primitives/_rust/Cargo.toml b/_primitives/_rust/Cargo.toml index 8961401..8cfff5c 100644 --- a/_primitives/_rust/Cargo.toml +++ b/_primitives/_rust/Cargo.toml @@ -33,6 +33,8 @@ members = [ "kei-forge", # v1 substrate — atom invocation runtime + schema linter (Stream D) "kei-runtime", + # v1 substrate — shared atom discovery + frontmatter + safe path (Stream E) + "kei-atom-discovery", ] [workspace.package] diff --git a/_primitives/_rust/kei-atom-discovery/Cargo.toml b/_primitives/_rust/kei-atom-discovery/Cargo.toml new file mode 100644 index 0000000..ba86e4a --- /dev/null +++ b/_primitives/_rust/kei-atom-discovery/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "kei-atom-discovery" +version = "0.1.0" +edition = "2021" +rust-version = "1.75" +description = "Shared atom discovery + frontmatter parsing + safe path join" + +[lib] +name = "kei_atom_discovery" +path = "src/lib.rs" + +[dependencies] +serde = { version = "1", features = ["derive"] } +serde_yaml_ng = "0.10" +walkdir = "2" +thiserror = "1" + +[dev-dependencies] +tempfile = "3" + +[package.metadata.keisei] +backend = "none" +description = "Shared atom discovery + frontmatter parsing + safe path join" diff --git a/_primitives/_rust/kei-atom-discovery/src/error.rs b/_primitives/_rust/kei-atom-discovery/src/error.rs new file mode 100644 index 0000000..26907e4 --- /dev/null +++ b/_primitives/_rust/kei-atom-discovery/src/error.rs @@ -0,0 +1,47 @@ +//! Typed errors for atom discovery + frontmatter parsing. +//! +//! Every failure mode is a distinct variant — callers pattern-match by variant, +//! not by `to_string()` scraping. + +use std::path::PathBuf; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum Error { + #[error("path escape: `{rel}` escapes base `{}`", base.display())] + PathEscape { base: PathBuf, rel: String }, + + #[error("path absolute not allowed: `{0}`")] + PathAbsolute(String), + + #[error("path contains parent component (..): `{0}`")] + PathParent(String), + + #[error("canonicalize `{}`: {source}", path.display())] + Canonicalize { + path: PathBuf, + #[source] + source: std::io::Error, + }, + + #[error("frontmatter missing leading --- delimiter")] + FrontmatterMissingStart, + + #[error("frontmatter missing closing --- delimiter")] + FrontmatterMissingEnd, + + #[error("frontmatter exceeds {limit} bytes (got {got})")] + FrontmatterTooLarge { limit: usize, got: usize }, + + #[error("yaml parse: {0}")] + Yaml(#[from] serde_yaml_ng::Error), + + #[error("atom id must be `::`, got `{0}`")] + BadAtomId(String), + + #[error("unknown atom kind: `{0}`")] + UnknownKind(String), + + #[error("io: {0}")] + Io(#[from] std::io::Error), +} diff --git a/_primitives/_rust/kei-atom-discovery/src/frontmatter.rs b/_primitives/_rust/kei-atom-discovery/src/frontmatter.rs new file mode 100644 index 0000000..785c65a --- /dev/null +++ b/_primitives/_rust/kei-atom-discovery/src/frontmatter.rs @@ -0,0 +1,150 @@ +//! Frontmatter schema + YAML parsing. +//! +//! Locked schema per `docs/SUBSTRATE-SCHEMA.md`. `input`/`output` are +//! REQUIRED for command/query/stream, OPTIONAL for transform. +//! +//! YAML parser is `serde_yaml_ng` (maintained fork of the archived +//! `serde_yaml` crate). A 64 KiB size cap is enforced pre-parse as a +//! billion-laughs mitigation. + +use crate::error::Error; +use serde::Deserialize; +use serde_yaml_ng::Value as YamlValue; +use std::path::PathBuf; +use std::str::FromStr; + +/// Hard cap on frontmatter size. 64 KiB is 100× any realistic atom spec. +pub const MAX_FRONTMATTER_BYTES: usize = 64 * 1024; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum AtomKind { + Command, + Query, + Stream, + Transform, +} + +impl AtomKind { + pub fn as_str(&self) -> &'static str { + match self { + AtomKind::Command => "command", + AtomKind::Query => "query", + AtomKind::Stream => "stream", + AtomKind::Transform => "transform", + } + } +} + +impl FromStr for AtomKind { + type Err = Error; + fn from_str(s: &str) -> Result { + match s.trim().to_ascii_lowercase().as_str() { + "command" => Ok(AtomKind::Command), + "query" => Ok(AtomKind::Query), + "stream" => Ok(AtomKind::Stream), + "transform" => Ok(AtomKind::Transform), + other => Err(Error::UnknownKind(other.to_string())), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SideEffect { + pub op: String, + pub domain: String, +} + +/// Fully-parsed atom metadata — one canonical struct shared across crates. +#[derive(Debug, Clone)] +pub struct AtomMeta { + pub full_id: String, + pub crate_name: String, + pub verb: String, + pub kind: AtomKind, + pub version: String, + pub md_path: PathBuf, + pub input_schema: Option, + pub output_schema: Option, + pub side_effects: Vec, + pub idempotent: bool, + pub stability: String, + pub keywords: Vec, + pub related: Vec, + pub body: String, +} + +/// Raw deserialisation target — kept private, `AtomMeta` is the public shape. +#[derive(Debug, Deserialize)] +pub struct Frontmatter { + pub atom: String, + pub kind: String, + #[serde(default)] + pub version: Option, + #[serde(default)] + pub input: Option, + #[serde(default)] + pub output: Option, + #[serde(default)] + pub side_effects: Vec, + #[serde(default)] + pub idempotent: Option, + #[serde(default)] + pub stability: Option, + #[serde(default)] + pub keywords: Vec, + #[serde(default)] + pub related: Vec, +} + +#[derive(Debug, Deserialize)] +pub struct SchemaRef { + pub schema: Option, +} + +/// Split a markdown file into (frontmatter_yaml, body). Enforces a 64 KiB +/// byte cap over the **entire input** pre-parse (billion-laughs mitigation). +pub fn parse_frontmatter(md_text: &str) -> Result<(&str, &str), Error> { + if md_text.len() > MAX_FRONTMATTER_BYTES.saturating_mul(16) { + // Whole file is huge — still allowed; the cap applies to frontmatter. + // We only pre-reject if the frontmatter itself is over the limit. + } + let rest = md_text + .strip_prefix("---\n") + .or_else(|| md_text.strip_prefix("---\r\n")) + .ok_or(Error::FrontmatterMissingStart)?; + let (end_off, end_len) = + find_closing_delim(rest).ok_or(Error::FrontmatterMissingEnd)?; + if end_off > MAX_FRONTMATTER_BYTES { + return Err(Error::FrontmatterTooLarge { + limit: MAX_FRONTMATTER_BYTES, + got: end_off, + }); + } + let fm = &rest[..end_off]; + let body_start = end_off + end_len; + Ok((fm, rest.get(body_start..).unwrap_or(""))) +} + +fn find_closing_delim(s: &str) -> Option<(usize, usize)> { + let mut i = 0; + for line in s.split_inclusive('\n') { + let trimmed = line.trim_end_matches(&['\n', '\r'][..]); + if trimmed == "---" { + return Some((i, line.len())); + } + i += line.len(); + } + None +} + +/// Parse the `side_effects:` YAML sequence into typed `{op, domain}` pairs. +/// Entries missing either field are skipped (lint surfaces them separately). +pub fn parse_side_effects(raw: &[YamlValue]) -> Vec { + raw.iter().filter_map(side_effect_from_yaml).collect() +} + +fn side_effect_from_yaml(v: &YamlValue) -> Option { + let op = v.get("op").and_then(|x| x.as_str())?.to_string(); + let domain = v.get("domain").and_then(|x| x.as_str())?.to_string(); + Some(SideEffect { op, domain }) +} diff --git a/_primitives/_rust/kei-atom-discovery/src/lib.rs b/_primitives/_rust/kei-atom-discovery/src/lib.rs new file mode 100644 index 0000000..6eb1cc9 --- /dev/null +++ b/_primitives/_rust/kei-atom-discovery/src/lib.rs @@ -0,0 +1,21 @@ +//! kei-atom-discovery — shared substrate-atom discovery primitives. +//! +//! Single authoritative implementation of: +//! - `AtomMeta` / `AtomKind` / `SideEffect` — locked frontmatter schema +//! - `parse_frontmatter` — YAML split with 64 KiB cap (billion-laughs guard) +//! - `discover_atoms` — walks `/*/atoms/*.md`, symlink-safe +//! - `parse_wikilink` — strict `[[target]]` matcher +//! - `safe_join` — path-traversal-safe base+rel join +//! +//! Both `kei-sage` and `kei-runtime` consume this crate — no parallel +//! frontmatter structs, no parallel YAML parsers. + +pub mod error; +pub mod frontmatter; +pub mod walk; + +pub use error::Error; +pub use frontmatter::{ + parse_frontmatter, AtomKind, AtomMeta, Frontmatter, SideEffect, MAX_FRONTMATTER_BYTES, +}; +pub use walk::{discover_atoms, is_atom_target, parse_wikilink, safe_join, split_atom_id}; diff --git a/_primitives/_rust/kei-atom-discovery/src/walk.rs b/_primitives/_rust/kei-atom-discovery/src/walk.rs new file mode 100644 index 0000000..9022c10 --- /dev/null +++ b/_primitives/_rust/kei-atom-discovery/src/walk.rs @@ -0,0 +1,136 @@ +//! Filesystem walk for atom discovery + path-safety primitives. +//! +//! `discover_atoms` enumerates `/*/atoms/*.md` with `follow_links(false)`. +//! `safe_join` is the authoritative base+rel path-join — rejects absolute +//! components and `..`, canonicalises, asserts base containment. + +use crate::error::Error; +use crate::frontmatter::{ + parse_frontmatter, parse_side_effects, AtomKind, AtomMeta, Frontmatter, +}; +use std::path::{Component, Path, PathBuf}; +use std::str::FromStr; +use walkdir::WalkDir; + +/// Walk `/*/atoms/*.md`. Skip-on-invalid: malformed files emit a +/// stderr warning and are dropped. Never follows symlinks. +pub fn discover_atoms(root: &Path) -> Vec { + let mut out = Vec::new(); + for entry in WalkDir::new(root) + .max_depth(3) + .follow_links(false) + .into_iter() + .flatten() + { + if !is_atom_md(entry.path()) { + continue; + } + match parse_one(entry.path()) { + Ok(meta) => out.push(meta), + Err(e) => eprintln!("warn: skip {}: {}", entry.path().display(), e), + } + } + out +} + +fn is_atom_md(path: &Path) -> bool { + path.is_file() + && path.extension().and_then(|s| s.to_str()) == Some("md") + && path + .parent() + .and_then(|p| p.file_name()) + .is_some_and(|n| n == "atoms") +} + +fn parse_one(md_path: &Path) -> Result { + let text = std::fs::read_to_string(md_path)?; + let (fm_text, body) = parse_frontmatter(&text)?; + let fm: Frontmatter = serde_yaml_ng::from_str(fm_text)?; + build_meta(fm, body, md_path) +} + +fn build_meta(fm: Frontmatter, body: &str, md_path: &Path) -> Result { + let kind = AtomKind::from_str(&fm.kind)?; + let (crate_name, verb) = split_atom_id(&fm.atom)?; + let md_dir = md_path.parent().unwrap_or(md_path); + let input_schema = resolve_opt_schema(md_dir, fm.input.as_ref().and_then(|s| s.schema.as_deref())); + let output_schema = + resolve_opt_schema(md_dir, fm.output.as_ref().and_then(|s| s.schema.as_deref())); + Ok(AtomMeta { + full_id: fm.atom.clone(), + crate_name, + verb, + kind, + version: fm.version.unwrap_or_default(), + md_path: md_path.to_path_buf(), + input_schema, + output_schema, + side_effects: parse_side_effects(&fm.side_effects), + idempotent: fm.idempotent.unwrap_or(false), + stability: fm.stability.unwrap_or_else(|| "unknown".into()), + keywords: fm.keywords, + related: fm.related, + body: body.to_string(), + }) +} + +/// Resolve an optional schema path relative to the atom's directory. +/// Silently drops entries that fail `safe_join` — lint catches them separately. +fn resolve_opt_schema(md_dir: &Path, rel: Option<&str>) -> Option { + rel.and_then(|r| safe_join(md_dir, r).ok()) +} + +/// Split `::` atom id into components. +pub fn split_atom_id(id: &str) -> Result<(String, String), Error> { + match id.split_once("::") { + Some((c, v)) if !c.is_empty() && !v.is_empty() => Ok((c.into(), v.into())), + _ => Err(Error::BadAtomId(id.to_string())), + } +} + +/// Parse a single wikilink `[[target]]`. Returns `None` if not a wikilink, +/// empty, or if the inner body contains a stray bracket (e.g. `[[[foo]]`). +pub fn parse_wikilink(raw: &str) -> Option { + let t = raw.trim(); + let inner = t.strip_prefix("[[").and_then(|s| s.strip_suffix("]]"))?; + let inner = inner.trim(); + if inner.is_empty() || inner.contains('[') || inner.contains(']') { + None + } else { + Some(inner.to_string()) + } +} + +/// Heuristic atom-target filter: `::` looks like an atom, +/// everything starting with `rules/` or `rule ` is a rule reference. +pub fn is_atom_target(target: &str) -> bool { + !target.starts_with("rules/") && !target.starts_with("rule ") +} + +/// Safe base+rel path join. Rejects absolute paths, parent (`..`) components, +/// and post-canonicalise escapes from `base`. +pub fn safe_join(base: &Path, rel: &str) -> Result { + let rel_path = Path::new(rel); + if rel_path.is_absolute() { + return Err(Error::PathAbsolute(rel.to_string())); + } + for comp in rel_path.components() { + if matches!(comp, Component::ParentDir) { + return Err(Error::PathParent(rel.to_string())); + } + } + let joined = base.join(rel_path); + // Canonicalise lazily — if either path doesn't exist yet, fall back to + // the lexical check we already did (absolute + parent-free is enough). + let base_canon = base.canonicalize().ok(); + let joined_canon = joined.canonicalize().ok(); + if let (Some(bc), Some(jc)) = (base_canon, joined_canon) { + if !jc.starts_with(&bc) { + return Err(Error::PathEscape { + base: bc, + rel: rel.to_string(), + }); + } + } + Ok(joined) +} diff --git a/_primitives/_rust/kei-atom-discovery/tests/smoke.rs b/_primitives/_rust/kei-atom-discovery/tests/smoke.rs new file mode 100644 index 0000000..8b6c526 --- /dev/null +++ b/_primitives/_rust/kei-atom-discovery/tests/smoke.rs @@ -0,0 +1,132 @@ +//! Smoke tests covering the 4 critical fixes consolidated in this crate. + +use kei_atom_discovery::{ + discover_atoms, parse_frontmatter, parse_wikilink, safe_join, AtomKind, Error, + MAX_FRONTMATTER_BYTES, +}; +use std::fs; +use std::path::Path; +use tempfile::tempdir; + +const ATOM_OK: &str = r#"--- +atom: kei-task::create +kind: command +version: "0.1.0" +input: + schema: schemas/create-input.json +output: + schema: schemas/create-output.json +side_effects: + - { op: write, domain: kei-task-db } +idempotent: false +stability: stable +keywords: [task, todo] +related: + - "[[kei-task::add-dependency]]" + - "[[rules/RULE 0.12]]" +--- +# kei-task::create +Body text. +"#; + +fn write_atom(root: &Path, crate_name: &str, verb: &str, body: &str) { + let atoms_dir = root.join(crate_name).join("atoms"); + fs::create_dir_all(atoms_dir.join("schemas")).unwrap(); + fs::write(atoms_dir.join(format!("{verb}.md")), body).unwrap(); + fs::write(atoms_dir.join("schemas").join("create-input.json"), "{}").unwrap(); + fs::write(atoms_dir.join("schemas").join("create-output.json"), "{}").unwrap(); +} + +// FIX 2 happy path — shared Frontmatter correctly parses and exposes typed kind +#[test] +fn discovery_returns_well_formed_atom_meta() { + let tmp = tempdir().unwrap(); + write_atom(tmp.path(), "kei-task", "create", ATOM_OK); + let atoms = discover_atoms(tmp.path()); + assert_eq!(atoms.len(), 1); + let a = &atoms[0]; + assert_eq!(a.full_id, "kei-task::create"); + assert_eq!(a.kind, AtomKind::Command); + assert_eq!(a.crate_name, "kei-task"); + assert_eq!(a.verb, "create"); + assert!(a.input_schema.is_some()); + assert!(a.output_schema.is_some()); + assert_eq!(a.side_effects.len(), 1); + assert_eq!(a.side_effects[0].op, "write"); + assert_eq!(a.side_effects[0].domain, "kei-task-db"); + assert!(a.body.contains("Body text")); +} + +// FIX 1 — path traversal rejection via safe_join +#[test] +fn safe_join_rejects_parent_component() { + let tmp = tempdir().unwrap(); + let err = safe_join(tmp.path(), "../etc/shadow").unwrap_err(); + assert!(matches!(err, Error::PathParent(_))); +} + +#[test] +fn safe_join_rejects_absolute_path() { + let tmp = tempdir().unwrap(); + let err = safe_join(tmp.path(), "/etc/shadow").unwrap_err(); + assert!(matches!(err, Error::PathAbsolute(_))); +} + +#[test] +fn safe_join_accepts_plain_relative() { + let tmp = tempdir().unwrap(); + let target = tmp.path().join("schemas"); + fs::create_dir_all(&target).unwrap(); + let joined = safe_join(tmp.path(), "schemas").unwrap(); + assert!(joined.ends_with("schemas")); +} + +// FIX 3 — YAML size cap enforced pre-parse +#[test] +fn frontmatter_size_cap_enforced() { + let huge = "x".repeat(MAX_FRONTMATTER_BYTES + 100); + let md = format!("---\n{huge}\n---\nbody\n"); + let err = parse_frontmatter(&md).unwrap_err(); + assert!(matches!(err, Error::FrontmatterTooLarge { .. })); +} + +#[test] +fn frontmatter_missing_start_rejected() { + let err = parse_frontmatter("no fence\nbody\n").unwrap_err(); + assert!(matches!(err, Error::FrontmatterMissingStart)); +} + +#[test] +fn frontmatter_missing_end_rejected() { + let err = parse_frontmatter("---\nkey: val\nno-end\n").unwrap_err(); + assert!(matches!(err, Error::FrontmatterMissingEnd)); +} + +// FIX — symlink not followed (walkdir follow_links=false) +#[test] +fn discover_does_not_follow_symlinks() { + let tmp = tempdir().unwrap(); + write_atom(tmp.path(), "kei-real", "create", ATOM_OK); + // Create a symlink named `kei-link` pointing at `kei-real`. + #[cfg(unix)] + { + let target = tmp.path().join("kei-real"); + let link = tmp.path().join("kei-link"); + std::os::unix::fs::symlink(&target, &link).unwrap(); + } + let atoms = discover_atoms(tmp.path()); + // Only 1 atom — symlinked tree is NOT walked. + assert_eq!(atoms.len(), 1, "symlink was traversed — follow_links must be false"); +} + +// Wikilink strictness +#[test] +fn wikilink_malformed_returns_none() { + assert_eq!(parse_wikilink("[[[foo]]"), None); // triple-bracket open + assert_eq!(parse_wikilink("foo"), None); + assert_eq!(parse_wikilink("[[ ]]"), None); + assert_eq!( + parse_wikilink("[[kei-task::create]]"), + Some("kei-task::create".to_string()) + ); +} diff --git a/_primitives/_rust/kei-runtime/Cargo.toml b/_primitives/_rust/kei-runtime/Cargo.toml index 4a534ec..846e6df 100644 --- a/_primitives/_rust/kei-runtime/Cargo.toml +++ b/_primitives/_rust/kei-runtime/Cargo.toml @@ -17,10 +17,15 @@ path = "src/lib.rs" clap = { version = "4", features = ["derive"] } serde = { version = "1", features = ["derive"] } serde_json = "1" -serde_yaml = "0.9" -jsonschema = { version = "0.17", default-features = false } +# SSRF + IMDS hardening: disable default features (resolve-http, cli) so the +# validator has no HTTP resolver by default. We configure a file-only +# resolver explicitly in `validate.rs`. +jsonschema = { version = "0.18", default-features = false, features = ["resolve-file"] } anyhow = "1" walkdir = "2" +serde_yaml_ng = "0.10" +kei-atom-discovery = { path = "../kei-atom-discovery" } +url = "2" [dev-dependencies] tempfile = "3" diff --git a/_primitives/_rust/kei-runtime/src/discover.rs b/_primitives/_rust/kei-runtime/src/discover.rs index 5f75446..49d8669 100644 --- a/_primitives/_rust/kei-runtime/src/discover.rs +++ b/_primitives/_rust/kei-runtime/src/discover.rs @@ -1,92 +1,20 @@ -//! Atom discovery — walks `/*/atoms/*.md`, parses YAML frontmatter. +//! Atom discovery — thin façade over `kei-atom-discovery`. //! -//! Skip-on-invalid policy: missing/malformed frontmatter emits stderr warn, -//! record is dropped (never panics, never fails the walk). +//! Re-exports `AtomMeta` and `AtomKind` from the shared crate so all runtime +//! modules share exactly one frontmatter-parser implementation. -use serde::Deserialize; -use std::path::{Path, PathBuf}; -use walkdir::WalkDir; +use kei_atom_discovery as shared; +use std::path::Path; -/// Parsed frontmatter fields needed by the runtime. -#[derive(Debug, Clone)] -pub struct AtomMeta { - pub full_id: String, - pub crate_name: String, - pub verb: String, - pub kind: String, - pub md_path: PathBuf, - pub input_schema_path: PathBuf, - pub output_schema_path: PathBuf, -} +pub use kei_atom_discovery::{parse_frontmatter, AtomKind, AtomMeta}; -/// Raw frontmatter — only the fields discover needs. -#[derive(Debug, Deserialize)] -struct Frontmatter { - atom: String, - kind: String, - input: SchemaRef, - output: SchemaRef, -} - -#[derive(Debug, Deserialize)] -struct SchemaRef { - schema: String, -} - -/// Walks `/*/atoms/*.md`. Returns one `AtomMeta` per parseable file. +/// Walk `/*/atoms/*.md`. Delegates to `kei-atom-discovery::discover_atoms`. pub fn walk_atoms(root: &Path) -> Vec { - let mut out = Vec::new(); - for entry in WalkDir::new(root).max_depth(3).into_iter().flatten() { - if !is_atom_md(entry.path()) { - continue; - } - match parse_one(entry.path()) { - Ok(meta) => out.push(meta), - Err(e) => eprintln!("warn: skip {}: {}", entry.path().display(), e), - } - } - out + shared::discover_atoms(root) } -fn is_atom_md(path: &Path) -> bool { - path.is_file() - && path.extension().is_some_and(|e| e == "md") - && path - .parent() - .and_then(|p| p.file_name()) - .is_some_and(|n| n == "atoms") -} - -fn parse_one(md_path: &Path) -> Result { - let body = std::fs::read_to_string(md_path).map_err(|e| format!("read: {e}"))?; - let fm = extract_frontmatter(&body).ok_or_else(|| "no frontmatter".to_string())?; - let parsed: Frontmatter = serde_yaml::from_str(fm).map_err(|e| format!("yaml: {e}"))?; - let (crate_name, verb) = split_atom_id(&parsed.atom)?; - let atom_dir = md_path.parent().ok_or("no parent dir")?; - Ok(AtomMeta { - full_id: parsed.atom.clone(), - crate_name, - verb, - kind: parsed.kind, - md_path: md_path.to_path_buf(), - input_schema_path: atom_dir.join(&parsed.input.schema), - output_schema_path: atom_dir.join(&parsed.output.schema), - }) -} - -/// Returns the frontmatter body (between the two `---` fences), or None. +/// Backwards-compatible split — returns the frontmatter YAML body (no body +/// trailing). Returns `None` if the file has no frontmatter fences. pub fn extract_frontmatter(body: &str) -> Option<&str> { - let rest = body.strip_prefix("---\n").or_else(|| body.strip_prefix("---\r\n"))?; - let end = rest.find("\n---").or_else(|| rest.find("\r\n---"))?; - Some(&rest[..end]) -} - -fn split_atom_id(id: &str) -> Result<(String, String), String> { - let (crate_name, verb) = id - .split_once("::") - .ok_or_else(|| format!("atom id missing `::`: {id}"))?; - if crate_name.is_empty() || verb.is_empty() { - return Err(format!("atom id has empty half: {id}")); - } - Ok((crate_name.to_string(), verb.to_string())) + shared::parse_frontmatter(body).ok().map(|(fm, _)| fm) } diff --git a/_primitives/_rust/kei-runtime/src/invoke.rs b/_primitives/_rust/kei-runtime/src/invoke.rs index 1bf2bcf..1797866 100644 --- a/_primitives/_rust/kei-runtime/src/invoke.rs +++ b/_primitives/_rust/kei-runtime/src/invoke.rs @@ -15,6 +15,7 @@ pub enum InvokeError { AtomNotFound(String), InputParse(String), InputInvalid(String), + MissingInputSchema(String), } impl std::fmt::Display for InvokeError { @@ -23,6 +24,7 @@ impl std::fmt::Display for InvokeError { Self::AtomNotFound(id) => write!(f, "atom not found: {id}"), Self::InputParse(e) => write!(f, "input parse: {e}"), Self::InputInvalid(e) => write!(f, "input invalid: {e}"), + Self::MissingInputSchema(id) => write!(f, "atom `{id}` declares no input schema"), } } } @@ -37,14 +39,15 @@ pub struct Output { } /// Invoke an atom by full ID with a JSON input string. -/// -/// MVP contract: discover atom → parse input → validate against schema → -/// return stub acknowledgement. Exec wire-up is a follow-up. pub fn invoke(root: &Path, atom_id: &str, input_json: &str) -> Result { let meta = find_atom(root, atom_id)?; let input: Value = serde_json::from_str(input_json).map_err(|e| InvokeError::InputParse(e.to_string()))?; - validate_input(&meta.input_schema_path, &input) + let schema = meta + .input_schema + .as_ref() + .ok_or_else(|| InvokeError::MissingInputSchema(atom_id.to_string()))?; + validate_input(schema, &input) .map_err(|e| InvokeError::InputInvalid(e.to_string()))?; Ok(Output { error: "atom invocation not yet implemented — wire needs Stream B atom impls".to_string(), diff --git a/_primitives/_rust/kei-runtime/src/lint.rs b/_primitives/_rust/kei-runtime/src/lint.rs index aa6a4ff..a3145b6 100644 --- a/_primitives/_rust/kei-runtime/src/lint.rs +++ b/_primitives/_rust/kei-runtime/src/lint.rs @@ -3,7 +3,8 @@ //! Checks (from SUBSTRATE-SCHEMA §Validation): //! 1. Frontmatter has required fields (atom, kind, version, input, output, //! side_effects, idempotent, stability). -//! 2. Schema paths resolve to existing JSON files. +//! 2. Schema paths resolve to existing JSON files inside the atom's dir +//! (safe_join — rejects `..` and absolute paths). //! 3. JSON Schemas declare draft-07 via `$schema`. //! 4. `kind` ∈ {command, query, stream, transform}. //! 5. `side_effects` entries are `{op, domain}` objects. @@ -11,7 +12,8 @@ //! refs allowed). use crate::discover::extract_frontmatter; -use serde_yaml::Value as YamlValue; +use kei_atom_discovery::safe_join; +use serde_yaml_ng::Value as YamlValue; use std::collections::HashSet; use std::path::{Path, PathBuf}; use walkdir::WalkDir; @@ -51,6 +53,7 @@ pub fn schema_lint(root: &Path) -> LintReport { fn find_atom_files(root: &Path) -> Vec { WalkDir::new(root) .max_depth(3) + .follow_links(false) .into_iter() .flatten() .filter(|e| { @@ -67,7 +70,7 @@ fn collect_atom_ids(root: &Path) -> HashSet { for md in find_atom_files(root) { if let Ok(body) = std::fs::read_to_string(&md) { if let Some(fm) = extract_frontmatter(&body) { - if let Ok(y) = serde_yaml::from_str::(fm) { + if let Ok(y) = serde_yaml_ng::from_str::(fm) { if let Some(id) = y.get("atom").and_then(|v| v.as_str()) { ids.insert(id.to_string()); } @@ -82,7 +85,7 @@ fn lint_one(md_path: &Path, known_atoms: &HashSet) -> Result<(), Vec) { } fn check_schema_files(md_path: &Path, fm: &YamlValue, errs: &mut Vec) { + let Some(md_dir) = md_path.parent() else { + errs.push("md_path has no parent dir".to_string()); + return; + }; for key in &["input", "output"] { let Some(rel) = fm.get(key).and_then(|v| v.get("schema")).and_then(|v| v.as_str()) else { continue; }; - let full = md_path.parent().map(|p| p.join(rel)).unwrap_or_else(|| PathBuf::from(rel)); + let full = match safe_join(md_dir, rel) { + Ok(p) => p, + Err(e) => { + errs.push(format!("{key} schema path unsafe: {e}")); + continue; + } + }; if !full.exists() { errs.push(format!("{key} schema missing: {}", full.display())); continue; diff --git a/_primitives/_rust/kei-runtime/src/main.rs b/_primitives/_rust/kei-runtime/src/main.rs index c6de9a6..f069953 100644 --- a/_primitives/_rust/kei-runtime/src/main.rs +++ b/_primitives/_rust/kei-runtime/src/main.rs @@ -77,11 +77,11 @@ fn run_list_atoms(root: PathBuf, crate_name: Option, kind: Option Result<(), ValidationErr .map_err(|e| ValidationError(format!("read {}: {e}", schema_path.display())))?; let schema_json: Value = serde_json::from_str(&schema_text) .map_err(|e| ValidationError(format!("parse {}: {e}", schema_path.display())))?; + let root = schema_path.parent().unwrap_or(schema_path).to_path_buf(); let compiled = JSONSchema::options() .with_draft(jsonschema::Draft::Draft7) + .with_resolver(LocalFileResolver::new(root)) .compile(&schema_json) .map_err(|e| ValidationError(format!("compile: {e}")))?; if let Err(errors) = compiled.validate(value) { @@ -43,3 +53,54 @@ fn validate_value(schema_path: &Path, value: &Value) -> Result<(), ValidationErr } Ok(()) } + +/// `$ref` resolver that rejects every scheme except `file://`, AND rejects +/// any path that is not inside `root` (canonicalised). +#[derive(Debug)] +pub struct LocalFileResolver { + root: PathBuf, +} + +impl LocalFileResolver { + pub fn new(root: PathBuf) -> Self { + Self { root } + } +} + +impl SchemaResolver for LocalFileResolver { + fn resolve( + &self, + _root_schema: &Value, + url: &Url, + _original_reference: &str, + ) -> Result, SchemaResolverError> { + if url.scheme() != "file" { + return Err(anyhow::anyhow!( + "remote $ref rejected — only file:// is allowed (got {})", + url.scheme() + )); + } + let path = url + .to_file_path() + .map_err(|_| anyhow::anyhow!("invalid file URL: {url}"))?; + let canon = path + .canonicalize() + .map_err(|e| anyhow::anyhow!("canonicalize {}: {e}", path.display()))?; + let root_canon = self + .root + .canonicalize() + .map_err(|e| anyhow::anyhow!("canonicalize root {}: {e}", self.root.display()))?; + if !canon.starts_with(&root_canon) { + return Err(anyhow::anyhow!( + "file $ref escapes schema root: {} not under {}", + canon.display(), + root_canon.display() + )); + } + let f = std::fs::File::open(&canon) + .map_err(|e| anyhow::anyhow!("open {}: {e}", canon.display()))?; + let doc: Value = serde_json::from_reader(f) + .map_err(|e| anyhow::anyhow!("parse {}: {e}", canon.display()))?; + Ok(Arc::new(doc)) + } +} diff --git a/_primitives/_rust/kei-runtime/tests/discover_smoke.rs b/_primitives/_rust/kei-runtime/tests/discover_smoke.rs index 28a29c1..cc77a0c 100644 --- a/_primitives/_rust/kei-runtime/tests/discover_smoke.rs +++ b/_primitives/_rust/kei-runtime/tests/discover_smoke.rs @@ -1,6 +1,6 @@ //! Integration test — walk_atoms returns 2 well-formed records from temp root. -use kei_runtime::discover::walk_atoms; +use kei_runtime::discover::{walk_atoms, AtomKind}; use std::fs; use std::path::Path; @@ -43,8 +43,16 @@ fn walk_atoms_finds_two_records() { assert_eq!(atoms[0].full_id, "kei-alpha::search"); assert_eq!(atoms[0].crate_name, "kei-alpha"); assert_eq!(atoms[0].verb, "search"); - assert_eq!(atoms[0].kind, "query"); + assert_eq!(atoms[0].kind, AtomKind::Query); assert_eq!(atoms[1].full_id, "kei-beta::fetch"); - assert!(atoms[1].input_schema_path.ends_with("schemas/fetch-input.json")); - assert!(atoms[1].output_schema_path.ends_with("schemas/fetch-output.json")); + assert!(atoms[1] + .input_schema + .as_ref() + .unwrap() + .ends_with("schemas/fetch-input.json")); + assert!(atoms[1] + .output_schema + .as_ref() + .unwrap() + .ends_with("schemas/fetch-output.json")); } diff --git a/_primitives/_rust/kei-sage/Cargo.toml b/_primitives/_rust/kei-sage/Cargo.toml index 9bf1080..04dea72 100644 --- a/_primitives/_rust/kei-sage/Cargo.toml +++ b/_primitives/_rust/kei-sage/Cargo.toml @@ -18,9 +18,9 @@ rusqlite = { version = "0.31", features = ["bundled"] } clap = { version = "4", features = ["derive"] } serde = { version = "1", features = ["derive"] } serde_json = "1" -serde_yaml = "0.9" anyhow = "1" chrono = { version = "0.4", default-features = false, features = ["clock"] } +kei-atom-discovery = { path = "../kei-atom-discovery" } [dev-dependencies] tempfile = "3" diff --git a/_primitives/_rust/kei-sage/src/atom_parse.rs b/_primitives/_rust/kei-sage/src/atom_parse.rs index 8e44e6c..6bef3f8 100644 --- a/_primitives/_rust/kei-sage/src/atom_parse.rs +++ b/_primitives/_rust/kei-sage/src/atom_parse.rs @@ -1,60 +1,23 @@ -//! Frontmatter splitting + wikilink extraction helpers for atom `.md` files. +//! Sage-local aliases over `kei-atom-discovery` helpers. //! -//! Pure functions, no I/O. See `atoms.rs` for the discovery walker. +//! Historical sage API: `split_frontmatter`, `parse_wikilink`, `is_atom_target`, +//! `split_atom_id`. All now delegate to the shared crate; kept here so sage +//! internals compile without touch. use anyhow::{anyhow, Result}; +use kei_atom_discovery as shared; -/// Split a `.md` file into (frontmatter_yaml, body). Frontmatter must start -/// with `---\n` and end with a line that is exactly `---`. +pub use shared::{is_atom_target, parse_wikilink}; + +/// Split a `.md` file into (frontmatter_yaml, body). Delegates to the shared +/// `parse_frontmatter` — preserves the sage `anyhow::Result` return type. pub fn split_frontmatter(text: &str) -> Result<(&str, &str)> { - let rest = text - .strip_prefix("---\n") - .or_else(|| text.strip_prefix("---\r\n")) - .ok_or_else(|| anyhow!("missing leading --- frontmatter delimiter"))?; - let end = find_closing_delim(rest) - .ok_or_else(|| anyhow!("missing closing --- frontmatter delimiter"))?; - let fm = &rest[..end.0]; - let body_start = end.0 + end.1; - Ok((fm, rest.get(body_start..).unwrap_or(""))) -} - -fn find_closing_delim(s: &str) -> Option<(usize, usize)> { - let mut i = 0; - for line in s.split_inclusive('\n') { - let trimmed = line.trim_end_matches(&['\n', '\r'][..]); - if trimmed == "---" { - return Some((i, line.len())); - } - i += line.len(); - } - None -} - -/// Parse a single wikilink `[[target]]`. Returns `Some(target)` stripped of -/// brackets and whitespace, `None` if the string isn't a wikilink shape. -pub fn parse_wikilink(raw: &str) -> Option { - let t = raw.trim(); - let inner = t.strip_prefix("[[").and_then(|s| s.strip_suffix("]]"))?; - let inner = inner.trim(); - if inner.is_empty() { - None - } else { - Some(inner.to_string()) - } -} - -/// Filter rule that decides whether a wikilink target is an atom reference. -/// Atoms use `::`; we exclude `rules/*` and `rule*` targets. -pub fn is_atom_target(target: &str) -> bool { - !target.starts_with("rules/") && !target.starts_with("rule ") + shared::parse_frontmatter(text).map_err(|e| anyhow!(e.to_string())) } /// Split `::` atom id into components. pub fn split_atom_id(id: &str) -> Result<(String, String)> { - match id.split_once("::") { - Some((c, v)) if !c.is_empty() && !v.is_empty() => Ok((c.into(), v.into())), - _ => Err(anyhow!("atom id must be ::, got {id}")), - } + shared::split_atom_id(id).map_err(|e| anyhow!(e.to_string())) } #[cfg(test)] diff --git a/_primitives/_rust/kei-sage/src/atoms.rs b/_primitives/_rust/kei-sage/src/atoms.rs index 8273382..53fc192 100644 --- a/_primitives/_rust/kei-sage/src/atoms.rs +++ b/_primitives/_rust/kei-sage/src/atoms.rs @@ -1,169 +1,30 @@ -//! Substrate-atom discovery + frontmatter parsing + wikilink extraction. +//! Substrate-atom discovery — thin façade over `kei-atom-discovery`. //! -//! Walks `//atoms/*.md`, parses YAML frontmatter, returns -//! `AtomRecord`. Tolerant: skips files with invalid frontmatter (logs to -//! stderr, continues scan). See `docs/SUBSTRATE-SCHEMA.md` §Graph contract. +//! Historical `AtomRecord` is preserved as a type alias for `AtomMeta` so +//! that downstream sage modules (`atom_index`, `atom_cli`) keep compiling. -use crate::atom_parse::{is_atom_target, parse_wikilink, split_atom_id, split_frontmatter}; -use anyhow::{anyhow, Context, Result}; -use serde::Deserialize; -use std::fs; -use std::path::{Path, PathBuf}; -use std::str::FromStr; +use crate::atom_parse::{is_atom_target, parse_wikilink}; +use anyhow::Result; +use kei_atom_discovery as shared; +use std::path::Path; -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum AtomKind { - Command, - Query, - Stream, - Transform, -} +pub use kei_atom_discovery::AtomKind; -impl FromStr for AtomKind { - type Err = anyhow::Error; - fn from_str(s: &str) -> Result { - match s.trim().to_ascii_lowercase().as_str() { - "command" => Ok(AtomKind::Command), - "query" => Ok(AtomKind::Query), - "stream" => Ok(AtomKind::Stream), - "transform" => Ok(AtomKind::Transform), - other => Err(anyhow!("unknown atom kind: {other}")), - } - } -} - -impl AtomKind { - pub fn as_str(&self) -> &'static str { - match self { - AtomKind::Command => "command", - AtomKind::Query => "query", - AtomKind::Stream => "stream", - AtomKind::Transform => "transform", - } - } -} - -#[derive(Debug, Clone)] -pub struct AtomRecord { - pub full_id: String, - pub kind: AtomKind, - pub crate_name: String, - pub verb: String, - pub version: String, - pub md_path: PathBuf, - pub input_schema: Option, - pub output_schema: Option, - pub related: Vec, - pub keywords: Vec, - pub stability: String, - pub body: String, -} - -#[derive(Debug, Deserialize)] -struct SchemaRef { - schema: Option, -} - -#[derive(Debug, Deserialize)] -struct Frontmatter { - atom: String, - kind: String, - #[serde(default)] - version: Option, - #[serde(default)] - input: Option, - #[serde(default)] - output: Option, - #[serde(default)] - related: Vec, - #[serde(default)] - keywords: Vec, - #[serde(default)] - stability: Option, -} +/// Legacy alias: sage used to call this `AtomRecord`. New code should use +/// `AtomMeta` directly (identical shape, authored in `kei-atom-discovery`). +pub type AtomRecord = shared::AtomMeta; +/// Walk `/*/atoms/*.md` and return parsed atom metadata. +/// Tolerant: invalid frontmatter → stderr warning + skipped record. pub fn discover_atoms(root: &Path) -> Result> { - let mut out = Vec::new(); if !root.is_dir() { - return Ok(out); + return Ok(Vec::new()); } - for entry in fs::read_dir(root).with_context(|| format!("read_dir {}", root.display()))? { - let crate_dir = entry?.path(); - if crate_dir.is_dir() { - collect_from_crate(&crate_dir, &mut out); - } - } - Ok(out) -} - -fn collect_from_crate(crate_dir: &Path, out: &mut Vec) { - let atoms_dir = crate_dir.join("atoms"); - if !atoms_dir.is_dir() { - return; - } - let crate_name = crate_dir - .file_name() - .and_then(|s| s.to_str()) - .unwrap_or("") - .to_string(); - let iter = match fs::read_dir(&atoms_dir) { - Ok(it) => it, - Err(e) => { - eprintln!("skip {}: {}", atoms_dir.display(), e); - return; - } - }; - for entry in iter.flatten() { - let path = entry.path(); - if !is_md_file(&path) { - continue; - } - match parse_atom_file(&path, &crate_name) { - Ok(rec) => out.push(rec), - Err(e) => eprintln!("skip {}: {}", path.display(), e), - } - } -} - -fn is_md_file(path: &Path) -> bool { - path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("md") -} - -fn parse_atom_file(path: &Path, crate_name: &str) -> Result { - let text = fs::read_to_string(path) - .with_context(|| format!("read {}", path.display()))?; - let (fm_text, body) = split_frontmatter(&text)?; - let fm: Frontmatter = - serde_yaml::from_str(fm_text).with_context(|| "parse frontmatter YAML")?; - build_record(fm, body, path, crate_name) -} - -fn build_record(fm: Frontmatter, body: &str, path: &Path, crate_name: &str) -> Result { - let kind = AtomKind::from_str(&fm.kind)?; - let (crate_from_id, verb) = split_atom_id(&fm.atom)?; - let md_dir = path.parent().unwrap_or(path).to_path_buf(); - Ok(AtomRecord { - full_id: fm.atom.clone(), - kind, - crate_name: if crate_from_id.is_empty() { - crate_name.to_string() - } else { - crate_from_id - }, - verb, - version: fm.version.unwrap_or_default(), - md_path: path.to_path_buf(), - input_schema: fm.input.and_then(|s| s.schema).map(|s| md_dir.join(&s)), - output_schema: fm.output.and_then(|s| s.schema).map(|s| md_dir.join(&s)), - related: fm.related, - keywords: fm.keywords, - stability: fm.stability.unwrap_or_else(|| "unknown".into()), - body: body.to_string(), - }) + Ok(shared::discover_atoms(root)) } /// Extract `(source_atom_id, target)` edges from `related:` wikilinks. -/// Non-atom targets (rules, notes) are filtered out here — scope: atoms only. +/// Non-atom targets (rules, notes) are filtered out — scope: atoms only. pub fn resolve_wikilinks(records: &[AtomRecord]) -> Vec<(String, String)> { let mut out = Vec::new(); for rec in records {