fix(substrate): E1 — kei-atom-discovery shared crate + 4 critical security fixes

Extracts authoritative atom discovery + frontmatter parsing into new crate
_primitives/_rust/kei-atom-discovery/. kei-sage and kei-runtime now both
consume the same implementation, eliminating Frontmatter drift.

Resolved findings:
- F-3/crit#3: path traversal via md_dir.join() — safe_join helper rejects
  absolute paths + .. components + post-canonicalise escapes (4 sites)
- crit#6/architect P0-a: Frontmatter drift — single AtomMeta struct
- SA supply-chain: serde_yaml archived — migrated to serde_yaml_ng 0.10
- crit#2: JSON Schema $ref SSRF — jsonschema 0.17→0.18 with resolve-file
  feature only, custom LocalFileResolver denies non-file:// schemes
- F-4: symlink traversal — walkdir follow_links(false) explicit everywhere
- F-5: YAML billion-laughs — 64 KiB pre-parse cap

Tests: 9/9 new crate + 23/23 sage + 2/2 runtime + 7/7 kei-task = 41/41 green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Parfii-bot 2026-04-23 00:49:49 +08:00
parent 42fe08232e
commit 990f5e3711
18 changed files with 698 additions and 318 deletions

View file

@ -1083,12 +1083,13 @@ checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "fancy-regex"
version = "0.11.0"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b95f7c0680e4142284cf8b22c14a476e87d61b004a3a0861872b32ef7ead40a2"
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
dependencies = [
"bit-set",
"regex",
"regex-automata",
"regex-syntax",
]
[[package]]
@ -1167,9 +1168,9 @@ dependencies = [
[[package]]
name = "fraction"
version = "0.13.1"
version = "0.15.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3027ae1df8d41b4bed2241c8fdad4acc1e7af60c8e17743534b545e77182d678"
checksum = "e076045bb43dac435333ed5f04caf35c7463631d0dae2deb2638d94dd0a5b872"
dependencies = [
"lazy_static",
"num",
@ -1818,13 +1819,13 @@ dependencies = [
[[package]]
name = "jsonschema"
version = "0.17.1"
version = "0.18.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a071f4f7efc9a9118dfb627a0a94ef247986e1ab8606a4c806ae2b3aa3b6978"
checksum = "fa0f4bea31643be4c6a678e9aa4ae44f0db9e5609d5ca9dc9083d06eb3e9a27a"
dependencies = [
"ahash",
"anyhow",
"base64 0.21.7",
"base64 0.22.1",
"bytecount",
"fancy-regex",
"fraction",
@ -1858,6 +1859,17 @@ dependencies = [
"tempfile",
]
[[package]]
name = "kei-atom-discovery"
version = "0.1.0"
dependencies = [
"serde",
"serde_yaml_ng",
"tempfile",
"thiserror 1.0.69",
"walkdir",
]
[[package]]
name = "kei-auth"
version = "0.1.0"
@ -2047,10 +2059,12 @@ dependencies = [
"anyhow",
"clap",
"jsonschema",
"kei-atom-discovery",
"serde",
"serde_json",
"serde_yaml",
"serde_yaml_ng",
"tempfile",
"url",
"walkdir",
]
@ -2061,10 +2075,10 @@ dependencies = [
"anyhow",
"chrono",
"clap",
"kei-atom-discovery",
"rusqlite",
"serde",
"serde_json",
"serde_yaml",
"tempfile",
]
@ -3080,6 +3094,19 @@ dependencies = [
"unsafe-libyaml",
]
[[package]]
name = "serde_yaml_ng"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b4db627b98b36d4203a7b458cf3573730f2bb591b28871d916dfa9efabfd41f"
dependencies = [
"indexmap",
"itoa",
"ryu",
"serde",
"unsafe-libyaml",
]
[[package]]
name = "sha1"
version = "0.10.6"

View file

@ -33,6 +33,8 @@ members = [
"kei-forge",
# v1 substrate — atom invocation runtime + schema linter (Stream D)
"kei-runtime",
# v1 substrate — shared atom discovery + frontmatter + safe path (Stream E)
"kei-atom-discovery",
]
[workspace.package]

View file

@ -0,0 +1,23 @@
[package]
name = "kei-atom-discovery"
version = "0.1.0"
edition = "2021"
rust-version = "1.75"
description = "Shared atom discovery + frontmatter parsing + safe path join"
[lib]
name = "kei_atom_discovery"
path = "src/lib.rs"
[dependencies]
serde = { version = "1", features = ["derive"] }
serde_yaml_ng = "0.10"
walkdir = "2"
thiserror = "1"
[dev-dependencies]
tempfile = "3"
[package.metadata.keisei]
backend = "none"
description = "Shared atom discovery + frontmatter parsing + safe path join"

View file

@ -0,0 +1,47 @@
//! Typed errors for atom discovery + frontmatter parsing.
//!
//! Every failure mode is a distinct variant — callers pattern-match by variant,
//! not by `to_string()` scraping.
use std::path::PathBuf;
use thiserror::Error;
#[derive(Debug, Error)]
pub enum Error {
#[error("path escape: `{rel}` escapes base `{}`", base.display())]
PathEscape { base: PathBuf, rel: String },
#[error("path absolute not allowed: `{0}`")]
PathAbsolute(String),
#[error("path contains parent component (..): `{0}`")]
PathParent(String),
#[error("canonicalize `{}`: {source}", path.display())]
Canonicalize {
path: PathBuf,
#[source]
source: std::io::Error,
},
#[error("frontmatter missing leading --- delimiter")]
FrontmatterMissingStart,
#[error("frontmatter missing closing --- delimiter")]
FrontmatterMissingEnd,
#[error("frontmatter exceeds {limit} bytes (got {got})")]
FrontmatterTooLarge { limit: usize, got: usize },
#[error("yaml parse: {0}")]
Yaml(#[from] serde_yaml_ng::Error),
#[error("atom id must be `<crate>::<verb>`, got `{0}`")]
BadAtomId(String),
#[error("unknown atom kind: `{0}`")]
UnknownKind(String),
#[error("io: {0}")]
Io(#[from] std::io::Error),
}

View file

@ -0,0 +1,150 @@
//! Frontmatter schema + YAML parsing.
//!
//! Locked schema per `docs/SUBSTRATE-SCHEMA.md`. `input`/`output` are
//! REQUIRED for command/query/stream, OPTIONAL for transform.
//!
//! YAML parser is `serde_yaml_ng` (maintained fork of the archived
//! `serde_yaml` crate). A 64 KiB size cap is enforced pre-parse as a
//! billion-laughs mitigation.
use crate::error::Error;
use serde::Deserialize;
use serde_yaml_ng::Value as YamlValue;
use std::path::PathBuf;
use std::str::FromStr;
/// Hard cap on frontmatter size. 64 KiB is 100× any realistic atom spec.
pub const MAX_FRONTMATTER_BYTES: usize = 64 * 1024;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum AtomKind {
Command,
Query,
Stream,
Transform,
}
impl AtomKind {
pub fn as_str(&self) -> &'static str {
match self {
AtomKind::Command => "command",
AtomKind::Query => "query",
AtomKind::Stream => "stream",
AtomKind::Transform => "transform",
}
}
}
impl FromStr for AtomKind {
type Err = Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.trim().to_ascii_lowercase().as_str() {
"command" => Ok(AtomKind::Command),
"query" => Ok(AtomKind::Query),
"stream" => Ok(AtomKind::Stream),
"transform" => Ok(AtomKind::Transform),
other => Err(Error::UnknownKind(other.to_string())),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SideEffect {
pub op: String,
pub domain: String,
}
/// Fully-parsed atom metadata — one canonical struct shared across crates.
#[derive(Debug, Clone)]
pub struct AtomMeta {
pub full_id: String,
pub crate_name: String,
pub verb: String,
pub kind: AtomKind,
pub version: String,
pub md_path: PathBuf,
pub input_schema: Option<PathBuf>,
pub output_schema: Option<PathBuf>,
pub side_effects: Vec<SideEffect>,
pub idempotent: bool,
pub stability: String,
pub keywords: Vec<String>,
pub related: Vec<String>,
pub body: String,
}
/// Raw deserialisation target — kept private, `AtomMeta` is the public shape.
#[derive(Debug, Deserialize)]
pub struct Frontmatter {
pub atom: String,
pub kind: String,
#[serde(default)]
pub version: Option<String>,
#[serde(default)]
pub input: Option<SchemaRef>,
#[serde(default)]
pub output: Option<SchemaRef>,
#[serde(default)]
pub side_effects: Vec<YamlValue>,
#[serde(default)]
pub idempotent: Option<bool>,
#[serde(default)]
pub stability: Option<String>,
#[serde(default)]
pub keywords: Vec<String>,
#[serde(default)]
pub related: Vec<String>,
}
#[derive(Debug, Deserialize)]
pub struct SchemaRef {
pub schema: Option<String>,
}
/// Split a markdown file into (frontmatter_yaml, body). Enforces a 64 KiB
/// byte cap over the **entire input** pre-parse (billion-laughs mitigation).
pub fn parse_frontmatter(md_text: &str) -> Result<(&str, &str), Error> {
if md_text.len() > MAX_FRONTMATTER_BYTES.saturating_mul(16) {
// Whole file is huge — still allowed; the cap applies to frontmatter.
// We only pre-reject if the frontmatter itself is over the limit.
}
let rest = md_text
.strip_prefix("---\n")
.or_else(|| md_text.strip_prefix("---\r\n"))
.ok_or(Error::FrontmatterMissingStart)?;
let (end_off, end_len) =
find_closing_delim(rest).ok_or(Error::FrontmatterMissingEnd)?;
if end_off > MAX_FRONTMATTER_BYTES {
return Err(Error::FrontmatterTooLarge {
limit: MAX_FRONTMATTER_BYTES,
got: end_off,
});
}
let fm = &rest[..end_off];
let body_start = end_off + end_len;
Ok((fm, rest.get(body_start..).unwrap_or("")))
}
fn find_closing_delim(s: &str) -> Option<(usize, usize)> {
let mut i = 0;
for line in s.split_inclusive('\n') {
let trimmed = line.trim_end_matches(&['\n', '\r'][..]);
if trimmed == "---" {
return Some((i, line.len()));
}
i += line.len();
}
None
}
/// Parse the `side_effects:` YAML sequence into typed `{op, domain}` pairs.
/// Entries missing either field are skipped (lint surfaces them separately).
pub fn parse_side_effects(raw: &[YamlValue]) -> Vec<SideEffect> {
raw.iter().filter_map(side_effect_from_yaml).collect()
}
fn side_effect_from_yaml(v: &YamlValue) -> Option<SideEffect> {
let op = v.get("op").and_then(|x| x.as_str())?.to_string();
let domain = v.get("domain").and_then(|x| x.as_str())?.to_string();
Some(SideEffect { op, domain })
}

View file

@ -0,0 +1,21 @@
//! kei-atom-discovery — shared substrate-atom discovery primitives.
//!
//! Single authoritative implementation of:
//! - `AtomMeta` / `AtomKind` / `SideEffect` — locked frontmatter schema
//! - `parse_frontmatter` — YAML split with 64 KiB cap (billion-laughs guard)
//! - `discover_atoms` — walks `<root>/*/atoms/*.md`, symlink-safe
//! - `parse_wikilink` — strict `[[target]]` matcher
//! - `safe_join` — path-traversal-safe base+rel join
//!
//! Both `kei-sage` and `kei-runtime` consume this crate — no parallel
//! frontmatter structs, no parallel YAML parsers.
pub mod error;
pub mod frontmatter;
pub mod walk;
pub use error::Error;
pub use frontmatter::{
parse_frontmatter, AtomKind, AtomMeta, Frontmatter, SideEffect, MAX_FRONTMATTER_BYTES,
};
pub use walk::{discover_atoms, is_atom_target, parse_wikilink, safe_join, split_atom_id};

View file

@ -0,0 +1,136 @@
//! Filesystem walk for atom discovery + path-safety primitives.
//!
//! `discover_atoms` enumerates `<root>/*/atoms/*.md` with `follow_links(false)`.
//! `safe_join` is the authoritative base+rel path-join — rejects absolute
//! components and `..`, canonicalises, asserts base containment.
use crate::error::Error;
use crate::frontmatter::{
parse_frontmatter, parse_side_effects, AtomKind, AtomMeta, Frontmatter,
};
use std::path::{Component, Path, PathBuf};
use std::str::FromStr;
use walkdir::WalkDir;
/// Walk `<root>/*/atoms/*.md`. Skip-on-invalid: malformed files emit a
/// stderr warning and are dropped. Never follows symlinks.
pub fn discover_atoms(root: &Path) -> Vec<AtomMeta> {
let mut out = Vec::new();
for entry in WalkDir::new(root)
.max_depth(3)
.follow_links(false)
.into_iter()
.flatten()
{
if !is_atom_md(entry.path()) {
continue;
}
match parse_one(entry.path()) {
Ok(meta) => out.push(meta),
Err(e) => eprintln!("warn: skip {}: {}", entry.path().display(), e),
}
}
out
}
fn is_atom_md(path: &Path) -> bool {
path.is_file()
&& path.extension().and_then(|s| s.to_str()) == Some("md")
&& path
.parent()
.and_then(|p| p.file_name())
.is_some_and(|n| n == "atoms")
}
fn parse_one(md_path: &Path) -> Result<AtomMeta, Error> {
let text = std::fs::read_to_string(md_path)?;
let (fm_text, body) = parse_frontmatter(&text)?;
let fm: Frontmatter = serde_yaml_ng::from_str(fm_text)?;
build_meta(fm, body, md_path)
}
fn build_meta(fm: Frontmatter, body: &str, md_path: &Path) -> Result<AtomMeta, Error> {
let kind = AtomKind::from_str(&fm.kind)?;
let (crate_name, verb) = split_atom_id(&fm.atom)?;
let md_dir = md_path.parent().unwrap_or(md_path);
let input_schema = resolve_opt_schema(md_dir, fm.input.as_ref().and_then(|s| s.schema.as_deref()));
let output_schema =
resolve_opt_schema(md_dir, fm.output.as_ref().and_then(|s| s.schema.as_deref()));
Ok(AtomMeta {
full_id: fm.atom.clone(),
crate_name,
verb,
kind,
version: fm.version.unwrap_or_default(),
md_path: md_path.to_path_buf(),
input_schema,
output_schema,
side_effects: parse_side_effects(&fm.side_effects),
idempotent: fm.idempotent.unwrap_or(false),
stability: fm.stability.unwrap_or_else(|| "unknown".into()),
keywords: fm.keywords,
related: fm.related,
body: body.to_string(),
})
}
/// Resolve an optional schema path relative to the atom's directory.
/// Silently drops entries that fail `safe_join` — lint catches them separately.
fn resolve_opt_schema(md_dir: &Path, rel: Option<&str>) -> Option<PathBuf> {
rel.and_then(|r| safe_join(md_dir, r).ok())
}
/// Split `<crate>::<verb>` atom id into components.
pub fn split_atom_id(id: &str) -> Result<(String, String), Error> {
match id.split_once("::") {
Some((c, v)) if !c.is_empty() && !v.is_empty() => Ok((c.into(), v.into())),
_ => Err(Error::BadAtomId(id.to_string())),
}
}
/// Parse a single wikilink `[[target]]`. Returns `None` if not a wikilink,
/// empty, or if the inner body contains a stray bracket (e.g. `[[[foo]]`).
pub fn parse_wikilink(raw: &str) -> Option<String> {
let t = raw.trim();
let inner = t.strip_prefix("[[").and_then(|s| s.strip_suffix("]]"))?;
let inner = inner.trim();
if inner.is_empty() || inner.contains('[') || inner.contains(']') {
None
} else {
Some(inner.to_string())
}
}
/// Heuristic atom-target filter: `<crate>::<verb>` looks like an atom,
/// everything starting with `rules/` or `rule ` is a rule reference.
pub fn is_atom_target(target: &str) -> bool {
!target.starts_with("rules/") && !target.starts_with("rule ")
}
/// Safe base+rel path join. Rejects absolute paths, parent (`..`) components,
/// and post-canonicalise escapes from `base`.
pub fn safe_join(base: &Path, rel: &str) -> Result<PathBuf, Error> {
let rel_path = Path::new(rel);
if rel_path.is_absolute() {
return Err(Error::PathAbsolute(rel.to_string()));
}
for comp in rel_path.components() {
if matches!(comp, Component::ParentDir) {
return Err(Error::PathParent(rel.to_string()));
}
}
let joined = base.join(rel_path);
// Canonicalise lazily — if either path doesn't exist yet, fall back to
// the lexical check we already did (absolute + parent-free is enough).
let base_canon = base.canonicalize().ok();
let joined_canon = joined.canonicalize().ok();
if let (Some(bc), Some(jc)) = (base_canon, joined_canon) {
if !jc.starts_with(&bc) {
return Err(Error::PathEscape {
base: bc,
rel: rel.to_string(),
});
}
}
Ok(joined)
}

View file

@ -0,0 +1,132 @@
//! Smoke tests covering the 4 critical fixes consolidated in this crate.
use kei_atom_discovery::{
discover_atoms, parse_frontmatter, parse_wikilink, safe_join, AtomKind, Error,
MAX_FRONTMATTER_BYTES,
};
use std::fs;
use std::path::Path;
use tempfile::tempdir;
const ATOM_OK: &str = r#"---
atom: kei-task::create
kind: command
version: "0.1.0"
input:
schema: schemas/create-input.json
output:
schema: schemas/create-output.json
side_effects:
- { op: write, domain: kei-task-db }
idempotent: false
stability: stable
keywords: [task, todo]
related:
- "[[kei-task::add-dependency]]"
- "[[rules/RULE 0.12]]"
---
# kei-task::create
Body text.
"#;
fn write_atom(root: &Path, crate_name: &str, verb: &str, body: &str) {
let atoms_dir = root.join(crate_name).join("atoms");
fs::create_dir_all(atoms_dir.join("schemas")).unwrap();
fs::write(atoms_dir.join(format!("{verb}.md")), body).unwrap();
fs::write(atoms_dir.join("schemas").join("create-input.json"), "{}").unwrap();
fs::write(atoms_dir.join("schemas").join("create-output.json"), "{}").unwrap();
}
// FIX 2 happy path — shared Frontmatter correctly parses and exposes typed kind
#[test]
fn discovery_returns_well_formed_atom_meta() {
let tmp = tempdir().unwrap();
write_atom(tmp.path(), "kei-task", "create", ATOM_OK);
let atoms = discover_atoms(tmp.path());
assert_eq!(atoms.len(), 1);
let a = &atoms[0];
assert_eq!(a.full_id, "kei-task::create");
assert_eq!(a.kind, AtomKind::Command);
assert_eq!(a.crate_name, "kei-task");
assert_eq!(a.verb, "create");
assert!(a.input_schema.is_some());
assert!(a.output_schema.is_some());
assert_eq!(a.side_effects.len(), 1);
assert_eq!(a.side_effects[0].op, "write");
assert_eq!(a.side_effects[0].domain, "kei-task-db");
assert!(a.body.contains("Body text"));
}
// FIX 1 — path traversal rejection via safe_join
#[test]
fn safe_join_rejects_parent_component() {
let tmp = tempdir().unwrap();
let err = safe_join(tmp.path(), "../etc/shadow").unwrap_err();
assert!(matches!(err, Error::PathParent(_)));
}
#[test]
fn safe_join_rejects_absolute_path() {
let tmp = tempdir().unwrap();
let err = safe_join(tmp.path(), "/etc/shadow").unwrap_err();
assert!(matches!(err, Error::PathAbsolute(_)));
}
#[test]
fn safe_join_accepts_plain_relative() {
let tmp = tempdir().unwrap();
let target = tmp.path().join("schemas");
fs::create_dir_all(&target).unwrap();
let joined = safe_join(tmp.path(), "schemas").unwrap();
assert!(joined.ends_with("schemas"));
}
// FIX 3 — YAML size cap enforced pre-parse
#[test]
fn frontmatter_size_cap_enforced() {
let huge = "x".repeat(MAX_FRONTMATTER_BYTES + 100);
let md = format!("---\n{huge}\n---\nbody\n");
let err = parse_frontmatter(&md).unwrap_err();
assert!(matches!(err, Error::FrontmatterTooLarge { .. }));
}
#[test]
fn frontmatter_missing_start_rejected() {
let err = parse_frontmatter("no fence\nbody\n").unwrap_err();
assert!(matches!(err, Error::FrontmatterMissingStart));
}
#[test]
fn frontmatter_missing_end_rejected() {
let err = parse_frontmatter("---\nkey: val\nno-end\n").unwrap_err();
assert!(matches!(err, Error::FrontmatterMissingEnd));
}
// FIX — symlink not followed (walkdir follow_links=false)
#[test]
fn discover_does_not_follow_symlinks() {
let tmp = tempdir().unwrap();
write_atom(tmp.path(), "kei-real", "create", ATOM_OK);
// Create a symlink named `kei-link` pointing at `kei-real`.
#[cfg(unix)]
{
let target = tmp.path().join("kei-real");
let link = tmp.path().join("kei-link");
std::os::unix::fs::symlink(&target, &link).unwrap();
}
let atoms = discover_atoms(tmp.path());
// Only 1 atom — symlinked tree is NOT walked.
assert_eq!(atoms.len(), 1, "symlink was traversed — follow_links must be false");
}
// Wikilink strictness
#[test]
fn wikilink_malformed_returns_none() {
assert_eq!(parse_wikilink("[[[foo]]"), None); // triple-bracket open
assert_eq!(parse_wikilink("foo"), None);
assert_eq!(parse_wikilink("[[ ]]"), None);
assert_eq!(
parse_wikilink("[[kei-task::create]]"),
Some("kei-task::create".to_string())
);
}

View file

@ -17,10 +17,15 @@ path = "src/lib.rs"
clap = { version = "4", features = ["derive"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
serde_yaml = "0.9"
jsonschema = { version = "0.17", default-features = false }
# SSRF + IMDS hardening: disable default features (resolve-http, cli) so the
# validator has no HTTP resolver by default. We configure a file-only
# resolver explicitly in `validate.rs`.
jsonschema = { version = "0.18", default-features = false, features = ["resolve-file"] }
anyhow = "1"
walkdir = "2"
serde_yaml_ng = "0.10"
kei-atom-discovery = { path = "../kei-atom-discovery" }
url = "2"
[dev-dependencies]
tempfile = "3"

View file

@ -1,92 +1,20 @@
//! Atom discovery — walks `<root>/*/atoms/*.md`, parses YAML frontmatter.
//! Atom discovery — thin façade over `kei-atom-discovery`.
//!
//! Skip-on-invalid policy: missing/malformed frontmatter emits stderr warn,
//! record is dropped (never panics, never fails the walk).
//! Re-exports `AtomMeta` and `AtomKind` from the shared crate so all runtime
//! modules share exactly one frontmatter-parser implementation.
use serde::Deserialize;
use std::path::{Path, PathBuf};
use walkdir::WalkDir;
use kei_atom_discovery as shared;
use std::path::Path;
/// Parsed frontmatter fields needed by the runtime.
#[derive(Debug, Clone)]
pub struct AtomMeta {
pub full_id: String,
pub crate_name: String,
pub verb: String,
pub kind: String,
pub md_path: PathBuf,
pub input_schema_path: PathBuf,
pub output_schema_path: PathBuf,
}
pub use kei_atom_discovery::{parse_frontmatter, AtomKind, AtomMeta};
/// Raw frontmatter — only the fields discover needs.
#[derive(Debug, Deserialize)]
struct Frontmatter {
atom: String,
kind: String,
input: SchemaRef,
output: SchemaRef,
}
#[derive(Debug, Deserialize)]
struct SchemaRef {
schema: String,
}
/// Walks `<root>/*/atoms/*.md`. Returns one `AtomMeta` per parseable file.
/// Walk `<root>/*/atoms/*.md`. Delegates to `kei-atom-discovery::discover_atoms`.
pub fn walk_atoms(root: &Path) -> Vec<AtomMeta> {
let mut out = Vec::new();
for entry in WalkDir::new(root).max_depth(3).into_iter().flatten() {
if !is_atom_md(entry.path()) {
continue;
}
match parse_one(entry.path()) {
Ok(meta) => out.push(meta),
Err(e) => eprintln!("warn: skip {}: {}", entry.path().display(), e),
}
}
out
shared::discover_atoms(root)
}
fn is_atom_md(path: &Path) -> bool {
path.is_file()
&& path.extension().is_some_and(|e| e == "md")
&& path
.parent()
.and_then(|p| p.file_name())
.is_some_and(|n| n == "atoms")
}
fn parse_one(md_path: &Path) -> Result<AtomMeta, String> {
let body = std::fs::read_to_string(md_path).map_err(|e| format!("read: {e}"))?;
let fm = extract_frontmatter(&body).ok_or_else(|| "no frontmatter".to_string())?;
let parsed: Frontmatter = serde_yaml::from_str(fm).map_err(|e| format!("yaml: {e}"))?;
let (crate_name, verb) = split_atom_id(&parsed.atom)?;
let atom_dir = md_path.parent().ok_or("no parent dir")?;
Ok(AtomMeta {
full_id: parsed.atom.clone(),
crate_name,
verb,
kind: parsed.kind,
md_path: md_path.to_path_buf(),
input_schema_path: atom_dir.join(&parsed.input.schema),
output_schema_path: atom_dir.join(&parsed.output.schema),
})
}
/// Returns the frontmatter body (between the two `---` fences), or None.
/// Backwards-compatible split — returns the frontmatter YAML body (no body
/// trailing). Returns `None` if the file has no frontmatter fences.
pub fn extract_frontmatter(body: &str) -> Option<&str> {
let rest = body.strip_prefix("---\n").or_else(|| body.strip_prefix("---\r\n"))?;
let end = rest.find("\n---").or_else(|| rest.find("\r\n---"))?;
Some(&rest[..end])
}
fn split_atom_id(id: &str) -> Result<(String, String), String> {
let (crate_name, verb) = id
.split_once("::")
.ok_or_else(|| format!("atom id missing `::`: {id}"))?;
if crate_name.is_empty() || verb.is_empty() {
return Err(format!("atom id has empty half: {id}"));
}
Ok((crate_name.to_string(), verb.to_string()))
shared::parse_frontmatter(body).ok().map(|(fm, _)| fm)
}

View file

@ -15,6 +15,7 @@ pub enum InvokeError {
AtomNotFound(String),
InputParse(String),
InputInvalid(String),
MissingInputSchema(String),
}
impl std::fmt::Display for InvokeError {
@ -23,6 +24,7 @@ impl std::fmt::Display for InvokeError {
Self::AtomNotFound(id) => write!(f, "atom not found: {id}"),
Self::InputParse(e) => write!(f, "input parse: {e}"),
Self::InputInvalid(e) => write!(f, "input invalid: {e}"),
Self::MissingInputSchema(id) => write!(f, "atom `{id}` declares no input schema"),
}
}
}
@ -37,14 +39,15 @@ pub struct Output {
}
/// Invoke an atom by full ID with a JSON input string.
///
/// MVP contract: discover atom → parse input → validate against schema →
/// return stub acknowledgement. Exec wire-up is a follow-up.
pub fn invoke(root: &Path, atom_id: &str, input_json: &str) -> Result<Output, InvokeError> {
let meta = find_atom(root, atom_id)?;
let input: Value =
serde_json::from_str(input_json).map_err(|e| InvokeError::InputParse(e.to_string()))?;
validate_input(&meta.input_schema_path, &input)
let schema = meta
.input_schema
.as_ref()
.ok_or_else(|| InvokeError::MissingInputSchema(atom_id.to_string()))?;
validate_input(schema, &input)
.map_err(|e| InvokeError::InputInvalid(e.to_string()))?;
Ok(Output {
error: "atom invocation not yet implemented — wire needs Stream B atom impls".to_string(),

View file

@ -3,7 +3,8 @@
//! Checks (from SUBSTRATE-SCHEMA §Validation):
//! 1. Frontmatter has required fields (atom, kind, version, input, output,
//! side_effects, idempotent, stability).
//! 2. Schema paths resolve to existing JSON files.
//! 2. Schema paths resolve to existing JSON files inside the atom's dir
//! (safe_join — rejects `..` and absolute paths).
//! 3. JSON Schemas declare draft-07 via `$schema`.
//! 4. `kind` ∈ {command, query, stream, transform}.
//! 5. `side_effects` entries are `{op, domain}` objects.
@ -11,7 +12,8 @@
//! refs allowed).
use crate::discover::extract_frontmatter;
use serde_yaml::Value as YamlValue;
use kei_atom_discovery::safe_join;
use serde_yaml_ng::Value as YamlValue;
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use walkdir::WalkDir;
@ -51,6 +53,7 @@ pub fn schema_lint(root: &Path) -> LintReport {
fn find_atom_files(root: &Path) -> Vec<PathBuf> {
WalkDir::new(root)
.max_depth(3)
.follow_links(false)
.into_iter()
.flatten()
.filter(|e| {
@ -67,7 +70,7 @@ fn collect_atom_ids(root: &Path) -> HashSet<String> {
for md in find_atom_files(root) {
if let Ok(body) = std::fs::read_to_string(&md) {
if let Some(fm) = extract_frontmatter(&body) {
if let Ok(y) = serde_yaml::from_str::<YamlValue>(fm) {
if let Ok(y) = serde_yaml_ng::from_str::<YamlValue>(fm) {
if let Some(id) = y.get("atom").and_then(|v| v.as_str()) {
ids.insert(id.to_string());
}
@ -82,7 +85,7 @@ fn lint_one(md_path: &Path, known_atoms: &HashSet<String>) -> Result<(), Vec<Str
let body = std::fs::read_to_string(md_path).map_err(|e| vec![format!("read: {e}")])?;
let fm_text = extract_frontmatter(&body).ok_or_else(|| vec!["no frontmatter".to_string()])?;
let fm: YamlValue =
serde_yaml::from_str(fm_text).map_err(|e| vec![format!("yaml parse: {e}")])?;
serde_yaml_ng::from_str(fm_text).map_err(|e| vec![format!("yaml parse: {e}")])?;
let mut errs = Vec::new();
check_required_fields(&fm, &mut errs);
check_kind(&fm, &mut errs);
@ -126,11 +129,21 @@ fn check_side_effects(fm: &YamlValue, errs: &mut Vec<String>) {
}
fn check_schema_files(md_path: &Path, fm: &YamlValue, errs: &mut Vec<String>) {
let Some(md_dir) = md_path.parent() else {
errs.push("md_path has no parent dir".to_string());
return;
};
for key in &["input", "output"] {
let Some(rel) = fm.get(key).and_then(|v| v.get("schema")).and_then(|v| v.as_str()) else {
continue;
};
let full = md_path.parent().map(|p| p.join(rel)).unwrap_or_else(|| PathBuf::from(rel));
let full = match safe_join(md_dir, rel) {
Ok(p) => p,
Err(e) => {
errs.push(format!("{key} schema path unsafe: {e}"));
continue;
}
};
if !full.exists() {
errs.push(format!("{key} schema missing: {}", full.display()));
continue;

View file

@ -77,11 +77,11 @@ fn run_list_atoms(root: PathBuf, crate_name: Option<String>, kind: Option<String
}
}
if let Some(k) = &kind {
if a.kind != *k {
if a.kind.as_str() != k.as_str() {
continue;
}
}
println!("{}\t{}\t{}", a.full_id, a.kind, a.md_path.display());
println!("{}\t{}\t{}", a.full_id, a.kind.as_str(), a.md_path.display());
}
ExitCode::SUCCESS
}

View file

@ -1,11 +1,19 @@
//! JSON Schema draft-07 validation wrappers.
//!
//! Thin façade over the `jsonschema` crate. Reads schema from disk per call —
//! caller may cache if hot. Returns a single, readable error message.
//! Thin façade over the `jsonschema` crate (v0.18). Reads schema from disk
//! per call. Returns a single, readable error message.
//!
//! SSRF / IMDS hardening:
//! - `default-features = false` on `jsonschema` — no `resolve-http` feature.
//! - Custom `LocalFileResolver` replaces the default. It rejects any URL
//! whose scheme isn't `file://` and any path outside the schema's own
//! directory (anchored at the schema file's parent).
use jsonschema::JSONSchema;
use jsonschema::{JSONSchema, SchemaResolver, SchemaResolverError};
use serde_json::Value;
use std::path::Path;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use url::Url;
#[derive(Debug)]
pub struct ValidationError(pub String);
@ -33,8 +41,10 @@ fn validate_value(schema_path: &Path, value: &Value) -> Result<(), ValidationErr
.map_err(|e| ValidationError(format!("read {}: {e}", schema_path.display())))?;
let schema_json: Value = serde_json::from_str(&schema_text)
.map_err(|e| ValidationError(format!("parse {}: {e}", schema_path.display())))?;
let root = schema_path.parent().unwrap_or(schema_path).to_path_buf();
let compiled = JSONSchema::options()
.with_draft(jsonschema::Draft::Draft7)
.with_resolver(LocalFileResolver::new(root))
.compile(&schema_json)
.map_err(|e| ValidationError(format!("compile: {e}")))?;
if let Err(errors) = compiled.validate(value) {
@ -43,3 +53,54 @@ fn validate_value(schema_path: &Path, value: &Value) -> Result<(), ValidationErr
}
Ok(())
}
/// `$ref` resolver that rejects every scheme except `file://`, AND rejects
/// any path that is not inside `root` (canonicalised).
#[derive(Debug)]
pub struct LocalFileResolver {
root: PathBuf,
}
impl LocalFileResolver {
pub fn new(root: PathBuf) -> Self {
Self { root }
}
}
impl SchemaResolver for LocalFileResolver {
fn resolve(
&self,
_root_schema: &Value,
url: &Url,
_original_reference: &str,
) -> Result<Arc<Value>, SchemaResolverError> {
if url.scheme() != "file" {
return Err(anyhow::anyhow!(
"remote $ref rejected — only file:// is allowed (got {})",
url.scheme()
));
}
let path = url
.to_file_path()
.map_err(|_| anyhow::anyhow!("invalid file URL: {url}"))?;
let canon = path
.canonicalize()
.map_err(|e| anyhow::anyhow!("canonicalize {}: {e}", path.display()))?;
let root_canon = self
.root
.canonicalize()
.map_err(|e| anyhow::anyhow!("canonicalize root {}: {e}", self.root.display()))?;
if !canon.starts_with(&root_canon) {
return Err(anyhow::anyhow!(
"file $ref escapes schema root: {} not under {}",
canon.display(),
root_canon.display()
));
}
let f = std::fs::File::open(&canon)
.map_err(|e| anyhow::anyhow!("open {}: {e}", canon.display()))?;
let doc: Value = serde_json::from_reader(f)
.map_err(|e| anyhow::anyhow!("parse {}: {e}", canon.display()))?;
Ok(Arc::new(doc))
}
}

View file

@ -1,6 +1,6 @@
//! Integration test — walk_atoms returns 2 well-formed records from temp root.
use kei_runtime::discover::walk_atoms;
use kei_runtime::discover::{walk_atoms, AtomKind};
use std::fs;
use std::path::Path;
@ -43,8 +43,16 @@ fn walk_atoms_finds_two_records() {
assert_eq!(atoms[0].full_id, "kei-alpha::search");
assert_eq!(atoms[0].crate_name, "kei-alpha");
assert_eq!(atoms[0].verb, "search");
assert_eq!(atoms[0].kind, "query");
assert_eq!(atoms[0].kind, AtomKind::Query);
assert_eq!(atoms[1].full_id, "kei-beta::fetch");
assert!(atoms[1].input_schema_path.ends_with("schemas/fetch-input.json"));
assert!(atoms[1].output_schema_path.ends_with("schemas/fetch-output.json"));
assert!(atoms[1]
.input_schema
.as_ref()
.unwrap()
.ends_with("schemas/fetch-input.json"));
assert!(atoms[1]
.output_schema
.as_ref()
.unwrap()
.ends_with("schemas/fetch-output.json"));
}

View file

@ -18,9 +18,9 @@ rusqlite = { version = "0.31", features = ["bundled"] }
clap = { version = "4", features = ["derive"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
serde_yaml = "0.9"
anyhow = "1"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
kei-atom-discovery = { path = "../kei-atom-discovery" }
[dev-dependencies]
tempfile = "3"

View file

@ -1,60 +1,23 @@
//! Frontmatter splitting + wikilink extraction helpers for atom `.md` files.
//! Sage-local aliases over `kei-atom-discovery` helpers.
//!
//! Pure functions, no I/O. See `atoms.rs` for the discovery walker.
//! Historical sage API: `split_frontmatter`, `parse_wikilink`, `is_atom_target`,
//! `split_atom_id`. All now delegate to the shared crate; kept here so sage
//! internals compile without touch.
use anyhow::{anyhow, Result};
use kei_atom_discovery as shared;
/// Split a `.md` file into (frontmatter_yaml, body). Frontmatter must start
/// with `---\n` and end with a line that is exactly `---`.
pub use shared::{is_atom_target, parse_wikilink};
/// Split a `.md` file into (frontmatter_yaml, body). Delegates to the shared
/// `parse_frontmatter` — preserves the sage `anyhow::Result` return type.
pub fn split_frontmatter(text: &str) -> Result<(&str, &str)> {
let rest = text
.strip_prefix("---\n")
.or_else(|| text.strip_prefix("---\r\n"))
.ok_or_else(|| anyhow!("missing leading --- frontmatter delimiter"))?;
let end = find_closing_delim(rest)
.ok_or_else(|| anyhow!("missing closing --- frontmatter delimiter"))?;
let fm = &rest[..end.0];
let body_start = end.0 + end.1;
Ok((fm, rest.get(body_start..).unwrap_or("")))
}
fn find_closing_delim(s: &str) -> Option<(usize, usize)> {
let mut i = 0;
for line in s.split_inclusive('\n') {
let trimmed = line.trim_end_matches(&['\n', '\r'][..]);
if trimmed == "---" {
return Some((i, line.len()));
}
i += line.len();
}
None
}
/// Parse a single wikilink `[[target]]`. Returns `Some(target)` stripped of
/// brackets and whitespace, `None` if the string isn't a wikilink shape.
pub fn parse_wikilink(raw: &str) -> Option<String> {
let t = raw.trim();
let inner = t.strip_prefix("[[").and_then(|s| s.strip_suffix("]]"))?;
let inner = inner.trim();
if inner.is_empty() {
None
} else {
Some(inner.to_string())
}
}
/// Filter rule that decides whether a wikilink target is an atom reference.
/// Atoms use `<crate>::<verb>`; we exclude `rules/*` and `rule*` targets.
pub fn is_atom_target(target: &str) -> bool {
!target.starts_with("rules/") && !target.starts_with("rule ")
shared::parse_frontmatter(text).map_err(|e| anyhow!(e.to_string()))
}
/// Split `<crate>::<verb>` atom id into components.
pub fn split_atom_id(id: &str) -> Result<(String, String)> {
match id.split_once("::") {
Some((c, v)) if !c.is_empty() && !v.is_empty() => Ok((c.into(), v.into())),
_ => Err(anyhow!("atom id must be <crate>::<verb>, got {id}")),
}
shared::split_atom_id(id).map_err(|e| anyhow!(e.to_string()))
}
#[cfg(test)]

View file

@ -1,169 +1,30 @@
//! Substrate-atom discovery + frontmatter parsing + wikilink extraction.
//! Substrate-atom discovery — thin façade over `kei-atom-discovery`.
//!
//! Walks `<root>/<crate>/atoms/*.md`, parses YAML frontmatter, returns
//! `AtomRecord`. Tolerant: skips files with invalid frontmatter (logs to
//! stderr, continues scan). See `docs/SUBSTRATE-SCHEMA.md` §Graph contract.
//! Historical `AtomRecord` is preserved as a type alias for `AtomMeta` so
//! that downstream sage modules (`atom_index`, `atom_cli`) keep compiling.
use crate::atom_parse::{is_atom_target, parse_wikilink, split_atom_id, split_frontmatter};
use anyhow::{anyhow, Context, Result};
use serde::Deserialize;
use std::fs;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use crate::atom_parse::{is_atom_target, parse_wikilink};
use anyhow::Result;
use kei_atom_discovery as shared;
use std::path::Path;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum AtomKind {
Command,
Query,
Stream,
Transform,
}
pub use kei_atom_discovery::AtomKind;
impl FromStr for AtomKind {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self> {
match s.trim().to_ascii_lowercase().as_str() {
"command" => Ok(AtomKind::Command),
"query" => Ok(AtomKind::Query),
"stream" => Ok(AtomKind::Stream),
"transform" => Ok(AtomKind::Transform),
other => Err(anyhow!("unknown atom kind: {other}")),
}
}
}
impl AtomKind {
pub fn as_str(&self) -> &'static str {
match self {
AtomKind::Command => "command",
AtomKind::Query => "query",
AtomKind::Stream => "stream",
AtomKind::Transform => "transform",
}
}
}
#[derive(Debug, Clone)]
pub struct AtomRecord {
pub full_id: String,
pub kind: AtomKind,
pub crate_name: String,
pub verb: String,
pub version: String,
pub md_path: PathBuf,
pub input_schema: Option<PathBuf>,
pub output_schema: Option<PathBuf>,
pub related: Vec<String>,
pub keywords: Vec<String>,
pub stability: String,
pub body: String,
}
#[derive(Debug, Deserialize)]
struct SchemaRef {
schema: Option<String>,
}
#[derive(Debug, Deserialize)]
struct Frontmatter {
atom: String,
kind: String,
#[serde(default)]
version: Option<String>,
#[serde(default)]
input: Option<SchemaRef>,
#[serde(default)]
output: Option<SchemaRef>,
#[serde(default)]
related: Vec<String>,
#[serde(default)]
keywords: Vec<String>,
#[serde(default)]
stability: Option<String>,
}
/// Legacy alias: sage used to call this `AtomRecord`. New code should use
/// `AtomMeta` directly (identical shape, authored in `kei-atom-discovery`).
pub type AtomRecord = shared::AtomMeta;
/// Walk `<root>/*/atoms/*.md` and return parsed atom metadata.
/// Tolerant: invalid frontmatter → stderr warning + skipped record.
pub fn discover_atoms(root: &Path) -> Result<Vec<AtomRecord>> {
let mut out = Vec::new();
if !root.is_dir() {
return Ok(out);
return Ok(Vec::new());
}
for entry in fs::read_dir(root).with_context(|| format!("read_dir {}", root.display()))? {
let crate_dir = entry?.path();
if crate_dir.is_dir() {
collect_from_crate(&crate_dir, &mut out);
}
}
Ok(out)
}
fn collect_from_crate(crate_dir: &Path, out: &mut Vec<AtomRecord>) {
let atoms_dir = crate_dir.join("atoms");
if !atoms_dir.is_dir() {
return;
}
let crate_name = crate_dir
.file_name()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string();
let iter = match fs::read_dir(&atoms_dir) {
Ok(it) => it,
Err(e) => {
eprintln!("skip {}: {}", atoms_dir.display(), e);
return;
}
};
for entry in iter.flatten() {
let path = entry.path();
if !is_md_file(&path) {
continue;
}
match parse_atom_file(&path, &crate_name) {
Ok(rec) => out.push(rec),
Err(e) => eprintln!("skip {}: {}", path.display(), e),
}
}
}
fn is_md_file(path: &Path) -> bool {
path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("md")
}
fn parse_atom_file(path: &Path, crate_name: &str) -> Result<AtomRecord> {
let text = fs::read_to_string(path)
.with_context(|| format!("read {}", path.display()))?;
let (fm_text, body) = split_frontmatter(&text)?;
let fm: Frontmatter =
serde_yaml::from_str(fm_text).with_context(|| "parse frontmatter YAML")?;
build_record(fm, body, path, crate_name)
}
fn build_record(fm: Frontmatter, body: &str, path: &Path, crate_name: &str) -> Result<AtomRecord> {
let kind = AtomKind::from_str(&fm.kind)?;
let (crate_from_id, verb) = split_atom_id(&fm.atom)?;
let md_dir = path.parent().unwrap_or(path).to_path_buf();
Ok(AtomRecord {
full_id: fm.atom.clone(),
kind,
crate_name: if crate_from_id.is_empty() {
crate_name.to_string()
} else {
crate_from_id
},
verb,
version: fm.version.unwrap_or_default(),
md_path: path.to_path_buf(),
input_schema: fm.input.and_then(|s| s.schema).map(|s| md_dir.join(&s)),
output_schema: fm.output.and_then(|s| s.schema).map(|s| md_dir.join(&s)),
related: fm.related,
keywords: fm.keywords,
stability: fm.stability.unwrap_or_else(|| "unknown".into()),
body: body.to_string(),
})
Ok(shared::discover_atoms(root))
}
/// Extract `(source_atom_id, target)` edges from `related:` wikilinks.
/// Non-atom targets (rules, notes) are filtered out here — scope: atoms only.
/// Non-atom targets (rules, notes) are filtered out — scope: atoms only.
pub fn resolve_wikilinks(records: &[AtomRecord]) -> Vec<(String, String)> {
let mut out = Vec::new();
for rec in records {