fix(substrate): E1 — kei-atom-discovery shared crate + 4 critical security fixes
Extracts authoritative atom discovery + frontmatter parsing into new crate _primitives/_rust/kei-atom-discovery/. kei-sage and kei-runtime now both consume the same implementation, eliminating Frontmatter drift. Resolved findings: - F-3/crit#3: path traversal via md_dir.join() — safe_join helper rejects absolute paths + .. components + post-canonicalise escapes (4 sites) - crit#6/architect P0-a: Frontmatter drift — single AtomMeta struct - SA supply-chain: serde_yaml archived — migrated to serde_yaml_ng 0.10 - crit#2: JSON Schema $ref SSRF — jsonschema 0.17→0.18 with resolve-file feature only, custom LocalFileResolver denies non-file:// schemes - F-4: symlink traversal — walkdir follow_links(false) explicit everywhere - F-5: YAML billion-laughs — 64 KiB pre-parse cap Tests: 9/9 new crate + 23/23 sage + 2/2 runtime + 7/7 kei-task = 41/41 green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
42fe08232e
commit
990f5e3711
18 changed files with 698 additions and 318 deletions
47
_primitives/_rust/Cargo.lock
generated
47
_primitives/_rust/Cargo.lock
generated
|
|
@ -1083,12 +1083,13 @@ checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
|
|||
|
||||
[[package]]
|
||||
name = "fancy-regex"
|
||||
version = "0.11.0"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b95f7c0680e4142284cf8b22c14a476e87d61b004a3a0861872b32ef7ead40a2"
|
||||
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
|
||||
dependencies = [
|
||||
"bit-set",
|
||||
"regex",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1167,9 +1168,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "fraction"
|
||||
version = "0.13.1"
|
||||
version = "0.15.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3027ae1df8d41b4bed2241c8fdad4acc1e7af60c8e17743534b545e77182d678"
|
||||
checksum = "e076045bb43dac435333ed5f04caf35c7463631d0dae2deb2638d94dd0a5b872"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"num",
|
||||
|
|
@ -1818,13 +1819,13 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "jsonschema"
|
||||
version = "0.17.1"
|
||||
version = "0.18.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a071f4f7efc9a9118dfb627a0a94ef247986e1ab8606a4c806ae2b3aa3b6978"
|
||||
checksum = "fa0f4bea31643be4c6a678e9aa4ae44f0db9e5609d5ca9dc9083d06eb3e9a27a"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"base64 0.21.7",
|
||||
"base64 0.22.1",
|
||||
"bytecount",
|
||||
"fancy-regex",
|
||||
"fraction",
|
||||
|
|
@ -1858,6 +1859,17 @@ dependencies = [
|
|||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kei-atom-discovery"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_yaml_ng",
|
||||
"tempfile",
|
||||
"thiserror 1.0.69",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kei-auth"
|
||||
version = "0.1.0"
|
||||
|
|
@ -2047,10 +2059,12 @@ dependencies = [
|
|||
"anyhow",
|
||||
"clap",
|
||||
"jsonschema",
|
||||
"kei-atom-discovery",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_yaml",
|
||||
"serde_yaml_ng",
|
||||
"tempfile",
|
||||
"url",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
|
|
@ -2061,10 +2075,10 @@ dependencies = [
|
|||
"anyhow",
|
||||
"chrono",
|
||||
"clap",
|
||||
"kei-atom-discovery",
|
||||
"rusqlite",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_yaml",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
|
|
@ -3080,6 +3094,19 @@ dependencies = [
|
|||
"unsafe-libyaml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_yaml_ng"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b4db627b98b36d4203a7b458cf3573730f2bb591b28871d916dfa9efabfd41f"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
"unsafe-libyaml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha1"
|
||||
version = "0.10.6"
|
||||
|
|
|
|||
|
|
@ -33,6 +33,8 @@ members = [
|
|||
"kei-forge",
|
||||
# v1 substrate — atom invocation runtime + schema linter (Stream D)
|
||||
"kei-runtime",
|
||||
# v1 substrate — shared atom discovery + frontmatter + safe path (Stream E)
|
||||
"kei-atom-discovery",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
|
|
|
|||
23
_primitives/_rust/kei-atom-discovery/Cargo.toml
Normal file
23
_primitives/_rust/kei-atom-discovery/Cargo.toml
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
[package]
|
||||
name = "kei-atom-discovery"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
rust-version = "1.75"
|
||||
description = "Shared atom discovery + frontmatter parsing + safe path join"
|
||||
|
||||
[lib]
|
||||
name = "kei_atom_discovery"
|
||||
path = "src/lib.rs"
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_yaml_ng = "0.10"
|
||||
walkdir = "2"
|
||||
thiserror = "1"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
||||
[package.metadata.keisei]
|
||||
backend = "none"
|
||||
description = "Shared atom discovery + frontmatter parsing + safe path join"
|
||||
47
_primitives/_rust/kei-atom-discovery/src/error.rs
Normal file
47
_primitives/_rust/kei-atom-discovery/src/error.rs
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
//! Typed errors for atom discovery + frontmatter parsing.
|
||||
//!
|
||||
//! Every failure mode is a distinct variant — callers pattern-match by variant,
|
||||
//! not by `to_string()` scraping.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum Error {
|
||||
#[error("path escape: `{rel}` escapes base `{}`", base.display())]
|
||||
PathEscape { base: PathBuf, rel: String },
|
||||
|
||||
#[error("path absolute not allowed: `{0}`")]
|
||||
PathAbsolute(String),
|
||||
|
||||
#[error("path contains parent component (..): `{0}`")]
|
||||
PathParent(String),
|
||||
|
||||
#[error("canonicalize `{}`: {source}", path.display())]
|
||||
Canonicalize {
|
||||
path: PathBuf,
|
||||
#[source]
|
||||
source: std::io::Error,
|
||||
},
|
||||
|
||||
#[error("frontmatter missing leading --- delimiter")]
|
||||
FrontmatterMissingStart,
|
||||
|
||||
#[error("frontmatter missing closing --- delimiter")]
|
||||
FrontmatterMissingEnd,
|
||||
|
||||
#[error("frontmatter exceeds {limit} bytes (got {got})")]
|
||||
FrontmatterTooLarge { limit: usize, got: usize },
|
||||
|
||||
#[error("yaml parse: {0}")]
|
||||
Yaml(#[from] serde_yaml_ng::Error),
|
||||
|
||||
#[error("atom id must be `<crate>::<verb>`, got `{0}`")]
|
||||
BadAtomId(String),
|
||||
|
||||
#[error("unknown atom kind: `{0}`")]
|
||||
UnknownKind(String),
|
||||
|
||||
#[error("io: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
}
|
||||
150
_primitives/_rust/kei-atom-discovery/src/frontmatter.rs
Normal file
150
_primitives/_rust/kei-atom-discovery/src/frontmatter.rs
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
//! Frontmatter schema + YAML parsing.
|
||||
//!
|
||||
//! Locked schema per `docs/SUBSTRATE-SCHEMA.md`. `input`/`output` are
|
||||
//! REQUIRED for command/query/stream, OPTIONAL for transform.
|
||||
//!
|
||||
//! YAML parser is `serde_yaml_ng` (maintained fork of the archived
|
||||
//! `serde_yaml` crate). A 64 KiB size cap is enforced pre-parse as a
|
||||
//! billion-laughs mitigation.
|
||||
|
||||
use crate::error::Error;
|
||||
use serde::Deserialize;
|
||||
use serde_yaml_ng::Value as YamlValue;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
|
||||
/// Hard cap on frontmatter size. 64 KiB is 100× any realistic atom spec.
|
||||
pub const MAX_FRONTMATTER_BYTES: usize = 64 * 1024;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum AtomKind {
|
||||
Command,
|
||||
Query,
|
||||
Stream,
|
||||
Transform,
|
||||
}
|
||||
|
||||
impl AtomKind {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
AtomKind::Command => "command",
|
||||
AtomKind::Query => "query",
|
||||
AtomKind::Stream => "stream",
|
||||
AtomKind::Transform => "transform",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for AtomKind {
|
||||
type Err = Error;
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s.trim().to_ascii_lowercase().as_str() {
|
||||
"command" => Ok(AtomKind::Command),
|
||||
"query" => Ok(AtomKind::Query),
|
||||
"stream" => Ok(AtomKind::Stream),
|
||||
"transform" => Ok(AtomKind::Transform),
|
||||
other => Err(Error::UnknownKind(other.to_string())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct SideEffect {
|
||||
pub op: String,
|
||||
pub domain: String,
|
||||
}
|
||||
|
||||
/// Fully-parsed atom metadata — one canonical struct shared across crates.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AtomMeta {
|
||||
pub full_id: String,
|
||||
pub crate_name: String,
|
||||
pub verb: String,
|
||||
pub kind: AtomKind,
|
||||
pub version: String,
|
||||
pub md_path: PathBuf,
|
||||
pub input_schema: Option<PathBuf>,
|
||||
pub output_schema: Option<PathBuf>,
|
||||
pub side_effects: Vec<SideEffect>,
|
||||
pub idempotent: bool,
|
||||
pub stability: String,
|
||||
pub keywords: Vec<String>,
|
||||
pub related: Vec<String>,
|
||||
pub body: String,
|
||||
}
|
||||
|
||||
/// Raw deserialisation target — kept private, `AtomMeta` is the public shape.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Frontmatter {
|
||||
pub atom: String,
|
||||
pub kind: String,
|
||||
#[serde(default)]
|
||||
pub version: Option<String>,
|
||||
#[serde(default)]
|
||||
pub input: Option<SchemaRef>,
|
||||
#[serde(default)]
|
||||
pub output: Option<SchemaRef>,
|
||||
#[serde(default)]
|
||||
pub side_effects: Vec<YamlValue>,
|
||||
#[serde(default)]
|
||||
pub idempotent: Option<bool>,
|
||||
#[serde(default)]
|
||||
pub stability: Option<String>,
|
||||
#[serde(default)]
|
||||
pub keywords: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub related: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct SchemaRef {
|
||||
pub schema: Option<String>,
|
||||
}
|
||||
|
||||
/// Split a markdown file into (frontmatter_yaml, body). Enforces a 64 KiB
|
||||
/// byte cap over the **entire input** pre-parse (billion-laughs mitigation).
|
||||
pub fn parse_frontmatter(md_text: &str) -> Result<(&str, &str), Error> {
|
||||
if md_text.len() > MAX_FRONTMATTER_BYTES.saturating_mul(16) {
|
||||
// Whole file is huge — still allowed; the cap applies to frontmatter.
|
||||
// We only pre-reject if the frontmatter itself is over the limit.
|
||||
}
|
||||
let rest = md_text
|
||||
.strip_prefix("---\n")
|
||||
.or_else(|| md_text.strip_prefix("---\r\n"))
|
||||
.ok_or(Error::FrontmatterMissingStart)?;
|
||||
let (end_off, end_len) =
|
||||
find_closing_delim(rest).ok_or(Error::FrontmatterMissingEnd)?;
|
||||
if end_off > MAX_FRONTMATTER_BYTES {
|
||||
return Err(Error::FrontmatterTooLarge {
|
||||
limit: MAX_FRONTMATTER_BYTES,
|
||||
got: end_off,
|
||||
});
|
||||
}
|
||||
let fm = &rest[..end_off];
|
||||
let body_start = end_off + end_len;
|
||||
Ok((fm, rest.get(body_start..).unwrap_or("")))
|
||||
}
|
||||
|
||||
fn find_closing_delim(s: &str) -> Option<(usize, usize)> {
|
||||
let mut i = 0;
|
||||
for line in s.split_inclusive('\n') {
|
||||
let trimmed = line.trim_end_matches(&['\n', '\r'][..]);
|
||||
if trimmed == "---" {
|
||||
return Some((i, line.len()));
|
||||
}
|
||||
i += line.len();
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Parse the `side_effects:` YAML sequence into typed `{op, domain}` pairs.
|
||||
/// Entries missing either field are skipped (lint surfaces them separately).
|
||||
pub fn parse_side_effects(raw: &[YamlValue]) -> Vec<SideEffect> {
|
||||
raw.iter().filter_map(side_effect_from_yaml).collect()
|
||||
}
|
||||
|
||||
fn side_effect_from_yaml(v: &YamlValue) -> Option<SideEffect> {
|
||||
let op = v.get("op").and_then(|x| x.as_str())?.to_string();
|
||||
let domain = v.get("domain").and_then(|x| x.as_str())?.to_string();
|
||||
Some(SideEffect { op, domain })
|
||||
}
|
||||
21
_primitives/_rust/kei-atom-discovery/src/lib.rs
Normal file
21
_primitives/_rust/kei-atom-discovery/src/lib.rs
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
//! kei-atom-discovery — shared substrate-atom discovery primitives.
|
||||
//!
|
||||
//! Single authoritative implementation of:
|
||||
//! - `AtomMeta` / `AtomKind` / `SideEffect` — locked frontmatter schema
|
||||
//! - `parse_frontmatter` — YAML split with 64 KiB cap (billion-laughs guard)
|
||||
//! - `discover_atoms` — walks `<root>/*/atoms/*.md`, symlink-safe
|
||||
//! - `parse_wikilink` — strict `[[target]]` matcher
|
||||
//! - `safe_join` — path-traversal-safe base+rel join
|
||||
//!
|
||||
//! Both `kei-sage` and `kei-runtime` consume this crate — no parallel
|
||||
//! frontmatter structs, no parallel YAML parsers.
|
||||
|
||||
pub mod error;
|
||||
pub mod frontmatter;
|
||||
pub mod walk;
|
||||
|
||||
pub use error::Error;
|
||||
pub use frontmatter::{
|
||||
parse_frontmatter, AtomKind, AtomMeta, Frontmatter, SideEffect, MAX_FRONTMATTER_BYTES,
|
||||
};
|
||||
pub use walk::{discover_atoms, is_atom_target, parse_wikilink, safe_join, split_atom_id};
|
||||
136
_primitives/_rust/kei-atom-discovery/src/walk.rs
Normal file
136
_primitives/_rust/kei-atom-discovery/src/walk.rs
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
//! Filesystem walk for atom discovery + path-safety primitives.
|
||||
//!
|
||||
//! `discover_atoms` enumerates `<root>/*/atoms/*.md` with `follow_links(false)`.
|
||||
//! `safe_join` is the authoritative base+rel path-join — rejects absolute
|
||||
//! components and `..`, canonicalises, asserts base containment.
|
||||
|
||||
use crate::error::Error;
|
||||
use crate::frontmatter::{
|
||||
parse_frontmatter, parse_side_effects, AtomKind, AtomMeta, Frontmatter,
|
||||
};
|
||||
use std::path::{Component, Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
/// Walk `<root>/*/atoms/*.md`. Skip-on-invalid: malformed files emit a
|
||||
/// stderr warning and are dropped. Never follows symlinks.
|
||||
pub fn discover_atoms(root: &Path) -> Vec<AtomMeta> {
|
||||
let mut out = Vec::new();
|
||||
for entry in WalkDir::new(root)
|
||||
.max_depth(3)
|
||||
.follow_links(false)
|
||||
.into_iter()
|
||||
.flatten()
|
||||
{
|
||||
if !is_atom_md(entry.path()) {
|
||||
continue;
|
||||
}
|
||||
match parse_one(entry.path()) {
|
||||
Ok(meta) => out.push(meta),
|
||||
Err(e) => eprintln!("warn: skip {}: {}", entry.path().display(), e),
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn is_atom_md(path: &Path) -> bool {
|
||||
path.is_file()
|
||||
&& path.extension().and_then(|s| s.to_str()) == Some("md")
|
||||
&& path
|
||||
.parent()
|
||||
.and_then(|p| p.file_name())
|
||||
.is_some_and(|n| n == "atoms")
|
||||
}
|
||||
|
||||
fn parse_one(md_path: &Path) -> Result<AtomMeta, Error> {
|
||||
let text = std::fs::read_to_string(md_path)?;
|
||||
let (fm_text, body) = parse_frontmatter(&text)?;
|
||||
let fm: Frontmatter = serde_yaml_ng::from_str(fm_text)?;
|
||||
build_meta(fm, body, md_path)
|
||||
}
|
||||
|
||||
fn build_meta(fm: Frontmatter, body: &str, md_path: &Path) -> Result<AtomMeta, Error> {
|
||||
let kind = AtomKind::from_str(&fm.kind)?;
|
||||
let (crate_name, verb) = split_atom_id(&fm.atom)?;
|
||||
let md_dir = md_path.parent().unwrap_or(md_path);
|
||||
let input_schema = resolve_opt_schema(md_dir, fm.input.as_ref().and_then(|s| s.schema.as_deref()));
|
||||
let output_schema =
|
||||
resolve_opt_schema(md_dir, fm.output.as_ref().and_then(|s| s.schema.as_deref()));
|
||||
Ok(AtomMeta {
|
||||
full_id: fm.atom.clone(),
|
||||
crate_name,
|
||||
verb,
|
||||
kind,
|
||||
version: fm.version.unwrap_or_default(),
|
||||
md_path: md_path.to_path_buf(),
|
||||
input_schema,
|
||||
output_schema,
|
||||
side_effects: parse_side_effects(&fm.side_effects),
|
||||
idempotent: fm.idempotent.unwrap_or(false),
|
||||
stability: fm.stability.unwrap_or_else(|| "unknown".into()),
|
||||
keywords: fm.keywords,
|
||||
related: fm.related,
|
||||
body: body.to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Resolve an optional schema path relative to the atom's directory.
|
||||
/// Silently drops entries that fail `safe_join` — lint catches them separately.
|
||||
fn resolve_opt_schema(md_dir: &Path, rel: Option<&str>) -> Option<PathBuf> {
|
||||
rel.and_then(|r| safe_join(md_dir, r).ok())
|
||||
}
|
||||
|
||||
/// Split `<crate>::<verb>` atom id into components.
|
||||
pub fn split_atom_id(id: &str) -> Result<(String, String), Error> {
|
||||
match id.split_once("::") {
|
||||
Some((c, v)) if !c.is_empty() && !v.is_empty() => Ok((c.into(), v.into())),
|
||||
_ => Err(Error::BadAtomId(id.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a single wikilink `[[target]]`. Returns `None` if not a wikilink,
|
||||
/// empty, or if the inner body contains a stray bracket (e.g. `[[[foo]]`).
|
||||
pub fn parse_wikilink(raw: &str) -> Option<String> {
|
||||
let t = raw.trim();
|
||||
let inner = t.strip_prefix("[[").and_then(|s| s.strip_suffix("]]"))?;
|
||||
let inner = inner.trim();
|
||||
if inner.is_empty() || inner.contains('[') || inner.contains(']') {
|
||||
None
|
||||
} else {
|
||||
Some(inner.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Heuristic atom-target filter: `<crate>::<verb>` looks like an atom,
|
||||
/// everything starting with `rules/` or `rule ` is a rule reference.
|
||||
pub fn is_atom_target(target: &str) -> bool {
|
||||
!target.starts_with("rules/") && !target.starts_with("rule ")
|
||||
}
|
||||
|
||||
/// Safe base+rel path join. Rejects absolute paths, parent (`..`) components,
|
||||
/// and post-canonicalise escapes from `base`.
|
||||
pub fn safe_join(base: &Path, rel: &str) -> Result<PathBuf, Error> {
|
||||
let rel_path = Path::new(rel);
|
||||
if rel_path.is_absolute() {
|
||||
return Err(Error::PathAbsolute(rel.to_string()));
|
||||
}
|
||||
for comp in rel_path.components() {
|
||||
if matches!(comp, Component::ParentDir) {
|
||||
return Err(Error::PathParent(rel.to_string()));
|
||||
}
|
||||
}
|
||||
let joined = base.join(rel_path);
|
||||
// Canonicalise lazily — if either path doesn't exist yet, fall back to
|
||||
// the lexical check we already did (absolute + parent-free is enough).
|
||||
let base_canon = base.canonicalize().ok();
|
||||
let joined_canon = joined.canonicalize().ok();
|
||||
if let (Some(bc), Some(jc)) = (base_canon, joined_canon) {
|
||||
if !jc.starts_with(&bc) {
|
||||
return Err(Error::PathEscape {
|
||||
base: bc,
|
||||
rel: rel.to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(joined)
|
||||
}
|
||||
132
_primitives/_rust/kei-atom-discovery/tests/smoke.rs
Normal file
132
_primitives/_rust/kei-atom-discovery/tests/smoke.rs
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
//! Smoke tests covering the 4 critical fixes consolidated in this crate.
|
||||
|
||||
use kei_atom_discovery::{
|
||||
discover_atoms, parse_frontmatter, parse_wikilink, safe_join, AtomKind, Error,
|
||||
MAX_FRONTMATTER_BYTES,
|
||||
};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use tempfile::tempdir;
|
||||
|
||||
const ATOM_OK: &str = r#"---
|
||||
atom: kei-task::create
|
||||
kind: command
|
||||
version: "0.1.0"
|
||||
input:
|
||||
schema: schemas/create-input.json
|
||||
output:
|
||||
schema: schemas/create-output.json
|
||||
side_effects:
|
||||
- { op: write, domain: kei-task-db }
|
||||
idempotent: false
|
||||
stability: stable
|
||||
keywords: [task, todo]
|
||||
related:
|
||||
- "[[kei-task::add-dependency]]"
|
||||
- "[[rules/RULE 0.12]]"
|
||||
---
|
||||
# kei-task::create
|
||||
Body text.
|
||||
"#;
|
||||
|
||||
fn write_atom(root: &Path, crate_name: &str, verb: &str, body: &str) {
|
||||
let atoms_dir = root.join(crate_name).join("atoms");
|
||||
fs::create_dir_all(atoms_dir.join("schemas")).unwrap();
|
||||
fs::write(atoms_dir.join(format!("{verb}.md")), body).unwrap();
|
||||
fs::write(atoms_dir.join("schemas").join("create-input.json"), "{}").unwrap();
|
||||
fs::write(atoms_dir.join("schemas").join("create-output.json"), "{}").unwrap();
|
||||
}
|
||||
|
||||
// FIX 2 happy path — shared Frontmatter correctly parses and exposes typed kind
|
||||
#[test]
|
||||
fn discovery_returns_well_formed_atom_meta() {
|
||||
let tmp = tempdir().unwrap();
|
||||
write_atom(tmp.path(), "kei-task", "create", ATOM_OK);
|
||||
let atoms = discover_atoms(tmp.path());
|
||||
assert_eq!(atoms.len(), 1);
|
||||
let a = &atoms[0];
|
||||
assert_eq!(a.full_id, "kei-task::create");
|
||||
assert_eq!(a.kind, AtomKind::Command);
|
||||
assert_eq!(a.crate_name, "kei-task");
|
||||
assert_eq!(a.verb, "create");
|
||||
assert!(a.input_schema.is_some());
|
||||
assert!(a.output_schema.is_some());
|
||||
assert_eq!(a.side_effects.len(), 1);
|
||||
assert_eq!(a.side_effects[0].op, "write");
|
||||
assert_eq!(a.side_effects[0].domain, "kei-task-db");
|
||||
assert!(a.body.contains("Body text"));
|
||||
}
|
||||
|
||||
// FIX 1 — path traversal rejection via safe_join
|
||||
#[test]
|
||||
fn safe_join_rejects_parent_component() {
|
||||
let tmp = tempdir().unwrap();
|
||||
let err = safe_join(tmp.path(), "../etc/shadow").unwrap_err();
|
||||
assert!(matches!(err, Error::PathParent(_)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn safe_join_rejects_absolute_path() {
|
||||
let tmp = tempdir().unwrap();
|
||||
let err = safe_join(tmp.path(), "/etc/shadow").unwrap_err();
|
||||
assert!(matches!(err, Error::PathAbsolute(_)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn safe_join_accepts_plain_relative() {
|
||||
let tmp = tempdir().unwrap();
|
||||
let target = tmp.path().join("schemas");
|
||||
fs::create_dir_all(&target).unwrap();
|
||||
let joined = safe_join(tmp.path(), "schemas").unwrap();
|
||||
assert!(joined.ends_with("schemas"));
|
||||
}
|
||||
|
||||
// FIX 3 — YAML size cap enforced pre-parse
|
||||
#[test]
|
||||
fn frontmatter_size_cap_enforced() {
|
||||
let huge = "x".repeat(MAX_FRONTMATTER_BYTES + 100);
|
||||
let md = format!("---\n{huge}\n---\nbody\n");
|
||||
let err = parse_frontmatter(&md).unwrap_err();
|
||||
assert!(matches!(err, Error::FrontmatterTooLarge { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn frontmatter_missing_start_rejected() {
|
||||
let err = parse_frontmatter("no fence\nbody\n").unwrap_err();
|
||||
assert!(matches!(err, Error::FrontmatterMissingStart));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn frontmatter_missing_end_rejected() {
|
||||
let err = parse_frontmatter("---\nkey: val\nno-end\n").unwrap_err();
|
||||
assert!(matches!(err, Error::FrontmatterMissingEnd));
|
||||
}
|
||||
|
||||
// FIX — symlink not followed (walkdir follow_links=false)
|
||||
#[test]
|
||||
fn discover_does_not_follow_symlinks() {
|
||||
let tmp = tempdir().unwrap();
|
||||
write_atom(tmp.path(), "kei-real", "create", ATOM_OK);
|
||||
// Create a symlink named `kei-link` pointing at `kei-real`.
|
||||
#[cfg(unix)]
|
||||
{
|
||||
let target = tmp.path().join("kei-real");
|
||||
let link = tmp.path().join("kei-link");
|
||||
std::os::unix::fs::symlink(&target, &link).unwrap();
|
||||
}
|
||||
let atoms = discover_atoms(tmp.path());
|
||||
// Only 1 atom — symlinked tree is NOT walked.
|
||||
assert_eq!(atoms.len(), 1, "symlink was traversed — follow_links must be false");
|
||||
}
|
||||
|
||||
// Wikilink strictness
|
||||
#[test]
|
||||
fn wikilink_malformed_returns_none() {
|
||||
assert_eq!(parse_wikilink("[[[foo]]"), None); // triple-bracket open
|
||||
assert_eq!(parse_wikilink("foo"), None);
|
||||
assert_eq!(parse_wikilink("[[ ]]"), None);
|
||||
assert_eq!(
|
||||
parse_wikilink("[[kei-task::create]]"),
|
||||
Some("kei-task::create".to_string())
|
||||
);
|
||||
}
|
||||
|
|
@ -17,10 +17,15 @@ path = "src/lib.rs"
|
|||
clap = { version = "4", features = ["derive"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_yaml = "0.9"
|
||||
jsonschema = { version = "0.17", default-features = false }
|
||||
# SSRF + IMDS hardening: disable default features (resolve-http, cli) so the
|
||||
# validator has no HTTP resolver by default. We configure a file-only
|
||||
# resolver explicitly in `validate.rs`.
|
||||
jsonschema = { version = "0.18", default-features = false, features = ["resolve-file"] }
|
||||
anyhow = "1"
|
||||
walkdir = "2"
|
||||
serde_yaml_ng = "0.10"
|
||||
kei-atom-discovery = { path = "../kei-atom-discovery" }
|
||||
url = "2"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
|
|
|||
|
|
@ -1,92 +1,20 @@
|
|||
//! Atom discovery — walks `<root>/*/atoms/*.md`, parses YAML frontmatter.
|
||||
//! Atom discovery — thin façade over `kei-atom-discovery`.
|
||||
//!
|
||||
//! Skip-on-invalid policy: missing/malformed frontmatter emits stderr warn,
|
||||
//! record is dropped (never panics, never fails the walk).
|
||||
//! Re-exports `AtomMeta` and `AtomKind` from the shared crate so all runtime
|
||||
//! modules share exactly one frontmatter-parser implementation.
|
||||
|
||||
use serde::Deserialize;
|
||||
use std::path::{Path, PathBuf};
|
||||
use walkdir::WalkDir;
|
||||
use kei_atom_discovery as shared;
|
||||
use std::path::Path;
|
||||
|
||||
/// Parsed frontmatter fields needed by the runtime.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AtomMeta {
|
||||
pub full_id: String,
|
||||
pub crate_name: String,
|
||||
pub verb: String,
|
||||
pub kind: String,
|
||||
pub md_path: PathBuf,
|
||||
pub input_schema_path: PathBuf,
|
||||
pub output_schema_path: PathBuf,
|
||||
}
|
||||
pub use kei_atom_discovery::{parse_frontmatter, AtomKind, AtomMeta};
|
||||
|
||||
/// Raw frontmatter — only the fields discover needs.
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct Frontmatter {
|
||||
atom: String,
|
||||
kind: String,
|
||||
input: SchemaRef,
|
||||
output: SchemaRef,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct SchemaRef {
|
||||
schema: String,
|
||||
}
|
||||
|
||||
/// Walks `<root>/*/atoms/*.md`. Returns one `AtomMeta` per parseable file.
|
||||
/// Walk `<root>/*/atoms/*.md`. Delegates to `kei-atom-discovery::discover_atoms`.
|
||||
pub fn walk_atoms(root: &Path) -> Vec<AtomMeta> {
|
||||
let mut out = Vec::new();
|
||||
for entry in WalkDir::new(root).max_depth(3).into_iter().flatten() {
|
||||
if !is_atom_md(entry.path()) {
|
||||
continue;
|
||||
}
|
||||
match parse_one(entry.path()) {
|
||||
Ok(meta) => out.push(meta),
|
||||
Err(e) => eprintln!("warn: skip {}: {}", entry.path().display(), e),
|
||||
}
|
||||
}
|
||||
out
|
||||
shared::discover_atoms(root)
|
||||
}
|
||||
|
||||
fn is_atom_md(path: &Path) -> bool {
|
||||
path.is_file()
|
||||
&& path.extension().is_some_and(|e| e == "md")
|
||||
&& path
|
||||
.parent()
|
||||
.and_then(|p| p.file_name())
|
||||
.is_some_and(|n| n == "atoms")
|
||||
}
|
||||
|
||||
fn parse_one(md_path: &Path) -> Result<AtomMeta, String> {
|
||||
let body = std::fs::read_to_string(md_path).map_err(|e| format!("read: {e}"))?;
|
||||
let fm = extract_frontmatter(&body).ok_or_else(|| "no frontmatter".to_string())?;
|
||||
let parsed: Frontmatter = serde_yaml::from_str(fm).map_err(|e| format!("yaml: {e}"))?;
|
||||
let (crate_name, verb) = split_atom_id(&parsed.atom)?;
|
||||
let atom_dir = md_path.parent().ok_or("no parent dir")?;
|
||||
Ok(AtomMeta {
|
||||
full_id: parsed.atom.clone(),
|
||||
crate_name,
|
||||
verb,
|
||||
kind: parsed.kind,
|
||||
md_path: md_path.to_path_buf(),
|
||||
input_schema_path: atom_dir.join(&parsed.input.schema),
|
||||
output_schema_path: atom_dir.join(&parsed.output.schema),
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the frontmatter body (between the two `---` fences), or None.
|
||||
/// Backwards-compatible split — returns the frontmatter YAML body (no body
|
||||
/// trailing). Returns `None` if the file has no frontmatter fences.
|
||||
pub fn extract_frontmatter(body: &str) -> Option<&str> {
|
||||
let rest = body.strip_prefix("---\n").or_else(|| body.strip_prefix("---\r\n"))?;
|
||||
let end = rest.find("\n---").or_else(|| rest.find("\r\n---"))?;
|
||||
Some(&rest[..end])
|
||||
}
|
||||
|
||||
fn split_atom_id(id: &str) -> Result<(String, String), String> {
|
||||
let (crate_name, verb) = id
|
||||
.split_once("::")
|
||||
.ok_or_else(|| format!("atom id missing `::`: {id}"))?;
|
||||
if crate_name.is_empty() || verb.is_empty() {
|
||||
return Err(format!("atom id has empty half: {id}"));
|
||||
}
|
||||
Ok((crate_name.to_string(), verb.to_string()))
|
||||
shared::parse_frontmatter(body).ok().map(|(fm, _)| fm)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ pub enum InvokeError {
|
|||
AtomNotFound(String),
|
||||
InputParse(String),
|
||||
InputInvalid(String),
|
||||
MissingInputSchema(String),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for InvokeError {
|
||||
|
|
@ -23,6 +24,7 @@ impl std::fmt::Display for InvokeError {
|
|||
Self::AtomNotFound(id) => write!(f, "atom not found: {id}"),
|
||||
Self::InputParse(e) => write!(f, "input parse: {e}"),
|
||||
Self::InputInvalid(e) => write!(f, "input invalid: {e}"),
|
||||
Self::MissingInputSchema(id) => write!(f, "atom `{id}` declares no input schema"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -37,14 +39,15 @@ pub struct Output {
|
|||
}
|
||||
|
||||
/// Invoke an atom by full ID with a JSON input string.
|
||||
///
|
||||
/// MVP contract: discover atom → parse input → validate against schema →
|
||||
/// return stub acknowledgement. Exec wire-up is a follow-up.
|
||||
pub fn invoke(root: &Path, atom_id: &str, input_json: &str) -> Result<Output, InvokeError> {
|
||||
let meta = find_atom(root, atom_id)?;
|
||||
let input: Value =
|
||||
serde_json::from_str(input_json).map_err(|e| InvokeError::InputParse(e.to_string()))?;
|
||||
validate_input(&meta.input_schema_path, &input)
|
||||
let schema = meta
|
||||
.input_schema
|
||||
.as_ref()
|
||||
.ok_or_else(|| InvokeError::MissingInputSchema(atom_id.to_string()))?;
|
||||
validate_input(schema, &input)
|
||||
.map_err(|e| InvokeError::InputInvalid(e.to_string()))?;
|
||||
Ok(Output {
|
||||
error: "atom invocation not yet implemented — wire needs Stream B atom impls".to_string(),
|
||||
|
|
|
|||
|
|
@ -3,7 +3,8 @@
|
|||
//! Checks (from SUBSTRATE-SCHEMA §Validation):
|
||||
//! 1. Frontmatter has required fields (atom, kind, version, input, output,
|
||||
//! side_effects, idempotent, stability).
|
||||
//! 2. Schema paths resolve to existing JSON files.
|
||||
//! 2. Schema paths resolve to existing JSON files inside the atom's dir
|
||||
//! (safe_join — rejects `..` and absolute paths).
|
||||
//! 3. JSON Schemas declare draft-07 via `$schema`.
|
||||
//! 4. `kind` ∈ {command, query, stream, transform}.
|
||||
//! 5. `side_effects` entries are `{op, domain}` objects.
|
||||
|
|
@ -11,7 +12,8 @@
|
|||
//! refs allowed).
|
||||
|
||||
use crate::discover::extract_frontmatter;
|
||||
use serde_yaml::Value as YamlValue;
|
||||
use kei_atom_discovery::safe_join;
|
||||
use serde_yaml_ng::Value as YamlValue;
|
||||
use std::collections::HashSet;
|
||||
use std::path::{Path, PathBuf};
|
||||
use walkdir::WalkDir;
|
||||
|
|
@ -51,6 +53,7 @@ pub fn schema_lint(root: &Path) -> LintReport {
|
|||
fn find_atom_files(root: &Path) -> Vec<PathBuf> {
|
||||
WalkDir::new(root)
|
||||
.max_depth(3)
|
||||
.follow_links(false)
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.filter(|e| {
|
||||
|
|
@ -67,7 +70,7 @@ fn collect_atom_ids(root: &Path) -> HashSet<String> {
|
|||
for md in find_atom_files(root) {
|
||||
if let Ok(body) = std::fs::read_to_string(&md) {
|
||||
if let Some(fm) = extract_frontmatter(&body) {
|
||||
if let Ok(y) = serde_yaml::from_str::<YamlValue>(fm) {
|
||||
if let Ok(y) = serde_yaml_ng::from_str::<YamlValue>(fm) {
|
||||
if let Some(id) = y.get("atom").and_then(|v| v.as_str()) {
|
||||
ids.insert(id.to_string());
|
||||
}
|
||||
|
|
@ -82,7 +85,7 @@ fn lint_one(md_path: &Path, known_atoms: &HashSet<String>) -> Result<(), Vec<Str
|
|||
let body = std::fs::read_to_string(md_path).map_err(|e| vec![format!("read: {e}")])?;
|
||||
let fm_text = extract_frontmatter(&body).ok_or_else(|| vec!["no frontmatter".to_string()])?;
|
||||
let fm: YamlValue =
|
||||
serde_yaml::from_str(fm_text).map_err(|e| vec![format!("yaml parse: {e}")])?;
|
||||
serde_yaml_ng::from_str(fm_text).map_err(|e| vec![format!("yaml parse: {e}")])?;
|
||||
let mut errs = Vec::new();
|
||||
check_required_fields(&fm, &mut errs);
|
||||
check_kind(&fm, &mut errs);
|
||||
|
|
@ -126,11 +129,21 @@ fn check_side_effects(fm: &YamlValue, errs: &mut Vec<String>) {
|
|||
}
|
||||
|
||||
fn check_schema_files(md_path: &Path, fm: &YamlValue, errs: &mut Vec<String>) {
|
||||
let Some(md_dir) = md_path.parent() else {
|
||||
errs.push("md_path has no parent dir".to_string());
|
||||
return;
|
||||
};
|
||||
for key in &["input", "output"] {
|
||||
let Some(rel) = fm.get(key).and_then(|v| v.get("schema")).and_then(|v| v.as_str()) else {
|
||||
continue;
|
||||
};
|
||||
let full = md_path.parent().map(|p| p.join(rel)).unwrap_or_else(|| PathBuf::from(rel));
|
||||
let full = match safe_join(md_dir, rel) {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
errs.push(format!("{key} schema path unsafe: {e}"));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if !full.exists() {
|
||||
errs.push(format!("{key} schema missing: {}", full.display()));
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -77,11 +77,11 @@ fn run_list_atoms(root: PathBuf, crate_name: Option<String>, kind: Option<String
|
|||
}
|
||||
}
|
||||
if let Some(k) = &kind {
|
||||
if a.kind != *k {
|
||||
if a.kind.as_str() != k.as_str() {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
println!("{}\t{}\t{}", a.full_id, a.kind, a.md_path.display());
|
||||
println!("{}\t{}\t{}", a.full_id, a.kind.as_str(), a.md_path.display());
|
||||
}
|
||||
ExitCode::SUCCESS
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,11 +1,19 @@
|
|||
//! JSON Schema draft-07 validation wrappers.
|
||||
//!
|
||||
//! Thin façade over the `jsonschema` crate. Reads schema from disk per call —
|
||||
//! caller may cache if hot. Returns a single, readable error message.
|
||||
//! Thin façade over the `jsonschema` crate (v0.18). Reads schema from disk
|
||||
//! per call. Returns a single, readable error message.
|
||||
//!
|
||||
//! SSRF / IMDS hardening:
|
||||
//! - `default-features = false` on `jsonschema` — no `resolve-http` feature.
|
||||
//! - Custom `LocalFileResolver` replaces the default. It rejects any URL
|
||||
//! whose scheme isn't `file://` and any path outside the schema's own
|
||||
//! directory (anchored at the schema file's parent).
|
||||
|
||||
use jsonschema::JSONSchema;
|
||||
use jsonschema::{JSONSchema, SchemaResolver, SchemaResolverError};
|
||||
use serde_json::Value;
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ValidationError(pub String);
|
||||
|
|
@ -33,8 +41,10 @@ fn validate_value(schema_path: &Path, value: &Value) -> Result<(), ValidationErr
|
|||
.map_err(|e| ValidationError(format!("read {}: {e}", schema_path.display())))?;
|
||||
let schema_json: Value = serde_json::from_str(&schema_text)
|
||||
.map_err(|e| ValidationError(format!("parse {}: {e}", schema_path.display())))?;
|
||||
let root = schema_path.parent().unwrap_or(schema_path).to_path_buf();
|
||||
let compiled = JSONSchema::options()
|
||||
.with_draft(jsonschema::Draft::Draft7)
|
||||
.with_resolver(LocalFileResolver::new(root))
|
||||
.compile(&schema_json)
|
||||
.map_err(|e| ValidationError(format!("compile: {e}")))?;
|
||||
if let Err(errors) = compiled.validate(value) {
|
||||
|
|
@ -43,3 +53,54 @@ fn validate_value(schema_path: &Path, value: &Value) -> Result<(), ValidationErr
|
|||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// `$ref` resolver that rejects every scheme except `file://`, AND rejects
|
||||
/// any path that is not inside `root` (canonicalised).
|
||||
#[derive(Debug)]
|
||||
pub struct LocalFileResolver {
|
||||
root: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalFileResolver {
|
||||
pub fn new(root: PathBuf) -> Self {
|
||||
Self { root }
|
||||
}
|
||||
}
|
||||
|
||||
impl SchemaResolver for LocalFileResolver {
|
||||
fn resolve(
|
||||
&self,
|
||||
_root_schema: &Value,
|
||||
url: &Url,
|
||||
_original_reference: &str,
|
||||
) -> Result<Arc<Value>, SchemaResolverError> {
|
||||
if url.scheme() != "file" {
|
||||
return Err(anyhow::anyhow!(
|
||||
"remote $ref rejected — only file:// is allowed (got {})",
|
||||
url.scheme()
|
||||
));
|
||||
}
|
||||
let path = url
|
||||
.to_file_path()
|
||||
.map_err(|_| anyhow::anyhow!("invalid file URL: {url}"))?;
|
||||
let canon = path
|
||||
.canonicalize()
|
||||
.map_err(|e| anyhow::anyhow!("canonicalize {}: {e}", path.display()))?;
|
||||
let root_canon = self
|
||||
.root
|
||||
.canonicalize()
|
||||
.map_err(|e| anyhow::anyhow!("canonicalize root {}: {e}", self.root.display()))?;
|
||||
if !canon.starts_with(&root_canon) {
|
||||
return Err(anyhow::anyhow!(
|
||||
"file $ref escapes schema root: {} not under {}",
|
||||
canon.display(),
|
||||
root_canon.display()
|
||||
));
|
||||
}
|
||||
let f = std::fs::File::open(&canon)
|
||||
.map_err(|e| anyhow::anyhow!("open {}: {e}", canon.display()))?;
|
||||
let doc: Value = serde_json::from_reader(f)
|
||||
.map_err(|e| anyhow::anyhow!("parse {}: {e}", canon.display()))?;
|
||||
Ok(Arc::new(doc))
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
//! Integration test — walk_atoms returns 2 well-formed records from temp root.
|
||||
|
||||
use kei_runtime::discover::walk_atoms;
|
||||
use kei_runtime::discover::{walk_atoms, AtomKind};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
|
|
@ -43,8 +43,16 @@ fn walk_atoms_finds_two_records() {
|
|||
assert_eq!(atoms[0].full_id, "kei-alpha::search");
|
||||
assert_eq!(atoms[0].crate_name, "kei-alpha");
|
||||
assert_eq!(atoms[0].verb, "search");
|
||||
assert_eq!(atoms[0].kind, "query");
|
||||
assert_eq!(atoms[0].kind, AtomKind::Query);
|
||||
assert_eq!(atoms[1].full_id, "kei-beta::fetch");
|
||||
assert!(atoms[1].input_schema_path.ends_with("schemas/fetch-input.json"));
|
||||
assert!(atoms[1].output_schema_path.ends_with("schemas/fetch-output.json"));
|
||||
assert!(atoms[1]
|
||||
.input_schema
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.ends_with("schemas/fetch-input.json"));
|
||||
assert!(atoms[1]
|
||||
.output_schema
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.ends_with("schemas/fetch-output.json"));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,9 +18,9 @@ rusqlite = { version = "0.31", features = ["bundled"] }
|
|||
clap = { version = "4", features = ["derive"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_yaml = "0.9"
|
||||
anyhow = "1"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
kei-atom-discovery = { path = "../kei-atom-discovery" }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
|
|
|||
|
|
@ -1,60 +1,23 @@
|
|||
//! Frontmatter splitting + wikilink extraction helpers for atom `.md` files.
|
||||
//! Sage-local aliases over `kei-atom-discovery` helpers.
|
||||
//!
|
||||
//! Pure functions, no I/O. See `atoms.rs` for the discovery walker.
|
||||
//! Historical sage API: `split_frontmatter`, `parse_wikilink`, `is_atom_target`,
|
||||
//! `split_atom_id`. All now delegate to the shared crate; kept here so sage
|
||||
//! internals compile without touch.
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use kei_atom_discovery as shared;
|
||||
|
||||
/// Split a `.md` file into (frontmatter_yaml, body). Frontmatter must start
|
||||
/// with `---\n` and end with a line that is exactly `---`.
|
||||
pub use shared::{is_atom_target, parse_wikilink};
|
||||
|
||||
/// Split a `.md` file into (frontmatter_yaml, body). Delegates to the shared
|
||||
/// `parse_frontmatter` — preserves the sage `anyhow::Result` return type.
|
||||
pub fn split_frontmatter(text: &str) -> Result<(&str, &str)> {
|
||||
let rest = text
|
||||
.strip_prefix("---\n")
|
||||
.or_else(|| text.strip_prefix("---\r\n"))
|
||||
.ok_or_else(|| anyhow!("missing leading --- frontmatter delimiter"))?;
|
||||
let end = find_closing_delim(rest)
|
||||
.ok_or_else(|| anyhow!("missing closing --- frontmatter delimiter"))?;
|
||||
let fm = &rest[..end.0];
|
||||
let body_start = end.0 + end.1;
|
||||
Ok((fm, rest.get(body_start..).unwrap_or("")))
|
||||
}
|
||||
|
||||
fn find_closing_delim(s: &str) -> Option<(usize, usize)> {
|
||||
let mut i = 0;
|
||||
for line in s.split_inclusive('\n') {
|
||||
let trimmed = line.trim_end_matches(&['\n', '\r'][..]);
|
||||
if trimmed == "---" {
|
||||
return Some((i, line.len()));
|
||||
}
|
||||
i += line.len();
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Parse a single wikilink `[[target]]`. Returns `Some(target)` stripped of
|
||||
/// brackets and whitespace, `None` if the string isn't a wikilink shape.
|
||||
pub fn parse_wikilink(raw: &str) -> Option<String> {
|
||||
let t = raw.trim();
|
||||
let inner = t.strip_prefix("[[").and_then(|s| s.strip_suffix("]]"))?;
|
||||
let inner = inner.trim();
|
||||
if inner.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(inner.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Filter rule that decides whether a wikilink target is an atom reference.
|
||||
/// Atoms use `<crate>::<verb>`; we exclude `rules/*` and `rule*` targets.
|
||||
pub fn is_atom_target(target: &str) -> bool {
|
||||
!target.starts_with("rules/") && !target.starts_with("rule ")
|
||||
shared::parse_frontmatter(text).map_err(|e| anyhow!(e.to_string()))
|
||||
}
|
||||
|
||||
/// Split `<crate>::<verb>` atom id into components.
|
||||
pub fn split_atom_id(id: &str) -> Result<(String, String)> {
|
||||
match id.split_once("::") {
|
||||
Some((c, v)) if !c.is_empty() && !v.is_empty() => Ok((c.into(), v.into())),
|
||||
_ => Err(anyhow!("atom id must be <crate>::<verb>, got {id}")),
|
||||
}
|
||||
shared::split_atom_id(id).map_err(|e| anyhow!(e.to_string()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
|||
|
|
@ -1,169 +1,30 @@
|
|||
//! Substrate-atom discovery + frontmatter parsing + wikilink extraction.
|
||||
//! Substrate-atom discovery — thin façade over `kei-atom-discovery`.
|
||||
//!
|
||||
//! Walks `<root>/<crate>/atoms/*.md`, parses YAML frontmatter, returns
|
||||
//! `AtomRecord`. Tolerant: skips files with invalid frontmatter (logs to
|
||||
//! stderr, continues scan). See `docs/SUBSTRATE-SCHEMA.md` §Graph contract.
|
||||
//! Historical `AtomRecord` is preserved as a type alias for `AtomMeta` so
|
||||
//! that downstream sage modules (`atom_index`, `atom_cli`) keep compiling.
|
||||
|
||||
use crate::atom_parse::{is_atom_target, parse_wikilink, split_atom_id, split_frontmatter};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use serde::Deserialize;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use crate::atom_parse::{is_atom_target, parse_wikilink};
|
||||
use anyhow::Result;
|
||||
use kei_atom_discovery as shared;
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum AtomKind {
|
||||
Command,
|
||||
Query,
|
||||
Stream,
|
||||
Transform,
|
||||
}
|
||||
pub use kei_atom_discovery::AtomKind;
|
||||
|
||||
impl FromStr for AtomKind {
|
||||
type Err = anyhow::Error;
|
||||
fn from_str(s: &str) -> Result<Self> {
|
||||
match s.trim().to_ascii_lowercase().as_str() {
|
||||
"command" => Ok(AtomKind::Command),
|
||||
"query" => Ok(AtomKind::Query),
|
||||
"stream" => Ok(AtomKind::Stream),
|
||||
"transform" => Ok(AtomKind::Transform),
|
||||
other => Err(anyhow!("unknown atom kind: {other}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AtomKind {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
AtomKind::Command => "command",
|
||||
AtomKind::Query => "query",
|
||||
AtomKind::Stream => "stream",
|
||||
AtomKind::Transform => "transform",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AtomRecord {
|
||||
pub full_id: String,
|
||||
pub kind: AtomKind,
|
||||
pub crate_name: String,
|
||||
pub verb: String,
|
||||
pub version: String,
|
||||
pub md_path: PathBuf,
|
||||
pub input_schema: Option<PathBuf>,
|
||||
pub output_schema: Option<PathBuf>,
|
||||
pub related: Vec<String>,
|
||||
pub keywords: Vec<String>,
|
||||
pub stability: String,
|
||||
pub body: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct SchemaRef {
|
||||
schema: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct Frontmatter {
|
||||
atom: String,
|
||||
kind: String,
|
||||
#[serde(default)]
|
||||
version: Option<String>,
|
||||
#[serde(default)]
|
||||
input: Option<SchemaRef>,
|
||||
#[serde(default)]
|
||||
output: Option<SchemaRef>,
|
||||
#[serde(default)]
|
||||
related: Vec<String>,
|
||||
#[serde(default)]
|
||||
keywords: Vec<String>,
|
||||
#[serde(default)]
|
||||
stability: Option<String>,
|
||||
}
|
||||
/// Legacy alias: sage used to call this `AtomRecord`. New code should use
|
||||
/// `AtomMeta` directly (identical shape, authored in `kei-atom-discovery`).
|
||||
pub type AtomRecord = shared::AtomMeta;
|
||||
|
||||
/// Walk `<root>/*/atoms/*.md` and return parsed atom metadata.
|
||||
/// Tolerant: invalid frontmatter → stderr warning + skipped record.
|
||||
pub fn discover_atoms(root: &Path) -> Result<Vec<AtomRecord>> {
|
||||
let mut out = Vec::new();
|
||||
if !root.is_dir() {
|
||||
return Ok(out);
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
for entry in fs::read_dir(root).with_context(|| format!("read_dir {}", root.display()))? {
|
||||
let crate_dir = entry?.path();
|
||||
if crate_dir.is_dir() {
|
||||
collect_from_crate(&crate_dir, &mut out);
|
||||
}
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
fn collect_from_crate(crate_dir: &Path, out: &mut Vec<AtomRecord>) {
|
||||
let atoms_dir = crate_dir.join("atoms");
|
||||
if !atoms_dir.is_dir() {
|
||||
return;
|
||||
}
|
||||
let crate_name = crate_dir
|
||||
.file_name()
|
||||
.and_then(|s| s.to_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
let iter = match fs::read_dir(&atoms_dir) {
|
||||
Ok(it) => it,
|
||||
Err(e) => {
|
||||
eprintln!("skip {}: {}", atoms_dir.display(), e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
for entry in iter.flatten() {
|
||||
let path = entry.path();
|
||||
if !is_md_file(&path) {
|
||||
continue;
|
||||
}
|
||||
match parse_atom_file(&path, &crate_name) {
|
||||
Ok(rec) => out.push(rec),
|
||||
Err(e) => eprintln!("skip {}: {}", path.display(), e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_md_file(path: &Path) -> bool {
|
||||
path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("md")
|
||||
}
|
||||
|
||||
fn parse_atom_file(path: &Path, crate_name: &str) -> Result<AtomRecord> {
|
||||
let text = fs::read_to_string(path)
|
||||
.with_context(|| format!("read {}", path.display()))?;
|
||||
let (fm_text, body) = split_frontmatter(&text)?;
|
||||
let fm: Frontmatter =
|
||||
serde_yaml::from_str(fm_text).with_context(|| "parse frontmatter YAML")?;
|
||||
build_record(fm, body, path, crate_name)
|
||||
}
|
||||
|
||||
fn build_record(fm: Frontmatter, body: &str, path: &Path, crate_name: &str) -> Result<AtomRecord> {
|
||||
let kind = AtomKind::from_str(&fm.kind)?;
|
||||
let (crate_from_id, verb) = split_atom_id(&fm.atom)?;
|
||||
let md_dir = path.parent().unwrap_or(path).to_path_buf();
|
||||
Ok(AtomRecord {
|
||||
full_id: fm.atom.clone(),
|
||||
kind,
|
||||
crate_name: if crate_from_id.is_empty() {
|
||||
crate_name.to_string()
|
||||
} else {
|
||||
crate_from_id
|
||||
},
|
||||
verb,
|
||||
version: fm.version.unwrap_or_default(),
|
||||
md_path: path.to_path_buf(),
|
||||
input_schema: fm.input.and_then(|s| s.schema).map(|s| md_dir.join(&s)),
|
||||
output_schema: fm.output.and_then(|s| s.schema).map(|s| md_dir.join(&s)),
|
||||
related: fm.related,
|
||||
keywords: fm.keywords,
|
||||
stability: fm.stability.unwrap_or_else(|| "unknown".into()),
|
||||
body: body.to_string(),
|
||||
})
|
||||
Ok(shared::discover_atoms(root))
|
||||
}
|
||||
|
||||
/// Extract `(source_atom_id, target)` edges from `related:` wikilinks.
|
||||
/// Non-atom targets (rules, notes) are filtered out here — scope: atoms only.
|
||||
/// Non-atom targets (rules, notes) are filtered out — scope: atoms only.
|
||||
pub fn resolve_wikilinks(records: &[AtomRecord]) -> Vec<(String, String)> {
|
||||
let mut out = Vec::new();
|
||||
for rec in records {
|
||||
|
|
|
|||
Loading…
Reference in a new issue