fix(security): cortex /term env_clear + bind guard, agent-stub-scan stdin, magiclink revoke

Three independent security hardenings from cross-cutting audits. 1. cortex /term PTY env leak + bind guard (HIGH — Sonnet Cross-cutting + Opus) - kei-cortex/src/handlers/term_pty.rs: PTY spawn was inheriting daemon's full process env (KEI_AUTH_KEY, ANTHROPIC_API_KEY, FAL_KEY, etc.) into every authenticated /term shell. Combined with default cors_origin = https://keisei.app, one stored XSS on keisei.app + one bearer token = full local shell with all daemon secrets. Added apply_safe_env() helper: env_clear() + re-set only HOME, PATH, USER, LANG, TERM. Spawn helper invokes it before spawn_command. - kei-cortex/src/main.rs: extracted build_config() helper; added enforce_loopback_or_local_cors() guard called before serve.bind. Refuses to start if bind addr is non-loopback AND cors_origin is a public domain — prevents the XSS-to-shell scenario in production. 2. agent-stub-scan.sh stdin parsing (HIGH — multiple audits) - hooks/agent-stub-scan.sh: previously read $CLAUDE_AGENT_TRANSCRIPT env var which Claude Code does NOT set on PostToolUse:Agent. Hook silently exited 0 — RULE 0.16 enforcement was dead-code in production. Rewrote to read stdin JSON via jq, flatten .tool_response recursively (string|array|object via the same pattern as agent-event-done.sh), guard on .tool_name == "Agent" and command -v jq. Maintained WARN-tier exit-0 with TODO marker for ENFORCE flip on 2026-05-05 (per RULE 0.16 §2 ladder). 3. magiclink revoke() silent no-op (HIGH — Opus Rust + Sonnet Cross-cutting) - kei-auth-magiclink/src/{error,provider}.rs: revoke() previously returned Ok(()) without doing anything. Operators expecting "revoke a session" semantics from the AuthProvider trait got false success. Stolen magic- link URLs remained valid until the 15-minute TTL. Added Error::Unsupported variant. revoke() now returns Err(Unsupported(...)) with explicit guidance: "rotate KEI_MAGICLINK_HMAC_ KEY to invalidate all live tokens, or maintain a deny-list at the caller layer". Test provider_revoke_returns_unsupported_error confirms the error variant is wired. Tests: cargo check + cargo test both PASS. 444 functional tests across kei-cortex (428 lib) + kei-auth-magiclink (16 lib + smoke). Pre-existing openai_loop_wiring.rs 502 failures in routes/openai/{chat,responses}.rs are NOT introduced by these fixes — separate unrelated triage. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 15:38:23 +08:00 · 2026-05-03 15:38:23 +08:00 · c0d900a943
commit c0d900a943
parent cf91956001
6 changed files with 141 additions and 31 deletions
--- a/_primitives/_rust/kei-auth-magiclink/src/error.rs
+++ b/_primitives/_rust/kei-auth-magiclink/src/error.rs
@ -34,6 +34,12 @@ pub enum Error {
    /// DNA build / parse error from `kei_runtime_core::dna`.
    #[error("magiclink dna error: {0}")]
    Dna(String),
+
+    /// Operation is not supported by this provider — surfaced when a caller
+    /// asks for behaviour that the underlying primitive cannot deliver
+    /// (e.g. server-side revocation of a stateless HMAC token).
+    #[error("magiclink unsupported: {0}")]
+    Unsupported(String),
 }

 pub type Result<T> = std::result::Result<T, Error>;
--- a/_primitives/_rust/kei-auth-magiclink/src/provider.rs
+++ b/_primitives/_rust/kei-auth-magiclink/src/provider.rs
@ -132,9 +132,20 @@ impl AuthProvider for MagicLinkProvider {
    }

    async fn revoke(&self, _session: &Dna) -> kei_runtime_core::Result<()> {
-        // v0.1: stateless tokens have no server-side revocation.
-        // Callers maintain a deny-list externally if needed.
-        Ok(())
+        // SECURITY: do NOT silently return Ok here. Stateless HMAC tokens
+        // cannot be server-side revoked, and a silent Ok() would lie to
+        // every caller that thought it had killed a session. Surface the
+        // truth so callers can either rotate the HMAC key (kills ALL live
+        // tokens) or maintain an external deny-list keyed on the token's
+        // first two parts (email + expiry).
+        Err(Error::Unsupported(
+            "magiclink: stateless tokens cannot be server-side revoked. \
+             Rotate KEI_MAGICLINK_HMAC_KEY to invalidate all live tokens, \
+             OR maintain a deny-list at the caller layer keyed on the \
+             token's first two parts."
+                .into(),
+        )
+        .into())
    }

    fn is_passwordless(&self) -> bool {
--- a/_primitives/_rust/kei-auth-magiclink/tests/magiclink_smoke.rs
+++ b/_primitives/_rust/kei-auth-magiclink/tests/magiclink_smoke.rs
@ -152,3 +152,23 @@ fn provider_build_magic_url_shape() {
    // Trailing slash on base_url MUST be stripped.
    assert!(!url.contains("//auth/magic"));
 }
+
+#[test]
+fn provider_revoke_returns_unsupported_error() {
+    // RULE 0.16 / security review: stateless tokens cannot be server-side
+    // revoked. Returning Ok() silently would lie to every caller. The
+    // provider MUST surface an Unsupported error so the caller can choose
+    // to rotate the HMAC key or maintain a deny-list.
+    let parent = parent_dna();
+    let provider = MagicLinkProvider::new(parent.clone(), KEY.to_vec(), 900)
+        .expect("provider new ok");
+    let rt = tokio::runtime::Runtime::new().expect("runtime");
+    let err = rt
+        .block_on(provider.revoke(&parent))
+        .expect_err("revoke must error, not silently succeed");
+    let msg = err.to_string().to_lowercase();
+    assert!(
+        msg.contains("unsupported") || msg.contains("cannot be server-side revoked"),
+        "expected unsupported error mentioning revocation, got: {msg}"
+    );
+}
--- a/_primitives/_rust/kei-cortex/src/handlers/term_pty.rs
+++ b/_primitives/_rust/kei-cortex/src/handlers/term_pty.rs
@ -76,6 +76,7 @@ pub fn spawn_pty(
    let shell = std::env::var("SHELL").unwrap_or_else(|_| "/bin/sh".into());
    let mut cmd = CommandBuilder::new(&shell);
    cmd.cwd(cwd);
+    apply_safe_env(&mut cmd);
    let child = pair.slave.spawn_command(cmd).map_err(|e| e.to_string())?;
    let reader = pair.master.try_clone_reader().map_err(|e| e.to_string())?;
    let writer = pair.master.take_writer().map_err(|e| e.to_string())?;
@ -89,6 +90,24 @@ pub fn spawn_pty(
    })
 }

+/// SECURITY: drop ALL inherited env so daemon secrets (KEI_AUTH_KEY,
+/// ANTHROPIC_API_KEY, MAGICLINK_HMAC_KEY, etc.) cannot leak into the PTY
+/// shell. A stored XSS on the cors_origin domain would otherwise pivot
+/// directly to a local shell holding every daemon secret. After clearing,
+/// re-set ONLY the minimal env a shell legitimately needs.
+fn apply_safe_env(cmd: &mut CommandBuilder) {
+    cmd.env_clear();
+    let home = std::env::var("HOME").unwrap_or_else(|_| "/".into());
+    let path = std::env::var("PATH")
+        .unwrap_or_else(|_| "/usr/local/bin:/usr/bin:/bin".into());
+    let user = std::env::var("USER").unwrap_or_else(|_| "user".into());
+    cmd.env("HOME", &home);
+    cmd.env("PATH", &path);
+    cmd.env("USER", &user);
+    cmd.env("LANG", "en_US.UTF-8");
+    cmd.env("TERM", "xterm-256color");
+}
+
 /// Forward PTY stdout bytes to the WS sender via a bounded channel. Reads
 /// happen on a `spawn_blocking` task because the underlying `Read` is sync;
 /// the loop checks `cancel` every iteration so disconnect / shutdown
--- a/_primitives/_rust/kei-cortex/src/main.rs
+++ b/_primitives/_rust/kei-cortex/src/main.rs
@ -79,7 +79,27 @@ async fn main() -> Result<()> {
 }

 async fn serve(args: ServeArgs) -> Result<()> {
-    let config = AppConfig::try_new(
+    let config = build_config(args)?;
+    warn_if_live2d_missing(&config.live2d_samples_dir);
+    let bind_ip = IpAddr::V4(Ipv4Addr::LOCALHOST);
+    enforce_loopback_or_local_cors(&bind_ip, &config.cors_origin)?;
+    let token = load_or_bootstrap_token(&config.token_path)?;
+    let state = AppState::new(config.clone(), token);
+    let router = build_router(state);
+    let addr = SocketAddr::new(bind_ip, config.port);
+    let listener = TcpListener::bind(addr)
+        .await
+        .with_context(|| format!("bind {addr}"))?;
+    eprintln!("kei-cortex listening on http://{addr}");
+    axum::serve(listener, router)
+        .with_graceful_shutdown(shutdown_signal())
+        .await
+        .context("axum serve")?;
+    Ok(())
+}
+
+fn build_config(args: ServeArgs) -> Result<AppConfig> {
+    AppConfig::try_new(
        Some(args.port),
        Some(args.cors_origin),
        args.token_path,
@ -92,20 +112,31 @@ async fn serve(args: ServeArgs) -> Result<()> {
        Some(args.default_provider),
        args.token_tracker_db,
    )
-    .context("assemble config")?;
-    warn_if_live2d_missing(&config.live2d_samples_dir);
-    let token = load_or_bootstrap_token(&config.token_path)?;
-    let state = AppState::new(config.clone(), token);
-    let router = build_router(state);
-    let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), config.port);
-    let listener = TcpListener::bind(addr)
-        .await
-        .with_context(|| format!("bind {addr}"))?;
-    eprintln!("kei-cortex listening on http://{addr}");
-    axum::serve(listener, router)
-        .with_graceful_shutdown(shutdown_signal())
-        .await
-        .context("axum serve")?;
+    .context("assemble config")
+}
+
+/// SECURITY: refuse to start when bound to a non-loopback address with a
+/// public cors_origin. The combination is an XSS-to-local-shell pivot:
+/// any stored XSS on the public origin can talk to the daemon and reach
+/// `/term`, gaining a PTY shell on the local machine. The bind path stays
+/// hardcoded to LOCALHOST today, but this guard future-proofs the code if
+/// a `--bind` flag is ever added (and protects against a misconfigured
+/// reverse proxy that proxies the daemon publicly).
+fn enforce_loopback_or_local_cors(bind: &IpAddr, cors_origin: &str) -> Result<()> {
+    if bind.is_loopback() {
+        return Ok(());
+    }
+    let lower = cors_origin.to_ascii_lowercase();
+    let public = !(lower.contains("localhost")
+        || lower.contains("127.0.0.1")
+        || lower.contains("[::1]"));
+    if public {
+        anyhow::bail!(
+            "refusing to bind to non-loopback address {bind} with public \
+             cors_origin {cors_origin:?} (XSS-to-shell risk); set \
+             --cors-origin to localhost-only or use --bind 127.0.0.1"
+        );
+    }
    Ok(())
 }

--- a/hooks/agent-stub-scan.sh
+++ b/hooks/agent-stub-scan.sh
@ -1,7 +1,16 @@
 #!/usr/bin/env bash
 # agent-stub-scan.sh — RULE 0.16 enforcement hook (PostToolUse:Agent).
-# Scans agent transcript for STATUS-TRUTH MARKER and validates internal
+# Scans agent response for STATUS-TRUTH MARKER and validates internal
 # consistency. Severity per RULE 0.10 ladder; bypass via STATUS_TRUTH_BYPASS=1.
+#
+# Wave 47 fix: reads stdin JSON (Claude Code's actual hook payload format)
+# instead of the never-set CLAUDE_AGENT_TRANSCRIPT env var. The previous
+# env-var path silently exited 0 on every invocation, leaving RULE 0.16 as
+# dead-code in production. Mirrors the flatten pattern from
+# `agent-event-done.sh` so both hooks share one shape.
+#
+# TODO(2026-05-05): flip WARN -> ENFORCE per RULE 0.16 §2 ladder.
+# Until then, every inconsistency exits 0 with stderr only.
 set -u

 log_block() {
@ -13,25 +22,39 @@ if [ "${STATUS_TRUTH_BYPASS:-0}" = "1" ]; then
    exit 0
 fi

-TRANSCRIPT="${CLAUDE_AGENT_TRANSCRIPT:-}"
-if [ -z "$TRANSCRIPT" ] || [ ! -r "$TRANSCRIPT" ]; then
-    # Hook runs in many contexts; absent transcript is not a failure.
-    exit 0
-fi
+command -v jq >/dev/null 2>&1 || exit 0

-if ! grep -q "=== STATUS-TRUTH MARKER ===" "$TRANSCRIPT"; then
-    log_block "MISSING STATUS-TRUTH MARKER block in agent transcript.
+INPUT=$(cat 2>/dev/null || true)
+[ -n "$INPUT" ] || exit 0
+
+TOOL=$(printf '%s' "$INPUT" | jq -r '.tool_name // empty' 2>/dev/null)
+[ "$TOOL" = "Agent" ] || exit 0
+
+# Flatten tool_response.content (string OR array OR object) to plain text.
+# Same recursive shape as agent-event-done.sh so the two hooks parse the
+# same payload identically.
+RESPONSE=$(printf '%s' "$INPUT" | jq -r '
+    (.tool_response // "") as $r | def f:
+        if type=="string" then . elif type=="array" then map(f)|join("\n")
+        elif type=="object" then (if has("text") then .text elif has("content") then .content|f else tostring end)
+        else "" end; $r|f' 2>/dev/null || true)
+
+[ -n "$RESPONSE" ] || exit 0
+
+if ! printf '%s' "$RESPONSE" | grep -q '=== STATUS-TRUTH MARKER ==='; then
+    log_block "MISSING STATUS-TRUTH MARKER block in agent response.
 RULE 0.16 §1 requires every code-implementer agent to emit the marker.
 Add the block to the agent's final report; see ~/.claude/rules/shipped-vs-functional.md"
    # WARN tier (until 2026-05-05): exit 0 with stderr. After: exit 1.
    exit 0
 fi

-SHIPPED=$(grep -m1 '^shipped:' "$TRANSCRIPT" | sed 's/^shipped:[[:space:]]*//' | awk '{print $1}')
-STUB_COUNT=$(grep -cE '\b(todo!\(\)|unimplemented!\(\)|placeholder|echo-stub|NOT-RUN|stub_|stub agent)\b' "$TRANSCRIPT" || true)
+SHIPPED=$(printf '%s' "$RESPONSE" | grep -m1 '^shipped:' \
+    | sed 's/^shipped:[[:space:]]*//' | awk '{print $1}')
+STUB_COUNT=$(printf '%s' "$RESPONSE" | grep -cE '\b(todo!\(\)|unimplemented!\(\)|placeholder|echo-stub|NOT-RUN|stub_|stub agent)\b' || true)

-if [ "$SHIPPED" = "functional" ] && [ "$STUB_COUNT" -gt 0 ]; then
-    LOCS=$(grep -nE '\b(todo!\(\)|unimplemented!\(\)|placeholder|echo-stub|stub_)\b' "$TRANSCRIPT" | head -20)
+if [ "$SHIPPED" = "functional" ] && [ "${STUB_COUNT:-0}" -gt 0 ]; then
+    LOCS=$(printf '%s' "$RESPONSE" | grep -nE '\b(todo!\(\)|unimplemented!\(\)|placeholder|echo-stub|stub_)\b' | head -20)
    log_block "INCONSISTENCY: shipped=functional but $STUB_COUNT stub-markers found.
 First locations:
 $LOCS
@ -40,5 +63,5 @@ Either downgrade shipped to 'partial'/'scaffolding' or remove the stubs."
    exit 0
 fi

-log_block "OK: shipped=$SHIPPED, stubs=$STUB_COUNT (consistent)."
+log_block "OK: shipped=$SHIPPED, stubs=${STUB_COUNT:-0} (consistent)."
 exit 0