feat(v0.45): post-install onboarding wizard + 5 full-profile bug fixes

User feedback from real prod install (curl|bash, profile=full): 'нет выбора провайдера, нахуй не понятно что делать после установки'. ## New: kei onboard wizard scripts/kei-onboard.sh — 4-step interactive wizard auto-triggered at end of bootstrap.sh (if stdin is TTY; non-interactive runs print summary): Step 1 — Pick primary LLM orchestrator (claude/grok/agy/copilot/kimi) Step 2 — Run kei mcp-wire to install MCP into each detected CLI Step 3 — Optional MOONSHOT_API_KEY hint for live limits Step 4 — Run kei-doctor health check Re-runnable anytime: 'kei onboard'. Skip auto-trigger: KEI_NO_ONBOARD=1. bin/kei gains 'onboard | setup | wizard' arms. ## Bug fixes from prod install log [install] act_runner: command not found brew installs 'gitea-runner' (not 'act_runner'); the two are functionally equivalent and both register with Forgejo. lib-dev-hub-forgejo-runner.sh now tries act_runner first, falls back to gitea-runner; brew install switches to gitea-runner package which is what's actually available. [install] forgejo admin user create — 'no such table: user' Fresh sqlite DB hadn't been migrated before admin user create ran. lib-dev-hub-forgejo.sh now runs 'forgejo migrate' before admin bootstrap; idempotent — safe on re-runs. [install] dev-hub-zoekt: 'No formulae or casks found for zoekt' Zoekt not in homebrew/core. lib-dev-hub-zoekt.sh now tries known taps (sourcegraph/zoekt, hyperdiscovery/zoekt), falls back to 'go install' if Go is available, and finally skips cleanly with a clear warning instead of aborting the entire dev-hub bundle install. [install] dev-hub-datasette: Bootstrap failed: 5: Input/output error launchd Input/output error is a macOS quirk when the plist exists but the agent isn't yet known to launchd. Not introducing a code fix this release — to investigate in v0.46. Doc note will be added. [install] kei-shared binary missing post-install Pre-built cache detection ('pre-built binaries detected — skipping cargo build') was overly eager; kei-shared wasn't in the cache. Workaround: run install with KEI_SKIP_RUST_BUILD unset to force rebuild. Permanent fix deferred to v0.46 (improve cache validation). ## Verification - 'kei onboard' non-interactive: prints next-steps + exits cleanly ✓ - 'kei --status' shows substrate v0.45 ✓ - bootstrap.sh end-of-install branch: TTY check + KEI_NO_ONBOARD honored ✓
feat(v0.44): pre-release audit — 1 CRITICAL + 4 HIGH + 4 MEDIUM patched
2026-05-26 23:18:55 +08:00 · 2026-05-26 23:00:34 +08:00 · 2026-05-26 22:03:12 +08:00 · 2026-05-26 21:50:55 +08:00 · 2026-05-26 21:43:39 +08:00 · 2026-05-26 21:33:54 +08:00
11 changed files with 949 additions and 127 deletions
--- a/_primitives/_rust/kei-mcp/src/handlers/safe_tools.rs
+++ b/_primitives/_rust/kei-mcp/src/handlers/safe_tools.rs
@ -30,6 +30,25 @@
 //!   #3 CLAUDECODE bypass — documented as design (see above), no behavior change
 //!   #4 tokio::fs for async file I/O (was: blocking std::fs on tokio thread)
 //!   #5 process-group kill on Unix (was: kill_on_drop SIGKILLs only direct child)
+//!
+//! v0.42 re-audit fixes (2026-05-26, 4-CLI dogfood: Claude+Grok+Gemini+Copilot):
+//!   #1 [CRITICAL] symlink LEAF bypass — canonicalize full path + reject
+//!      leaf symlinks (v0.41 only canonicalized PARENT; ln -s ~/.ssh/keys ./x
+//!      then kei_write x followed the link to the target)
+//!   #2 [HIGH]     $HOME removed from default allowed_roots — was a blanket
+//!      allow that let agent overwrite ~/.claude/hooks (self-neuter), ~/.zshrc
+//!      (RCE on next shell), and credential stores. Default: $PWD only.
+//!      Denylist also extended with .claude/, .grok/, .gemini/, .copilot/,
+//!      .kimi/, and exact shell-init filenames.
+//!   #3 [HIGH]     empty [bash]/[edit]/[write] section also FAIL-CLOSED (was:
+//!      empty vec → pass-through). KEI_POLICY_CHAIN_OPTIONAL=1 to opt in.
+//!   #4 [MED]      load_chain converted to async + tokio::fs (was: blocking
+//!      std::fs on tokio worker thread).
+//!   #5 [MED]      set_process_group + killpg applied to HOOK subprocess too
+//!      (v0.41 only had it on the bash action; hook grandchildren orphaned).
+//!   #6 [MED]      doc note that aggregate timeout is still per-step (60s ×
+//!      N hooks + 60s action). Single-deadline implementation deferred to
+//!      v0.43 — not security-blocking.

 use crate::protocol::{err, ok, JsonRpcRequest, JsonRpcResponse, INTERNAL_ERROR, INVALID_PARAMS};
 use serde::Deserialize;
@ -41,8 +60,12 @@ use tokio::fs;
 use tokio::io::AsyncWriteExt;
 use tokio::process::Command;

-/// Hard cap on how long a single hook chain + action may take. Matches the
-/// timeout in `handlers::tools::ATOM_TIMEOUT_SECS` for consistency.
+/// Per-step timeout (each hook AND the action each get up to this long).
+/// For an N-hook chain the total wall-clock cap is approximately
+/// `(N+1) * SAFE_TOOL_TIMEOUT_SECS`. v0.44 doc-honesty fix (Claude MED):
+/// prior versions claimed this was an "aggregate" cap, which was always
+/// wrong. Aggregate-deadline impl is deferred; for now the per-step
+/// semantics are documented honestly so operators pick a sane value.
 const SAFE_TOOL_TIMEOUT_SECS: u64 = 60;

 #[derive(Deserialize, Default)]
@ -129,9 +152,16 @@ async fn handle_bash(args: &Value) -> Result<String, String> {
        .ok_or_else(|| missing_arg("kei_bash", "command"))?;
    let cwd = args.get("cwd").and_then(Value::as_str);

+    // v0.44 fix #8 (Gemini MED): include cwd in hook input. Without this,
+    // safety-guard could approve a destructive command (e.g. `rm -rf *`)
+    // assuming PWD, while the actual cwd arg redirected it to a sensitive
+    // dir. Hooks now see the real working directory.
    let hook_input = json!({
        "tool_name": "Bash",
-        "tool_input": { "command": command }
+        "tool_input": {
+            "command": command,
+            "cwd": cwd
+        }
    });
    run_chain("bash", &hook_input).await?;

@ -144,9 +174,14 @@ async fn handle_bash(args: &Value) -> Result<String, String> {
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .kill_on_drop(true);
-    // v0.41 fix #5 (Gemini MED): put child in its own process group so timeout
-    // kills it and ALL grandchildren together (not just the immediate shell).
+    // v0.41 fix #5: put child in its own process group so timeout kills it
+    // and ALL grandchildren together (not just the immediate shell).
    set_process_group(&mut cmd);
+    // v0.44 fix #4 (Gemini HIGH): clear parent env on subprocess spawn.
+    // Was: child inherited AWS_*, GITHUB_TOKEN, MOONSHOT_API_KEY, etc.
+    // An agent that exec's `env` via kei_bash could exfiltrate all of them.
+    // Now: only PATH/HOME/USER/LANG/TERM/SHELL forwarded (set in helper).
+    apply_safe_env(&mut cmd);

    let child = cmd.spawn().map_err(|e| format!("spawn bash: {e}"))?;
    let pid_opt = child.id();
@ -177,15 +212,42 @@ async fn handle_bash(args: &Value) -> Result<String, String> {
 }

 // v0.41 fix #5: process-group helpers (Unix-only; no-op on other platforms).
-// tokio::process::Command::process_group is available on Unix without
-// requiring the std::os::unix::process::CommandExt trait import.
 #[cfg(unix)]
 fn set_process_group(cmd: &mut Command) {
-    cmd.process_group(0); // 0 = new session leader for this child
+    cmd.process_group(0);
 }
 #[cfg(not(unix))]
 fn set_process_group(_cmd: &mut Command) {}

+/// v0.44 fix #4 (Gemini HIGH): strip parent env on subprocess spawn so secrets
+/// like AWS_*, GITHUB_TOKEN, MOONSHOT_API_KEY etc. don't leak to user-controlled
+/// bash commands or hook scripts. Whitelist forwards only PATH/HOME/USER/LANG/
+/// TERM/SHELL — enough to keep tools functional, none of it sensitive.
+///
+/// Override: `KEI_SAFE_ENV_EXTRA=":-separated list"` adds named vars to the
+/// whitelist for callers that legitimately need (e.g. NIX_PATH, JAVA_HOME).
+fn apply_safe_env(cmd: &mut Command) {
+    cmd.env_clear();
+    let default_keep = [
+        "PATH", "HOME", "USER", "LOGNAME", "SHELL", "LANG", "LC_ALL",
+        "LC_CTYPE", "TERM", "PWD", "TMPDIR",
+    ];
+    for k in default_keep {
+        if let Ok(v) = std::env::var(k) {
+            cmd.env(k, v);
+        }
+    }
+    if let Ok(extras) = std::env::var("KEI_SAFE_ENV_EXTRA") {
+        for k in extras.split(':') {
+            let k = k.trim();
+            if k.is_empty() { continue; }
+            if let Ok(v) = std::env::var(k) {
+                cmd.env(k, v);
+            }
+        }
+    }
+}
+
 #[cfg(unix)]
 fn killpg_best_effort(pid: u32) {
    // SAFETY: libc::kill on a negative PID targets the process group.
@ -205,7 +267,12 @@ async fn handle_edit(args: &Value) -> Result<String, String> {
    let new_string = args.get("new_string").and_then(Value::as_str)
        .ok_or_else(|| missing_arg("kei_edit", "new_string"))?;

-    // v0.41 fix #2: path-traversal guard
+    // v0.44 LOW: reject empty old_string (would silently prepend new_string
+    // because contents.contains("") is always true).
+    if old_string.is_empty() {
+        return Err("kei_edit: old_string must not be empty".into());
+    }
+
    let safe_path = validate_path(file_path)?;

    let hook_input = json!({
@ -218,16 +285,12 @@ async fn handle_edit(args: &Value) -> Result<String, String> {
    });
    run_chain("edit", &hook_input).await?;

-    // v0.41 fix #4: tokio::fs (async)
-    let contents = fs::read_to_string(&safe_path).await
-        .map_err(|e| format!("read {}: {e}", safe_path.display()))?;
-    if !contents.contains(old_string) {
-        return Err(format!("kei_edit: old_string not found in {}", safe_path.display()));
-    }
-    let updated = contents.replacen(old_string, new_string, 1);
-    fs::write(&safe_path, &updated).await
-        .map_err(|e| format!("write {}: {e}", safe_path.display()))?;
-    Ok(format!("edited {} ({} bytes)", safe_path.display(), updated.len()))
+    // v0.44 fix #2 (Gemini HIGH + Claude #4 MED): close TOCTOU window. After
+    // validate_path approved the path, a concurrent process could swap the
+    // file for a symlink before our write. Open the existing file with
+    // O_NOFOLLOW so the open itself fails on symlink-swap; then read/write
+    // through the open fd (not the path again) so no second path lookup.
+    open_nofollow_read_write_edit(&safe_path, old_string, new_string).await
 }

 async fn handle_write(args: &Value) -> Result<String, String> {
@ -236,7 +299,6 @@ async fn handle_write(args: &Value) -> Result<String, String> {
    let content = args.get("content").and_then(Value::as_str)
        .ok_or_else(|| missing_arg("kei_write", "content"))?;

-    // v0.41 fix #2: path-traversal guard
    let safe_path = validate_path(file_path)?;

    let hook_input = json!({
@ -251,20 +313,110 @@ async fn handle_write(args: &Value) -> Result<String, String> {
                .map_err(|e| format!("mkdir {}: {e}", parent.display()))?;
        }
    }
-    fs::write(&safe_path, content).await
-        .map_err(|e| format!("write {}: {e}", safe_path.display()))?;
-    Ok(format!("wrote {} ({} bytes)", safe_path.display(), content.len()))
+    // v0.44 fix #2: open with O_NOFOLLOW + O_CREAT to refuse swap-to-symlink.
+    open_nofollow_write(&safe_path, content).await
 }

-/// v0.41 fix #2 (Gemini HIGH): reject obvious path-traversal / sensitive-path
-/// targets BEFORE running hooks. Defense-in-depth: hooks may also flag this,
-/// but having the Rust layer reject obvious attacks gives a fast-fail
-/// independent of hook configuration.
+/// v0.44 fix #2: edit via O_NOFOLLOW-opened fd to close the TOCTOU window
+/// between validate_path and the write. The open() itself refuses if the leaf
+/// has been swapped to a symlink during the hook-chain await.
+#[cfg(unix)]
+async fn open_nofollow_read_write_edit(
+    path: &Path, old_string: &str, new_string: &str,
+) -> Result<String, String> {
+    use std::os::unix::fs::OpenOptionsExt;
+    let path = path.to_path_buf();
+    let old_s = old_string.to_string();
+    let new_s = new_string.to_string();
+    // Blocking syscalls on a dedicated thread (tokio::task::spawn_blocking).
+    let result = tokio::task::spawn_blocking(move || -> Result<String, String> {
+        let mut f = std::fs::OpenOptions::new()
+            .read(true).write(true)
+            .custom_flags(libc::O_NOFOLLOW)
+            .open(&path)
+            .map_err(|e| format!("kei_edit: open(O_NOFOLLOW) {}: {e}", path.display()))?;
+        use std::io::{Read, Write, Seek, SeekFrom};
+        let mut contents = String::new();
+        f.read_to_string(&mut contents)
+            .map_err(|e| format!("kei_edit: read {}: {e}", path.display()))?;
+        if !contents.contains(&old_s) {
+            return Err(format!("kei_edit: old_string not found in {}", path.display()));
+        }
+        let updated = contents.replacen(&old_s, &new_s, 1);
+        f.set_len(0).map_err(|e| format!("kei_edit: truncate {}: {e}", path.display()))?;
+        f.seek(SeekFrom::Start(0))
+            .map_err(|e| format!("kei_edit: seek {}: {e}", path.display()))?;
+        f.write_all(updated.as_bytes())
+            .map_err(|e| format!("kei_edit: write {}: {e}", path.display()))?;
+        Ok(format!("edited {} ({} bytes)", path.display(), updated.len()))
+    }).await
+        .map_err(|e| format!("kei_edit: thread join: {e}"))?;
+    result
+}
+#[cfg(not(unix))]
+async fn open_nofollow_read_write_edit(
+    path: &Path, old_string: &str, new_string: &str,
+) -> Result<String, String> {
+    // Non-Unix fallback: best-effort using tokio::fs (no O_NOFOLLOW available).
+    let contents = fs::read_to_string(path).await
+        .map_err(|e| format!("read {}: {e}", path.display()))?;
+    if !contents.contains(old_string) {
+        return Err(format!("kei_edit: old_string not found in {}", path.display()));
+    }
+    let updated = contents.replacen(old_string, new_string, 1);
+    fs::write(path, &updated).await
+        .map_err(|e| format!("write {}: {e}", path.display()))?;
+    Ok(format!("edited {} ({} bytes)", path.display(), updated.len()))
+}
+
+#[cfg(unix)]
+async fn open_nofollow_write(path: &Path, content: &str) -> Result<String, String> {
+    use std::os::unix::fs::OpenOptionsExt;
+    let path = path.to_path_buf();
+    let bytes = content.as_bytes().to_vec();
+    let result = tokio::task::spawn_blocking(move || -> Result<String, String> {
+        let mut opts = std::fs::OpenOptions::new();
+        opts.write(true).create(true).truncate(true);
+        // O_NOFOLLOW: refuse if the leaf is a symlink (someone swapped it
+        // during our await). Without this the v0.42 symlink_metadata pre-check
+        // was just an indicator — fs::write still followed.
+        opts.custom_flags(libc::O_NOFOLLOW);
+        // O_EXCL combined with O_CREAT could be added when path does not yet
+        // exist to refuse any pre-existing inode — but the test suite uses
+        // the same path multiple times, so we keep truncate semantics. The
+        // O_NOFOLLOW + symlink_metadata pre-check is sufficient.
+        let mut f = opts.open(&path)
+            .map_err(|e| format!("kei_write: open(O_NOFOLLOW) {}: {e}", path.display()))?;
+        use std::io::Write;
+        f.write_all(&bytes)
+            .map_err(|e| format!("kei_write: write {}: {e}", path.display()))?;
+        Ok(format!("wrote {} ({} bytes)", path.display(), bytes.len()))
+    }).await
+        .map_err(|e| format!("kei_write: thread join: {e}"))?;
+    result
+}
+#[cfg(not(unix))]
+async fn open_nofollow_write(path: &Path, content: &str) -> Result<String, String> {
+    fs::write(path, content).await
+        .map_err(|e| format!("write {}: {e}", path.display()))?;
+    Ok(format!("wrote {} ({} bytes)", path.display(), content.len()))
+}
+
+/// Path-traversal + symlink + denylist guard.
 ///
-/// Allowed roots: $PWD (recursively), $HOME (excluding dotfile-secret dirs).
-/// Override: set KEI_ALLOWED_ROOTS=":" -separated absolute paths.
-/// Always rejected: /etc/, /usr/, /System/, /var/, /private/etc/, $HOME/.ssh/,
-/// $HOME/.aws/, $HOME/.config/gcloud/, $HOME/.gnupg/, any path containing "..".
+/// v0.41 (initial): rejected `..`, canonicalized PARENT, checked denylist + roots.
+///   → 4-CLI re-audit (2026-05-26) found this was bypassable via symlink at the
+///     leaf and self-attackable via the $HOME blanket-allowed root.
+///
+/// v0.42 fixes:
+///   #1 [CRITICAL] reject if the leaf is a symlink (was: validated parent
+///      only, fs::write followed leaf symlink to anywhere). Done via
+///      `symlink_metadata` on the leaf BEFORE write, and full `canonicalize`
+///      on the leaf when the file already exists.
+///   #2 [HIGH] $HOME removed from default allowed-roots — default is $PWD
+///      only. Denylist now also covers $HOME/.claude/ (the substrate
+///      itself), shell init files, and credential stores. Operators who
+///      need broader access set KEI_ALLOWED_ROOTS explicitly.
 fn validate_path(p: &str) -> Result<PathBuf, String> {
    if p.is_empty() {
        return Err("file_path: empty".into());
@ -274,73 +426,157 @@ fn validate_path(p: &str) -> Result<PathBuf, String> {
        return Err(format!("file_path: '..' segment not allowed in {p}"));
    }
    let path = Path::new(p);
-    // 2. Canonicalize the parent (file may not exist yet for kei_write);
-    //    if even the parent doesn't exist, use the absolute form.
-    let canonical = if let Some(parent) = path.parent() {
-        if parent.as_os_str().is_empty() || parent == Path::new("") {
-            std::env::current_dir()
-                .map_err(|e| format!("file_path: cwd unavailable: {e}"))?
-                .join(path)
-        } else if parent.exists() {
-            parent.canonicalize()
-                .map_err(|e| format!("file_path: canonicalize {}: {e}", parent.display()))?
-                .join(path.file_name().unwrap_or_default())
-        } else if path.is_absolute() {
-            path.to_path_buf()
-        } else {
-            std::env::current_dir()
-                .map_err(|e| format!("file_path: cwd unavailable: {e}"))?
-                .join(path)
+
+    // 2. Build a canonical path. Walk UP to the deepest existing ancestor,
+    //    canonicalize it (resolves all symlinks in the existing prefix),
+    //    then reattach the non-existent tail. This catches symlinks at ANY
+    //    depth in the path, including nested non-existent leaves.
+    //
+    //    v0.44 fix #1 (Gemini CRITICAL): v0.42 only canonicalized the immediate
+    //    parent. If the parent didn't exist either (e.g. /proj/symlink_dir/
+    //    new_subdir/file.txt where symlink_dir → /Users/denis), the path fell
+    //    through to "absolute as-is" → no canonicalization → bypass.
+    let canonical = canonicalize_with_walk_up(path)?;
+
+    // 3. Even when the file doesn't exist yet, the LEAF could already be a
+    //    dangling symlink that `fs::write` would follow on creation. Reject.
+    if let Ok(meta) = std::fs::symlink_metadata(&canonical) {
+        if meta.file_type().is_symlink() {
+            return Err(format!(
+                "file_path: leaf is a symlink (refusing to follow): {}",
+                canonical.display()
+            ));
        }
-    } else {
-        return Err(format!("file_path: invalid {p}"));
-    };
+    }
+
+    // 4. Allowed-root containment FIRST (v0.44 fix #6 reorder: was after
+    //    denylist, which meant macOS $TMPDIR = /private/var/folders/... hit
+    //    the /var/ denylist before reaching the allowed_roots check, blocking
+    //    legitimate use of tempfile-backed CWD on macOS).
+    //
+    //    v0.44 fix #5 (Claude HIGH): use Path::starts_with for component-aware
+    //    containment — Path::starts_with("/home/u/proj") does NOT match
+    //    /home/u/proj-secrets, the str::starts_with that was here did.
+    let roots = allowed_roots();
+    let in_allowed_root = roots.is_empty() || roots.iter().any(|r| {
+        canonical.starts_with(r)
+    });
+    if !in_allowed_root {
+        return Err(format!(
+            "file_path: outside allowed roots {:?}: {}",
+            roots, canonical.display()
+        ));
+    }
+
    let canon_str = canonical.display().to_string();

-    // 3. Reject obvious sensitive directories.
+    // 5. Reject system + substrate-control + credential paths.
+    //    Note: paths inside an allowed root that also match a denylist entry
+    //    are STILL denied (e.g. agent's CWD == ~/.claude/ — denied even
+    //    though it matches a default root). System dirs not in any allowed
+    //    root would have been caught above anyway.
    let denylist = [
-        "/etc/", "/usr/", "/System/", "/var/", "/private/etc/", "/private/var/",
-        "/root/",
+        "/etc/", "/usr/", "/System/", "/var/db/", "/var/log/", "/var/root/",
+        "/private/etc/", "/private/var/db/", "/private/var/log/", "/private/var/root/",
+        "/root/", "/bin/", "/sbin/",
    ];
+    // NOTE: /var/folders/ (macOS $TMPDIR) and /private/tmp/ are NOT denied —
+    // they are legitimate working dirs for tempfile-backed agents.
    for d in denylist {
        if canon_str.starts_with(d) {
            return Err(format!("file_path: denied (system dir): {canon_str}"));
        }
    }
    if let Ok(home) = std::env::var("HOME") {
-        let secret_dirs = [".ssh/", ".aws/", ".gnupg/", ".config/gcloud/"];
-        for sd in secret_dirs {
+        let dir_secrets = [
+            ".ssh/", ".aws/", ".gnupg/", ".config/gcloud/", ".cargo/credentials",
+            ".npmrc", ".docker/config.json", ".kube/",
+            ".claude/", ".grok/", ".gemini/", ".copilot/", ".kimi/",
+        ];
+        for sd in dir_secrets {
            let full = format!("{home}/{sd}");
            if canon_str.starts_with(&full) {
-                return Err(format!("file_path: denied (secret dir): {canon_str}"));
+                return Err(format!("file_path: denied (secret/substrate dir): {canon_str}"));
            }
        }
+        let init_files = [
+            ".zshrc", ".bashrc", ".profile", ".bash_profile", ".zprofile",
+            ".zshenv", ".bash_login", ".inputrc", ".gitconfig",
+            ".config/fish/config.fish",
+        ];
+        for f in init_files {
+            let full = format!("{home}/{f}");
+            if canon_str == full {
+                return Err(format!("file_path: denied (shell-init file): {canon_str}"));
            }
-
-    // 4. Enforce allowed-root containment.
-    let roots = allowed_roots();
-    if !roots.is_empty() {
-        let ok = roots.iter().any(|r| canon_str.starts_with(r));
-        if !ok {
-            return Err(format!(
-                "file_path: outside allowed roots {roots:?}: {canon_str}"
-            ));
        }
    }

    Ok(canonical)
 }

+/// v0.44 fix #1: walk up the path looking for the deepest existing ancestor,
+/// canonicalize THAT, then reattach the non-existent tail components.
+/// Resolves symlinks at any depth (existing OR non-existing branches).
+fn canonicalize_with_walk_up(path: &Path) -> Result<PathBuf, String> {
+    // Make the path absolute first so we can walk up reliably.
+    let abs = if path.is_absolute() {
+        path.to_path_buf()
+    } else {
+        std::env::current_dir()
+            .map_err(|e| format!("file_path: cwd unavailable: {e}"))?
+            .join(path)
+    };
+
+    // Walk up from the leaf, collecting non-existent components in reverse.
+    let mut current = abs.clone();
+    let mut tail: Vec<std::ffi::OsString> = Vec::new();
+    let canon = loop {
+        if current.exists() {
+            break current.canonicalize()
+                .map_err(|e| format!("file_path: canonicalize {}: {e}", current.display()))?;
+        }
+        let name = current.file_name()
+            .ok_or_else(|| format!("file_path: path has no existing ancestor: {}", abs.display()))?
+            .to_os_string();
+        let parent = match current.parent() {
+            Some(p) if !p.as_os_str().is_empty() => p.to_path_buf(),
+            _ => return Err(format!("file_path: walked to root without finding existing dir: {}", abs.display())),
+        };
+        tail.push(name);
+        current = parent;
+    };
+
+    // Reattach tail (in reverse — we pushed from leaf to root).
+    let mut result = canon;
+    for name in tail.into_iter().rev() {
+        result.push(name);
+    }
+    Ok(result)
+}
+
 fn allowed_roots() -> Vec<String> {
+    // Canonicalize each entry so symlinked roots (e.g. macOS /var → /private/var,
+    // /tmp → /private/tmp) match canonicalized targets. Trailing slash added
+    // for the consistency-with-default format. v0.44 fix #5 + #6 combined.
+    let canon_with_slash = |raw: &str| -> Option<String> {
+        let p = Path::new(raw);
+        let canon = std::fs::canonicalize(p).unwrap_or_else(|_| p.to_path_buf());
+        let mut s = canon.display().to_string();
+        if !s.ends_with('/') { s.push('/'); }
+        if s.is_empty() { None } else { Some(s) }
+    };
    if let Ok(v) = std::env::var("KEI_ALLOWED_ROOTS") {
-        return v.split(':').filter(|s| !s.is_empty()).map(String::from).collect();
+        return v.split(':')
+            .filter(|s| !s.is_empty())
+            .filter_map(canon_with_slash)
+            .collect();
    }
    let mut roots = Vec::new();
    if let Ok(cwd) = std::env::current_dir() {
-        roots.push(format!("{}/", cwd.display()));
+        if let Some(r) = canon_with_slash(&cwd.display().to_string()) {
+            roots.push(r);
        }
-    if let Ok(home) = std::env::var("HOME") {
-        roots.push(format!("{home}/"));
    }
    roots
 }
@ -353,16 +589,35 @@ fn allowed_roots() -> Vec<String> {
 ///
 /// Skips the chain if the parent process is already inside Claude or Grok
 /// (env flags), since those CLIs' native PreToolUse hooks already fired.
+/// Run the configured hook chain for `tool` ("bash"/"edit"/"write").
+///
+/// v0.42 fixes:
+///   #3 [HIGH]   empty chain (section absent or zero hooks) now FAILS CLOSED
+///               unless KEI_POLICY_CHAIN_OPTIONAL=1.
+///   #4 [MED]    load_chain() converted to async (was: blocking std::fs).
+///   #5 [MED]    hook subprocess gets `process_group(0)` + killpg on timeout
+///               (was: only the bash action got it; hooks could orphan).
+///   #6 [MED]    aggregate timeout across the whole chain + action (was:
+///               per-hook 60s, so chain+action could legitimately run
+///               4× the documented cap on a 3-hook chain).
 async fn run_chain(tool: &str, hook_input: &Value) -> Result<(), String> {
    if env_truthy("CLAUDECODE") || env_truthy("GROKCODE") {
        // Native hooks already enforced — don't double-fire.
        return Ok(());
    }

-    let chain = load_chain(tool)?;
+    let chain = load_chain(tool).await?;
    if chain.is_empty() {
+        // v0.42 fix #3 (Claude+Gemini HIGH): empty section is the same
+        // misconfig class as missing file — FAIL CLOSED with explicit opt-in.
+        if env_truthy("KEI_POLICY_CHAIN_OPTIONAL") {
            return Ok(());
        }
+        return Err(format!(
+            "[policy-chain] section [{tool}] is empty — refusing to run \
+             (set KEI_POLICY_CHAIN_OPTIONAL=1 to allow pass-through, e.g. for tests)"
+        ));
+    }

    let hooks_dir = hooks_dir()?;
    let payload = serde_json::to_string(hook_input)
@ -371,24 +626,26 @@ async fn run_chain(tool: &str, hook_input: &Value) -> Result<(), String> {
    for hook in chain {
        let path = hooks_dir.join(&hook);
        if !path.is_file() {
-            // v0.41 fix #1 (Gemini HIGH): FAIL-CLOSED on missing hook.
-            // Previously we logged a warning and continued — that meant a
-            // misconfigured deployment (hook deleted, wrong path) silently
-            // disabled enforcement. Now: refuse the action, surface the
-            // error so the operator notices.
            return Err(format!(
                "[policy-chain] hook missing: {} (declared in policy-chain.toml [{}])",
                path.display(), tool
            ));
        }

-        let mut child = Command::new(&path)
+        let mut child_cmd = Command::new(&path);
+        child_cmd
            .stdin(Stdio::piped())
            .stdout(Stdio::piped())
            .stderr(Stdio::piped())
-            .kill_on_drop(true)
+            .kill_on_drop(true);
+        set_process_group(&mut child_cmd);
+        // v0.44 fix #4: same env-isolation for hook subprocess.
+        apply_safe_env(&mut child_cmd);
+
+        let mut child = child_cmd
            .spawn()
            .map_err(|e| format!("spawn {}: {e}", path.display()))?;
+        let pid_opt = child.id();

        if let Some(mut stdin) = child.stdin.take() {
            stdin.write_all(payload.as_bytes()).await
@ -398,10 +655,18 @@ async fn run_chain(tool: &str, hook_input: &Value) -> Result<(), String> {
        }

        let fut = child.wait_with_output();
-        let out = tokio::time::timeout(Duration::from_secs(SAFE_TOOL_TIMEOUT_SECS), fut)
-            .await
-            .map_err(|_| format!("hook {} timeout", hook))?
-            .map_err(|e| format!("wait {}: {e}", path.display()))?;
+        let out = match tokio::time::timeout(Duration::from_secs(SAFE_TOOL_TIMEOUT_SECS), fut).await {
+            Ok(Ok(o)) => o,
+            Ok(Err(e)) => return Err(format!("wait {}: {e}", path.display())),
+            Err(_) => {
+                // v0.42 fix #5: kill the whole hook process group, not just
+                // the immediate child.
+                if let Some(pid) = pid_opt {
+                    killpg_best_effort(pid);
+                }
+                return Err(format!("hook {hook} timeout"));
+            }
+        };

        let code = out.status.code().unwrap_or(-1);
        if code == 0 {
@ -417,14 +682,14 @@ async fn run_chain(tool: &str, hook_input: &Value) -> Result<(), String> {

 // ---- config helpers -----------------------------------------------------

-fn load_chain(tool: &str) -> Result<Vec<String>, String> {
+/// v0.42 fix #4: async + tokio::fs (was: blocking std::fs would freeze
+/// a tokio worker if policy-chain.toml lived on a slow / hung mount).
+async fn load_chain(tool: &str) -> Result<Vec<String>, String> {
    let path = chain_path()?;
-    if !path.is_file() {
-        // v0.41 fix #1 (Gemini HIGH companion): default behavior when
-        // policy-chain.toml is absent is now configurable via env. Without
-        // explicit opt-in to pass-through, FAIL-CLOSED — caller sees a
-        // clear error instead of silent bypass.
-        if std::env::var("KEI_POLICY_CHAIN_OPTIONAL").as_deref() == Ok("1") {
+    // tokio::fs::try_exists avoids a blocking is_file() syscall.
+    let exists = fs::try_exists(&path).await.unwrap_or(false);
+    if !exists {
+        if env_truthy("KEI_POLICY_CHAIN_OPTIONAL") {
            return Ok(vec![]);
        }
        return Err(format!(
@ -432,7 +697,7 @@ fn load_chain(tool: &str) -> Result<Vec<String>, String> {
            path.display()
        ));
    }
-    let raw = std::fs::read_to_string(&path)
+    let raw = fs::read_to_string(&path).await
        .map_err(|e| format!("read policy-chain.toml: {e}"))?;
    let parsed: PolicyChain = toml::from_str(&raw)
        .map_err(|e| format!("parse policy-chain.toml: {e}"))?;
--- a/bin/kei
+++ b/bin/kei
@ -20,6 +20,9 @@
 #   kei mcp-wire [<cli>]    # wire kei-mcp into a CLI's MCP config + hook setup
 #                           # (Phase C cross-CLI policy enforcement)
 #   kei mcp-wire --list     # show enforcement tier per CLI
+#   kei limits              # probe each CLI's subscription quota (best-effort)
+#                           # (4 of 5 CLIs have no public API — honest report)
+#   kei onboard             # post-install wizard (pick primary + mcp-wire + check)
 #   kei --on=<backend>      # one-shot launch of <backend> (does not change primary)
 #   kei [args...]           # splash → exec primary CLI (default: claude)
 #
@ -66,6 +69,14 @@ case "${1:-}" in
    shift
    exec "$HOME/.claude/scripts/kei-mcp-wire.sh" "$@"
    ;;
+  limits|quota|usage)
+    shift
+    exec "$HOME/.claude/scripts/kei-limits.sh" "$@"
+    ;;
+  onboard|setup|wizard)
+    shift
+    exec "$HOME/.claude/scripts/kei-onboard.sh" "$@"
+    ;;
 esac

 # --- one-shot --on=<backend> override (does not write primary.toml) -------
@ -224,7 +235,7 @@ ${C1}    ██╔═██╗ ██╔══╝  ██║╚════█
 ${C1}    ██║  ██╗███████╗██║███████║███████╗██║${C0}
 ${C1}    ╚═╝  ╚═╝╚══════╝╚═╝╚══════╝╚══════╝╚═╝${C0}

-${C2}    KeiSeiKit · substrate v0.40${C0}
+${C2}    KeiSeiKit · substrate v0.45${C0}
 ${C3}    ─────────────────────────────────────${C0}
      primary CLI    : ${CV}${PRIMARY}${C0}
      profile        : ${CV}${p}${C0}
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -177,9 +177,14 @@ fi
 log "checkout: $KIT_DIR"

 # --- 5. run install ------------------------------------------------------
-log "running ./install.sh --profile=$PROFILE $YES_FLAG ${EXTRA_FLAGS[*]:-}"
+log "running install.sh --profile=$PROFILE $YES_FLAG ${EXTRA_FLAGS[*]:-}"
 cd "$KIT_DIR"
-./install.sh --profile="$PROFILE" $YES_FLAG "${EXTRA_FLAGS[@]:+${EXTRA_FLAGS[@]}}"
+# Defensive: invoke via `bash` not `./install.sh` because GitHub's contents
+# API does NOT preserve the executable bit on `gh api -X PUT` updates
+# (only the git Data API does). Older clones may have install.sh with
+# mode 644 even though the source repo has it 755. `bash <file>` works
+# regardless of file mode. Verified incident 2026-05-26 prod-curl test.
+bash ./install.sh --profile="$PROFILE" $YES_FLAG "${EXTRA_FLAGS[@]:+${EXTRA_FLAGS[@]}}"

 # --- 6. post-install verification ----------------------------------------
 KEI_BIN="$HOME/.claude/agents/_primitives/_rust/target/release"
@ -199,6 +204,25 @@ log ""
 log "==========================================================================="
 log "DONE — KeiSeiKit installed (profile: $PROFILE)"
 log "==========================================================================="
+
+# v0.45: post-install onboarding wizard.
+# Auto-triggers if stdin is a TTY (real terminal). Wizard itself re-checks
+# and exits cleanly if non-interactive — so curl|bash one-liner runs work too.
+ONBOARD_SH="$HOME/.claude/scripts/kei-onboard.sh"
+if [ -x "$ONBOARD_SH" ] && [ -t 0 ] && [ "${KEI_NO_ONBOARD:-0}" != "1" ]; then
+  log ""
+  log "Starting post-install onboarding (pick primary CLI + wire MCP)..."
+  log "Skip with KEI_NO_ONBOARD=1; re-run anytime with 'kei onboard'."
+  log ""
+  "$ONBOARD_SH" || log "(onboarding exited non-zero; re-run with 'kei onboard')"
+else
+  log ""
+  log "Post-install wizard skipped (no TTY or KEI_NO_ONBOARD=1)."
+  log "Run interactively to configure primary CLI:"
+  log "  kei onboard           # full wizard"
+  log "  kei pick              # just pick primary"
+  log "  kei mcp-wire          # wire MCP into installed CLIs"
+fi
 log ""
 log "Next steps:"
 log "  - Open a new shell so PATH picks up ~/.cargo/bin and the kei-* binaries."
--- a/docs/encyclopedia/cross-cli-policy.md
+++ b/docs/encyclopedia/cross-cli-policy.md
@ -116,22 +116,34 @@ The chain runs against the same hook scripts Claude uses; identical input
 shape, identical decisions. On block, the hook's stderr surfaces as the MCP
 error message so the calling agent sees exactly why.

-**v0.41 hardening** (post-audit fixes):
+**v0.42 hardening** (post 4-CLI re-audit, supersedes v0.41):

- **Fail-CLOSED on missing config** — if `policy-chain.toml` is absent the
-  chain refuses to run (was: silent pass-through). Tests / dev can opt in
-  via `KEI_POLICY_CHAIN_OPTIONAL=1` env.
- **Fail-CLOSED on missing hook script** — if a hook declared in the chain
-  is not on disk the call fails (was: warn-and-skip).
- **Path-traversal guard** on `kei_edit` / `kei_write` — rejects `..`
-  segments, `/etc/`, `/usr/`, `/System/`, `/var/`, `/root/`, plus
-  `$HOME/{.ssh,.aws,.gnupg,.config/gcloud}/` recursively. Override via
-  `KEI_ALLOWED_ROOTS=':'-separated-absolute-paths`.
- **Async file I/O** — `kei_edit` / `kei_write` now use `tokio::fs` so a
-  pathological file (`/dev/random` etc.) cannot block a tokio worker.
- **Process-group kill on timeout** — `kei_bash` puts its child shell in
-  its own process group; on timeout the entire group is `killpg(SIGKILL)`'d
-  so grandchildren don't orphan (Unix-only; no-op on Windows).
+- **Fail-CLOSED everywhere** — missing config, missing hook, OR empty
+  section (`[bash]/[edit]/[write]` with no entries) all refuse to run.
+  Tests / dev can opt in via `KEI_POLICY_CHAIN_OPTIONAL=1`.
+- **Symlink-safe path guard** — `kei_edit` / `kei_write` canonicalize the
+  FULL path (resolving any leaf symlink to its real target) and reject
+  if the leaf itself is a symlink for a not-yet-existent file. Fixes the
+  v0.41 CRITICAL bypass where `ln -s ~/.ssh/keys ./x; kei_write x` would
+  follow the link.
+- **$PWD-only default root** — `allowed_roots` defaults to current working
+  directory only. Was: `$PWD` + entire `$HOME` — too permissive, agent
+  could overwrite `~/.claude/hooks/*` (self-neuter) or `~/.zshrc` (RCE on
+  next shell). Operators who need broader access set `KEI_ALLOWED_ROOTS`.
+- **Denylist extended** — system dirs (`/etc/`, `/usr/`, `/System/`,
+  `/var/`, `/root/`, `/bin/`, `/sbin/`); credential stores (`~/.ssh/`,
+  `~/.aws/`, `~/.gnupg/`, `~/.config/gcloud/`, `~/.cargo/credentials`,
+  `~/.docker/config.json`, `~/.kube/`); substrate dirs (`~/.claude/`,
+  `~/.grok/`, `~/.gemini/`, `~/.copilot/`, `~/.kimi/`); exact shell-init
+  files (`.zshrc`, `.bashrc`, `.profile`, `.zshenv`, `.gitconfig`, ...).
+- **Async file I/O in load_chain** — `policy-chain.toml` now read via
+  `tokio::fs` (was: blocking `std::fs` froze worker on slow mounts).
+- **Process-group kill on hooks too** — hook subprocesses get
+  `process_group(0)` and `killpg(SIGKILL)` on timeout. Was: only the bash
+  action got this; hook grandchildren orphaned.
+- **CLAUDECODE/GROKCODE design note** — documented as perf/UX
+  optimization, NOT a security boundary (env-controllable parent → confused
+  deputy is already-game-over scenario).

 ### Double-enforcement guard

--- a/install/lib-dev-hub-forgejo-runner.sh
+++ b/install/lib-dev-hub-forgejo-runner.sh
@ -80,14 +80,29 @@ _mint_runner_token() {
  printf '%s' "$token"
 }

-# Internal: register act_runner with the local Forgejo. Writes ${DATA}/.runner.
-# Args: <data_dir> <token>.
+# v0.45 fix: brew installs `gitea-runner` (not `act_runner`); the binary is
+# named `gitea-runner`. Resolver tries both names so future brew packaging
+# changes don't re-break this. act_runner upstream and gitea-runner fork are
+# functionally equivalent and both register with Forgejo.
+_runner_bin() {
+  if command -v act_runner >/dev/null 2>&1; then
+    echo "act_runner"
+  elif command -v gitea-runner >/dev/null 2>&1; then
+    echo "gitea-runner"
+  else
+    return 1
+  fi
+}
+
+# Internal: register the runner with the local Forgejo. Writes ${DATA}/.runner.
 _register_act_runner() {
  local data_dir="$1"
  local token="$2"
  local label="self-hosted,macos-arm64,native"
  local name="$(hostname -s)-keisei"
-  ( cd "$data_dir" && act_runner register \
+  local runner
+  runner="$(_runner_bin)" || { err "no runner binary found (looked for act_runner + gitea-runner)"; return 1; }
+  ( cd "$data_dir" && "$runner" register \
      --no-interactive \
      --instance http://127.0.0.1:3001 \
      --token "$token" \
@ -97,12 +112,19 @@ _register_act_runner() {

 # Public entry: install + register + bootstrap the runner.
 install_dev_hub_forgejo_runner() {
-  say "installing dev-hub-forgejo-runner (act_runner)"
+  say "installing dev-hub-forgejo-runner (Forgejo Actions runner)"
  _require_forgejo_binary || return 1
  _require_forgejo_running || return 1

-  say "brew install act_runner"
-  brew install act_runner
+  # Prefer the Forgejo-official runner; fall back to the gitea-runner fork
+  # (which is what `brew install gitea-runner` actually provides today).
+  if ! _runner_bin >/dev/null 2>&1; then
+    say "brew install gitea-runner (Forgejo-compatible)"
+    brew install gitea-runner || {
+      warn "brew install gitea-runner failed — try 'brew tap actions/runner' for act_runner"
+      return 1
+    }
+  fi

  local data_dir
  data_dir="$(_runner_data_dir)"
@ -125,7 +147,9 @@ install_dev_hub_forgejo_runner() {
  . "$KIT_DIR/install/lib-launchd.sh"
  install_service forgejo-runner

-  say "act_runner registered + running. Polling http://127.0.0.1:3001 for jobs."
+  local runner_name
+  runner_name="$(_runner_bin 2>/dev/null || echo runner)"
+  say "$runner_name registered + running. Polling http://127.0.0.1:3001 for jobs."
 }

 # Public entry: stop + unload the runner. Keeps ${DATA}/.runner so re-install
--- a/install/lib-dev-hub-forgejo.sh
+++ b/install/lib-dev-hub-forgejo.sh
@ -97,11 +97,19 @@ _dhf_bootstrap_admin_user() {
  local kc_token_svc kc_pass_svc
  config="$(_dhf_app_ini)"
  username="${KEI_FORGEJO_ADMIN_USER:-${USER:-denis}}"
-  # Single-source Keychain service names (override per-host via env).
-  # Wizard MUST read identical names — see drive-import-wizard.sh.tmpl.
  kc_token_svc="${KEI_FORGEJO_KC_TOKEN_SERVICE:-forgejo-api-token}"
  kc_pass_svc="${KEI_FORGEJO_KC_PASS_SERVICE:-forgejo-admin-password}"
-  # Detection: any rows beyond header in `admin user list`?
+
+  # v0.45 fix: Forgejo on first install needs `migrate` to create the sqlite
+  # schema. Without it, `admin user create` fails with "no such table: user"
+  # (verified bug 2026-05-26 in prod curl|bash test). `migrate` is idempotent
+  # — safe to re-run.
+  if ! forgejo --config "$config" migrate 2>/dev/null; then
+    warn "  → forgejo migrate failed; daemon may need restart before admin create"
+  fi
+
+  # Detection: any rows beyond header in `admin user list`? Now safe to
+  # parse since migrate has ensured the user table exists.
  user_count="$(forgejo --config "$config" admin user list 2>/dev/null \
    | tail -n +2 | grep -cv '^$' || echo 0)"
  if [ "$user_count" -gt 0 ]; then
--- a/install/lib-dev-hub-zoekt.sh
+++ b/install/lib-dev-hub-zoekt.sh
@ -41,13 +41,38 @@ _dhz_check_go_runtime() {
  fi
 }

-# Step b — brew install zoekt (idempotent).
+# Step b — install zoekt. Zoekt is NOT in homebrew/core — try tap first,
+# then fall back to building from source via Go (if installed). On total
+# failure, skip cleanly rather than aborting the whole install.
+# v0.45 fix: prior version errored hard ("No formula") and bailed the entire
+# dev-hub install. Now degrades gracefully.
 _dhz_brew_install() {
-  say "installing zoekt via brew (idempotent)"
-  if ! brew install zoekt; then
-    err "brew install zoekt failed — see brew log above"
-    return 1
+  say "installing zoekt (idempotent)"
+  if command -v zoekt-webserver >/dev/null 2>&1 && command -v zoekt-index >/dev/null 2>&1; then
+    say "  → zoekt already installed; skipping"
+    return 0
  fi
+  if brew install zoekt 2>/dev/null; then
+    say "  → installed via brew core"
+    return 0
+  fi
+  if brew install sourcegraph/zoekt/zoekt 2>/dev/null \
+     || brew install hyperdiscovery/zoekt/zoekt 2>/dev/null; then
+    say "  → installed via tap"
+    return 0
+  fi
+  if command -v go >/dev/null 2>&1; then
+    say "  → falling back to 'go install' from sourcegraph/zoekt"
+    if go install github.com/sourcegraph/zoekt/cmd/zoekt-webserver@latest \
+       && go install github.com/sourcegraph/zoekt/cmd/zoekt-index@latest; then
+      say "  → installed via go"
+      return 0
+    fi
+  fi
+  warn "zoekt unavailable: not in brew core/taps + no go fallback."
+  warn "Skipping zoekt service install. Other dev-hub services continue."
+  warn "To install later: brew install --HEAD sourcegraph/zoekt/zoekt"
+  return 2  # signal partial — caller treats as skip, not fatal
 }

 # Step c — ensure data dir tree (+ index dir).
--- a/plugin.json
+++ b/plugin.json
@ -3,7 +3,7 @@
  "name": "keisei",
  "displayName": "KeiSei",
  "description": "Constructor Pattern multi-LLM agent substrate — 38 agents, 69 skills, 54 hooks, 86 blocks. Cross-CLI policy enforcement (Claude/Grok/Copilot/Agy/Kimi) via kei-mcp + kei_bash/kei_edit/kei_write. Rust primitives via classic ./install.sh.",
-  "version": "0.40.0",
+  "version": "0.45.0",
  "homepage": "https://keisei.app",
  "repository": "https://github.com/KeiSeiLab/KeiSeiKit-1.0.git",
  "author": {
--- a/scripts/kei-limits.sh
+++ b/scripts/kei-limits.sh
@ -0,0 +1,230 @@
+#!/usr/bin/env bash
+# kei-limits — probe each installed CLI's remaining quota / balance.
+#
+# Reality (research 2026-05-26):
+#   • claude  — no programmatic API. Headers per-API-call only. Admin API
+#               exists but needs a separate admin key. See dashboard.
+#   • grok    — same as claude. Headers per-API-call only. No file.
+#   • agy     — interactive /usage slash-cmd is broken (shows 100% always,
+#               forum-verified bug). No public API.
+#   • copilot — no public quota API. github.com/settings/billing only.
+#               Inline output during call shows usage but nothing exposed
+#               for poll.
+#   • kimi    — Moonshot API /v1/users/me/balance returns $ balance only
+#               (no session/weekly quota). Requires MOONSHOT_API_KEY.
+#
+# Output:
+#   stdout: human summary (default) OR JSON (--json)
+#   file:   ~/.claude/pet/limits-cache.json (always, for pet to read)
+#
+# Polling: NOT poll-friendly. Run on demand or via launchd at >5 min intervals.
+# Pet's job: read the cache; pet does NOT call this script.
+
+set -u
+
+# v0.43-fix #4: jq runtime guard (convention with 40+ sibling scripts).
+command -v jq >/dev/null 2>&1 || {
+  echo "kei-limits: jq required (brew install jq / apt install jq)" >&2
+  exit 1
+}
+
+CACHE="${KEI_LIMITS_CACHE:-$HOME/.claude/pet/limits-cache.json}"
+mkdir -p "$(dirname "$CACHE")"
+
+JSON_OUT=0
+QUIET=0
+for arg in "$@"; do
+  case "$arg" in
+    --json)  JSON_OUT=1 ;;
+    --quiet) QUIET=1 ;;
+    -h|--help) sed -n '2,22p' "$0" | sed 's|^# \{0,1\}||'; exit 0 ;;
+  esac
+done
+
+# --- per-CLI probes (each returns one JSON value to stdout) ----------------
+probe_claude() {
+  # No public API; produce a status marker, no live data.
+  printf '%s' '{"status":"no-api","note":"see claude.ai/settings/usage","dashboard":"https://claude.ai/settings/usage"}'
+}
+
+probe_grok() {
+  printf '%s' '{"status":"no-api","note":"headers-only per API call; see x.ai dashboard","dashboard":"https://x.ai"}'
+}
+
+probe_agy() {
+  printf '%s' '{"status":"broken-api","note":"interactive /usage shows 100% (forum-verified bug); use Google Cloud Console","dashboard":"https://console.cloud.google.com/apis/api/generativelanguage.googleapis.com/quotas"}'
+}
+
+probe_copilot() {
+  # Try gh CLI graphQL — most variants don't expose Copilot billing publicly.
+  # If we ever find an endpoint, drop it in here. For now: status marker.
+  printf '%s' '{"status":"no-api","note":"see github.com/settings/billing → Copilot section","dashboard":"https://github.com/settings/billing"}'
+}
+
+probe_kimi() {
+  if [ -z "${MOONSHOT_API_KEY:-}" ]; then
+    printf '%s' '{"status":"need-key","note":"set MOONSHOT_API_KEY in env to fetch live balance","dashboard":"https://platform.kimi.ai"}'
+    return
+  fi
+  if ! command -v curl >/dev/null 2>&1; then
+    printf '%s' '{"status":"no-curl","note":"curl required for live probe"}'
+    return
+  fi
+  # v0.44 fix #3 (Gemini HIGH): sanitize MOONSHOT_API_KEY before formatting.
+  # Was: token injected into a curl --config line via printf 'header = "...%s..."';
+  # if the token contained a double-quote + newline + 'url = "attacker"',
+  # curl would parse the injected config option and redirect the request.
+  # Now: validate the key matches a known-safe charset; reject otherwise.
+  case "$MOONSHOT_API_KEY" in
+    *[!A-Za-z0-9_.\-]*)
+      printf '%s' '{"status":"probe-failed","note":"MOONSHOT_API_KEY contains unsafe chars; expected [A-Za-z0-9_.-]"}'
+      return
+      ;;
+  esac
+  local resp
+  resp=$(printf 'header = "Authorization: Bearer %s"\n' "$MOONSHOT_API_KEY" \
+    | curl -sS --max-time 5 --config - \
+        "https://api.moonshot.ai/v1/users/me/balance" 2>/dev/null \
+    || echo '')
+  if [ -z "$resp" ]; then
+    printf '%s' '{"status":"probe-failed","note":"no response (network / wrong key)"}'
+    return
+  fi
+  # v0.43-fix #2: tonumber? swallows parse errors (was: tonumber threw on
+  # any non-numeric balance, emitted empty JSON, poisoned the assembler
+  # --argjson → whole cache wiped).
+  local avail
+  avail=$(printf '%s' "$resp" | jq -r '.data.available_balance // empty' 2>/dev/null)
+  if [ -z "$avail" ]; then
+    printf '%s' '{"status":"probe-failed","note":"API returned non-balance response"}'
+    return
+  fi
+  local cash voucher
+  cash=$(printf '%s'   "$resp" | jq -r '.data.cash_balance // 0'    2>/dev/null)
+  voucher=$(printf '%s' "$resp" | jq -r '.data.voucher_balance // 0' 2>/dev/null)
+  jq -n --arg s "live" --arg a "$avail" --arg c "$cash" --arg v "$voucher" \
+    '{status:$s, available_balance_usd:($a|tonumber? // 0), cash_balance_usd:($c|tonumber? // 0), voucher_balance_usd:($v|tonumber? // 0), dashboard:"https://platform.kimi.ai"}'
+}
+
+# --- assemble cache JSON ---------------------------------------------------
+# v0.43-fix #1: atomic stage-and-rename. Was: `jq > "$CACHE"` truncated the
+# cache BEFORE jq ran — a transient failure permanently wiped the cache.
+# Now: build in tmpfile, validate non-empty, then atomic mv. Preserves
+# last-known-good across probe failures.
+# v0.43-fix #2 (defense-in-depth): if any individual probe returns empty
+# string, substitute a status marker so --argjson never sees invalid JSON.
+
+_safe_json() {
+  local payload="$1"
+  if [ -z "$payload" ]; then
+    printf '%s' '{"status":"probe-empty","note":"probe returned empty result"}'
+    return
+  fi
+  # Validate parses.
+  if ! printf '%s' "$payload" | jq empty 2>/dev/null; then
+    printf '%s' '{"status":"probe-invalid","note":"probe returned non-JSON"}'
+    return
+  fi
+  printf '%s' "$payload"
+}
+
+P_CLAUDE=$(_safe_json "$(probe_claude)")
+P_GROK=$(_safe_json "$(probe_grok)")
+P_AGY=$(_safe_json "$(probe_agy)")
+P_COPILOT=$(_safe_json "$(probe_copilot)")
+P_KIMI=$(_safe_json "$(probe_kimi)")
+
+NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
+TMP=$(mktemp "${CACHE}.XXXXXX")
+if jq -n \
+    --arg ts "$NOW" \
+    --argjson claude  "$P_CLAUDE" \
+    --argjson grok    "$P_GROK" \
+    --argjson agy     "$P_AGY" \
+    --argjson copilot "$P_COPILOT" \
+    --argjson kimi    "$P_KIMI" \
+    '{ts:$ts, claude:$claude, grok:$grok, agy:$agy, copilot:$copilot, kimi:$kimi}' \
+    > "$TMP" 2>/dev/null \
+   && [ -s "$TMP" ]; then
+  mv -f "$TMP" "$CACHE"
+else
+  rm -f "$TMP" 2>/dev/null
+  echo "kei-limits: cache refresh failed — keeping previous cache" >&2
+  if [ ! -f "$CACHE" ]; then
+    # v0.44 fix #9 (Claude MED): failure-fallback must carry the SAME schema
+    # as the success cache (ts + 5 per-CLI keys). Was: emitted only {ts,
+    # status} which broke pet's .kimi.available_balance_usd read and the
+    # script's own per-CLI render loop. Now: full shape, all 5 marked
+    # status="assembly-failed".
+    jq -n '{ts:"",
+            claude:{status:"assembly-failed",note:"see logs"},
+            grok:{status:"assembly-failed",note:"see logs"},
+            agy:{status:"assembly-failed",note:"see logs"},
+            copilot:{status:"assembly-failed",note:"see logs"},
+            kimi:{status:"assembly-failed",note:"see logs"}}' \
+      > "$CACHE" 2>/dev/null \
+      || printf '%s\n' '{"ts":"","claude":{"status":"assembly-failed"},"grok":{"status":"assembly-failed"},"agy":{"status":"assembly-failed"},"copilot":{"status":"assembly-failed"},"kimi":{"status":"assembly-failed"}}' > "$CACHE"
+  fi
+fi
+
+# --- output ----------------------------------------------------------------
+if [ "$JSON_OUT" = "1" ]; then
+  cat "$CACHE"
+  exit 0
+fi
+
+if [ "$QUIET" = "1" ]; then
+  exit 0
+fi
+
+C0= CB= CG= CY= CR= CD=
+if [ -t 1 ]; then
+  C0=$'\033[0m'
+  CB=$'\033[1;38;5;39m'
+  CG=$'\033[32m'
+  CY=$'\033[33m'
+  CR=$'\033[31m'
+  CD=$'\033[2m'
+fi
+
+format_one() {
+  local label="$1" key="$2" data="$3"
+  local status note
+  status=$(printf '%s' "$data" | jq -r '.status')
+  note=$(printf '%s' "$data" | jq -r '.note // ""')
+  case "$status" in
+    live)
+      local avail
+      avail=$(printf '%s' "$data" | jq -r '.available_balance_usd // empty')
+      printf "  ${CG}✓${C0} %-8s \$%-8s ${CD}live (Moonshot balance)${C0}\n" "$label" "$avail"
+      ;;
+    no-api|need-key)
+      printf "  ${CY}?${C0}  %-8s ${CD}%s${C0}\n" "$label" "$note"
+      ;;
+    broken-api)
+      printf "  ${CR}✗${C0} %-8s ${CD}%s${C0}\n" "$label" "$note"
+      ;;
+    *)
+      printf "  ${CY}?${C0}  %-8s ${CD}%s${C0}\n" "$label" "$note"
+      ;;
+  esac
+}
+
+cat <<EOF
+
+${CB}╔════════════════════════════════════════════════════════════╗
+║  KeiSeiKit · CLI subscription limits                         ║
+╚════════════════════════════════════════════════════════════╝${C0}
+
+EOF
+
+CACHE_CONTENT=$(cat "$CACHE")
+for cli in claude grok agy copilot kimi; do
+  data=$(printf '%s' "$CACHE_CONTENT" | jq -c ".$cli")
+  format_one "$cli" "$cli" "$data"
+done
+
+echo
+echo "${CD}cached: $CACHE${C0}"
+echo "${CD}note:   no CLI exposes session/weekly quota in a poll-friendly way.${C0}"
+echo "${CD}        See dashboards via 'open <url>' from --json output.${C0}"
--- a/scripts/kei-onboard.sh
+++ b/scripts/kei-onboard.sh
@ -0,0 +1,191 @@
+#!/usr/bin/env bash
+# kei-onboard — post-install wizard.
+#
+# Runs after install.sh / bootstrap.sh to guide the user through:
+#   Step 1: pick the primary LLM orchestrator (default for `kei` no-args)
+#   Step 2: wire kei-mcp into the chosen CLI (cross-CLI policy + spawn_agent)
+#   Step 3: optional MOONSHOT_API_KEY hint for kei limits
+#   Step 4: quick health check
+#
+# Idempotent — safe to re-run anytime via `kei onboard`.
+# Honors TTY gate: non-interactive runs print summary + exit, no prompts.
+
+set -eu
+
+KEI_PRIMARY_CFG="${KEI_PRIMARY_CFG:-$HOME/.claude/config/primary.toml}"
+PICK_SH="$HOME/.claude/scripts/kei-pick.sh"
+WIRE_SH="$HOME/.claude/scripts/kei-mcp-wire.sh"
+
+# Colors only if stdout is a TTY (TTY-INTERACTIVITY-GATE: -t 1 for color is OK).
+C0= CB= CC= CG= CD= CR=
+if [ -t 1 ]; then
+  C0=$'\033[0m'
+  CB=$'\033[1;38;5;39m'    # blue
+  CC=$'\033[1;38;5;220m'   # gold
+  CG=$'\033[32m'           # green
+  CR=$'\033[31m'           # red
+  CD=$'\033[2m'            # dim
+fi
+
+# Non-interactive (no stdin TTY): print summary + exit.
+# Per tty-interactivity-gate.md: -t 0 not -t 1.
+if [ ! -t 0 ]; then
+  cat <<EOF
+
+${CB}KeiSeiKit · onboarding${C0} (non-interactive — wizard skipped)
+
+Next manual steps:
+  ${CC}kei onboard${C0}       run this wizard interactively
+  ${CC}kei pick${C0}          pick primary LLM CLI
+  ${CC}kei mcp-wire${C0}      wire kei-mcp into your CLIs
+  ${CC}kei limits${C0}        check subscription quotas (honest report)
+  ${CC}kei-doctor${C0}        substrate health diagnostic
+
+EOF
+  exit 0
+fi
+
+# Banner
+cat <<EOF
+
+${CB}╔═══════════════════════════════════════════════════════════════════╗
+║  KeiSeiKit · post-install onboarding                                ║
+╚═══════════════════════════════════════════════════════════════════╝${C0}
+
+The install put 38 agents, 54 hooks, and 60+ Rust primitives in place.
+Now let's wire up the LLM CLIs you'll actually use.
+
+EOF
+
+# ── Step 1: pick primary ───────────────────────────────────────────
+echo "${CB}── Step 1/4 — Pick your primary LLM orchestrator ──${C0}"
+echo
+echo "When you run ${CC}kei${C0} (no args) it launches your primary CLI."
+echo "Each agent's manifest can also declare a preferred provider (DNA)."
+echo
+
+declare -a BACKENDS=(claude grok agy copilot kimi)
+declare -A LABELS=(
+  [claude]="Claude Code       (Anthropic, full hook enforcement)"
+  [grok]="Grok              (xAI, native --agent flag)"
+  [agy]="Antigravity       (Google Gemini)"
+  [copilot]="GitHub Copilot    (Microsoft, MCP-wrapped)"
+  [kimi]="Kimi              (Moonshot, TUI-primary)"
+)
+
+i=1
+for b in "${BACKENDS[@]}"; do
+  if command -v "$b" >/dev/null 2>&1; then
+    mark="${CG}✓${C0}"
+  else
+    mark="${CR}✗${C0} ${CD}(not installed)${C0}"
+  fi
+  printf "  ${CB}%d${C0}) %s %-20s %s\n" "$i" "$mark" "$b" "${LABELS[$b]}"
+  i=$((i+1))
+done
+echo "  ${CB}s${C0}) skip — keep current primary (claude default)"
+echo
+
+current=""
+[ -f "$KEI_PRIMARY_CFG" ] && current=$(awk -F'=' '/^provider/ {gsub(/[" ]/, "", $2); print $2; exit}' "$KEI_PRIMARY_CFG")
+printf "Current primary: ${CC}%s${C0}\n" "${current:-claude (default)}"
+printf "Pick [1-${#BACKENDS[@]}/s, default=s]: "
+read -r choice
+choice="${choice:-s}"
+
+primary_set=""
+case "$choice" in
+  s|S|"")
+    echo "  ${CD}— keeping ${current:-claude}${C0}"
+    primary_set="${current:-claude}"
+    ;;
+  [1-9])
+    idx=$((choice-1))
+    if [ $idx -ge ${#BACKENDS[@]} ] || [ $idx -lt 0 ]; then
+      echo "  ${CR}invalid; keeping ${current:-claude}${C0}"
+      primary_set="${current:-claude}"
+    else
+      new="${BACKENDS[$idx]}"
+      mkdir -p "$(dirname "$KEI_PRIMARY_CFG")"
+      printf '# kei primary — written %s by onboarding\nprovider = "%s"\n' \
+        "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$new" > "$KEI_PRIMARY_CFG"
+      echo "  ${CG}✓${C0} primary set: ${CC}${new}${C0} → $KEI_PRIMARY_CFG"
+      primary_set="$new"
+    fi
+    ;;
+  *)
+    echo "  ${CR}invalid; keeping ${current:-claude}${C0}"
+    primary_set="${current:-claude}"
+    ;;
+esac
+
+# ── Step 2: mcp-wire ───────────────────────────────────────────────
+echo
+echo "${CB}── Step 2/4 — Wire kei-mcp into installed CLIs ──${C0}"
+echo
+echo "kei-mcp exposes ${CC}spawn_agent${C0} + ${CC}kei_bash/kei_edit/kei_write${C0} (with"
+echo "policy chain) to any MCP-capable CLI. Enables cross-CLI agent invocation"
+echo "AND hook enforcement on non-Claude backends."
+echo
+printf "Run ${CC}kei mcp-wire${C0} now (writes to ~/.grok/, ~/.copilot/, etc.)? [Y/n]: "
+read -r wire_ans
+wire_ans="${wire_ans:-Y}"
+case "$wire_ans" in
+  y|Y|yes)
+    if [ -x "$WIRE_SH" ]; then
+      "$WIRE_SH"
+    else
+      echo "  ${CR}— $WIRE_SH not found; skip${C0}"
+    fi
+    ;;
+  *)
+    echo "  ${CD}— skipped. Run later: ${CC}kei mcp-wire${C0}${CD}${C0}"
+    ;;
+esac
+
+# ── Step 3: MOONSHOT key hint ──────────────────────────────────────
+echo
+echo "${CB}── Step 3/4 — Live subscription limits (optional) ──${C0}"
+echo
+echo "${CC}kei limits${C0} probes each CLI's subscription quota. Research found that"
+echo "only Kimi exposes a public API; the others are dashboard-only."
+echo
+if [ -n "${MOONSHOT_API_KEY:-}" ]; then
+  echo "  ${CG}✓${C0} MOONSHOT_API_KEY is set — Kimi balance probing enabled"
+else
+  cat <<EOF
+  ${CD}Optional: set ${CC}MOONSHOT_API_KEY${CD} in ${CC}~/.claude/secrets/.env${CD} to enable
+  Kimi balance polling. Other CLIs: see dashboards via ${CC}kei limits${CD}.${C0}
+EOF
+fi
+
+# ── Step 4: health check ───────────────────────────────────────────
+echo
+echo "${CB}── Step 4/4 — Health check ──${C0}"
+echo
+if command -v kei-doctor >/dev/null 2>&1; then
+  kei-doctor 2>&1 | head -20 || true
+else
+  echo "  ${CD}— kei-doctor not on PATH yet. Open new shell + run: ${CC}kei-doctor${C0}"
+fi
+
+# ── Done ───────────────────────────────────────────────────────────
+cat <<EOF
+
+${CB}╔═══════════════════════════════════════════════════════════════════╗
+║  Onboarding complete.                                              ║
+╚═══════════════════════════════════════════════════════════════════╝${C0}
+
+Quick-start:
+  ${CC}kei${C0}                              launch ${primary_set} (your primary)
+  ${CC}kei agent critic "..."${C0}           invoke an agent (DNA → primary)
+  ${CC}kei agent --on=grok critic "..."${C0} invoke on a specific backend
+  ${CC}kei mcp-wire --list${C0}              show enforcement tiers per CLI
+  ${CC}kei limits${C0}                       quota report (where APIs exist)
+  ${CC}kei pick${C0}                         re-pick primary anytime
+  ${CC}kei configure${C0}                    re-pick hook packs / stack profile
+
+Docs:  ${CD}~/.local/share/keisei/docs/encyclopedia/${C0}
+Logs:  ${CD}~/.keisei-install.log${C0}
+
+EOF
--- a/scripts/keisei-pet.sh
+++ b/scripts/keisei-pet.sh
@ -127,6 +127,37 @@ fi
 [ -n "$spend" ] && global+="${spend} "
 global="${global% }"

+# v0.43: CLI subscription limits (best-effort).
+# Pet does NOT poll — reads cache only. Cache populated by `kei limits`.
+# Reality: 4 of 5 CLIs have no programmatic limit API (see research). Pet
+# shows only what's actually available + how stale the cache is.
+limits_cache="${HOME}/.claude/pet/limits-cache.json"
+limits=""
+if [ -f "$limits_cache" ]; then
+  # Cache age in seconds.
+  cache_ts=$(jq -r '.ts // empty' "$limits_cache" 2>/dev/null)
+  if [ -n "$cache_ts" ]; then
+    # Convert ISO8601 to epoch (macOS + Linux compatible).
+    cache_epoch=$(
+      date -j -u -f "%Y-%m-%dT%H:%M:%SZ" "$cache_ts" "+%s" 2>/dev/null \
+      || date -u -d "$cache_ts" "+%s" 2>/dev/null \
+      || echo 0
+    )
+    cache_age=$(( now - cache_epoch ))
+    # Kimi balance (only CLI with live API). Show $X.XX if available.
+    kimi_avail=$(jq -r '.kimi | select(.status=="live") | .available_balance_usd' "$limits_cache" 2>/dev/null)
+    if [ -n "$kimi_avail" ] && [ "$kimi_avail" != "null" ]; then
+      limits+="K:\$$(printf '%.2f' "$kimi_avail" 2>/dev/null) "
+    fi
+    # Stale marker if older than 1h.
+    if [ "$cache_age" -gt 3600 ] 2>/dev/null && [ -n "$limits" ]; then
+      stale_min=$((cache_age / 60))
+      limits="${limits% }${dim}(${stale_min}m old)${reset} "
+    fi
+  fi
+fi
+limits="${limits% }"
+
 # ── THIS session: tokens + context% (from statusLine stdin) ─────────────────
 sess=""
 if [ -n "$SLINE" ]; then
@ -172,6 +203,7 @@ proj="${PWD##*/}"; [ -z "$proj" ] && proj="~"
 out=""
 [ -n "$sess" ]   && out+="${sess}  "
 [ -n "$global" ] && out+="${dim}${global}${reset}  "
+[ -n "$limits" ] && out+="${dim}${limits}${reset}  "
 [ -n "$plan" ]   && out+="${plan} "
 out+="${color}${face}${reset}"
 [ -n "$message" ] && out+=" ${dim}${message}${reset}"