KeiSeiKit-1.0/_primitives/tomd.sh
Parfii-bot 954b8c1f3e sanitize: remove patent-metadata from main tree (Tier 1+2+3)
Pre-public-launch cleanup. 17 files touched. Grep verification confirms
only Tier 4 (intentional GTM attribution) remains: README + docs/PHILOSOPHY
credit to Denis Parfionovich / KeiLab.

## Tier 1 — INFRA-LEAKS (4 targets, 1 file)
- _blocks/ci-forgejo-actions.md: Tailscale IPs 100.91.246.53 removed,
  kgl-runner-01 → my-runner-01, SSH fingerprint line deleted, Forgejo
  topology description generalised to "private interface"

## Tier 2 — PATENT-FLAG PROSE (4 files, ~10 edits)
- _manifests/kei-{modal-runner,ml-implementer,infra-implementer}.toml:
  "proprietary/non-public-deploy" → "private/non-public-deploy"
- _blocks/ci-forgejo-actions.md: RULE 0.1 sensitive IP references softened
  to generic "sensitive IP / compliance / air-gap" framing

## Tier 3 — INTERNAL PROJECT NAMES (8 files)
- kei-provision/tests/backend_smoke.rs: kgl-* fixtures → test-srv-*/test-vultr
- kei-auth/tests/integration.rs: project: "kgl" → "demo"
- kei-memory/src/coaccess.rs: "PROJECT-C/Genesis" origin → "in-house implementation"
- _primitives/{tomd.sh,README.md}: PROJECT-D provenance removed
- _bridges/README.md: PROJECT-D cross-ref line deleted
- skills/site-create/: keiagent/fal.ai → generic AI-asset generator
- skills/self-audit/: hardcoded project paths → ~/Projects/my-project
- skills/compose-solution/: hardcoded ~/Projects/PROJECT-E →
  ${KEISEI_BUNDLE_PATH:-} env-conditional lookup
- skills/sleep-setup/: forgejo.example.com → forgejo.example.com

## Phase 2 — Regenerated 3 root .md (Option B manual)
Assembler invocation blocked by sandbox; fell back to manual Edit on
kei-ml-implementer.md + kei-infra-implementer.md + kei-modal-runner.md
with same Tier-2 replacements as their source manifests.

## Known residual (Phase 3 pending user decision)
Git history still contains 619+ patent-term hits (pre-rewrite). Filter-repo
on /tmp/keisei-mirror.git prepared by separate agent; force-push
pending user approval because `genesis-scan` / `genesis-leak-guard` are
intentional kit features — naive rewrite would break them.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 19:20:16 +08:00

186 lines
5.9 KiB
Bash
Executable file

#!/usr/bin/env bash
# tomd — universal non-native-format → markdown converter.
# First-class primitive. Universal non-native-format → markdown converter.
# Install path: $HOME/.claude/agents/_primitives/tomd.sh.
# Deps: pandoc, python3, jq. Optional: pymupdf4llm, openpyxl, tesseract.
set -euo pipefail
check_deps() {
local missing=()
command -v python3 >/dev/null 2>&1 || missing+=("python3 (system)")
command -v jq >/dev/null 2>&1 || missing+=("jq (brew install jq)")
if [ "${#missing[@]}" -gt 0 ]; then
echo "[tomd] missing core prerequisites:" >&2
for m in "${missing[@]}"; do echo "[tomd] - $m" >&2; done
echo "[tomd] hint: brew install jq && pip3 install pymupdf4llm openpyxl" >&2
exit 1
fi
}
need_pandoc() {
command -v pandoc >/dev/null 2>&1 && return 0
echo "[tomd] pandoc required for this format. Install: brew install pandoc" >&2
exit 1
}
detect_format() {
local f="$1"
[ "$f" = "-" ] && { echo stdin; return; }
local l; l=$(printf '%s' "$f" | tr '[:upper:]' '[:lower:]')
case "$l" in
*.pdf) echo pdf ;; *.docx) echo docx ;; *.doc) echo doc ;;
*.html|*.htm) echo html ;; *.pptx) echo pptx ;;
*.xlsx) echo xlsx ;; *.csv) echo csv ;;
*.json) echo "fence:json" ;; *.yaml|*.yml) echo "fence:yaml" ;;
*.xml) echo "fence:xml" ;; *.toml) echo "fence:toml" ;; *.sql) echo "fence:sql" ;;
*.png|*.jpg|*.jpeg|*.gif|*.webp|*.svg) echo image ;;
*.py) echo "fence:python" ;; *.go) echo "fence:go" ;;
*.ts|*.tsx) echo "fence:typescript" ;; *.js|*.jsx) echo "fence:javascript" ;;
*.rs) echo "fence:rust" ;; *.c|*.h) echo "fence:c" ;; *.cpp|*.hpp) echo "fence:cpp" ;;
*.swift) echo "fence:swift" ;; *.sh|*.bash|*.zsh) echo "fence:bash" ;;
*.zig) echo "fence:zig" ;; *.md) echo md ;; *) echo text ;;
esac
}
convert_pdf() {
python3 - "$1" <<'PYEOF'
import sys
p=sys.argv[1]
try: import pymupdf4llm; print(pymupdf4llm.to_markdown(p))
except ImportError:
try:
import fitz; doc=fitz.open(p)
for page in doc: print(page.get_text("text")); print()
except ImportError:
sys.stderr.write("[tomd] pdf: pip3 install pymupdf4llm\n"); sys.exit(1)
PYEOF
}
convert_pandoc() {
need_pandoc
local from="${2:-}"
if [ -n "$from" ]; then pandoc -f "$from" -t markdown --wrap=none "$1"
else pandoc -t markdown --wrap=none "$1"; fi
}
convert_doc() {
if ! command -v textutil >/dev/null 2>&1; then
echo "[tomd] .doc: textutil not available (macOS only). Convert to .docx first." >&2
exit 1
fi
need_pandoc
local tmp; tmp=$(mktemp /tmp/tomd-XXXX.html)
textutil -convert html -output "$tmp" "$1"
pandoc -f html -t markdown --wrap=none "$tmp"; rm -f "$tmp"
}
convert_csv() {
python3 - "$1" <<'PYEOF'
import csv, sys
with open(sys.argv[1]) as f: rows=list(csv.reader(f))
if not rows: sys.exit(0)
print('| '+' | '.join(rows[0])+' |')
print('|'+'|'.join(['---']*len(rows[0]))+'|')
for r in rows[1:]: print('| '+' | '.join(r)+' |')
PYEOF
}
convert_xlsx() {
python3 - "$1" <<'PYEOF'
import sys
try: import openpyxl
except ImportError: sys.stderr.write("[tomd] xlsx: pip3 install openpyxl\n"); sys.exit(1)
wb=openpyxl.load_workbook(sys.argv[1], data_only=True)
for name in wb.sheetnames:
ws=wb[name]; print(f"## Sheet: {name}\n")
rows=list(ws.iter_rows(values_only=True))
if not rows: continue
h=[str(c) if c is not None else '' for c in rows[0]]
print('| '+' | '.join(h)+' |'); print('|'+'|'.join(['---']*len(h))+'|')
for r in rows[1:]:
print('| '+' | '.join(str(c) if c is not None else '' for c in r)+' |')
print()
PYEOF
}
fence() { echo "\`\`\`$1"; cat "$2"; echo '```'; }
convert_json() {
echo '```json'
if [ "$1" = "-" ]; then jq '.' 2>/dev/null || cat
else jq '.' "$1" 2>/dev/null || cat "$1"; fi
echo '```'
}
convert_image() {
echo "![$(basename "$1")]($1)"
if command -v tesseract >/dev/null 2>&1; then
echo; echo "**OCR:**"; echo '```'
tesseract "$1" stdout 2>/dev/null || echo "(failed)"
echo '```'
fi
}
convert_stdin() {
local c; c=$(cat)
if printf '%s' "$c" | jq '.' >/dev/null 2>&1; then
echo '```json'; printf '%s' "$c" | jq '.'; echo '```'
else printf '%s\n' "$c"; fi
}
convert_single() {
local f="$1" fmt; fmt=$(detect_format "$f")
case "$fmt" in
pdf) convert_pdf "$f" ;;
docx) convert_pandoc "$f" ;;
doc) convert_doc "$f" ;;
html) convert_pandoc "$f" html ;;
pptx) convert_pandoc "$f" ;;
xlsx) convert_xlsx "$f" ;;
csv) convert_csv "$f" ;;
fence:json) convert_json "$f" ;;
fence:*) fence "${fmt#fence:}" "$f" ;;
image) convert_image "$f" ;;
md|text) cat "$f" ;;
stdin) convert_stdin ;;
*) echo "[tomd] unknown format for $f" >&2; return 1 ;;
esac
}
convert_dir() {
local dir="${1:-.}" outdir="${1:-.}/_md" count=0
mkdir -p "$outdir"
while IFS= read -r -d '' file; do
local fmt; fmt=$(detect_format "$file")
{ [ "$fmt" = "text" ] || [ "$fmt" = "md" ]; } && continue
local out="$outdir/$(basename "${file%.*}").md"
echo "[tomd] $file -> $out" >&2
convert_single "$file" > "$out" 2>/dev/null && count=$((count+1)) || true
done < <(find "$dir" -maxdepth 2 -type f -not -path '*/_md/*' -not -path '*/.git/*' -print0)
echo "[tomd] converted $count files -> $outdir" >&2
}
usage() { cat <<'EOF'
Usage: tomd <file> [--output out.md]
tomd --dir <folder>
cat file | tomd -
Converts non-native formats to markdown (for LLM ingestion).
Formats: PDF DOCX DOC HTML PPTX XLSX CSV JSON YAML XML TOML SQL
images(+OCR) code(py/go/ts/rs/c/swift/sh/zig) text
EOF
}
check_deps
case "${1:-}" in
-h|--help|help) usage; exit 0 ;;
--dir) convert_dir "${2:-.}"; exit 0 ;;
"") usage; exit 1 ;;
esac
output=""; file="$1"; shift
while [ $# -gt 0 ]; do
case "$1" in --output) output="$2"; shift 2 ;; *) shift ;; esac
done
if [ -n "$output" ]; then convert_single "$file" > "$output"; echo "[tomd] wrote: $output" >&2
else convert_single "$file"; fi