KeiSeiKit-1.0/_primitives/_rust/kei-llm-llamacpp/src/generate.rs
Parfii-bot 0be354a920 KeiSeiKit-public — clean state
Single-commit clean baseline after security scrub of niche-tells,
project codenames, internal jargon, and contributor-email leaks.

Contents:
- 100 Rust crates (_primitives/_rust/)
- 37 agent manifests (_manifests/) + generated specs (_generated/)
- 67 user-invocable skills (skills/)
- 33 hooks (hooks/)
- Composition blocks (_blocks/)
- Documentation (docs/, README.md)
- TS adapter packages (_ts_packages/)
- Assembler (_assembler/)
- Roles (_roles/)
- Templates (_templates/)
- Forgejo CI (.forgejo/)

Author: Denis Parfionovich <info@greendragon.info>

License: see LICENSE.
2026-05-01 12:09:03 +08:00

136 lines
4.2 KiB
Rust

//! Generate — non-streaming `llama-cli` invocation.
//!
//! Shells out to `llama-cli -m <path> -p <prompt> -n <n>` (plus optional
//! `--temp`) and parses the trailing timing footer. Output:
//! `Response { text, eval_tokens, eval_ms, tokens_per_sec }`.
//!
//! Footer format (llama.cpp >= b3000): line of the form
//! "llama_perf_context_print: eval time = 1234.56 ms / 12 runs ..."
//! We tolerate older builds that emit "llama_print_timings" with the
//! same fields.
use crate::error::{Error, Result};
use crate::runner::Runner;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::path::Path;
/// User-facing options for a generate call.
#[derive(Debug, Clone)]
pub struct GenerateOpts {
pub max_tokens: u32,
pub temperature: Option<f32>,
}
impl Default for GenerateOpts {
fn default() -> Self {
Self { max_tokens: 128, temperature: None }
}
}
/// Parsed result from a non-streaming `llama-cli` run.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct Response {
pub text: String,
pub eval_tokens: u32,
pub eval_ms: f64,
pub tokens_per_sec: f64,
}
/// Build the argv passed to `llama-cli` for a non-streaming call.
pub fn build_args(model: &Path, prompt: &str, opts: &GenerateOpts) -> Vec<String> {
let mut args = vec![
"-m".into(),
model.to_string_lossy().into_owned(),
"-p".into(),
prompt.to_string(),
"-n".into(),
opts.max_tokens.to_string(),
"--no-display-prompt".into(),
];
if let Some(t) = opts.temperature {
args.push("--temp".into());
args.push(t.to_string());
}
args
}
/// Run a non-streaming generate via the supplied Runner.
pub async fn generate<R: Runner + ?Sized>(
runner: &R,
bin: &str,
model: &Path,
prompt: &str,
opts: &GenerateOpts,
) -> Result<Response> {
if !model.exists() {
return Err(Error::ModelNotFound { path: model.to_path_buf() });
}
let args = build_args(model, prompt, opts);
let out = runner.run(bin, &args).await?;
if out.code != 0 {
return Err(Error::NonZeroExit { code: out.code, stderr: out.stderr });
}
parse_stdout(&out.stdout, &out.stderr)
}
/// Parse the captured stdout+stderr into a Response. The "answer text"
/// is the stdout up to the timing footer; timings come from stderr (or
/// stdout for older builds).
pub fn parse_stdout(stdout: &str, stderr: &str) -> Result<Response> {
let (text, _) = split_text_and_footer(stdout);
let combined = format!("{stdout}\n{stderr}");
let (eval_tokens, eval_ms) = parse_timings(&combined)?;
let tps = if eval_ms > 0.0 {
(eval_tokens as f64) * 1000.0 / eval_ms
} else {
0.0
};
Ok(Response {
text: text.trim().to_string(),
eval_tokens,
eval_ms,
tokens_per_sec: tps,
})
}
/// Split the model output into (answer_text, footer_block). The footer
/// starts at the first line beginning with "llama_perf" or
/// "llama_print_timings".
fn split_text_and_footer(stdout: &str) -> (String, String) {
let mut text = String::new();
let mut footer = String::new();
let mut in_footer = false;
for line in stdout.lines() {
if !in_footer && (line.starts_with("llama_perf") || line.starts_with("llama_print_timings"))
{
in_footer = true;
}
if in_footer {
footer.push_str(line);
footer.push('\n');
} else {
text.push_str(line);
text.push('\n');
}
}
(text, footer)
}
/// Pull `(eval_tokens, eval_ms)` out of the timing footer.
/// Format: `eval time = 1234.56 ms / 12 runs`.
fn parse_timings(combined: &str) -> Result<(u32, f64)> {
let re = Regex::new(r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*runs?").map_err(|e| {
Error::ParseFailed { reason: format!("regex compile: {e}") }
})?;
let cap = re
.captures(combined)
.ok_or_else(|| Error::ParseFailed { reason: "no eval-time footer".into() })?;
let ms: f64 = cap[1]
.parse()
.map_err(|e| Error::ParseFailed { reason: format!("ms parse: {e}") })?;
let tokens: u32 = cap[2]
.parse()
.map_err(|e| Error::ParseFailed { reason: format!("tokens parse: {e}") })?;
Ok((tokens, ms))
}