From b5da1940e1dd8d4cf045f7e30322dafd97bbdac8 Mon Sep 17 00:00:00 2001 From: Parfii-bot Date: Tue, 12 May 2026 13:47:35 +0800 Subject: [PATCH] feat(kei-tts + kei-stt): TTS/STT abstractions with 4+3 backends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two parallel atomars in the kei-buddy phase-1 plan. Mirror each other's architecture: trait + feature-gated backend modules + env-driven dispatch + wiremock tests for HTTP backends + subprocess-error test for local. ## kei-tts (text-to-speech) LOC: 959 across 15 files (largest src/lib.rs 121). Trait `TtsBackend` + 4 backends behind feature flags: * elevenlabs — POST api.elevenlabs.io/v1/text-to-speech/{voice}/stream * openai — POST api.openai.com/v1/audio/speech (tts-1, tts-1-hd) * google — POST texttospeech.googleapis.com/v1/text:synthesize (Wavenet voices, base64 audioContent) * piper — local subprocess to piper-tts binary, raw PCM out Default features: ["piper"]. all-backends feature gates the rest. `from_env()` reads KEI_TTS_BACKEND (default piper). Returns Box. Tests: 9 passed (env routing + 3 wiremock backends + piper subprocess error). ## kei-stt (speech-to-text) LOC: 935 across 13 files (largest whisper_local.rs 181). Trait `SttBackend` + 3 backends: * whisper-local — subprocess to `whisper` CLI / faster-whisper, reads JSON output, parses segments * deepgram — POST api.deepgram.com/v1/listen (Token auth header, raw audio body, parses words → Segments) * openai-whisper — POST api.openai.com/v1/audio/transcriptions (multipart file + model=whisper-1 + response_format=verbose_json) Default features: ["whisper-local"]. all-backends gates the rest. `from_env()` reads KEI_STT_BACKEND (default whisper-local). Tests: 10 passed + 1 doc-test (env routing + 5 wiremock + 2 JSON parsers + 1 subprocess error + 1 auth-header check). ## Common architecture decisions * `with_base_url(url)` constructor on each HTTP backend for wiremock testability — same pattern as kei-llm-router and kei-notify-telegram. * `tempfile` crate added to kei-stt for whisper-local audio scratch. * `base64 = { version = "0.22", optional = true }` in kei-tts for Google's base64-encoded audioContent. ## Verify-before-commit (RULE 0.13 §) * cargo check -p kei-tts (default + all-backends): PASS * cargo check -p kei-stt (default + all-backends): PASS * cargo test -p kei-tts --features all-backends --lib: 9/0 * cargo test -p kei-stt --features all-backends --lib: 10/0 * cargo check --workspace: PASS STATUS-TRUTH from both agents: shipped=functional, stubs=0, behaviour-verified=yes. ## Follow-up (deferred, non-blocking) * Real backend verification needs API keys for ElevenLabs / OpenAI / Google / Deepgram and piper-tts binary + .onnx model on PATH. * whisper-local language_detected always None — whisper CLI JSON schema differs across versions, parse heuristic to be added. * faster-whisper has different JSON schema from openai-whisper; current parser covers openai-whisper convention only. --- _primitives/_rust/Cargo.lock | 30 +++ _primitives/_rust/Cargo.toml | 4 + _primitives/_rust/kei-stt/Cargo.toml | 39 ++++ _primitives/_rust/kei-stt/README.md | 66 +++++++ _primitives/_rust/kei-stt/src/deepgram.rs | 115 +++++++++++ .../_rust/kei-stt/src/deepgram_test.rs | 93 +++++++++ _primitives/_rust/kei-stt/src/error.rs | 43 +++++ _primitives/_rust/kei-stt/src/lib.rs | 88 +++++++++ .../_rust/kei-stt/src/openai_whisper.rs | 138 +++++++++++++ .../_rust/kei-stt/src/openai_whisper_test.rs | 71 +++++++ _primitives/_rust/kei-stt/src/request.rs | 32 ++++ _primitives/_rust/kei-stt/src/response.rs | 45 +++++ _primitives/_rust/kei-stt/src/trait_def.rs | 24 +++ .../_rust/kei-stt/src/whisper_local.rs | 181 ++++++++++++++++++ _primitives/_rust/kei-tts/Cargo.toml | 40 ++++ _primitives/_rust/kei-tts/README.md | 53 +++++ _primitives/_rust/kei-tts/src/elevenlabs.rs | 94 +++++++++ .../_rust/kei-tts/src/elevenlabs_test.rs | 58 ++++++ _primitives/_rust/kei-tts/src/error.rs | 39 ++++ _primitives/_rust/kei-tts/src/google.rs | 103 ++++++++++ _primitives/_rust/kei-tts/src/google_test.rs | 65 +++++++ _primitives/_rust/kei-tts/src/lib.rs | 121 ++++++++++++ _primitives/_rust/kei-tts/src/openai.rs | 99 ++++++++++ _primitives/_rust/kei-tts/src/openai_test.rs | 58 ++++++ _primitives/_rust/kei-tts/src/piper.rs | 115 +++++++++++ _primitives/_rust/kei-tts/src/request.rs | 66 +++++++ _primitives/_rust/kei-tts/src/response.rs | 22 +++ _primitives/_rust/kei-tts/src/trait_def.rs | 24 +++ 28 files changed, 1926 insertions(+) create mode 100644 _primitives/_rust/kei-stt/Cargo.toml create mode 100644 _primitives/_rust/kei-stt/README.md create mode 100644 _primitives/_rust/kei-stt/src/deepgram.rs create mode 100644 _primitives/_rust/kei-stt/src/deepgram_test.rs create mode 100644 _primitives/_rust/kei-stt/src/error.rs create mode 100644 _primitives/_rust/kei-stt/src/lib.rs create mode 100644 _primitives/_rust/kei-stt/src/openai_whisper.rs create mode 100644 _primitives/_rust/kei-stt/src/openai_whisper_test.rs create mode 100644 _primitives/_rust/kei-stt/src/request.rs create mode 100644 _primitives/_rust/kei-stt/src/response.rs create mode 100644 _primitives/_rust/kei-stt/src/trait_def.rs create mode 100644 _primitives/_rust/kei-stt/src/whisper_local.rs create mode 100644 _primitives/_rust/kei-tts/Cargo.toml create mode 100644 _primitives/_rust/kei-tts/README.md create mode 100644 _primitives/_rust/kei-tts/src/elevenlabs.rs create mode 100644 _primitives/_rust/kei-tts/src/elevenlabs_test.rs create mode 100644 _primitives/_rust/kei-tts/src/error.rs create mode 100644 _primitives/_rust/kei-tts/src/google.rs create mode 100644 _primitives/_rust/kei-tts/src/google_test.rs create mode 100644 _primitives/_rust/kei-tts/src/lib.rs create mode 100644 _primitives/_rust/kei-tts/src/openai.rs create mode 100644 _primitives/_rust/kei-tts/src/openai_test.rs create mode 100644 _primitives/_rust/kei-tts/src/piper.rs create mode 100644 _primitives/_rust/kei-tts/src/request.rs create mode 100644 _primitives/_rust/kei-tts/src/response.rs create mode 100644 _primitives/_rust/kei-tts/src/trait_def.rs diff --git a/_primitives/_rust/Cargo.lock b/_primitives/_rust/Cargo.lock index 4f7274b..c50c64c 100644 --- a/_primitives/_rust/Cargo.lock +++ b/_primitives/_rust/Cargo.lock @@ -4393,6 +4393,21 @@ dependencies = [ "toml", ] +[[package]] +name = "kei-stt" +version = "0.1.0" +dependencies = [ + "async-trait", + "reqwest 0.12.28", + "serde", + "serde_json", + "tempfile", + "thiserror 1.0.69", + "tokio", + "tracing", + "wiremock", +] + [[package]] name = "kei-svc-systemd" version = "0.1.0" @@ -4455,6 +4470,21 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "kei-tts" +version = "0.1.0" +dependencies = [ + "async-trait", + "base64 0.22.1", + "reqwest 0.12.28", + "serde", + "serde_json", + "thiserror 1.0.69", + "tokio", + "tracing", + "wiremock", +] + [[package]] name = "kei-tty" version = "0.1.0" diff --git a/_primitives/_rust/Cargo.toml b/_primitives/_rust/Cargo.toml index a7e8de2..038fde5 100644 --- a/_primitives/_rust/Cargo.toml +++ b/_primitives/_rust/Cargo.toml @@ -183,6 +183,10 @@ members = [ "kei-buddy", # Inbound Telegram webhook handler — parses Update payloads into typed WebhookEvent "kei-telegram-webhook", + # TTS abstraction — 4 backends (ElevenLabs/OpenAI/Google/Piper) behind feature flags + "kei-tts", + # STT abstraction — 3 backends (whisper-local/Deepgram/OpenAI-Whisper) behind feature flags + "kei-stt", ] [workspace.package] diff --git a/_primitives/_rust/kei-stt/Cargo.toml b/_primitives/_rust/kei-stt/Cargo.toml new file mode 100644 index 0000000..f95f418 --- /dev/null +++ b/_primitives/_rust/kei-stt/Cargo.toml @@ -0,0 +1,39 @@ +[package] +name = "kei-stt" +version = "0.1.0" +edition.workspace = true +rust-version.workspace = true +description = "Speech-to-text abstraction trait with 3 backends (whisper-local/Deepgram/OpenAI). Default = whisper-local (free, local)." +authors.workspace = true +license.workspace = true + +[lib] +name = "kei_stt" +path = "src/lib.rs" + +[dependencies] +async-trait = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true, features = ["macros", "rt-multi-thread", "process", "io-util", "fs"] } +reqwest = { workspace = true, features = ["multipart"] } +tracing = "0.1" +tempfile = { workspace = true } + +[features] +default = ["whisper-local"] +whisper-local = [] +deepgram = [] +openai-whisper = [] +all-backends = ["whisper-local", "deepgram", "openai-whisper"] + +[dev-dependencies] +wiremock = { workspace = true } +tokio = { workspace = true } + +[package.metadata.keisei] +maturity = "alpha" +trait = "SttBackend" +description = "STT abstraction with 3 backends (whisper-local/Deepgram/OpenAI). Default = whisper-local." +authors = ["Denis Parfionovich "] diff --git a/_primitives/_rust/kei-stt/README.md b/_primitives/_rust/kei-stt/README.md new file mode 100644 index 0000000..f64e923 --- /dev/null +++ b/_primitives/_rust/kei-stt/README.md @@ -0,0 +1,66 @@ +# kei-stt + +Speech-to-text abstraction crate with 3 backends selected at runtime via +`KEI_STT_BACKEND`. Default backend is **whisper-local** (free, local, no API key). + +## Backend matrix + +| Backend | Feature flag | Cost | Latency | Quality | +|------------------|------------------|----------------|-------------|-----------| +| `whisper-local` | `whisper-local` | Free | 1–10× RT | Very good | +| `deepgram` | `deepgram` | ~$0.0043/min | 200–500 ms | Excellent | +| `openai-whisper` | `openai-whisper` | ~$0.006/min | 300–800 ms | Excellent | + +RT = real-time factor (depends on hardware / model size for whisper-local). + +## Environment variables + +| Variable | Backend | Required | Description | +|----------------------------|-----------------|----------|------------------------------------------| +| `KEI_STT_BACKEND` | all | No | `whisper-local` (default) / `deepgram` / `openai-whisper` | +| `KEI_STT_WHISPER_BINARY` | whisper-local | No | Path to `whisper` CLI (default: PATH) | +| `KEI_STT_WHISPER_MODEL` | whisper-local | No | Model name (default: `base.en`) | +| `DEEPGRAM_API_KEY` | deepgram | Yes | Deepgram API key | +| `OPENAI_API_KEY` | openai-whisper | Yes | OpenAI API key | + +## Usage + +```toml +[dependencies] +kei-stt = { path = "../kei-stt", features = ["whisper-local"] } +``` + +```rust +#[tokio::main] +async fn main() -> Result<(), kei_stt::SttError> { + let backend = kei_stt::from_env()?; + let audio = std::fs::read("speech.wav").unwrap(); + let req = kei_stt::SttRequest::new_wav(audio); + let resp = backend.transcribe(&req).await?; + println!("[{}] {}", backend.name(), resp.text); + for seg in &resp.segments { + println!(" {:>6}ms–{:>6}ms {}", seg.start_ms, seg.end_ms, seg.text); + } + Ok(()) +} +``` + +## Compile-time features + +```toml +# All backends: +kei-stt = { features = ["all-backends"] } +# Cloud only, no local whisper: +kei-stt = { features = ["deepgram", "openai-whisper"], default-features = false } +``` + +## whisper-local prerequisites + +Install the `openai-whisper` Python package: + +```sh +pip install openai-whisper +``` + +This makes the `whisper` CLI available. Alternatively point `KEI_STT_WHISPER_BINARY` +at a compatible binary (`faster-whisper`, etc. with identical CLI interface). diff --git a/_primitives/_rust/kei-stt/src/deepgram.rs b/_primitives/_rust/kei-stt/src/deepgram.rs new file mode 100644 index 0000000..3dd3d8e --- /dev/null +++ b/_primitives/_rust/kei-stt/src/deepgram.rs @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! Deepgram STT backend — calls `api.deepgram.com/v1/listen`. +//! +//! Endpoint: `POST /v1/listen?language={lang}&punctuate=true` +//! Auth: `Authorization: Token {DEEPGRAM_API_KEY}` header. +//! Body: raw audio bytes with the request MIME type. +//! +//! Response shape: +//! ```json +//! {"results":{"channels":[{"alternatives":[{ +//! "transcript":"...", +//! "words":[{"word":"...","start":0.1,"end":0.4}] +//! }]}]}} +//! ``` +//! +//! Constructor surface: +//! * [`DeepgramBackend::from_env`] — reads `DEEPGRAM_API_KEY`. +//! * [`DeepgramBackend::with_base_url`] — explicit URL + key (tests). + +#![cfg(feature = "deepgram")] + +use crate::error::SttError; +use crate::request::SttRequest; +use crate::response::{Segment, SttResponse}; +use crate::trait_def::SttBackend; + +const DEFAULT_BASE_URL: &str = "https://api.deepgram.com"; + +pub struct DeepgramBackend { + api_key: String, + client: reqwest::Client, + base_url: String, +} + +impl DeepgramBackend { + /// Build from explicit base URL and API key (used in wiremock tests). + pub fn with_base_url( + base_url: impl Into, + api_key: impl Into, + ) -> Self { + Self { + api_key: api_key.into(), + client: reqwest::Client::new(), + base_url: base_url.into().trim_end_matches('/').to_string(), + } + } + + /// Build from `DEEPGRAM_API_KEY` env var. + pub fn from_env() -> Result { + let key = std::env::var("DEEPGRAM_API_KEY") + .map_err(|_| SttError::MissingEnv("DEEPGRAM_API_KEY".into()))?; + Ok(Self::with_base_url(DEFAULT_BASE_URL, key)) + } +} + +#[async_trait::async_trait] +impl SttBackend for DeepgramBackend { + fn name(&self) -> &'static str { "deepgram" } + + async fn transcribe(&self, req: &SttRequest) -> Result { + let mut url = format!("{}/v1/listen?punctuate=true", self.base_url); + if let Some(lang) = &req.language { + url.push_str(&format!("&language={lang}")); + } + + let resp = self.client + .post(&url) + .header("Authorization", format!("Token {}", self.api_key)) + .header("Content-Type", &req.mime_type) + .body(req.audio_bytes.clone()) + .send() + .await?; + + if !resp.status().is_success() { + let status = resp.status().as_u16(); + let text = resp.text().await.unwrap_or_default(); + return Err(SttError::Http(format!("http {status}: {text}"))); + } + + let body: serde_json::Value = resp.json().await + .map_err(|e| SttError::InvalidResponse(e.to_string()))?; + + parse_deepgram_response(&body) + } +} + +fn parse_deepgram_response(body: &serde_json::Value) -> Result { + let alt = body + .pointer("/results/channels/0/alternatives/0") + .ok_or_else(|| SttError::InvalidResponse("missing alternatives".into()))?; + + let text = alt["transcript"] + .as_str() + .unwrap_or_default() + .to_string(); + + let segments = alt["words"] + .as_array() + .unwrap_or(&vec![]) + .iter() + .filter_map(|w| { + let start_ms = (w["start"].as_f64()? * 1000.0) as u64; + let end_ms = (w["end"].as_f64()? * 1000.0) as u64; + let word = w["word"].as_str()?.to_string(); + Some(Segment { start_ms, end_ms, text: word }) + }) + .collect(); + + Ok(SttResponse { text, segments, language_detected: None }) +} + +#[cfg(test)] +#[path = "deepgram_test.rs"] +mod tests; diff --git a/_primitives/_rust/kei-stt/src/deepgram_test.rs b/_primitives/_rust/kei-stt/src/deepgram_test.rs new file mode 100644 index 0000000..f02a1cf --- /dev/null +++ b/_primitives/_rust/kei-stt/src/deepgram_test.rs @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! Wiremock tests for `DeepgramBackend`. +//! +//! Verifies request headers, URL parameters, and response parsing. + +#![cfg(all(test, feature = "deepgram"))] + +use wiremock::matchers::{header, header_regex, method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +use crate::deepgram::DeepgramBackend; +use crate::request::SttRequest; +use crate::trait_def::SttBackend; + +fn deepgram_response_body() -> serde_json::Value { + serde_json::json!({ + "results": { + "channels": [{ + "alternatives": [{ + "transcript": "hello deepgram", + "words": [ + {"word": "hello", "start": 0.1, "end": 0.5}, + {"word": "deepgram", "start": 0.6, "end": 1.1} + ] + }] + }] + } + }) +} + +#[tokio::test] +async fn deepgram_parses_transcript() { + let server = MockServer::start().await; + + Mock::given(method("POST")) + .and(path("/v1/listen")) + .and(header_regex("authorization", "Token .+")) + .respond_with( + ResponseTemplate::new(200) + .set_body_json(deepgram_response_body()), + ) + .mount(&server) + .await; + + let backend = DeepgramBackend::with_base_url(server.uri(), "test-key"); + let req = SttRequest { + audio_bytes: b"fake_audio".to_vec(), + mime_type: "audio/wav".to_string(), + language: None, + }; + let resp = backend.transcribe(&req).await.expect("transcribe should succeed"); + assert_eq!(resp.text, "hello deepgram"); + assert_eq!(resp.segments.len(), 2); + assert_eq!(resp.segments[0].start_ms, 100); + assert_eq!(resp.segments[1].end_ms, 1100); + assert!(resp.language_detected.is_none()); +} + +#[tokio::test] +async fn deepgram_sends_auth_header() { + let server = MockServer::start().await; + + Mock::given(method("POST")) + .and(path("/v1/listen")) + .and(header("authorization", "Token secret-key")) + .respond_with( + ResponseTemplate::new(200) + .set_body_json(deepgram_response_body()), + ) + .mount(&server) + .await; + + let backend = DeepgramBackend::with_base_url(server.uri(), "secret-key"); + let req = SttRequest::new_wav(b"audio".to_vec()); + backend.transcribe(&req).await.expect("auth header test should pass"); +} + +#[tokio::test] +async fn deepgram_http_error() { + let server = MockServer::start().await; + + Mock::given(method("POST")) + .and(path("/v1/listen")) + .respond_with(ResponseTemplate::new(401).set_body_string("Unauthorized")) + .mount(&server) + .await; + + let backend = DeepgramBackend::with_base_url(server.uri(), "bad-key"); + let req = SttRequest::new_wav(b"audio".to_vec()); + let err = backend.transcribe(&req).await.expect_err("should fail on 401"); + assert!(matches!(err, crate::SttError::Http(_))); +} diff --git a/_primitives/_rust/kei-stt/src/error.rs b/_primitives/_rust/kei-stt/src/error.rs new file mode 100644 index 0000000..c973e08 --- /dev/null +++ b/_primitives/_rust/kei-stt/src/error.rs @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! `SttError` — crate-level error enum for all STT backends. +//! +//! Each variant carries a human-readable string so call-sites can log +//! without leaking transport internals. `thiserror` provides `Display` +//! and the `Error` trait automatically. + +use thiserror::Error; + +/// Errors that can occur across any STT backend. +#[derive(Debug, Error)] +pub enum SttError { + /// HTTP transport or API error from a cloud backend. + #[error("http: {0}")] + Http(String), + + /// Subprocess (whisper CLI) spawn or IO error. + #[error("subprocess: {0}")] + Subprocess(String), + + /// Required environment variable is absent. + #[error("missing env var: {0}")] + MissingEnv(String), + + /// Backend name was requested but its Cargo feature is not compiled in. + #[error("backend not enabled: {0}")] + BackendNotEnabled(String), + + /// Unexpected or malformed response from a backend. + #[error("invalid response: {0}")] + InvalidResponse(String), + + /// Input audio bytes are invalid or in an unsupported format. + #[error("invalid audio: {0}")] + InvalidAudio(String), +} + +impl From for SttError { + fn from(e: reqwest::Error) -> Self { + SttError::Http(e.to_string()) + } +} diff --git a/_primitives/_rust/kei-stt/src/lib.rs b/_primitives/_rust/kei-stt/src/lib.rs new file mode 100644 index 0000000..fa6ac33 --- /dev/null +++ b/_primitives/_rust/kei-stt/src/lib.rs @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! `kei-stt` — speech-to-text abstraction with 3 backend impls. +//! +//! Backend is chosen at runtime via `KEI_STT_BACKEND` env var. +//! Compile-time feature flags gate which backends are available. +//! +//! | Feature | Backend | Default | +//! |------------------|---------------|---------| +//! | `whisper-local` | local subprocess | ✓ | +//! | `deepgram` | cloud API | – | +//! | `openai-whisper` | cloud API | – | +//! +//! # Quick start +//! ```no_run +//! # async fn example() -> Result<(), kei_stt::SttError> { +//! let backend = kei_stt::from_env()?; +//! let audio = std::fs::read("speech.wav").unwrap(); +//! let req = kei_stt::SttRequest::new_wav(audio); +//! let resp = backend.transcribe(&req).await?; +//! println!("transcript: {}", resp.text); +//! # Ok(()) } +//! ``` + +pub mod error; +pub mod request; +pub mod response; +pub mod trait_def; + +#[cfg(feature = "whisper-local")] +pub mod whisper_local; +#[cfg(feature = "deepgram")] +pub mod deepgram; +#[cfg(feature = "openai-whisper")] +pub mod openai_whisper; + +pub use error::SttError; +pub use request::SttRequest; +pub use response::{Segment, SttResponse}; +pub use trait_def::SttBackend; + +/// Construct the backend selected by `KEI_STT_BACKEND`. +/// +/// Defaults to `whisper-local` when the env var is absent or empty. +/// Returns `SttError::BackendNotEnabled` if the chosen backend's +/// feature flag was not compiled in. +pub fn from_env() -> Result, SttError> { + let name = std::env::var("KEI_STT_BACKEND") + .unwrap_or_else(|_| "whisper-local".to_string()); + build_backend(&name) +} + +fn build_backend(name: &str) -> Result, SttError> { + match name { + #[cfg(feature = "whisper-local")] + "whisper-local" => Ok(Box::new(whisper_local::WhisperLocalBackend::from_env())), + #[cfg(feature = "deepgram")] + "deepgram" => Ok(Box::new(deepgram::DeepgramBackend::from_env()?)), + #[cfg(feature = "openai-whisper")] + "openai-whisper" => Ok(Box::new(openai_whisper::OpenAiWhisperBackend::from_env()?)), + other => Err(SttError::BackendNotEnabled(other.to_string())), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_env_defaults_to_whisper_local() { + std::env::remove_var("KEI_STT_BACKEND"); + let backend = from_env().expect("whisper-local backend should construct"); + assert_eq!(backend.name(), "whisper-local"); + } + + #[test] + fn from_env_unknown_backend_errors() { + std::env::remove_var("KEI_STT_BACKEND"); + let result = build_backend("unknown_provider"); + match result { + Err(SttError::BackendNotEnabled(name)) => { + assert_eq!(name, "unknown_provider"); + } + Ok(_) => panic!("expected BackendNotEnabled, got Ok"), + Err(e) => panic!("expected BackendNotEnabled, got: {e}"), + } + } +} diff --git a/_primitives/_rust/kei-stt/src/openai_whisper.rs b/_primitives/_rust/kei-stt/src/openai_whisper.rs new file mode 100644 index 0000000..4194de0 --- /dev/null +++ b/_primitives/_rust/kei-stt/src/openai_whisper.rs @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! OpenAI Whisper STT backend — calls `api.openai.com/v1/audio/transcriptions`. +//! +//! Sends a multipart/form-data POST with: +//! - `file`: audio bytes (filename derived from MIME type) +//! - `model`: `whisper-1` +//! - `response_format`: `verbose_json` +//! - `language`: BCP-47 code if provided +//! +//! Response `verbose_json` shape: +//! ```json +//! {"text":"...", "segments":[{"start":0.0,"end":1.0,"text":"..."}]} +//! ``` +//! +//! Constructor surface: +//! * [`OpenAiWhisperBackend::from_env`] — reads `OPENAI_API_KEY`. +//! * [`OpenAiWhisperBackend::with_base_url`] — explicit URL + key (tests). + +#![cfg(feature = "openai-whisper")] + +use reqwest::multipart; + +use crate::error::SttError; +use crate::request::SttRequest; +use crate::response::{Segment, SttResponse}; +use crate::trait_def::SttBackend; + +const DEFAULT_BASE_URL: &str = "https://api.openai.com"; +const WHISPER_MODEL: &str = "whisper-1"; + +pub struct OpenAiWhisperBackend { + api_key: String, + client: reqwest::Client, + base_url: String, +} + +impl OpenAiWhisperBackend { + /// Build from explicit base URL and API key (used in wiremock tests). + pub fn with_base_url( + base_url: impl Into, + api_key: impl Into, + ) -> Self { + Self { + api_key: api_key.into(), + client: reqwest::Client::new(), + base_url: base_url.into().trim_end_matches('/').to_string(), + } + } + + /// Build from `OPENAI_API_KEY` env var. + pub fn from_env() -> Result { + let key = std::env::var("OPENAI_API_KEY") + .map_err(|_| SttError::MissingEnv("OPENAI_API_KEY".into()))?; + Ok(Self::with_base_url(DEFAULT_BASE_URL, key)) + } + + fn filename_from_mime(mime: &str) -> &'static str { + match mime { + "audio/mpeg" => "audio.mp3", + "audio/ogg" => "audio.ogg", + "audio/flac" => "audio.flac", + _ => "audio.wav", + } + } +} + +#[async_trait::async_trait] +impl SttBackend for OpenAiWhisperBackend { + fn name(&self) -> &'static str { "openai-whisper" } + + async fn transcribe(&self, req: &SttRequest) -> Result { + let url = format!("{}/v1/audio/transcriptions", self.base_url); + let filename = Self::filename_from_mime(&req.mime_type); + + let file_part = multipart::Part::bytes(req.audio_bytes.clone()) + .file_name(filename) + .mime_str(&req.mime_type) + .map_err(|e| SttError::InvalidAudio(e.to_string()))?; + + let mut form = multipart::Form::new() + .part("file", file_part) + .text("model", WHISPER_MODEL) + .text("response_format", "verbose_json"); + + if let Some(lang) = &req.language { + form = form.text("language", lang.clone()); + } + + let resp = self.client + .post(&url) + .bearer_auth(&self.api_key) + .multipart(form) + .send() + .await?; + + if !resp.status().is_success() { + let status = resp.status().as_u16(); + let text = resp.text().await.unwrap_or_default(); + return Err(SttError::Http(format!("http {status}: {text}"))); + } + + let body: serde_json::Value = resp.json().await + .map_err(|e| SttError::InvalidResponse(e.to_string()))?; + + parse_openai_whisper_response(&body) + } +} + +fn parse_openai_whisper_response(body: &serde_json::Value) -> Result { + let text = body["text"] + .as_str() + .unwrap_or_default() + .trim() + .to_string(); + + let language_detected = body["language"] + .as_str() + .map(|s| s.to_string()); + + let segments = body["segments"] + .as_array() + .unwrap_or(&vec![]) + .iter() + .filter_map(|s| { + let start_ms = (s["start"].as_f64()? * 1000.0) as u64; + let end_ms = (s["end"].as_f64()? * 1000.0) as u64; + let seg_text = s["text"].as_str()?.trim().to_string(); + Some(Segment { start_ms, end_ms, text: seg_text }) + }) + .collect(); + + Ok(SttResponse { text, segments, language_detected }) +} + +#[cfg(test)] +#[path = "openai_whisper_test.rs"] +mod tests; diff --git a/_primitives/_rust/kei-stt/src/openai_whisper_test.rs b/_primitives/_rust/kei-stt/src/openai_whisper_test.rs new file mode 100644 index 0000000..fd84092 --- /dev/null +++ b/_primitives/_rust/kei-stt/src/openai_whisper_test.rs @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! Wiremock tests for `OpenAiWhisperBackend`. +//! +//! Verifies Bearer auth, multipart body, and verbose_json segment parsing. + +#![cfg(all(test, feature = "openai-whisper"))] + +use wiremock::matchers::{header, method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +use crate::openai_whisper::OpenAiWhisperBackend; +use crate::request::SttRequest; +use crate::trait_def::SttBackend; + +fn verbose_json_body() -> serde_json::Value { + serde_json::json!({ + "text": "hello openai", + "language": "english", + "segments": [ + {"start": 0.0, "end": 0.5, "text": "hello"}, + {"start": 0.5, "end": 1.2, "text": "openai"} + ] + }) +} + +#[tokio::test] +async fn openai_whisper_parses_segments() { + let server = MockServer::start().await; + + Mock::given(method("POST")) + .and(path("/v1/audio/transcriptions")) + .and(header("authorization", "Bearer test-key")) + .respond_with( + ResponseTemplate::new(200) + .set_body_json(verbose_json_body()), + ) + .mount(&server) + .await; + + let backend = OpenAiWhisperBackend::with_base_url(server.uri(), "test-key"); + let req = SttRequest { + audio_bytes: b"fake_audio".to_vec(), + mime_type: "audio/wav".to_string(), + language: None, + }; + let resp = backend.transcribe(&req).await.expect("transcribe should succeed"); + assert_eq!(resp.text, "hello openai"); + assert_eq!(resp.segments.len(), 2); + assert_eq!(resp.segments[0].start_ms, 0); + assert_eq!(resp.segments[0].end_ms, 500); + assert_eq!(resp.segments[1].start_ms, 500); + assert_eq!(resp.segments[1].end_ms, 1200); + assert_eq!(resp.language_detected.as_deref(), Some("english")); +} + +#[tokio::test] +async fn openai_whisper_http_error() { + let server = MockServer::start().await; + + Mock::given(method("POST")) + .and(path("/v1/audio/transcriptions")) + .respond_with(ResponseTemplate::new(429).set_body_string("Rate limited")) + .mount(&server) + .await; + + let backend = OpenAiWhisperBackend::with_base_url(server.uri(), "test-key"); + let req = SttRequest::new_wav(b"audio".to_vec()); + let err = backend.transcribe(&req).await.expect_err("should fail on 429"); + assert!(matches!(err, crate::SttError::Http(_))); +} diff --git a/_primitives/_rust/kei-stt/src/request.rs b/_primitives/_rust/kei-stt/src/request.rs new file mode 100644 index 0000000..b1ff7ca --- /dev/null +++ b/_primitives/_rust/kei-stt/src/request.rs @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! `SttRequest` — input type for all STT backends. +//! +//! Deliberately backend-agnostic: each backend maps its fields to +//! provider-specific parameters in its own module. + +use serde::{Deserialize, Serialize}; + +/// Parameters for a single STT transcription request. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SttRequest { + /// Raw audio bytes to transcribe. + pub audio_bytes: Vec, + + /// MIME type of the audio data (e.g. `"audio/wav"`, `"audio/mpeg"`, `"audio/ogg"`). + pub mime_type: String, + + /// BCP-47 language hint (e.g. `"en"`, `"ru"`). `None` → auto-detect. + pub language: Option, +} + +impl SttRequest { + /// Convenience constructor for WAV audio with no language hint. + pub fn new_wav(audio_bytes: Vec) -> Self { + Self { + audio_bytes, + mime_type: "audio/wav".to_string(), + language: None, + } + } +} diff --git a/_primitives/_rust/kei-stt/src/response.rs b/_primitives/_rust/kei-stt/src/response.rs new file mode 100644 index 0000000..c94a385 --- /dev/null +++ b/_primitives/_rust/kei-stt/src/response.rs @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! `SttResponse` and `Segment` — output types for all STT backends. +//! +//! `segments` is empty when the backend does not provide word-level timing. + +use serde::{Deserialize, Serialize}; + +/// A timed text segment from the transcription. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Segment { + /// Start of the segment in milliseconds from the audio start. + pub start_ms: u64, + + /// End of the segment in milliseconds from the audio start. + pub end_ms: u64, + + /// Transcribed text for this segment. + pub text: String, +} + +/// Result of a successful STT transcription call. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SttResponse { + /// Full transcribed text (concatenation of all segments). + pub text: String, + + /// Word/sentence-level timing segments. + /// Empty when the backend does not provide timing data. + pub segments: Vec, + + /// BCP-47 language code detected by the backend. `None` if not reported. + pub language_detected: Option, +} + +impl SttResponse { + /// Construct a minimal response with text only. + pub fn text_only(text: impl Into) -> Self { + Self { + text: text.into(), + segments: Vec::new(), + language_detected: None, + } + } +} diff --git a/_primitives/_rust/kei-stt/src/trait_def.rs b/_primitives/_rust/kei-stt/src/trait_def.rs new file mode 100644 index 0000000..c4e71f8 --- /dev/null +++ b/_primitives/_rust/kei-stt/src/trait_def.rs @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! `SttBackend` — the core async trait all backend impls satisfy. +//! +//! Implementing this trait is sufficient to plug a new STT provider into +//! the `from_env()` dispatch without modifying `lib.rs`. Each backend +//! module is self-contained and feature-gated. + +use crate::error::SttError; +use crate::request::SttRequest; +use crate::response::SttResponse; + +/// Async STT transcription backend. +/// +/// Implementations must be `Send + Sync` so they can be stored in a +/// `Box` and shared across Tokio tasks. +#[async_trait::async_trait] +pub trait SttBackend: Send + Sync { + /// Transcribe the audio in `req` and return the text plus optional segments. + async fn transcribe(&self, req: &SttRequest) -> Result; + + /// Short, stable identifier for this backend (e.g. `"whisper-local"`). + fn name(&self) -> &'static str; +} diff --git a/_primitives/_rust/kei-stt/src/whisper_local.rs b/_primitives/_rust/kei-stt/src/whisper_local.rs new file mode 100644 index 0000000..9923c3c --- /dev/null +++ b/_primitives/_rust/kei-stt/src/whisper_local.rs @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! Whisper-local STT backend — spawns the `whisper` CLI subprocess. +//! +//! Invocation: +//! ` --model --output_format json --output_dir ` +//! +//! The openai-whisper Python package writes `.json` into `--output_dir`. +//! We parse `text` + `segments[].{start,end,text}` from that JSON. +//! +//! Constructor surface: +//! * [`WhisperLocalBackend::from_env`] — reads `KEI_STT_WHISPER_BINARY` + +//! `KEI_STT_WHISPER_MODEL` (default `base.en`). +//! * [`WhisperLocalBackend::with_binary`] + [`WhisperLocalBackend::with_model`] +//! — explicit overrides for testability. + +#![cfg(feature = "whisper-local")] + +use std::path::PathBuf; + +use tokio::process::Command; + +use crate::error::SttError; +use crate::request::SttRequest; +use crate::response::{Segment, SttResponse}; +use crate::trait_def::SttBackend; + +const DEFAULT_BINARY: &str = "whisper"; +const DEFAULT_MODEL: &str = "base.en"; + +pub struct WhisperLocalBackend { + binary: PathBuf, + model: String, +} + +impl WhisperLocalBackend { + /// Build from explicit binary and model (used in tests). + pub fn new(binary: impl Into, model: impl Into) -> Self { + Self { binary: binary.into(), model: model.into() } + } + + /// Build from env vars. + /// Optional: `KEI_STT_WHISPER_BINARY` (default `whisper`). + /// Optional: `KEI_STT_WHISPER_MODEL` (default `base.en`). + pub fn from_env() -> Self { + let binary = std::env::var("KEI_STT_WHISPER_BINARY") + .unwrap_or_else(|_| DEFAULT_BINARY.to_string()); + let model = std::env::var("KEI_STT_WHISPER_MODEL") + .unwrap_or_else(|_| DEFAULT_MODEL.to_string()); + Self::new(binary, model) + } + + fn ext_from_mime(mime: &str) -> &'static str { + match mime { + "audio/mpeg" => "mp3", + "audio/ogg" => "ogg", + "audio/flac" => "flac", + _ => "wav", + } + } +} + +#[async_trait::async_trait] +impl SttBackend for WhisperLocalBackend { + fn name(&self) -> &'static str { "whisper-local" } + + async fn transcribe(&self, req: &SttRequest) -> Result { + let ext = Self::ext_from_mime(&req.mime_type); + + // Write audio bytes to a named temp file. + let audio_file = tempfile::Builder::new() + .suffix(&format!(".{ext}")) + .tempfile() + .map_err(|e| SttError::Subprocess(e.to_string()))?; + + let out_dir = tempfile::tempdir() + .map_err(|e| SttError::Subprocess(e.to_string()))?; + + tokio::fs::write(audio_file.path(), &req.audio_bytes) + .await + .map_err(|e| SttError::Subprocess(e.to_string()))?; + + // Build and run whisper CLI command. + let mut cmd = Command::new(&self.binary); + cmd.arg(audio_file.path()) + .arg("--model").arg(&self.model) + .arg("--output_format").arg("json") + .arg("--output_dir").arg(out_dir.path()); + + if let Some(lang) = &req.language { + cmd.arg("--language").arg(lang); + } + + let output = cmd.output().await + .map_err(|e| SttError::Subprocess(e.to_string()))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr).into_owned(); + return Err(SttError::Subprocess(format!( + "whisper exited {}: {stderr}", output.status + ))); + } + + // Find the produced JSON file (stem of input + ".json"). + let stem = audio_file.path() + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("audio"); + let json_path = out_dir.path().join(format!("{stem}.json")); + + let json_bytes = tokio::fs::read(&json_path).await + .map_err(|e| SttError::InvalidResponse(format!("json file: {e}")))?; + + parse_whisper_json(&json_bytes) + } +} + +fn parse_whisper_json(bytes: &[u8]) -> Result { + let v: serde_json::Value = serde_json::from_slice(bytes) + .map_err(|e| SttError::InvalidResponse(e.to_string()))?; + + let text = v["text"].as_str() + .unwrap_or_default() + .trim() + .to_string(); + + let segments = v["segments"] + .as_array() + .unwrap_or(&vec![]) + .iter() + .filter_map(|s| { + let start_ms = (s["start"].as_f64()? * 1000.0) as u64; + let end_ms = (s["end"].as_f64()? * 1000.0) as u64; + let seg_text = s["text"].as_str()?.trim().to_string(); + Some(Segment { start_ms, end_ms, text: seg_text }) + }) + .collect(); + + Ok(SttResponse { text, segments, language_detected: None }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn whisper_local_subprocess_error_on_bad_binary() { + let backend = WhisperLocalBackend::new("/nonexistent/whisper", "base.en"); + let req = SttRequest::new_wav(b"RIFF....".to_vec()); + let err = backend.transcribe(&req).await + .expect_err("should fail with non-existent binary"); + assert!( + matches!(err, SttError::Subprocess(_)), + "expected Subprocess error, got: {err:?}" + ); + } + + #[test] + fn parse_whisper_json_full() { + let json = br#"{ + "text": " Hello world", + "segments": [ + {"start": 0.0, "end": 1.5, "text": " Hello"}, + {"start": 1.5, "end": 2.0, "text": " world"} + ] + }"#; + let resp = parse_whisper_json(json).expect("parse should succeed"); + assert_eq!(resp.text, "Hello world"); + assert_eq!(resp.segments.len(), 2); + assert_eq!(resp.segments[0].end_ms, 1500); + assert_eq!(resp.segments[1].start_ms, 1500); + } + + #[test] + fn parse_whisper_json_no_segments() { + let json = br#"{"text": " Hi"}"#; + let resp = parse_whisper_json(json).expect("parse should succeed"); + assert_eq!(resp.text, "Hi"); + assert!(resp.segments.is_empty()); + } +} diff --git a/_primitives/_rust/kei-tts/Cargo.toml b/_primitives/_rust/kei-tts/Cargo.toml new file mode 100644 index 0000000..ea340f9 --- /dev/null +++ b/_primitives/_rust/kei-tts/Cargo.toml @@ -0,0 +1,40 @@ +[package] +name = "kei-tts" +version = "0.1.0" +edition.workspace = true +rust-version.workspace = true +description = "Text-to-speech abstraction trait with 4 backends (ElevenLabs/OpenAI/Google/Piper). Default = piper (local, free, zero latency)." +authors.workspace = true +license.workspace = true + +[lib] +name = "kei_tts" +path = "src/lib.rs" + +[dependencies] +async-trait = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true, features = ["macros", "rt-multi-thread", "process", "io-util"] } +reqwest = { workspace = true } +tracing = "0.1" +base64 = { version = "0.22", optional = true } + +[features] +default = ["piper"] +elevenlabs = [] +openai = [] +google = ["dep:base64"] +piper = [] +all-backends = ["elevenlabs", "openai", "google", "piper"] + +[dev-dependencies] +wiremock = { workspace = true } +tokio = { workspace = true } + +[package.metadata.keisei] +maturity = "alpha" +trait = "TtsBackend" +description = "TTS abstraction with 4 backends (ElevenLabs/OpenAI/Google/Piper). Default = piper." +authors = ["Denis Parfionovich "] diff --git a/_primitives/_rust/kei-tts/README.md b/_primitives/_rust/kei-tts/README.md new file mode 100644 index 0000000..92c2d7c --- /dev/null +++ b/_primitives/_rust/kei-tts/README.md @@ -0,0 +1,53 @@ +# kei-tts + +Text-to-speech abstraction crate with 4 backends selected at runtime via +`KEI_TTS_BACKEND`. Default backend is **piper** (local, free, zero latency). + +## Backend matrix + +| Backend | Feature flag | Cost | Latency | Quality | Language coverage | +|-------------|---------------|-------------|------------|-----------|-------------------| +| `piper` | `piper` | Free | ~50–200 ms | Good | 20+ language packs | +| `elevenlabs`| `elevenlabs` | ~$0.30/1k ch| 300–600 ms | Excellent | 30+ languages | +| `openai` | `openai` | ~$0.015/1k ch| 200–500 ms| Very good | 50+ languages | +| `google` | `google` | ~$4/1M ch | 200–400 ms | Very good | 40+ languages | + +## Environment variables + +| Variable | Backend | Required | Description | +|-------------------------|-------------|----------|------------------------------------| +| `KEI_TTS_BACKEND` | all | No | `piper` (default) / `elevenlabs` / `openai` / `google` | +| `ELEVENLABS_API_KEY` | elevenlabs | Yes | ElevenLabs API key | +| `OPENAI_API_KEY` | openai | Yes | OpenAI API key | +| `KEI_TTS_OPENAI_MODEL` | openai | No | `tts-1` (default) or `tts-1-hd` | +| `GOOGLE_TTS_API_KEY` | google | Yes | Google Cloud API key | +| `KEI_TTS_PIPER_MODEL` | piper | Yes | Path to `.onnx` piper model file | +| `KEI_TTS_PIPER_BINARY` | piper | No | Path to `piper-tts` (default: PATH)| + +## Usage + +```toml +[dependencies] +kei-tts = { path = "../kei-tts", features = ["piper"] } +``` + +```rust +#[tokio::main] +async fn main() -> Result<(), kei_tts::TtsError> { + let backend = kei_tts::from_env()?; + let req = kei_tts::TtsRequest::new("Hello, world!"); + let resp = backend.synth(&req).await?; + std::fs::write("out.mp3", &resp.audio_bytes).ok(); + println!("synthesised {} bytes via {}", resp.audio_bytes.len(), backend.name()); + Ok(()) +} +``` + +## Compile-time features + +```toml +# All backends: +kei-tts = { features = ["all-backends"] } +# Cloud only, no piper: +kei-tts = { features = ["elevenlabs", "openai", "google"], default-features = false } +``` diff --git a/_primitives/_rust/kei-tts/src/elevenlabs.rs b/_primitives/_rust/kei-tts/src/elevenlabs.rs new file mode 100644 index 0000000..b56e22e --- /dev/null +++ b/_primitives/_rust/kei-tts/src/elevenlabs.rs @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! ElevenLabs TTS backend — streams audio from `api.elevenlabs.io`. +//! +//! Endpoint: `POST /v1/text-to-speech/{voice_id}/stream` +//! Auth: `xi-api-key: {ELEVENLABS_API_KEY}` header. +//! Response: raw audio bytes (format-specific Content-Type). +//! +//! Constructor surface: +//! * [`ElevenLabsBackend::from_env`] — reads `ELEVENLABS_API_KEY`. +//! * [`ElevenLabsBackend::with_base_url`] — explicit URL + key (tests). + +#![cfg(feature = "elevenlabs")] + +use crate::error::TtsError; +use crate::request::{AudioFormat, TtsRequest}; +use crate::response::TtsResponse; +use crate::trait_def::TtsBackend; + +const DEFAULT_BASE_URL: &str = "https://api.elevenlabs.io"; +const DEFAULT_VOICE_ID: &str = "21m00Tcm4TlvDq8ikWAM"; // Rachel + +pub struct ElevenLabsBackend { + api_key: String, + client: reqwest::Client, + base_url: String, +} + +impl ElevenLabsBackend { + /// Build from explicit base URL and API key (used in wiremock tests). + pub fn with_base_url( + base_url: impl Into, + api_key: impl Into, + ) -> Self { + Self { + api_key: api_key.into(), + client: reqwest::Client::new(), + base_url: base_url.into().trim_end_matches('/').to_string(), + } + } + + /// Build from `ELEVENLABS_API_KEY` env var. + pub fn from_env() -> Result { + let key = std::env::var("ELEVENLABS_API_KEY") + .map_err(|_| TtsError::MissingEnv("ELEVENLABS_API_KEY".into()))?; + Ok(Self::with_base_url(DEFAULT_BASE_URL, key)) + } + + fn format_param(fmt: AudioFormat) -> &'static str { + match fmt { + AudioFormat::Mp3 => "mp3_44100_128", + AudioFormat::Ogg => "ogg_48000", + AudioFormat::Wav | AudioFormat::Raw => "pcm_44100", + } + } +} + +#[async_trait::async_trait] +impl TtsBackend for ElevenLabsBackend { + fn name(&self) -> &'static str { "elevenlabs" } + + async fn synth(&self, req: &TtsRequest) -> Result { + let voice = req.voice_id.as_deref().unwrap_or(DEFAULT_VOICE_ID); + let url = format!( + "{}/v1/text-to-speech/{}/stream", + self.base_url, voice + ); + let body = serde_json::json!({ + "text": req.text, + "output_format": Self::format_param(req.format), + }); + let resp = self.client + .post(&url) + .header("xi-api-key", &self.api_key) + .header("Content-Type", "application/json") + .json(&body) + .send() + .await?; + if !resp.status().is_success() { + let status = resp.status().as_u16(); + let text = resp.text().await.unwrap_or_default(); + return Err(TtsError::Http(format!("http {status}: {text}"))); + } + let mime = req.format.mime_type().to_string(); + let bytes = resp.bytes().await + .map_err(|e| TtsError::Http(e.to_string()))? + .to_vec(); + Ok(TtsResponse::new(bytes, mime)) + } +} + +#[cfg(test)] +#[path = "elevenlabs_test.rs"] +mod tests; diff --git a/_primitives/_rust/kei-tts/src/elevenlabs_test.rs b/_primitives/_rust/kei-tts/src/elevenlabs_test.rs new file mode 100644 index 0000000..17a62f2 --- /dev/null +++ b/_primitives/_rust/kei-tts/src/elevenlabs_test.rs @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! Wiremock tests for `ElevenLabsBackend`. +//! +//! Verifies request shape (path, header) and response byte parsing. + +#![cfg(all(test, feature = "elevenlabs"))] + +use wiremock::matchers::{header, method, path_regex}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +use crate::elevenlabs::ElevenLabsBackend; +use crate::request::{AudioFormat, TtsRequest}; +use crate::trait_def::TtsBackend; + +#[tokio::test] +async fn elevenlabs_synth_ok() { + let server = MockServer::start().await; + let fake_audio = b"FAKE_AUDIO_BYTES".to_vec(); + + Mock::given(method("POST")) + .and(path_regex(r"/v1/text-to-speech/.+/stream")) + .and(header("xi-api-key", "test-key")) + .respond_with( + ResponseTemplate::new(200) + .set_body_bytes(fake_audio.clone()) + .append_header("Content-Type", "audio/mpeg"), + ) + .mount(&server) + .await; + + let backend = ElevenLabsBackend::with_base_url(server.uri(), "test-key"); + let req = TtsRequest { + text: "hello".into(), + voice_id: Some("voice123".into()), + language: None, + format: AudioFormat::Mp3, + }; + let resp = backend.synth(&req).await.expect("synth should succeed"); + assert_eq!(resp.audio_bytes, fake_audio); + assert_eq!(resp.mime_type, "audio/mpeg"); +} + +#[tokio::test] +async fn elevenlabs_synth_http_error() { + let server = MockServer::start().await; + + Mock::given(method("POST")) + .and(path_regex(r"/v1/text-to-speech/.+/stream")) + .respond_with(ResponseTemplate::new(401).set_body_string("Unauthorized")) + .mount(&server) + .await; + + let backend = ElevenLabsBackend::with_base_url(server.uri(), "bad-key"); + let req = TtsRequest::new("hello"); + let err = backend.synth(&req).await.expect_err("should fail on 401"); + assert!(matches!(err, crate::TtsError::Http(_))); +} diff --git a/_primitives/_rust/kei-tts/src/error.rs b/_primitives/_rust/kei-tts/src/error.rs new file mode 100644 index 0000000..6e2bdab --- /dev/null +++ b/_primitives/_rust/kei-tts/src/error.rs @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! `TtsError` — crate-level error enum for all TTS backends. +//! +//! Each variant carries a human-readable string so call-sites can log +//! without leaking transport internals. `thiserror` provides `Display` +//! and the `Error` trait automatically. + +use thiserror::Error; + +/// Errors that can occur across any TTS backend. +#[derive(Debug, Error)] +pub enum TtsError { + /// HTTP transport or API error from a cloud backend. + #[error("http: {0}")] + Http(String), + + /// Subprocess (piper-tts) spawn or IO error. + #[error("subprocess: {0}")] + Subprocess(String), + + /// Required environment variable is absent. + #[error("missing env var: {0}")] + MissingEnv(String), + + /// Backend name was requested but its Cargo feature is not compiled in. + #[error("backend not enabled: {0}")] + BackendNotEnabled(String), + + /// Unexpected or malformed response from a backend. + #[error("invalid response: {0}")] + InvalidResponse(String), +} + +impl From for TtsError { + fn from(e: reqwest::Error) -> Self { + TtsError::Http(e.to_string()) + } +} diff --git a/_primitives/_rust/kei-tts/src/google.rs b/_primitives/_rust/kei-tts/src/google.rs new file mode 100644 index 0000000..289885b --- /dev/null +++ b/_primitives/_rust/kei-tts/src/google.rs @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! Google Cloud TTS backend — calls `texttospeech.googleapis.com`. +//! +//! Endpoint: `POST /v1/text:synthesize?key={api_key}` +//! Response: JSON `{"audioContent": ""}`. Base64-decoded bytes +//! are returned as `TtsResponse.audio_bytes`. +//! +//! Constructor surface: +//! * [`GoogleBackend::from_env`] — reads `GOOGLE_TTS_API_KEY`. +//! * [`GoogleBackend::with_base_url`] — explicit URL + key (tests). + +#![cfg(feature = "google")] + +use base64::{engine::general_purpose::STANDARD as B64, Engine as _}; + +use crate::error::TtsError; +use crate::request::{AudioFormat, TtsRequest}; +use crate::response::TtsResponse; +use crate::trait_def::TtsBackend; + +const DEFAULT_BASE_URL: &str = "https://texttospeech.googleapis.com"; +const DEFAULT_VOICE: &str = "en-US-Wavenet-D"; +const DEFAULT_LANG: &str = "en-US"; + +pub struct GoogleBackend { + api_key: String, + client: reqwest::Client, + base_url: String, +} + +impl GoogleBackend { + /// Build from explicit parameters (used in wiremock tests). + pub fn with_base_url( + base_url: impl Into, + api_key: impl Into, + ) -> Self { + Self { + api_key: api_key.into(), + client: reqwest::Client::new(), + base_url: base_url.into().trim_end_matches('/').to_string(), + } + } + + /// Build from `GOOGLE_TTS_API_KEY` env var. + pub fn from_env() -> Result { + let key = std::env::var("GOOGLE_TTS_API_KEY") + .map_err(|_| TtsError::MissingEnv("GOOGLE_TTS_API_KEY".into()))?; + Ok(Self::with_base_url(DEFAULT_BASE_URL, key)) + } + + fn encoding_str(fmt: AudioFormat) -> &'static str { + match fmt { + AudioFormat::Mp3 => "MP3", + AudioFormat::Ogg => "OGG_OPUS", + AudioFormat::Wav | AudioFormat::Raw => "LINEAR16", + } + } +} + +#[derive(serde::Deserialize)] +struct GoogleResponse { + #[serde(rename = "audioContent")] + audio_content: String, +} + +#[async_trait::async_trait] +impl TtsBackend for GoogleBackend { + fn name(&self) -> &'static str { "google" } + + async fn synth(&self, req: &TtsRequest) -> Result { + let url = format!( + "{}/v1/text:synthesize?key={}", + self.base_url, self.api_key + ); + let voice_name = req.voice_id.as_deref().unwrap_or(DEFAULT_VOICE); + let lang = req.language.as_deref().unwrap_or(DEFAULT_LANG); + let body = serde_json::json!({ + "input": { "text": req.text }, + "voice": { "languageCode": lang, "name": voice_name }, + "audioConfig": { "audioEncoding": Self::encoding_str(req.format) }, + }); + let resp = self.client + .post(&url) + .json(&body) + .send() + .await?; + if !resp.status().is_success() { + let status = resp.status().as_u16(); + let text = resp.text().await.unwrap_or_default(); + return Err(TtsError::Http(format!("http {status}: {text}"))); + } + let parsed: GoogleResponse = resp.json().await + .map_err(|e| TtsError::InvalidResponse(e.to_string()))?; + let bytes = B64.decode(&parsed.audio_content) + .map_err(|e| TtsError::InvalidResponse(format!("base64: {e}")))?; + Ok(TtsResponse::new(bytes, req.format.mime_type().to_string())) + } +} + +#[cfg(test)] +#[path = "google_test.rs"] +mod tests; diff --git a/_primitives/_rust/kei-tts/src/google_test.rs b/_primitives/_rust/kei-tts/src/google_test.rs new file mode 100644 index 0000000..1ab4c97 --- /dev/null +++ b/_primitives/_rust/kei-tts/src/google_test.rs @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! Wiremock tests for `GoogleBackend`. +//! +//! Verifies JSON request shape and base64 `audioContent` decoding. + +#![cfg(all(test, feature = "google"))] + +use base64::{engine::general_purpose::STANDARD as B64, Engine as _}; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +use crate::google::GoogleBackend; +use crate::request::{AudioFormat, TtsRequest}; +use crate::trait_def::TtsBackend; + +#[tokio::test] +async fn google_synth_ok() { + let server = MockServer::start().await; + let fake_audio = b"GOOGLE_AUDIO".to_vec(); + let encoded = B64.encode(&fake_audio); + let body = serde_json::json!({ "audioContent": encoded }).to_string(); + + Mock::given(method("POST")) + .and(path("/v1/text:synthesize")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string(body) + .append_header("Content-Type", "application/json"), + ) + .mount(&server) + .await; + + let backend = GoogleBackend::with_base_url(server.uri(), "test-key"); + let req = TtsRequest { + text: "hello google".into(), + voice_id: None, + language: Some("en-US".into()), + format: AudioFormat::Mp3, + }; + let resp = backend.synth(&req).await.expect("synth should succeed"); + assert_eq!(resp.audio_bytes, fake_audio); + assert_eq!(resp.mime_type, "audio/mpeg"); +} + +#[tokio::test] +async fn google_synth_invalid_base64() { + let server = MockServer::start().await; + let body = serde_json::json!({ "audioContent": "!!!not_b64!!!" }).to_string(); + + Mock::given(method("POST")) + .and(path("/v1/text:synthesize")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string(body) + .append_header("Content-Type", "application/json"), + ) + .mount(&server) + .await; + + let backend = GoogleBackend::with_base_url(server.uri(), "test-key"); + let req = TtsRequest::new("hello"); + let err = backend.synth(&req).await.expect_err("should fail on bad b64"); + assert!(matches!(err, crate::TtsError::InvalidResponse(_))); +} diff --git a/_primitives/_rust/kei-tts/src/lib.rs b/_primitives/_rust/kei-tts/src/lib.rs new file mode 100644 index 0000000..da96573 --- /dev/null +++ b/_primitives/_rust/kei-tts/src/lib.rs @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! `kei-tts` — text-to-speech abstraction with 4 backend impls. +//! +//! Backend is chosen at runtime via `KEI_TTS_BACKEND` env var. +//! Compile-time feature flags gate which backends are available. +//! +//! | Feature | Backend | Default | +//! |---------------|------------|---------| +//! | `piper` | local | ✓ | +//! | `elevenlabs` | cloud | – | +//! | `openai` | cloud | – | +//! | `google` | cloud | – | +//! +//! # Quick start +//! ```no_run +//! # async fn example() -> Result<(), kei_tts::TtsError> { +//! let backend = kei_tts::from_env()?; +//! let req = kei_tts::TtsRequest::new("Hello, world!"); +//! let resp = backend.synth(&req).await?; +//! std::fs::write("out.mp3", &resp.audio_bytes).ok(); +//! # Ok(()) } +//! ``` + +pub mod error; +pub mod request; +pub mod response; +pub mod trait_def; + +#[cfg(feature = "elevenlabs")] +pub mod elevenlabs; +#[cfg(feature = "google")] +pub mod google; +#[cfg(feature = "openai")] +pub mod openai; +#[cfg(feature = "piper")] +pub mod piper; + +pub use error::TtsError; +pub use request::{AudioFormat, TtsRequest}; +pub use response::TtsResponse; +pub use trait_def::TtsBackend; + +/// Construct the backend selected by `KEI_TTS_BACKEND`. +/// +/// Defaults to `piper` when the env var is absent or empty. +/// Returns `TtsError::BackendNotEnabled` if the chosen backend's +/// feature flag was not compiled in. +pub fn from_env() -> Result, TtsError> { + let name = std::env::var("KEI_TTS_BACKEND") + .unwrap_or_else(|_| "piper".to_string()); + build_backend(&name) +} + +fn build_backend(name: &str) -> Result, TtsError> { + match name { + #[cfg(feature = "piper")] + "piper" => Ok(Box::new(piper::PiperBackend::from_env()?)), + #[cfg(feature = "elevenlabs")] + "elevenlabs" => Ok(Box::new(elevenlabs::ElevenLabsBackend::from_env()?)), + #[cfg(feature = "openai")] + "openai" => Ok(Box::new(openai::OpenAiBackend::from_env()?)), + #[cfg(feature = "google")] + "google" => Ok(Box::new(google::GoogleBackend::from_env()?)), + other => Err(TtsError::BackendNotEnabled(other.to_string())), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// NOTE: env-var tests are run with `-- --test-threads=1` to avoid + /// races between tests that mutate process-global env. + + #[test] + fn from_env_defaults_to_piper() { + // Ensure a previous test has not leaked KEI_TTS_BACKEND. + std::env::remove_var("KEI_TTS_BACKEND"); + // piper backend requires KEI_TTS_PIPER_MODEL — set a dummy path. + std::env::set_var("KEI_TTS_PIPER_MODEL", "/tmp/dummy.onnx"); + let backend = from_env().expect("piper backend should construct from env"); + assert_eq!(backend.name(), "piper"); + std::env::remove_var("KEI_TTS_PIPER_MODEL"); + } + + #[test] + fn from_env_unknown_backend_errors() { + std::env::remove_var("KEI_TTS_BACKEND"); + let result = build_backend("invalid_provider"); + match result { + Err(TtsError::BackendNotEnabled(name)) => { + assert_eq!(name, "invalid_provider"); + } + Ok(_) => panic!("expected BackendNotEnabled, got Ok"), + Err(e) => panic!("expected BackendNotEnabled, got different error: {e}"), + } + } + + /// Verify piper backend propagates subprocess error on bad model path. + /// Skipped entirely when `piper-tts` binary is not on PATH. + #[cfg(feature = "piper")] + #[tokio::test] + async fn piper_subprocess_error_on_bad_model() { + let available = std::process::Command::new("piper-tts") + .arg("--help") + .output() + .is_ok(); + if !available { + eprintln!("piper-tts not on PATH — skipping binary test"); + return; + } + use crate::piper::PiperBackend; + use crate::trait_def::TtsBackend; + let backend = PiperBackend::new("piper-tts", "/nonexistent/model.onnx"); + let req = TtsRequest::new("hello"); + let err = backend.synth(&req).await + .expect_err("bad model path should fail"); + assert!(matches!(err, TtsError::Subprocess(_))); + } +} diff --git a/_primitives/_rust/kei-tts/src/openai.rs b/_primitives/_rust/kei-tts/src/openai.rs new file mode 100644 index 0000000..d96e132 --- /dev/null +++ b/_primitives/_rust/kei-tts/src/openai.rs @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! OpenAI TTS backend — calls `api.openai.com/v1/audio/speech`. +//! +//! Supported models: `tts-1` (fast) and `tts-1-hd` (higher quality). +//! Default voice: `alloy`. Format negotiated via `response_format` field. +//! +//! Constructor surface: +//! * [`OpenAiBackend::from_env`] — reads `OPENAI_API_KEY`. +//! * [`OpenAiBackend::with_base_url`] — explicit URL + key + model (tests). + +#![cfg(feature = "openai")] + +use crate::error::TtsError; +use crate::request::{AudioFormat, TtsRequest}; +use crate::response::TtsResponse; +use crate::trait_def::TtsBackend; + +const DEFAULT_BASE_URL: &str = "https://api.openai.com"; +const DEFAULT_MODEL: &str = "tts-1"; +const DEFAULT_VOICE: &str = "alloy"; + +pub struct OpenAiBackend { + api_key: String, + model: String, + client: reqwest::Client, + base_url: String, +} + +impl OpenAiBackend { + /// Build from explicit parameters (used in wiremock tests). + pub fn with_base_url( + base_url: impl Into, + api_key: impl Into, + model: impl Into, + ) -> Self { + Self { + api_key: api_key.into(), + model: model.into(), + client: reqwest::Client::new(), + base_url: base_url.into().trim_end_matches('/').to_string(), + } + } + + /// Build from `OPENAI_API_KEY` env var. Reads optional + /// `KEI_TTS_OPENAI_MODEL` (default `tts-1`). + pub fn from_env() -> Result { + let key = std::env::var("OPENAI_API_KEY") + .map_err(|_| TtsError::MissingEnv("OPENAI_API_KEY".into()))?; + let model = std::env::var("KEI_TTS_OPENAI_MODEL") + .unwrap_or_else(|_| DEFAULT_MODEL.to_string()); + Ok(Self::with_base_url(DEFAULT_BASE_URL, key, model)) + } + + fn format_str(fmt: AudioFormat) -> &'static str { + match fmt { + AudioFormat::Mp3 => "mp3", + AudioFormat::Ogg => "opus", + AudioFormat::Wav => "wav", + AudioFormat::Raw => "pcm", + } + } +} + +#[async_trait::async_trait] +impl TtsBackend for OpenAiBackend { + fn name(&self) -> &'static str { "openai" } + + async fn synth(&self, req: &TtsRequest) -> Result { + let url = format!("{}/v1/audio/speech", self.base_url); + let voice = req.voice_id.as_deref().unwrap_or(DEFAULT_VOICE); + let body = serde_json::json!({ + "model": self.model, + "input": req.text, + "voice": voice, + "response_format": Self::format_str(req.format), + }); + let resp = self.client + .post(&url) + .bearer_auth(&self.api_key) + .json(&body) + .send() + .await?; + if !resp.status().is_success() { + let status = resp.status().as_u16(); + let text = resp.text().await.unwrap_or_default(); + return Err(TtsError::Http(format!("http {status}: {text}"))); + } + let mime = req.format.mime_type().to_string(); + let bytes = resp.bytes().await + .map_err(|e| TtsError::Http(e.to_string()))? + .to_vec(); + Ok(TtsResponse::new(bytes, mime)) + } +} + +#[cfg(test)] +#[path = "openai_test.rs"] +mod tests; diff --git a/_primitives/_rust/kei-tts/src/openai_test.rs b/_primitives/_rust/kei-tts/src/openai_test.rs new file mode 100644 index 0000000..a0815e4 --- /dev/null +++ b/_primitives/_rust/kei-tts/src/openai_test.rs @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! Wiremock tests for `OpenAiBackend`. +//! +//! Verifies request JSON shape, Bearer auth, and response byte parsing. + +#![cfg(all(test, feature = "openai"))] + +use wiremock::matchers::{header, method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +use crate::openai::OpenAiBackend; +use crate::request::{AudioFormat, TtsRequest}; +use crate::trait_def::TtsBackend; + +#[tokio::test] +async fn openai_synth_ok() { + let server = MockServer::start().await; + let fake_audio = b"OPENAI_AUDIO".to_vec(); + + Mock::given(method("POST")) + .and(path("/v1/audio/speech")) + .and(header("authorization", "Bearer test-key")) + .respond_with( + ResponseTemplate::new(200) + .set_body_bytes(fake_audio.clone()) + .append_header("Content-Type", "audio/mpeg"), + ) + .mount(&server) + .await; + + let backend = OpenAiBackend::with_base_url(server.uri(), "test-key", "tts-1"); + let req = TtsRequest { + text: "hello openai".into(), + voice_id: Some("nova".into()), + language: None, + format: AudioFormat::Mp3, + }; + let resp = backend.synth(&req).await.expect("synth should succeed"); + assert_eq!(resp.audio_bytes, fake_audio); + assert_eq!(resp.mime_type, "audio/mpeg"); +} + +#[tokio::test] +async fn openai_synth_http_error() { + let server = MockServer::start().await; + + Mock::given(method("POST")) + .and(path("/v1/audio/speech")) + .respond_with(ResponseTemplate::new(429).set_body_string("Rate limited")) + .mount(&server) + .await; + + let backend = OpenAiBackend::with_base_url(server.uri(), "test-key", "tts-1"); + let req = TtsRequest::new("hello"); + let err = backend.synth(&req).await.expect_err("should fail on 429"); + assert!(matches!(err, crate::TtsError::Http(_))); +} diff --git a/_primitives/_rust/kei-tts/src/piper.rs b/_primitives/_rust/kei-tts/src/piper.rs new file mode 100644 index 0000000..28083c3 --- /dev/null +++ b/_primitives/_rust/kei-tts/src/piper.rs @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! Piper TTS backend — spawns the `piper-tts` subprocess. +//! +//! Invocation: `piper-tts --model --output_raw` +//! Text is written to stdin; raw PCM bytes are read from stdout. +//! A minimal RIFF WAV header is prepended when `format = Wav`. +//! +//! Constructor surface: +//! * [`PiperBackend::from_env`] — reads `KEI_TTS_PIPER_MODEL` + +//! optional `KEI_TTS_PIPER_BINARY` (default `piper-tts`). +//! * [`PiperBackend::new`] — explicit binary + model paths (tests). + +#![cfg(feature = "piper")] + +use std::path::PathBuf; + +use tokio::io::AsyncWriteExt as _; +use tokio::process::Command; + +use crate::error::TtsError; +use crate::request::{AudioFormat, TtsRequest}; +use crate::response::TtsResponse; +use crate::trait_def::TtsBackend; + +const DEFAULT_BINARY: &str = "piper-tts"; +// PCM parameters piper-tts emits by default. +const SAMPLE_RATE: u32 = 22050; +const CHANNELS: u16 = 1; +const BITS_PER_SAMPLE: u16 = 16; + +pub struct PiperBackend { + binary: PathBuf, + model: PathBuf, +} + +impl PiperBackend { + /// Build from explicit binary path and model path. + pub fn new(binary: impl Into, model: impl Into) -> Self { + Self { binary: binary.into(), model: model.into() } + } + + /// Build from env vars. + /// Required: `KEI_TTS_PIPER_MODEL` (path to `.onnx` model file). + /// Optional: `KEI_TTS_PIPER_BINARY` (default `piper-tts`). + pub fn from_env() -> Result { + let model = std::env::var("KEI_TTS_PIPER_MODEL") + .map_err(|_| TtsError::MissingEnv("KEI_TTS_PIPER_MODEL".into()))?; + let binary = std::env::var("KEI_TTS_PIPER_BINARY") + .unwrap_or_else(|_| DEFAULT_BINARY.to_string()); + Ok(Self::new(binary, model)) + } +} + +/// Build a minimal 44-byte RIFF WAV header for PCM data. +fn wav_header(data_len: u32) -> Vec { + let byte_rate = SAMPLE_RATE * u32::from(CHANNELS) * u32::from(BITS_PER_SAMPLE) / 8; + let block_align: u16 = CHANNELS * BITS_PER_SAMPLE / 8; + let mut h = Vec::with_capacity(44); + h.extend_from_slice(b"RIFF"); + h.extend_from_slice(&(36u32 + data_len).to_le_bytes()); + h.extend_from_slice(b"WAVE"); + h.extend_from_slice(b"fmt "); + h.extend_from_slice(&16u32.to_le_bytes()); // subchunk1 size + h.extend_from_slice(&1u16.to_le_bytes()); // PCM + h.extend_from_slice(&CHANNELS.to_le_bytes()); + h.extend_from_slice(&SAMPLE_RATE.to_le_bytes()); + h.extend_from_slice(&byte_rate.to_le_bytes()); + h.extend_from_slice(&block_align.to_le_bytes()); + h.extend_from_slice(&BITS_PER_SAMPLE.to_le_bytes()); + h.extend_from_slice(b"data"); + h.extend_from_slice(&data_len.to_le_bytes()); + h +} + +#[async_trait::async_trait] +impl TtsBackend for PiperBackend { + fn name(&self) -> &'static str { "piper" } + + async fn synth(&self, req: &TtsRequest) -> Result { + let mut child = Command::new(&self.binary) + .arg("--model") + .arg(&self.model) + .arg("--output_raw") + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::null()) + .spawn() + .map_err(|e| TtsError::Subprocess(e.to_string()))?; + + if let Some(mut stdin) = child.stdin.take() { + stdin.write_all(req.text.as_bytes()).await + .map_err(|e| TtsError::Subprocess(e.to_string()))?; + } + + let output = child.wait_with_output().await + .map_err(|e| TtsError::Subprocess(e.to_string()))?; + + if !output.status.success() { + return Err(TtsError::Subprocess(format!( + "piper-tts exited with {}", output.status + ))); + } + + let pcm = output.stdout; + match req.format { + AudioFormat::Wav => { + let mut wav = wav_header(pcm.len() as u32); + wav.extend_from_slice(&pcm); + Ok(TtsResponse::new(wav, "audio/wav")) + } + _ => Ok(TtsResponse::new(pcm, "audio/pcm")), + } + } +} diff --git a/_primitives/_rust/kei-tts/src/request.rs b/_primitives/_rust/kei-tts/src/request.rs new file mode 100644 index 0000000..f457e7b --- /dev/null +++ b/_primitives/_rust/kei-tts/src/request.rs @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! `TtsRequest` and `AudioFormat` — input types for all TTS backends. +//! +//! `TtsRequest` is deliberately backend-agnostic: each backend maps its +//! fields to provider-specific parameters in its own module. + +use serde::{Deserialize, Serialize}; + +/// Output audio encoding requested from the backend. +/// +/// Not every backend supports every format; unsupported formats result +/// in `TtsError::InvalidResponse`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum AudioFormat { + /// MPEG Layer 3 — the most widely supported lossy format. + Mp3, + /// Ogg Vorbis — open, patent-free lossy format. + Ogg, + /// RIFF WAVE — uncompressed PCM container. + Wav, + /// Raw PCM bytes with no container header (piper default). + Raw, +} + +impl AudioFormat { + /// Returns the MIME type string for the format. + pub fn mime_type(self) -> &'static str { + match self { + AudioFormat::Mp3 => "audio/mpeg", + AudioFormat::Ogg => "audio/ogg", + AudioFormat::Wav => "audio/wav", + AudioFormat::Raw => "audio/pcm", + } + } +} + +/// Parameters for a single TTS synthesis request. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TtsRequest { + /// The text to synthesise. + pub text: String, + + /// Backend-specific voice identifier. `None` lets the backend use + /// its own default voice. + pub voice_id: Option, + + /// BCP-47 language tag (e.g. `"ru"`, `"en-US"`). `None` → auto. + pub language: Option, + + /// Desired output audio encoding. + pub format: AudioFormat, +} + +impl TtsRequest { + /// Convenience constructor for plain text with backend defaults. + pub fn new(text: impl Into) -> Self { + Self { + text: text.into(), + voice_id: None, + language: None, + format: AudioFormat::Mp3, + } + } +} diff --git a/_primitives/_rust/kei-tts/src/response.rs b/_primitives/_rust/kei-tts/src/response.rs new file mode 100644 index 0000000..f513a53 --- /dev/null +++ b/_primitives/_rust/kei-tts/src/response.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! `TtsResponse` — output of a successful TTS synthesis call. + +use serde::{Deserialize, Serialize}; + +/// Audio data returned by a TTS backend. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TtsResponse { + /// Raw bytes of the synthesised audio file. + pub audio_bytes: Vec, + + /// MIME type of the audio data (e.g. `"audio/mpeg"`, `"audio/wav"`). + pub mime_type: String, +} + +impl TtsResponse { + /// Construct a response with explicit audio data and MIME type. + pub fn new(audio_bytes: Vec, mime_type: impl Into) -> Self { + Self { audio_bytes, mime_type: mime_type.into() } + } +} diff --git a/_primitives/_rust/kei-tts/src/trait_def.rs b/_primitives/_rust/kei-tts/src/trait_def.rs new file mode 100644 index 0000000..055b572 --- /dev/null +++ b/_primitives/_rust/kei-tts/src/trait_def.rs @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 +//! `TtsBackend` — the core async trait all backend impls satisfy. +//! +//! Implementing this trait is sufficient to plug a new TTS provider into +//! the `from_env()` dispatch without modifying `lib.rs`. Each backend +//! module is self-contained and feature-gated. + +use crate::error::TtsError; +use crate::request::TtsRequest; +use crate::response::TtsResponse; + +/// Async TTS synthesis backend. +/// +/// Implementations must be `Send + Sync` so they can be stored in a +/// `Box` and shared across Tokio tasks. +#[async_trait::async_trait] +pub trait TtsBackend: Send + Sync { + /// Synthesise `req.text` and return the audio bytes. + async fn synth(&self, req: &TtsRequest) -> Result; + + /// Short, stable identifier for this backend (e.g. `"piper"`). + fn name(&self) -> &'static str; +}