feat(kei-tts + kei-stt): TTS/STT abstractions with 4+3 backends
Two parallel atomars in the kei-buddy phase-1 plan. Mirror each other's
architecture: trait + feature-gated backend modules + env-driven dispatch
+ wiremock tests for HTTP backends + subprocess-error test for local.
## kei-tts (text-to-speech)
LOC: 959 across 15 files (largest src/lib.rs 121).
Trait `TtsBackend` + 4 backends behind feature flags:
* elevenlabs — POST api.elevenlabs.io/v1/text-to-speech/{voice}/stream
* openai — POST api.openai.com/v1/audio/speech (tts-1, tts-1-hd)
* google — POST texttospeech.googleapis.com/v1/text:synthesize
(Wavenet voices, base64 audioContent)
* piper — local subprocess to piper-tts binary, raw PCM out
Default features: ["piper"]. all-backends feature gates the rest.
`from_env()` reads KEI_TTS_BACKEND (default piper). Returns Box<dyn TtsBackend>.
Tests: 9 passed (env routing + 3 wiremock backends + piper subprocess error).
## kei-stt (speech-to-text)
LOC: 935 across 13 files (largest whisper_local.rs 181).
Trait `SttBackend` + 3 backends:
* whisper-local — subprocess to `whisper` CLI / faster-whisper,
reads JSON output, parses segments
* deepgram — POST api.deepgram.com/v1/listen (Token auth header,
raw audio body, parses words → Segments)
* openai-whisper — POST api.openai.com/v1/audio/transcriptions
(multipart file + model=whisper-1 +
response_format=verbose_json)
Default features: ["whisper-local"]. all-backends gates the rest.
`from_env()` reads KEI_STT_BACKEND (default whisper-local).
Tests: 10 passed + 1 doc-test (env routing + 5 wiremock + 2 JSON parsers
+ 1 subprocess error + 1 auth-header check).
## Common architecture decisions
* `with_base_url(url)` constructor on each HTTP backend for wiremock
testability — same pattern as kei-llm-router and kei-notify-telegram.
* `tempfile` crate added to kei-stt for whisper-local audio scratch.
* `base64 = { version = "0.22", optional = true }` in kei-tts for
Google's base64-encoded audioContent.
## Verify-before-commit (RULE 0.13 §)
* cargo check -p kei-tts (default + all-backends): PASS
* cargo check -p kei-stt (default + all-backends): PASS
* cargo test -p kei-tts --features all-backends --lib: 9/0
* cargo test -p kei-stt --features all-backends --lib: 10/0
* cargo check --workspace: PASS
STATUS-TRUTH from both agents: shipped=functional, stubs=0,
behaviour-verified=yes.
## Follow-up (deferred, non-blocking)
* Real backend verification needs API keys for ElevenLabs / OpenAI /
Google / Deepgram and piper-tts binary + .onnx model on PATH.
* whisper-local language_detected always None — whisper CLI JSON
schema differs across versions, parse heuristic to be added.
* faster-whisper has different JSON schema from openai-whisper;
current parser covers openai-whisper convention only.
This commit is contained in:
parent
0267311087
commit
b5da1940e1
28 changed files with 1926 additions and 0 deletions
30
_primitives/_rust/Cargo.lock
generated
30
_primitives/_rust/Cargo.lock
generated
|
|
@ -4393,6 +4393,21 @@ dependencies = [
|
|||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kei-stt"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest 0.12.28",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"wiremock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kei-svc-systemd"
|
||||
version = "0.1.0"
|
||||
|
|
@ -4455,6 +4470,21 @@ dependencies = [
|
|||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kei-tts"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.22.1",
|
||||
"reqwest 0.12.28",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"wiremock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kei-tty"
|
||||
version = "0.1.0"
|
||||
|
|
|
|||
|
|
@ -183,6 +183,10 @@ members = [
|
|||
"kei-buddy",
|
||||
# Inbound Telegram webhook handler — parses Update payloads into typed WebhookEvent
|
||||
"kei-telegram-webhook",
|
||||
# TTS abstraction — 4 backends (ElevenLabs/OpenAI/Google/Piper) behind feature flags
|
||||
"kei-tts",
|
||||
# STT abstraction — 3 backends (whisper-local/Deepgram/OpenAI-Whisper) behind feature flags
|
||||
"kei-stt",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
|
|
|
|||
39
_primitives/_rust/kei-stt/Cargo.toml
Normal file
39
_primitives/_rust/kei-stt/Cargo.toml
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
[package]
|
||||
name = "kei-stt"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
description = "Speech-to-text abstraction trait with 3 backends (whisper-local/Deepgram/OpenAI). Default = whisper-local (free, local)."
|
||||
authors.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[lib]
|
||||
name = "kei_stt"
|
||||
path = "src/lib.rs"
|
||||
|
||||
[dependencies]
|
||||
async-trait = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
serde_json = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tokio = { workspace = true, features = ["macros", "rt-multi-thread", "process", "io-util", "fs"] }
|
||||
reqwest = { workspace = true, features = ["multipart"] }
|
||||
tracing = "0.1"
|
||||
tempfile = { workspace = true }
|
||||
|
||||
[features]
|
||||
default = ["whisper-local"]
|
||||
whisper-local = []
|
||||
deepgram = []
|
||||
openai-whisper = []
|
||||
all-backends = ["whisper-local", "deepgram", "openai-whisper"]
|
||||
|
||||
[dev-dependencies]
|
||||
wiremock = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
|
||||
[package.metadata.keisei]
|
||||
maturity = "alpha"
|
||||
trait = "SttBackend"
|
||||
description = "STT abstraction with 3 backends (whisper-local/Deepgram/OpenAI). Default = whisper-local."
|
||||
authors = ["Denis Parfionovich <parfionovich@keilab.io>"]
|
||||
66
_primitives/_rust/kei-stt/README.md
Normal file
66
_primitives/_rust/kei-stt/README.md
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# kei-stt
|
||||
|
||||
Speech-to-text abstraction crate with 3 backends selected at runtime via
|
||||
`KEI_STT_BACKEND`. Default backend is **whisper-local** (free, local, no API key).
|
||||
|
||||
## Backend matrix
|
||||
|
||||
| Backend | Feature flag | Cost | Latency | Quality |
|
||||
|------------------|------------------|----------------|-------------|-----------|
|
||||
| `whisper-local` | `whisper-local` | Free | 1–10× RT | Very good |
|
||||
| `deepgram` | `deepgram` | ~$0.0043/min | 200–500 ms | Excellent |
|
||||
| `openai-whisper` | `openai-whisper` | ~$0.006/min | 300–800 ms | Excellent |
|
||||
|
||||
RT = real-time factor (depends on hardware / model size for whisper-local).
|
||||
|
||||
## Environment variables
|
||||
|
||||
| Variable | Backend | Required | Description |
|
||||
|----------------------------|-----------------|----------|------------------------------------------|
|
||||
| `KEI_STT_BACKEND` | all | No | `whisper-local` (default) / `deepgram` / `openai-whisper` |
|
||||
| `KEI_STT_WHISPER_BINARY` | whisper-local | No | Path to `whisper` CLI (default: PATH) |
|
||||
| `KEI_STT_WHISPER_MODEL` | whisper-local | No | Model name (default: `base.en`) |
|
||||
| `DEEPGRAM_API_KEY` | deepgram | Yes | Deepgram API key |
|
||||
| `OPENAI_API_KEY` | openai-whisper | Yes | OpenAI API key |
|
||||
|
||||
## Usage
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
kei-stt = { path = "../kei-stt", features = ["whisper-local"] }
|
||||
```
|
||||
|
||||
```rust
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), kei_stt::SttError> {
|
||||
let backend = kei_stt::from_env()?;
|
||||
let audio = std::fs::read("speech.wav").unwrap();
|
||||
let req = kei_stt::SttRequest::new_wav(audio);
|
||||
let resp = backend.transcribe(&req).await?;
|
||||
println!("[{}] {}", backend.name(), resp.text);
|
||||
for seg in &resp.segments {
|
||||
println!(" {:>6}ms–{:>6}ms {}", seg.start_ms, seg.end_ms, seg.text);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
## Compile-time features
|
||||
|
||||
```toml
|
||||
# All backends:
|
||||
kei-stt = { features = ["all-backends"] }
|
||||
# Cloud only, no local whisper:
|
||||
kei-stt = { features = ["deepgram", "openai-whisper"], default-features = false }
|
||||
```
|
||||
|
||||
## whisper-local prerequisites
|
||||
|
||||
Install the `openai-whisper` Python package:
|
||||
|
||||
```sh
|
||||
pip install openai-whisper
|
||||
```
|
||||
|
||||
This makes the `whisper` CLI available. Alternatively point `KEI_STT_WHISPER_BINARY`
|
||||
at a compatible binary (`faster-whisper`, etc. with identical CLI interface).
|
||||
115
_primitives/_rust/kei-stt/src/deepgram.rs
Normal file
115
_primitives/_rust/kei-stt/src/deepgram.rs
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! Deepgram STT backend — calls `api.deepgram.com/v1/listen`.
|
||||
//!
|
||||
//! Endpoint: `POST /v1/listen?language={lang}&punctuate=true`
|
||||
//! Auth: `Authorization: Token {DEEPGRAM_API_KEY}` header.
|
||||
//! Body: raw audio bytes with the request MIME type.
|
||||
//!
|
||||
//! Response shape:
|
||||
//! ```json
|
||||
//! {"results":{"channels":[{"alternatives":[{
|
||||
//! "transcript":"...",
|
||||
//! "words":[{"word":"...","start":0.1,"end":0.4}]
|
||||
//! }]}]}}
|
||||
//! ```
|
||||
//!
|
||||
//! Constructor surface:
|
||||
//! * [`DeepgramBackend::from_env`] — reads `DEEPGRAM_API_KEY`.
|
||||
//! * [`DeepgramBackend::with_base_url`] — explicit URL + key (tests).
|
||||
|
||||
#![cfg(feature = "deepgram")]
|
||||
|
||||
use crate::error::SttError;
|
||||
use crate::request::SttRequest;
|
||||
use crate::response::{Segment, SttResponse};
|
||||
use crate::trait_def::SttBackend;
|
||||
|
||||
const DEFAULT_BASE_URL: &str = "https://api.deepgram.com";
|
||||
|
||||
pub struct DeepgramBackend {
|
||||
api_key: String,
|
||||
client: reqwest::Client,
|
||||
base_url: String,
|
||||
}
|
||||
|
||||
impl DeepgramBackend {
|
||||
/// Build from explicit base URL and API key (used in wiremock tests).
|
||||
pub fn with_base_url(
|
||||
base_url: impl Into<String>,
|
||||
api_key: impl Into<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
api_key: api_key.into(),
|
||||
client: reqwest::Client::new(),
|
||||
base_url: base_url.into().trim_end_matches('/').to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build from `DEEPGRAM_API_KEY` env var.
|
||||
pub fn from_env() -> Result<Self, SttError> {
|
||||
let key = std::env::var("DEEPGRAM_API_KEY")
|
||||
.map_err(|_| SttError::MissingEnv("DEEPGRAM_API_KEY".into()))?;
|
||||
Ok(Self::with_base_url(DEFAULT_BASE_URL, key))
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl SttBackend for DeepgramBackend {
|
||||
fn name(&self) -> &'static str { "deepgram" }
|
||||
|
||||
async fn transcribe(&self, req: &SttRequest) -> Result<SttResponse, SttError> {
|
||||
let mut url = format!("{}/v1/listen?punctuate=true", self.base_url);
|
||||
if let Some(lang) = &req.language {
|
||||
url.push_str(&format!("&language={lang}"));
|
||||
}
|
||||
|
||||
let resp = self.client
|
||||
.post(&url)
|
||||
.header("Authorization", format!("Token {}", self.api_key))
|
||||
.header("Content-Type", &req.mime_type)
|
||||
.body(req.audio_bytes.clone())
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let status = resp.status().as_u16();
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(SttError::Http(format!("http {status}: {text}")));
|
||||
}
|
||||
|
||||
let body: serde_json::Value = resp.json().await
|
||||
.map_err(|e| SttError::InvalidResponse(e.to_string()))?;
|
||||
|
||||
parse_deepgram_response(&body)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_deepgram_response(body: &serde_json::Value) -> Result<SttResponse, SttError> {
|
||||
let alt = body
|
||||
.pointer("/results/channels/0/alternatives/0")
|
||||
.ok_or_else(|| SttError::InvalidResponse("missing alternatives".into()))?;
|
||||
|
||||
let text = alt["transcript"]
|
||||
.as_str()
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
|
||||
let segments = alt["words"]
|
||||
.as_array()
|
||||
.unwrap_or(&vec![])
|
||||
.iter()
|
||||
.filter_map(|w| {
|
||||
let start_ms = (w["start"].as_f64()? * 1000.0) as u64;
|
||||
let end_ms = (w["end"].as_f64()? * 1000.0) as u64;
|
||||
let word = w["word"].as_str()?.to_string();
|
||||
Some(Segment { start_ms, end_ms, text: word })
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(SttResponse { text, segments, language_detected: None })
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "deepgram_test.rs"]
|
||||
mod tests;
|
||||
93
_primitives/_rust/kei-stt/src/deepgram_test.rs
Normal file
93
_primitives/_rust/kei-stt/src/deepgram_test.rs
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! Wiremock tests for `DeepgramBackend`.
|
||||
//!
|
||||
//! Verifies request headers, URL parameters, and response parsing.
|
||||
|
||||
#![cfg(all(test, feature = "deepgram"))]
|
||||
|
||||
use wiremock::matchers::{header, header_regex, method, path};
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
use crate::deepgram::DeepgramBackend;
|
||||
use crate::request::SttRequest;
|
||||
use crate::trait_def::SttBackend;
|
||||
|
||||
fn deepgram_response_body() -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"results": {
|
||||
"channels": [{
|
||||
"alternatives": [{
|
||||
"transcript": "hello deepgram",
|
||||
"words": [
|
||||
{"word": "hello", "start": 0.1, "end": 0.5},
|
||||
{"word": "deepgram", "start": 0.6, "end": 1.1}
|
||||
]
|
||||
}]
|
||||
}]
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn deepgram_parses_transcript() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/v1/listen"))
|
||||
.and(header_regex("authorization", "Token .+"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(200)
|
||||
.set_body_json(deepgram_response_body()),
|
||||
)
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = DeepgramBackend::with_base_url(server.uri(), "test-key");
|
||||
let req = SttRequest {
|
||||
audio_bytes: b"fake_audio".to_vec(),
|
||||
mime_type: "audio/wav".to_string(),
|
||||
language: None,
|
||||
};
|
||||
let resp = backend.transcribe(&req).await.expect("transcribe should succeed");
|
||||
assert_eq!(resp.text, "hello deepgram");
|
||||
assert_eq!(resp.segments.len(), 2);
|
||||
assert_eq!(resp.segments[0].start_ms, 100);
|
||||
assert_eq!(resp.segments[1].end_ms, 1100);
|
||||
assert!(resp.language_detected.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn deepgram_sends_auth_header() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/v1/listen"))
|
||||
.and(header("authorization", "Token secret-key"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(200)
|
||||
.set_body_json(deepgram_response_body()),
|
||||
)
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = DeepgramBackend::with_base_url(server.uri(), "secret-key");
|
||||
let req = SttRequest::new_wav(b"audio".to_vec());
|
||||
backend.transcribe(&req).await.expect("auth header test should pass");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn deepgram_http_error() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/v1/listen"))
|
||||
.respond_with(ResponseTemplate::new(401).set_body_string("Unauthorized"))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = DeepgramBackend::with_base_url(server.uri(), "bad-key");
|
||||
let req = SttRequest::new_wav(b"audio".to_vec());
|
||||
let err = backend.transcribe(&req).await.expect_err("should fail on 401");
|
||||
assert!(matches!(err, crate::SttError::Http(_)));
|
||||
}
|
||||
43
_primitives/_rust/kei-stt/src/error.rs
Normal file
43
_primitives/_rust/kei-stt/src/error.rs
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! `SttError` — crate-level error enum for all STT backends.
|
||||
//!
|
||||
//! Each variant carries a human-readable string so call-sites can log
|
||||
//! without leaking transport internals. `thiserror` provides `Display`
|
||||
//! and the `Error` trait automatically.
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
/// Errors that can occur across any STT backend.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum SttError {
|
||||
/// HTTP transport or API error from a cloud backend.
|
||||
#[error("http: {0}")]
|
||||
Http(String),
|
||||
|
||||
/// Subprocess (whisper CLI) spawn or IO error.
|
||||
#[error("subprocess: {0}")]
|
||||
Subprocess(String),
|
||||
|
||||
/// Required environment variable is absent.
|
||||
#[error("missing env var: {0}")]
|
||||
MissingEnv(String),
|
||||
|
||||
/// Backend name was requested but its Cargo feature is not compiled in.
|
||||
#[error("backend not enabled: {0}")]
|
||||
BackendNotEnabled(String),
|
||||
|
||||
/// Unexpected or malformed response from a backend.
|
||||
#[error("invalid response: {0}")]
|
||||
InvalidResponse(String),
|
||||
|
||||
/// Input audio bytes are invalid or in an unsupported format.
|
||||
#[error("invalid audio: {0}")]
|
||||
InvalidAudio(String),
|
||||
}
|
||||
|
||||
impl From<reqwest::Error> for SttError {
|
||||
fn from(e: reqwest::Error) -> Self {
|
||||
SttError::Http(e.to_string())
|
||||
}
|
||||
}
|
||||
88
_primitives/_rust/kei-stt/src/lib.rs
Normal file
88
_primitives/_rust/kei-stt/src/lib.rs
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! `kei-stt` — speech-to-text abstraction with 3 backend impls.
|
||||
//!
|
||||
//! Backend is chosen at runtime via `KEI_STT_BACKEND` env var.
|
||||
//! Compile-time feature flags gate which backends are available.
|
||||
//!
|
||||
//! | Feature | Backend | Default |
|
||||
//! |------------------|---------------|---------|
|
||||
//! | `whisper-local` | local subprocess | ✓ |
|
||||
//! | `deepgram` | cloud API | – |
|
||||
//! | `openai-whisper` | cloud API | – |
|
||||
//!
|
||||
//! # Quick start
|
||||
//! ```no_run
|
||||
//! # async fn example() -> Result<(), kei_stt::SttError> {
|
||||
//! let backend = kei_stt::from_env()?;
|
||||
//! let audio = std::fs::read("speech.wav").unwrap();
|
||||
//! let req = kei_stt::SttRequest::new_wav(audio);
|
||||
//! let resp = backend.transcribe(&req).await?;
|
||||
//! println!("transcript: {}", resp.text);
|
||||
//! # Ok(()) }
|
||||
//! ```
|
||||
|
||||
pub mod error;
|
||||
pub mod request;
|
||||
pub mod response;
|
||||
pub mod trait_def;
|
||||
|
||||
#[cfg(feature = "whisper-local")]
|
||||
pub mod whisper_local;
|
||||
#[cfg(feature = "deepgram")]
|
||||
pub mod deepgram;
|
||||
#[cfg(feature = "openai-whisper")]
|
||||
pub mod openai_whisper;
|
||||
|
||||
pub use error::SttError;
|
||||
pub use request::SttRequest;
|
||||
pub use response::{Segment, SttResponse};
|
||||
pub use trait_def::SttBackend;
|
||||
|
||||
/// Construct the backend selected by `KEI_STT_BACKEND`.
|
||||
///
|
||||
/// Defaults to `whisper-local` when the env var is absent or empty.
|
||||
/// Returns `SttError::BackendNotEnabled` if the chosen backend's
|
||||
/// feature flag was not compiled in.
|
||||
pub fn from_env() -> Result<Box<dyn SttBackend>, SttError> {
|
||||
let name = std::env::var("KEI_STT_BACKEND")
|
||||
.unwrap_or_else(|_| "whisper-local".to_string());
|
||||
build_backend(&name)
|
||||
}
|
||||
|
||||
fn build_backend(name: &str) -> Result<Box<dyn SttBackend>, SttError> {
|
||||
match name {
|
||||
#[cfg(feature = "whisper-local")]
|
||||
"whisper-local" => Ok(Box::new(whisper_local::WhisperLocalBackend::from_env())),
|
||||
#[cfg(feature = "deepgram")]
|
||||
"deepgram" => Ok(Box::new(deepgram::DeepgramBackend::from_env()?)),
|
||||
#[cfg(feature = "openai-whisper")]
|
||||
"openai-whisper" => Ok(Box::new(openai_whisper::OpenAiWhisperBackend::from_env()?)),
|
||||
other => Err(SttError::BackendNotEnabled(other.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn from_env_defaults_to_whisper_local() {
|
||||
std::env::remove_var("KEI_STT_BACKEND");
|
||||
let backend = from_env().expect("whisper-local backend should construct");
|
||||
assert_eq!(backend.name(), "whisper-local");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_env_unknown_backend_errors() {
|
||||
std::env::remove_var("KEI_STT_BACKEND");
|
||||
let result = build_backend("unknown_provider");
|
||||
match result {
|
||||
Err(SttError::BackendNotEnabled(name)) => {
|
||||
assert_eq!(name, "unknown_provider");
|
||||
}
|
||||
Ok(_) => panic!("expected BackendNotEnabled, got Ok"),
|
||||
Err(e) => panic!("expected BackendNotEnabled, got: {e}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
138
_primitives/_rust/kei-stt/src/openai_whisper.rs
Normal file
138
_primitives/_rust/kei-stt/src/openai_whisper.rs
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! OpenAI Whisper STT backend — calls `api.openai.com/v1/audio/transcriptions`.
|
||||
//!
|
||||
//! Sends a multipart/form-data POST with:
|
||||
//! - `file`: audio bytes (filename derived from MIME type)
|
||||
//! - `model`: `whisper-1`
|
||||
//! - `response_format`: `verbose_json`
|
||||
//! - `language`: BCP-47 code if provided
|
||||
//!
|
||||
//! Response `verbose_json` shape:
|
||||
//! ```json
|
||||
//! {"text":"...", "segments":[{"start":0.0,"end":1.0,"text":"..."}]}
|
||||
//! ```
|
||||
//!
|
||||
//! Constructor surface:
|
||||
//! * [`OpenAiWhisperBackend::from_env`] — reads `OPENAI_API_KEY`.
|
||||
//! * [`OpenAiWhisperBackend::with_base_url`] — explicit URL + key (tests).
|
||||
|
||||
#![cfg(feature = "openai-whisper")]
|
||||
|
||||
use reqwest::multipart;
|
||||
|
||||
use crate::error::SttError;
|
||||
use crate::request::SttRequest;
|
||||
use crate::response::{Segment, SttResponse};
|
||||
use crate::trait_def::SttBackend;
|
||||
|
||||
const DEFAULT_BASE_URL: &str = "https://api.openai.com";
|
||||
const WHISPER_MODEL: &str = "whisper-1";
|
||||
|
||||
pub struct OpenAiWhisperBackend {
|
||||
api_key: String,
|
||||
client: reqwest::Client,
|
||||
base_url: String,
|
||||
}
|
||||
|
||||
impl OpenAiWhisperBackend {
|
||||
/// Build from explicit base URL and API key (used in wiremock tests).
|
||||
pub fn with_base_url(
|
||||
base_url: impl Into<String>,
|
||||
api_key: impl Into<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
api_key: api_key.into(),
|
||||
client: reqwest::Client::new(),
|
||||
base_url: base_url.into().trim_end_matches('/').to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build from `OPENAI_API_KEY` env var.
|
||||
pub fn from_env() -> Result<Self, SttError> {
|
||||
let key = std::env::var("OPENAI_API_KEY")
|
||||
.map_err(|_| SttError::MissingEnv("OPENAI_API_KEY".into()))?;
|
||||
Ok(Self::with_base_url(DEFAULT_BASE_URL, key))
|
||||
}
|
||||
|
||||
fn filename_from_mime(mime: &str) -> &'static str {
|
||||
match mime {
|
||||
"audio/mpeg" => "audio.mp3",
|
||||
"audio/ogg" => "audio.ogg",
|
||||
"audio/flac" => "audio.flac",
|
||||
_ => "audio.wav",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl SttBackend for OpenAiWhisperBackend {
|
||||
fn name(&self) -> &'static str { "openai-whisper" }
|
||||
|
||||
async fn transcribe(&self, req: &SttRequest) -> Result<SttResponse, SttError> {
|
||||
let url = format!("{}/v1/audio/transcriptions", self.base_url);
|
||||
let filename = Self::filename_from_mime(&req.mime_type);
|
||||
|
||||
let file_part = multipart::Part::bytes(req.audio_bytes.clone())
|
||||
.file_name(filename)
|
||||
.mime_str(&req.mime_type)
|
||||
.map_err(|e| SttError::InvalidAudio(e.to_string()))?;
|
||||
|
||||
let mut form = multipart::Form::new()
|
||||
.part("file", file_part)
|
||||
.text("model", WHISPER_MODEL)
|
||||
.text("response_format", "verbose_json");
|
||||
|
||||
if let Some(lang) = &req.language {
|
||||
form = form.text("language", lang.clone());
|
||||
}
|
||||
|
||||
let resp = self.client
|
||||
.post(&url)
|
||||
.bearer_auth(&self.api_key)
|
||||
.multipart(form)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let status = resp.status().as_u16();
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(SttError::Http(format!("http {status}: {text}")));
|
||||
}
|
||||
|
||||
let body: serde_json::Value = resp.json().await
|
||||
.map_err(|e| SttError::InvalidResponse(e.to_string()))?;
|
||||
|
||||
parse_openai_whisper_response(&body)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_openai_whisper_response(body: &serde_json::Value) -> Result<SttResponse, SttError> {
|
||||
let text = body["text"]
|
||||
.as_str()
|
||||
.unwrap_or_default()
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
let language_detected = body["language"]
|
||||
.as_str()
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let segments = body["segments"]
|
||||
.as_array()
|
||||
.unwrap_or(&vec![])
|
||||
.iter()
|
||||
.filter_map(|s| {
|
||||
let start_ms = (s["start"].as_f64()? * 1000.0) as u64;
|
||||
let end_ms = (s["end"].as_f64()? * 1000.0) as u64;
|
||||
let seg_text = s["text"].as_str()?.trim().to_string();
|
||||
Some(Segment { start_ms, end_ms, text: seg_text })
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(SttResponse { text, segments, language_detected })
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "openai_whisper_test.rs"]
|
||||
mod tests;
|
||||
71
_primitives/_rust/kei-stt/src/openai_whisper_test.rs
Normal file
71
_primitives/_rust/kei-stt/src/openai_whisper_test.rs
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! Wiremock tests for `OpenAiWhisperBackend`.
|
||||
//!
|
||||
//! Verifies Bearer auth, multipart body, and verbose_json segment parsing.
|
||||
|
||||
#![cfg(all(test, feature = "openai-whisper"))]
|
||||
|
||||
use wiremock::matchers::{header, method, path};
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
use crate::openai_whisper::OpenAiWhisperBackend;
|
||||
use crate::request::SttRequest;
|
||||
use crate::trait_def::SttBackend;
|
||||
|
||||
fn verbose_json_body() -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"text": "hello openai",
|
||||
"language": "english",
|
||||
"segments": [
|
||||
{"start": 0.0, "end": 0.5, "text": "hello"},
|
||||
{"start": 0.5, "end": 1.2, "text": "openai"}
|
||||
]
|
||||
})
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn openai_whisper_parses_segments() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/v1/audio/transcriptions"))
|
||||
.and(header("authorization", "Bearer test-key"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(200)
|
||||
.set_body_json(verbose_json_body()),
|
||||
)
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = OpenAiWhisperBackend::with_base_url(server.uri(), "test-key");
|
||||
let req = SttRequest {
|
||||
audio_bytes: b"fake_audio".to_vec(),
|
||||
mime_type: "audio/wav".to_string(),
|
||||
language: None,
|
||||
};
|
||||
let resp = backend.transcribe(&req).await.expect("transcribe should succeed");
|
||||
assert_eq!(resp.text, "hello openai");
|
||||
assert_eq!(resp.segments.len(), 2);
|
||||
assert_eq!(resp.segments[0].start_ms, 0);
|
||||
assert_eq!(resp.segments[0].end_ms, 500);
|
||||
assert_eq!(resp.segments[1].start_ms, 500);
|
||||
assert_eq!(resp.segments[1].end_ms, 1200);
|
||||
assert_eq!(resp.language_detected.as_deref(), Some("english"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn openai_whisper_http_error() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/v1/audio/transcriptions"))
|
||||
.respond_with(ResponseTemplate::new(429).set_body_string("Rate limited"))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = OpenAiWhisperBackend::with_base_url(server.uri(), "test-key");
|
||||
let req = SttRequest::new_wav(b"audio".to_vec());
|
||||
let err = backend.transcribe(&req).await.expect_err("should fail on 429");
|
||||
assert!(matches!(err, crate::SttError::Http(_)));
|
||||
}
|
||||
32
_primitives/_rust/kei-stt/src/request.rs
Normal file
32
_primitives/_rust/kei-stt/src/request.rs
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! `SttRequest` — input type for all STT backends.
|
||||
//!
|
||||
//! Deliberately backend-agnostic: each backend maps its fields to
|
||||
//! provider-specific parameters in its own module.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Parameters for a single STT transcription request.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SttRequest {
|
||||
/// Raw audio bytes to transcribe.
|
||||
pub audio_bytes: Vec<u8>,
|
||||
|
||||
/// MIME type of the audio data (e.g. `"audio/wav"`, `"audio/mpeg"`, `"audio/ogg"`).
|
||||
pub mime_type: String,
|
||||
|
||||
/// BCP-47 language hint (e.g. `"en"`, `"ru"`). `None` → auto-detect.
|
||||
pub language: Option<String>,
|
||||
}
|
||||
|
||||
impl SttRequest {
|
||||
/// Convenience constructor for WAV audio with no language hint.
|
||||
pub fn new_wav(audio_bytes: Vec<u8>) -> Self {
|
||||
Self {
|
||||
audio_bytes,
|
||||
mime_type: "audio/wav".to_string(),
|
||||
language: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
45
_primitives/_rust/kei-stt/src/response.rs
Normal file
45
_primitives/_rust/kei-stt/src/response.rs
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! `SttResponse` and `Segment` — output types for all STT backends.
|
||||
//!
|
||||
//! `segments` is empty when the backend does not provide word-level timing.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// A timed text segment from the transcription.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Segment {
|
||||
/// Start of the segment in milliseconds from the audio start.
|
||||
pub start_ms: u64,
|
||||
|
||||
/// End of the segment in milliseconds from the audio start.
|
||||
pub end_ms: u64,
|
||||
|
||||
/// Transcribed text for this segment.
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
/// Result of a successful STT transcription call.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SttResponse {
|
||||
/// Full transcribed text (concatenation of all segments).
|
||||
pub text: String,
|
||||
|
||||
/// Word/sentence-level timing segments.
|
||||
/// Empty when the backend does not provide timing data.
|
||||
pub segments: Vec<Segment>,
|
||||
|
||||
/// BCP-47 language code detected by the backend. `None` if not reported.
|
||||
pub language_detected: Option<String>,
|
||||
}
|
||||
|
||||
impl SttResponse {
|
||||
/// Construct a minimal response with text only.
|
||||
pub fn text_only(text: impl Into<String>) -> Self {
|
||||
Self {
|
||||
text: text.into(),
|
||||
segments: Vec::new(),
|
||||
language_detected: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
24
_primitives/_rust/kei-stt/src/trait_def.rs
Normal file
24
_primitives/_rust/kei-stt/src/trait_def.rs
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! `SttBackend` — the core async trait all backend impls satisfy.
|
||||
//!
|
||||
//! Implementing this trait is sufficient to plug a new STT provider into
|
||||
//! the `from_env()` dispatch without modifying `lib.rs`. Each backend
|
||||
//! module is self-contained and feature-gated.
|
||||
|
||||
use crate::error::SttError;
|
||||
use crate::request::SttRequest;
|
||||
use crate::response::SttResponse;
|
||||
|
||||
/// Async STT transcription backend.
|
||||
///
|
||||
/// Implementations must be `Send + Sync` so they can be stored in a
|
||||
/// `Box<dyn SttBackend>` and shared across Tokio tasks.
|
||||
#[async_trait::async_trait]
|
||||
pub trait SttBackend: Send + Sync {
|
||||
/// Transcribe the audio in `req` and return the text plus optional segments.
|
||||
async fn transcribe(&self, req: &SttRequest) -> Result<SttResponse, SttError>;
|
||||
|
||||
/// Short, stable identifier for this backend (e.g. `"whisper-local"`).
|
||||
fn name(&self) -> &'static str;
|
||||
}
|
||||
181
_primitives/_rust/kei-stt/src/whisper_local.rs
Normal file
181
_primitives/_rust/kei-stt/src/whisper_local.rs
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! Whisper-local STT backend — spawns the `whisper` CLI subprocess.
|
||||
//!
|
||||
//! Invocation:
|
||||
//! `<binary> <audio_file> --model <model> --output_format json --output_dir <dir>`
|
||||
//!
|
||||
//! The openai-whisper Python package writes `<stem>.json` into `--output_dir`.
|
||||
//! We parse `text` + `segments[].{start,end,text}` from that JSON.
|
||||
//!
|
||||
//! Constructor surface:
|
||||
//! * [`WhisperLocalBackend::from_env`] — reads `KEI_STT_WHISPER_BINARY` +
|
||||
//! `KEI_STT_WHISPER_MODEL` (default `base.en`).
|
||||
//! * [`WhisperLocalBackend::with_binary`] + [`WhisperLocalBackend::with_model`]
|
||||
//! — explicit overrides for testability.
|
||||
|
||||
#![cfg(feature = "whisper-local")]
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use tokio::process::Command;
|
||||
|
||||
use crate::error::SttError;
|
||||
use crate::request::SttRequest;
|
||||
use crate::response::{Segment, SttResponse};
|
||||
use crate::trait_def::SttBackend;
|
||||
|
||||
const DEFAULT_BINARY: &str = "whisper";
|
||||
const DEFAULT_MODEL: &str = "base.en";
|
||||
|
||||
pub struct WhisperLocalBackend {
|
||||
binary: PathBuf,
|
||||
model: String,
|
||||
}
|
||||
|
||||
impl WhisperLocalBackend {
|
||||
/// Build from explicit binary and model (used in tests).
|
||||
pub fn new(binary: impl Into<PathBuf>, model: impl Into<String>) -> Self {
|
||||
Self { binary: binary.into(), model: model.into() }
|
||||
}
|
||||
|
||||
/// Build from env vars.
|
||||
/// Optional: `KEI_STT_WHISPER_BINARY` (default `whisper`).
|
||||
/// Optional: `KEI_STT_WHISPER_MODEL` (default `base.en`).
|
||||
pub fn from_env() -> Self {
|
||||
let binary = std::env::var("KEI_STT_WHISPER_BINARY")
|
||||
.unwrap_or_else(|_| DEFAULT_BINARY.to_string());
|
||||
let model = std::env::var("KEI_STT_WHISPER_MODEL")
|
||||
.unwrap_or_else(|_| DEFAULT_MODEL.to_string());
|
||||
Self::new(binary, model)
|
||||
}
|
||||
|
||||
fn ext_from_mime(mime: &str) -> &'static str {
|
||||
match mime {
|
||||
"audio/mpeg" => "mp3",
|
||||
"audio/ogg" => "ogg",
|
||||
"audio/flac" => "flac",
|
||||
_ => "wav",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl SttBackend for WhisperLocalBackend {
|
||||
fn name(&self) -> &'static str { "whisper-local" }
|
||||
|
||||
async fn transcribe(&self, req: &SttRequest) -> Result<SttResponse, SttError> {
|
||||
let ext = Self::ext_from_mime(&req.mime_type);
|
||||
|
||||
// Write audio bytes to a named temp file.
|
||||
let audio_file = tempfile::Builder::new()
|
||||
.suffix(&format!(".{ext}"))
|
||||
.tempfile()
|
||||
.map_err(|e| SttError::Subprocess(e.to_string()))?;
|
||||
|
||||
let out_dir = tempfile::tempdir()
|
||||
.map_err(|e| SttError::Subprocess(e.to_string()))?;
|
||||
|
||||
tokio::fs::write(audio_file.path(), &req.audio_bytes)
|
||||
.await
|
||||
.map_err(|e| SttError::Subprocess(e.to_string()))?;
|
||||
|
||||
// Build and run whisper CLI command.
|
||||
let mut cmd = Command::new(&self.binary);
|
||||
cmd.arg(audio_file.path())
|
||||
.arg("--model").arg(&self.model)
|
||||
.arg("--output_format").arg("json")
|
||||
.arg("--output_dir").arg(out_dir.path());
|
||||
|
||||
if let Some(lang) = &req.language {
|
||||
cmd.arg("--language").arg(lang);
|
||||
}
|
||||
|
||||
let output = cmd.output().await
|
||||
.map_err(|e| SttError::Subprocess(e.to_string()))?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr).into_owned();
|
||||
return Err(SttError::Subprocess(format!(
|
||||
"whisper exited {}: {stderr}", output.status
|
||||
)));
|
||||
}
|
||||
|
||||
// Find the produced JSON file (stem of input + ".json").
|
||||
let stem = audio_file.path()
|
||||
.file_stem()
|
||||
.and_then(|s| s.to_str())
|
||||
.unwrap_or("audio");
|
||||
let json_path = out_dir.path().join(format!("{stem}.json"));
|
||||
|
||||
let json_bytes = tokio::fs::read(&json_path).await
|
||||
.map_err(|e| SttError::InvalidResponse(format!("json file: {e}")))?;
|
||||
|
||||
parse_whisper_json(&json_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_whisper_json(bytes: &[u8]) -> Result<SttResponse, SttError> {
|
||||
let v: serde_json::Value = serde_json::from_slice(bytes)
|
||||
.map_err(|e| SttError::InvalidResponse(e.to_string()))?;
|
||||
|
||||
let text = v["text"].as_str()
|
||||
.unwrap_or_default()
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
let segments = v["segments"]
|
||||
.as_array()
|
||||
.unwrap_or(&vec![])
|
||||
.iter()
|
||||
.filter_map(|s| {
|
||||
let start_ms = (s["start"].as_f64()? * 1000.0) as u64;
|
||||
let end_ms = (s["end"].as_f64()? * 1000.0) as u64;
|
||||
let seg_text = s["text"].as_str()?.trim().to_string();
|
||||
Some(Segment { start_ms, end_ms, text: seg_text })
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(SttResponse { text, segments, language_detected: None })
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn whisper_local_subprocess_error_on_bad_binary() {
|
||||
let backend = WhisperLocalBackend::new("/nonexistent/whisper", "base.en");
|
||||
let req = SttRequest::new_wav(b"RIFF....".to_vec());
|
||||
let err = backend.transcribe(&req).await
|
||||
.expect_err("should fail with non-existent binary");
|
||||
assert!(
|
||||
matches!(err, SttError::Subprocess(_)),
|
||||
"expected Subprocess error, got: {err:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_whisper_json_full() {
|
||||
let json = br#"{
|
||||
"text": " Hello world",
|
||||
"segments": [
|
||||
{"start": 0.0, "end": 1.5, "text": " Hello"},
|
||||
{"start": 1.5, "end": 2.0, "text": " world"}
|
||||
]
|
||||
}"#;
|
||||
let resp = parse_whisper_json(json).expect("parse should succeed");
|
||||
assert_eq!(resp.text, "Hello world");
|
||||
assert_eq!(resp.segments.len(), 2);
|
||||
assert_eq!(resp.segments[0].end_ms, 1500);
|
||||
assert_eq!(resp.segments[1].start_ms, 1500);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_whisper_json_no_segments() {
|
||||
let json = br#"{"text": " Hi"}"#;
|
||||
let resp = parse_whisper_json(json).expect("parse should succeed");
|
||||
assert_eq!(resp.text, "Hi");
|
||||
assert!(resp.segments.is_empty());
|
||||
}
|
||||
}
|
||||
40
_primitives/_rust/kei-tts/Cargo.toml
Normal file
40
_primitives/_rust/kei-tts/Cargo.toml
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
[package]
|
||||
name = "kei-tts"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
description = "Text-to-speech abstraction trait with 4 backends (ElevenLabs/OpenAI/Google/Piper). Default = piper (local, free, zero latency)."
|
||||
authors.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[lib]
|
||||
name = "kei_tts"
|
||||
path = "src/lib.rs"
|
||||
|
||||
[dependencies]
|
||||
async-trait = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
serde_json = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tokio = { workspace = true, features = ["macros", "rt-multi-thread", "process", "io-util"] }
|
||||
reqwest = { workspace = true }
|
||||
tracing = "0.1"
|
||||
base64 = { version = "0.22", optional = true }
|
||||
|
||||
[features]
|
||||
default = ["piper"]
|
||||
elevenlabs = []
|
||||
openai = []
|
||||
google = ["dep:base64"]
|
||||
piper = []
|
||||
all-backends = ["elevenlabs", "openai", "google", "piper"]
|
||||
|
||||
[dev-dependencies]
|
||||
wiremock = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
|
||||
[package.metadata.keisei]
|
||||
maturity = "alpha"
|
||||
trait = "TtsBackend"
|
||||
description = "TTS abstraction with 4 backends (ElevenLabs/OpenAI/Google/Piper). Default = piper."
|
||||
authors = ["Denis Parfionovich <parfionovich@keilab.io>"]
|
||||
53
_primitives/_rust/kei-tts/README.md
Normal file
53
_primitives/_rust/kei-tts/README.md
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
# kei-tts
|
||||
|
||||
Text-to-speech abstraction crate with 4 backends selected at runtime via
|
||||
`KEI_TTS_BACKEND`. Default backend is **piper** (local, free, zero latency).
|
||||
|
||||
## Backend matrix
|
||||
|
||||
| Backend | Feature flag | Cost | Latency | Quality | Language coverage |
|
||||
|-------------|---------------|-------------|------------|-----------|-------------------|
|
||||
| `piper` | `piper` | Free | ~50–200 ms | Good | 20+ language packs |
|
||||
| `elevenlabs`| `elevenlabs` | ~$0.30/1k ch| 300–600 ms | Excellent | 30+ languages |
|
||||
| `openai` | `openai` | ~$0.015/1k ch| 200–500 ms| Very good | 50+ languages |
|
||||
| `google` | `google` | ~$4/1M ch | 200–400 ms | Very good | 40+ languages |
|
||||
|
||||
## Environment variables
|
||||
|
||||
| Variable | Backend | Required | Description |
|
||||
|-------------------------|-------------|----------|------------------------------------|
|
||||
| `KEI_TTS_BACKEND` | all | No | `piper` (default) / `elevenlabs` / `openai` / `google` |
|
||||
| `ELEVENLABS_API_KEY` | elevenlabs | Yes | ElevenLabs API key |
|
||||
| `OPENAI_API_KEY` | openai | Yes | OpenAI API key |
|
||||
| `KEI_TTS_OPENAI_MODEL` | openai | No | `tts-1` (default) or `tts-1-hd` |
|
||||
| `GOOGLE_TTS_API_KEY` | google | Yes | Google Cloud API key |
|
||||
| `KEI_TTS_PIPER_MODEL` | piper | Yes | Path to `.onnx` piper model file |
|
||||
| `KEI_TTS_PIPER_BINARY` | piper | No | Path to `piper-tts` (default: PATH)|
|
||||
|
||||
## Usage
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
kei-tts = { path = "../kei-tts", features = ["piper"] }
|
||||
```
|
||||
|
||||
```rust
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), kei_tts::TtsError> {
|
||||
let backend = kei_tts::from_env()?;
|
||||
let req = kei_tts::TtsRequest::new("Hello, world!");
|
||||
let resp = backend.synth(&req).await?;
|
||||
std::fs::write("out.mp3", &resp.audio_bytes).ok();
|
||||
println!("synthesised {} bytes via {}", resp.audio_bytes.len(), backend.name());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
## Compile-time features
|
||||
|
||||
```toml
|
||||
# All backends:
|
||||
kei-tts = { features = ["all-backends"] }
|
||||
# Cloud only, no piper:
|
||||
kei-tts = { features = ["elevenlabs", "openai", "google"], default-features = false }
|
||||
```
|
||||
94
_primitives/_rust/kei-tts/src/elevenlabs.rs
Normal file
94
_primitives/_rust/kei-tts/src/elevenlabs.rs
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! ElevenLabs TTS backend — streams audio from `api.elevenlabs.io`.
|
||||
//!
|
||||
//! Endpoint: `POST /v1/text-to-speech/{voice_id}/stream`
|
||||
//! Auth: `xi-api-key: {ELEVENLABS_API_KEY}` header.
|
||||
//! Response: raw audio bytes (format-specific Content-Type).
|
||||
//!
|
||||
//! Constructor surface:
|
||||
//! * [`ElevenLabsBackend::from_env`] — reads `ELEVENLABS_API_KEY`.
|
||||
//! * [`ElevenLabsBackend::with_base_url`] — explicit URL + key (tests).
|
||||
|
||||
#![cfg(feature = "elevenlabs")]
|
||||
|
||||
use crate::error::TtsError;
|
||||
use crate::request::{AudioFormat, TtsRequest};
|
||||
use crate::response::TtsResponse;
|
||||
use crate::trait_def::TtsBackend;
|
||||
|
||||
const DEFAULT_BASE_URL: &str = "https://api.elevenlabs.io";
|
||||
const DEFAULT_VOICE_ID: &str = "21m00Tcm4TlvDq8ikWAM"; // Rachel
|
||||
|
||||
pub struct ElevenLabsBackend {
|
||||
api_key: String,
|
||||
client: reqwest::Client,
|
||||
base_url: String,
|
||||
}
|
||||
|
||||
impl ElevenLabsBackend {
|
||||
/// Build from explicit base URL and API key (used in wiremock tests).
|
||||
pub fn with_base_url(
|
||||
base_url: impl Into<String>,
|
||||
api_key: impl Into<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
api_key: api_key.into(),
|
||||
client: reqwest::Client::new(),
|
||||
base_url: base_url.into().trim_end_matches('/').to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build from `ELEVENLABS_API_KEY` env var.
|
||||
pub fn from_env() -> Result<Self, TtsError> {
|
||||
let key = std::env::var("ELEVENLABS_API_KEY")
|
||||
.map_err(|_| TtsError::MissingEnv("ELEVENLABS_API_KEY".into()))?;
|
||||
Ok(Self::with_base_url(DEFAULT_BASE_URL, key))
|
||||
}
|
||||
|
||||
fn format_param(fmt: AudioFormat) -> &'static str {
|
||||
match fmt {
|
||||
AudioFormat::Mp3 => "mp3_44100_128",
|
||||
AudioFormat::Ogg => "ogg_48000",
|
||||
AudioFormat::Wav | AudioFormat::Raw => "pcm_44100",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl TtsBackend for ElevenLabsBackend {
|
||||
fn name(&self) -> &'static str { "elevenlabs" }
|
||||
|
||||
async fn synth(&self, req: &TtsRequest) -> Result<TtsResponse, TtsError> {
|
||||
let voice = req.voice_id.as_deref().unwrap_or(DEFAULT_VOICE_ID);
|
||||
let url = format!(
|
||||
"{}/v1/text-to-speech/{}/stream",
|
||||
self.base_url, voice
|
||||
);
|
||||
let body = serde_json::json!({
|
||||
"text": req.text,
|
||||
"output_format": Self::format_param(req.format),
|
||||
});
|
||||
let resp = self.client
|
||||
.post(&url)
|
||||
.header("xi-api-key", &self.api_key)
|
||||
.header("Content-Type", "application/json")
|
||||
.json(&body)
|
||||
.send()
|
||||
.await?;
|
||||
if !resp.status().is_success() {
|
||||
let status = resp.status().as_u16();
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(TtsError::Http(format!("http {status}: {text}")));
|
||||
}
|
||||
let mime = req.format.mime_type().to_string();
|
||||
let bytes = resp.bytes().await
|
||||
.map_err(|e| TtsError::Http(e.to_string()))?
|
||||
.to_vec();
|
||||
Ok(TtsResponse::new(bytes, mime))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "elevenlabs_test.rs"]
|
||||
mod tests;
|
||||
58
_primitives/_rust/kei-tts/src/elevenlabs_test.rs
Normal file
58
_primitives/_rust/kei-tts/src/elevenlabs_test.rs
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! Wiremock tests for `ElevenLabsBackend`.
|
||||
//!
|
||||
//! Verifies request shape (path, header) and response byte parsing.
|
||||
|
||||
#![cfg(all(test, feature = "elevenlabs"))]
|
||||
|
||||
use wiremock::matchers::{header, method, path_regex};
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
use crate::elevenlabs::ElevenLabsBackend;
|
||||
use crate::request::{AudioFormat, TtsRequest};
|
||||
use crate::trait_def::TtsBackend;
|
||||
|
||||
#[tokio::test]
|
||||
async fn elevenlabs_synth_ok() {
|
||||
let server = MockServer::start().await;
|
||||
let fake_audio = b"FAKE_AUDIO_BYTES".to_vec();
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path_regex(r"/v1/text-to-speech/.+/stream"))
|
||||
.and(header("xi-api-key", "test-key"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(200)
|
||||
.set_body_bytes(fake_audio.clone())
|
||||
.append_header("Content-Type", "audio/mpeg"),
|
||||
)
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = ElevenLabsBackend::with_base_url(server.uri(), "test-key");
|
||||
let req = TtsRequest {
|
||||
text: "hello".into(),
|
||||
voice_id: Some("voice123".into()),
|
||||
language: None,
|
||||
format: AudioFormat::Mp3,
|
||||
};
|
||||
let resp = backend.synth(&req).await.expect("synth should succeed");
|
||||
assert_eq!(resp.audio_bytes, fake_audio);
|
||||
assert_eq!(resp.mime_type, "audio/mpeg");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn elevenlabs_synth_http_error() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path_regex(r"/v1/text-to-speech/.+/stream"))
|
||||
.respond_with(ResponseTemplate::new(401).set_body_string("Unauthorized"))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = ElevenLabsBackend::with_base_url(server.uri(), "bad-key");
|
||||
let req = TtsRequest::new("hello");
|
||||
let err = backend.synth(&req).await.expect_err("should fail on 401");
|
||||
assert!(matches!(err, crate::TtsError::Http(_)));
|
||||
}
|
||||
39
_primitives/_rust/kei-tts/src/error.rs
Normal file
39
_primitives/_rust/kei-tts/src/error.rs
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! `TtsError` — crate-level error enum for all TTS backends.
|
||||
//!
|
||||
//! Each variant carries a human-readable string so call-sites can log
|
||||
//! without leaking transport internals. `thiserror` provides `Display`
|
||||
//! and the `Error` trait automatically.
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
/// Errors that can occur across any TTS backend.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum TtsError {
|
||||
/// HTTP transport or API error from a cloud backend.
|
||||
#[error("http: {0}")]
|
||||
Http(String),
|
||||
|
||||
/// Subprocess (piper-tts) spawn or IO error.
|
||||
#[error("subprocess: {0}")]
|
||||
Subprocess(String),
|
||||
|
||||
/// Required environment variable is absent.
|
||||
#[error("missing env var: {0}")]
|
||||
MissingEnv(String),
|
||||
|
||||
/// Backend name was requested but its Cargo feature is not compiled in.
|
||||
#[error("backend not enabled: {0}")]
|
||||
BackendNotEnabled(String),
|
||||
|
||||
/// Unexpected or malformed response from a backend.
|
||||
#[error("invalid response: {0}")]
|
||||
InvalidResponse(String),
|
||||
}
|
||||
|
||||
impl From<reqwest::Error> for TtsError {
|
||||
fn from(e: reqwest::Error) -> Self {
|
||||
TtsError::Http(e.to_string())
|
||||
}
|
||||
}
|
||||
103
_primitives/_rust/kei-tts/src/google.rs
Normal file
103
_primitives/_rust/kei-tts/src/google.rs
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! Google Cloud TTS backend — calls `texttospeech.googleapis.com`.
|
||||
//!
|
||||
//! Endpoint: `POST /v1/text:synthesize?key={api_key}`
|
||||
//! Response: JSON `{"audioContent": "<base64>"}`. Base64-decoded bytes
|
||||
//! are returned as `TtsResponse.audio_bytes`.
|
||||
//!
|
||||
//! Constructor surface:
|
||||
//! * [`GoogleBackend::from_env`] — reads `GOOGLE_TTS_API_KEY`.
|
||||
//! * [`GoogleBackend::with_base_url`] — explicit URL + key (tests).
|
||||
|
||||
#![cfg(feature = "google")]
|
||||
|
||||
use base64::{engine::general_purpose::STANDARD as B64, Engine as _};
|
||||
|
||||
use crate::error::TtsError;
|
||||
use crate::request::{AudioFormat, TtsRequest};
|
||||
use crate::response::TtsResponse;
|
||||
use crate::trait_def::TtsBackend;
|
||||
|
||||
const DEFAULT_BASE_URL: &str = "https://texttospeech.googleapis.com";
|
||||
const DEFAULT_VOICE: &str = "en-US-Wavenet-D";
|
||||
const DEFAULT_LANG: &str = "en-US";
|
||||
|
||||
pub struct GoogleBackend {
|
||||
api_key: String,
|
||||
client: reqwest::Client,
|
||||
base_url: String,
|
||||
}
|
||||
|
||||
impl GoogleBackend {
|
||||
/// Build from explicit parameters (used in wiremock tests).
|
||||
pub fn with_base_url(
|
||||
base_url: impl Into<String>,
|
||||
api_key: impl Into<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
api_key: api_key.into(),
|
||||
client: reqwest::Client::new(),
|
||||
base_url: base_url.into().trim_end_matches('/').to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build from `GOOGLE_TTS_API_KEY` env var.
|
||||
pub fn from_env() -> Result<Self, TtsError> {
|
||||
let key = std::env::var("GOOGLE_TTS_API_KEY")
|
||||
.map_err(|_| TtsError::MissingEnv("GOOGLE_TTS_API_KEY".into()))?;
|
||||
Ok(Self::with_base_url(DEFAULT_BASE_URL, key))
|
||||
}
|
||||
|
||||
fn encoding_str(fmt: AudioFormat) -> &'static str {
|
||||
match fmt {
|
||||
AudioFormat::Mp3 => "MP3",
|
||||
AudioFormat::Ogg => "OGG_OPUS",
|
||||
AudioFormat::Wav | AudioFormat::Raw => "LINEAR16",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
struct GoogleResponse {
|
||||
#[serde(rename = "audioContent")]
|
||||
audio_content: String,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl TtsBackend for GoogleBackend {
|
||||
fn name(&self) -> &'static str { "google" }
|
||||
|
||||
async fn synth(&self, req: &TtsRequest) -> Result<TtsResponse, TtsError> {
|
||||
let url = format!(
|
||||
"{}/v1/text:synthesize?key={}",
|
||||
self.base_url, self.api_key
|
||||
);
|
||||
let voice_name = req.voice_id.as_deref().unwrap_or(DEFAULT_VOICE);
|
||||
let lang = req.language.as_deref().unwrap_or(DEFAULT_LANG);
|
||||
let body = serde_json::json!({
|
||||
"input": { "text": req.text },
|
||||
"voice": { "languageCode": lang, "name": voice_name },
|
||||
"audioConfig": { "audioEncoding": Self::encoding_str(req.format) },
|
||||
});
|
||||
let resp = self.client
|
||||
.post(&url)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await?;
|
||||
if !resp.status().is_success() {
|
||||
let status = resp.status().as_u16();
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(TtsError::Http(format!("http {status}: {text}")));
|
||||
}
|
||||
let parsed: GoogleResponse = resp.json().await
|
||||
.map_err(|e| TtsError::InvalidResponse(e.to_string()))?;
|
||||
let bytes = B64.decode(&parsed.audio_content)
|
||||
.map_err(|e| TtsError::InvalidResponse(format!("base64: {e}")))?;
|
||||
Ok(TtsResponse::new(bytes, req.format.mime_type().to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "google_test.rs"]
|
||||
mod tests;
|
||||
65
_primitives/_rust/kei-tts/src/google_test.rs
Normal file
65
_primitives/_rust/kei-tts/src/google_test.rs
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! Wiremock tests for `GoogleBackend`.
|
||||
//!
|
||||
//! Verifies JSON request shape and base64 `audioContent` decoding.
|
||||
|
||||
#![cfg(all(test, feature = "google"))]
|
||||
|
||||
use base64::{engine::general_purpose::STANDARD as B64, Engine as _};
|
||||
use wiremock::matchers::{method, path};
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
use crate::google::GoogleBackend;
|
||||
use crate::request::{AudioFormat, TtsRequest};
|
||||
use crate::trait_def::TtsBackend;
|
||||
|
||||
#[tokio::test]
|
||||
async fn google_synth_ok() {
|
||||
let server = MockServer::start().await;
|
||||
let fake_audio = b"GOOGLE_AUDIO".to_vec();
|
||||
let encoded = B64.encode(&fake_audio);
|
||||
let body = serde_json::json!({ "audioContent": encoded }).to_string();
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/v1/text:synthesize"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(200)
|
||||
.set_body_string(body)
|
||||
.append_header("Content-Type", "application/json"),
|
||||
)
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = GoogleBackend::with_base_url(server.uri(), "test-key");
|
||||
let req = TtsRequest {
|
||||
text: "hello google".into(),
|
||||
voice_id: None,
|
||||
language: Some("en-US".into()),
|
||||
format: AudioFormat::Mp3,
|
||||
};
|
||||
let resp = backend.synth(&req).await.expect("synth should succeed");
|
||||
assert_eq!(resp.audio_bytes, fake_audio);
|
||||
assert_eq!(resp.mime_type, "audio/mpeg");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn google_synth_invalid_base64() {
|
||||
let server = MockServer::start().await;
|
||||
let body = serde_json::json!({ "audioContent": "!!!not_b64!!!" }).to_string();
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/v1/text:synthesize"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(200)
|
||||
.set_body_string(body)
|
||||
.append_header("Content-Type", "application/json"),
|
||||
)
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = GoogleBackend::with_base_url(server.uri(), "test-key");
|
||||
let req = TtsRequest::new("hello");
|
||||
let err = backend.synth(&req).await.expect_err("should fail on bad b64");
|
||||
assert!(matches!(err, crate::TtsError::InvalidResponse(_)));
|
||||
}
|
||||
121
_primitives/_rust/kei-tts/src/lib.rs
Normal file
121
_primitives/_rust/kei-tts/src/lib.rs
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! `kei-tts` — text-to-speech abstraction with 4 backend impls.
|
||||
//!
|
||||
//! Backend is chosen at runtime via `KEI_TTS_BACKEND` env var.
|
||||
//! Compile-time feature flags gate which backends are available.
|
||||
//!
|
||||
//! | Feature | Backend | Default |
|
||||
//! |---------------|------------|---------|
|
||||
//! | `piper` | local | ✓ |
|
||||
//! | `elevenlabs` | cloud | – |
|
||||
//! | `openai` | cloud | – |
|
||||
//! | `google` | cloud | – |
|
||||
//!
|
||||
//! # Quick start
|
||||
//! ```no_run
|
||||
//! # async fn example() -> Result<(), kei_tts::TtsError> {
|
||||
//! let backend = kei_tts::from_env()?;
|
||||
//! let req = kei_tts::TtsRequest::new("Hello, world!");
|
||||
//! let resp = backend.synth(&req).await?;
|
||||
//! std::fs::write("out.mp3", &resp.audio_bytes).ok();
|
||||
//! # Ok(()) }
|
||||
//! ```
|
||||
|
||||
pub mod error;
|
||||
pub mod request;
|
||||
pub mod response;
|
||||
pub mod trait_def;
|
||||
|
||||
#[cfg(feature = "elevenlabs")]
|
||||
pub mod elevenlabs;
|
||||
#[cfg(feature = "google")]
|
||||
pub mod google;
|
||||
#[cfg(feature = "openai")]
|
||||
pub mod openai;
|
||||
#[cfg(feature = "piper")]
|
||||
pub mod piper;
|
||||
|
||||
pub use error::TtsError;
|
||||
pub use request::{AudioFormat, TtsRequest};
|
||||
pub use response::TtsResponse;
|
||||
pub use trait_def::TtsBackend;
|
||||
|
||||
/// Construct the backend selected by `KEI_TTS_BACKEND`.
|
||||
///
|
||||
/// Defaults to `piper` when the env var is absent or empty.
|
||||
/// Returns `TtsError::BackendNotEnabled` if the chosen backend's
|
||||
/// feature flag was not compiled in.
|
||||
pub fn from_env() -> Result<Box<dyn TtsBackend>, TtsError> {
|
||||
let name = std::env::var("KEI_TTS_BACKEND")
|
||||
.unwrap_or_else(|_| "piper".to_string());
|
||||
build_backend(&name)
|
||||
}
|
||||
|
||||
fn build_backend(name: &str) -> Result<Box<dyn TtsBackend>, TtsError> {
|
||||
match name {
|
||||
#[cfg(feature = "piper")]
|
||||
"piper" => Ok(Box::new(piper::PiperBackend::from_env()?)),
|
||||
#[cfg(feature = "elevenlabs")]
|
||||
"elevenlabs" => Ok(Box::new(elevenlabs::ElevenLabsBackend::from_env()?)),
|
||||
#[cfg(feature = "openai")]
|
||||
"openai" => Ok(Box::new(openai::OpenAiBackend::from_env()?)),
|
||||
#[cfg(feature = "google")]
|
||||
"google" => Ok(Box::new(google::GoogleBackend::from_env()?)),
|
||||
other => Err(TtsError::BackendNotEnabled(other.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// NOTE: env-var tests are run with `-- --test-threads=1` to avoid
|
||||
/// races between tests that mutate process-global env.
|
||||
|
||||
#[test]
|
||||
fn from_env_defaults_to_piper() {
|
||||
// Ensure a previous test has not leaked KEI_TTS_BACKEND.
|
||||
std::env::remove_var("KEI_TTS_BACKEND");
|
||||
// piper backend requires KEI_TTS_PIPER_MODEL — set a dummy path.
|
||||
std::env::set_var("KEI_TTS_PIPER_MODEL", "/tmp/dummy.onnx");
|
||||
let backend = from_env().expect("piper backend should construct from env");
|
||||
assert_eq!(backend.name(), "piper");
|
||||
std::env::remove_var("KEI_TTS_PIPER_MODEL");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_env_unknown_backend_errors() {
|
||||
std::env::remove_var("KEI_TTS_BACKEND");
|
||||
let result = build_backend("invalid_provider");
|
||||
match result {
|
||||
Err(TtsError::BackendNotEnabled(name)) => {
|
||||
assert_eq!(name, "invalid_provider");
|
||||
}
|
||||
Ok(_) => panic!("expected BackendNotEnabled, got Ok"),
|
||||
Err(e) => panic!("expected BackendNotEnabled, got different error: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Verify piper backend propagates subprocess error on bad model path.
|
||||
/// Skipped entirely when `piper-tts` binary is not on PATH.
|
||||
#[cfg(feature = "piper")]
|
||||
#[tokio::test]
|
||||
async fn piper_subprocess_error_on_bad_model() {
|
||||
let available = std::process::Command::new("piper-tts")
|
||||
.arg("--help")
|
||||
.output()
|
||||
.is_ok();
|
||||
if !available {
|
||||
eprintln!("piper-tts not on PATH — skipping binary test");
|
||||
return;
|
||||
}
|
||||
use crate::piper::PiperBackend;
|
||||
use crate::trait_def::TtsBackend;
|
||||
let backend = PiperBackend::new("piper-tts", "/nonexistent/model.onnx");
|
||||
let req = TtsRequest::new("hello");
|
||||
let err = backend.synth(&req).await
|
||||
.expect_err("bad model path should fail");
|
||||
assert!(matches!(err, TtsError::Subprocess(_)));
|
||||
}
|
||||
}
|
||||
99
_primitives/_rust/kei-tts/src/openai.rs
Normal file
99
_primitives/_rust/kei-tts/src/openai.rs
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! OpenAI TTS backend — calls `api.openai.com/v1/audio/speech`.
|
||||
//!
|
||||
//! Supported models: `tts-1` (fast) and `tts-1-hd` (higher quality).
|
||||
//! Default voice: `alloy`. Format negotiated via `response_format` field.
|
||||
//!
|
||||
//! Constructor surface:
|
||||
//! * [`OpenAiBackend::from_env`] — reads `OPENAI_API_KEY`.
|
||||
//! * [`OpenAiBackend::with_base_url`] — explicit URL + key + model (tests).
|
||||
|
||||
#![cfg(feature = "openai")]
|
||||
|
||||
use crate::error::TtsError;
|
||||
use crate::request::{AudioFormat, TtsRequest};
|
||||
use crate::response::TtsResponse;
|
||||
use crate::trait_def::TtsBackend;
|
||||
|
||||
const DEFAULT_BASE_URL: &str = "https://api.openai.com";
|
||||
const DEFAULT_MODEL: &str = "tts-1";
|
||||
const DEFAULT_VOICE: &str = "alloy";
|
||||
|
||||
pub struct OpenAiBackend {
|
||||
api_key: String,
|
||||
model: String,
|
||||
client: reqwest::Client,
|
||||
base_url: String,
|
||||
}
|
||||
|
||||
impl OpenAiBackend {
|
||||
/// Build from explicit parameters (used in wiremock tests).
|
||||
pub fn with_base_url(
|
||||
base_url: impl Into<String>,
|
||||
api_key: impl Into<String>,
|
||||
model: impl Into<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
api_key: api_key.into(),
|
||||
model: model.into(),
|
||||
client: reqwest::Client::new(),
|
||||
base_url: base_url.into().trim_end_matches('/').to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build from `OPENAI_API_KEY` env var. Reads optional
|
||||
/// `KEI_TTS_OPENAI_MODEL` (default `tts-1`).
|
||||
pub fn from_env() -> Result<Self, TtsError> {
|
||||
let key = std::env::var("OPENAI_API_KEY")
|
||||
.map_err(|_| TtsError::MissingEnv("OPENAI_API_KEY".into()))?;
|
||||
let model = std::env::var("KEI_TTS_OPENAI_MODEL")
|
||||
.unwrap_or_else(|_| DEFAULT_MODEL.to_string());
|
||||
Ok(Self::with_base_url(DEFAULT_BASE_URL, key, model))
|
||||
}
|
||||
|
||||
fn format_str(fmt: AudioFormat) -> &'static str {
|
||||
match fmt {
|
||||
AudioFormat::Mp3 => "mp3",
|
||||
AudioFormat::Ogg => "opus",
|
||||
AudioFormat::Wav => "wav",
|
||||
AudioFormat::Raw => "pcm",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl TtsBackend for OpenAiBackend {
|
||||
fn name(&self) -> &'static str { "openai" }
|
||||
|
||||
async fn synth(&self, req: &TtsRequest) -> Result<TtsResponse, TtsError> {
|
||||
let url = format!("{}/v1/audio/speech", self.base_url);
|
||||
let voice = req.voice_id.as_deref().unwrap_or(DEFAULT_VOICE);
|
||||
let body = serde_json::json!({
|
||||
"model": self.model,
|
||||
"input": req.text,
|
||||
"voice": voice,
|
||||
"response_format": Self::format_str(req.format),
|
||||
});
|
||||
let resp = self.client
|
||||
.post(&url)
|
||||
.bearer_auth(&self.api_key)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await?;
|
||||
if !resp.status().is_success() {
|
||||
let status = resp.status().as_u16();
|
||||
let text = resp.text().await.unwrap_or_default();
|
||||
return Err(TtsError::Http(format!("http {status}: {text}")));
|
||||
}
|
||||
let mime = req.format.mime_type().to_string();
|
||||
let bytes = resp.bytes().await
|
||||
.map_err(|e| TtsError::Http(e.to_string()))?
|
||||
.to_vec();
|
||||
Ok(TtsResponse::new(bytes, mime))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "openai_test.rs"]
|
||||
mod tests;
|
||||
58
_primitives/_rust/kei-tts/src/openai_test.rs
Normal file
58
_primitives/_rust/kei-tts/src/openai_test.rs
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! Wiremock tests for `OpenAiBackend`.
|
||||
//!
|
||||
//! Verifies request JSON shape, Bearer auth, and response byte parsing.
|
||||
|
||||
#![cfg(all(test, feature = "openai"))]
|
||||
|
||||
use wiremock::matchers::{header, method, path};
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
use crate::openai::OpenAiBackend;
|
||||
use crate::request::{AudioFormat, TtsRequest};
|
||||
use crate::trait_def::TtsBackend;
|
||||
|
||||
#[tokio::test]
|
||||
async fn openai_synth_ok() {
|
||||
let server = MockServer::start().await;
|
||||
let fake_audio = b"OPENAI_AUDIO".to_vec();
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/v1/audio/speech"))
|
||||
.and(header("authorization", "Bearer test-key"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(200)
|
||||
.set_body_bytes(fake_audio.clone())
|
||||
.append_header("Content-Type", "audio/mpeg"),
|
||||
)
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = OpenAiBackend::with_base_url(server.uri(), "test-key", "tts-1");
|
||||
let req = TtsRequest {
|
||||
text: "hello openai".into(),
|
||||
voice_id: Some("nova".into()),
|
||||
language: None,
|
||||
format: AudioFormat::Mp3,
|
||||
};
|
||||
let resp = backend.synth(&req).await.expect("synth should succeed");
|
||||
assert_eq!(resp.audio_bytes, fake_audio);
|
||||
assert_eq!(resp.mime_type, "audio/mpeg");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn openai_synth_http_error() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/v1/audio/speech"))
|
||||
.respond_with(ResponseTemplate::new(429).set_body_string("Rate limited"))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let backend = OpenAiBackend::with_base_url(server.uri(), "test-key", "tts-1");
|
||||
let req = TtsRequest::new("hello");
|
||||
let err = backend.synth(&req).await.expect_err("should fail on 429");
|
||||
assert!(matches!(err, crate::TtsError::Http(_)));
|
||||
}
|
||||
115
_primitives/_rust/kei-tts/src/piper.rs
Normal file
115
_primitives/_rust/kei-tts/src/piper.rs
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! Piper TTS backend — spawns the `piper-tts` subprocess.
|
||||
//!
|
||||
//! Invocation: `piper-tts --model <model> --output_raw`
|
||||
//! Text is written to stdin; raw PCM bytes are read from stdout.
|
||||
//! A minimal RIFF WAV header is prepended when `format = Wav`.
|
||||
//!
|
||||
//! Constructor surface:
|
||||
//! * [`PiperBackend::from_env`] — reads `KEI_TTS_PIPER_MODEL` +
|
||||
//! optional `KEI_TTS_PIPER_BINARY` (default `piper-tts`).
|
||||
//! * [`PiperBackend::new`] — explicit binary + model paths (tests).
|
||||
|
||||
#![cfg(feature = "piper")]
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use tokio::io::AsyncWriteExt as _;
|
||||
use tokio::process::Command;
|
||||
|
||||
use crate::error::TtsError;
|
||||
use crate::request::{AudioFormat, TtsRequest};
|
||||
use crate::response::TtsResponse;
|
||||
use crate::trait_def::TtsBackend;
|
||||
|
||||
const DEFAULT_BINARY: &str = "piper-tts";
|
||||
// PCM parameters piper-tts emits by default.
|
||||
const SAMPLE_RATE: u32 = 22050;
|
||||
const CHANNELS: u16 = 1;
|
||||
const BITS_PER_SAMPLE: u16 = 16;
|
||||
|
||||
pub struct PiperBackend {
|
||||
binary: PathBuf,
|
||||
model: PathBuf,
|
||||
}
|
||||
|
||||
impl PiperBackend {
|
||||
/// Build from explicit binary path and model path.
|
||||
pub fn new(binary: impl Into<PathBuf>, model: impl Into<PathBuf>) -> Self {
|
||||
Self { binary: binary.into(), model: model.into() }
|
||||
}
|
||||
|
||||
/// Build from env vars.
|
||||
/// Required: `KEI_TTS_PIPER_MODEL` (path to `.onnx` model file).
|
||||
/// Optional: `KEI_TTS_PIPER_BINARY` (default `piper-tts`).
|
||||
pub fn from_env() -> Result<Self, TtsError> {
|
||||
let model = std::env::var("KEI_TTS_PIPER_MODEL")
|
||||
.map_err(|_| TtsError::MissingEnv("KEI_TTS_PIPER_MODEL".into()))?;
|
||||
let binary = std::env::var("KEI_TTS_PIPER_BINARY")
|
||||
.unwrap_or_else(|_| DEFAULT_BINARY.to_string());
|
||||
Ok(Self::new(binary, model))
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a minimal 44-byte RIFF WAV header for PCM data.
|
||||
fn wav_header(data_len: u32) -> Vec<u8> {
|
||||
let byte_rate = SAMPLE_RATE * u32::from(CHANNELS) * u32::from(BITS_PER_SAMPLE) / 8;
|
||||
let block_align: u16 = CHANNELS * BITS_PER_SAMPLE / 8;
|
||||
let mut h = Vec::with_capacity(44);
|
||||
h.extend_from_slice(b"RIFF");
|
||||
h.extend_from_slice(&(36u32 + data_len).to_le_bytes());
|
||||
h.extend_from_slice(b"WAVE");
|
||||
h.extend_from_slice(b"fmt ");
|
||||
h.extend_from_slice(&16u32.to_le_bytes()); // subchunk1 size
|
||||
h.extend_from_slice(&1u16.to_le_bytes()); // PCM
|
||||
h.extend_from_slice(&CHANNELS.to_le_bytes());
|
||||
h.extend_from_slice(&SAMPLE_RATE.to_le_bytes());
|
||||
h.extend_from_slice(&byte_rate.to_le_bytes());
|
||||
h.extend_from_slice(&block_align.to_le_bytes());
|
||||
h.extend_from_slice(&BITS_PER_SAMPLE.to_le_bytes());
|
||||
h.extend_from_slice(b"data");
|
||||
h.extend_from_slice(&data_len.to_le_bytes());
|
||||
h
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl TtsBackend for PiperBackend {
|
||||
fn name(&self) -> &'static str { "piper" }
|
||||
|
||||
async fn synth(&self, req: &TtsRequest) -> Result<TtsResponse, TtsError> {
|
||||
let mut child = Command::new(&self.binary)
|
||||
.arg("--model")
|
||||
.arg(&self.model)
|
||||
.arg("--output_raw")
|
||||
.stdin(std::process::Stdio::piped())
|
||||
.stdout(std::process::Stdio::piped())
|
||||
.stderr(std::process::Stdio::null())
|
||||
.spawn()
|
||||
.map_err(|e| TtsError::Subprocess(e.to_string()))?;
|
||||
|
||||
if let Some(mut stdin) = child.stdin.take() {
|
||||
stdin.write_all(req.text.as_bytes()).await
|
||||
.map_err(|e| TtsError::Subprocess(e.to_string()))?;
|
||||
}
|
||||
|
||||
let output = child.wait_with_output().await
|
||||
.map_err(|e| TtsError::Subprocess(e.to_string()))?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(TtsError::Subprocess(format!(
|
||||
"piper-tts exited with {}", output.status
|
||||
)));
|
||||
}
|
||||
|
||||
let pcm = output.stdout;
|
||||
match req.format {
|
||||
AudioFormat::Wav => {
|
||||
let mut wav = wav_header(pcm.len() as u32);
|
||||
wav.extend_from_slice(&pcm);
|
||||
Ok(TtsResponse::new(wav, "audio/wav"))
|
||||
}
|
||||
_ => Ok(TtsResponse::new(pcm, "audio/pcm")),
|
||||
}
|
||||
}
|
||||
}
|
||||
66
_primitives/_rust/kei-tts/src/request.rs
Normal file
66
_primitives/_rust/kei-tts/src/request.rs
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! `TtsRequest` and `AudioFormat` — input types for all TTS backends.
|
||||
//!
|
||||
//! `TtsRequest` is deliberately backend-agnostic: each backend maps its
|
||||
//! fields to provider-specific parameters in its own module.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Output audio encoding requested from the backend.
|
||||
///
|
||||
/// Not every backend supports every format; unsupported formats result
|
||||
/// in `TtsError::InvalidResponse`.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum AudioFormat {
|
||||
/// MPEG Layer 3 — the most widely supported lossy format.
|
||||
Mp3,
|
||||
/// Ogg Vorbis — open, patent-free lossy format.
|
||||
Ogg,
|
||||
/// RIFF WAVE — uncompressed PCM container.
|
||||
Wav,
|
||||
/// Raw PCM bytes with no container header (piper default).
|
||||
Raw,
|
||||
}
|
||||
|
||||
impl AudioFormat {
|
||||
/// Returns the MIME type string for the format.
|
||||
pub fn mime_type(self) -> &'static str {
|
||||
match self {
|
||||
AudioFormat::Mp3 => "audio/mpeg",
|
||||
AudioFormat::Ogg => "audio/ogg",
|
||||
AudioFormat::Wav => "audio/wav",
|
||||
AudioFormat::Raw => "audio/pcm",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parameters for a single TTS synthesis request.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TtsRequest {
|
||||
/// The text to synthesise.
|
||||
pub text: String,
|
||||
|
||||
/// Backend-specific voice identifier. `None` lets the backend use
|
||||
/// its own default voice.
|
||||
pub voice_id: Option<String>,
|
||||
|
||||
/// BCP-47 language tag (e.g. `"ru"`, `"en-US"`). `None` → auto.
|
||||
pub language: Option<String>,
|
||||
|
||||
/// Desired output audio encoding.
|
||||
pub format: AudioFormat,
|
||||
}
|
||||
|
||||
impl TtsRequest {
|
||||
/// Convenience constructor for plain text with backend defaults.
|
||||
pub fn new(text: impl Into<String>) -> Self {
|
||||
Self {
|
||||
text: text.into(),
|
||||
voice_id: None,
|
||||
language: None,
|
||||
format: AudioFormat::Mp3,
|
||||
}
|
||||
}
|
||||
}
|
||||
22
_primitives/_rust/kei-tts/src/response.rs
Normal file
22
_primitives/_rust/kei-tts/src/response.rs
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! `TtsResponse` — output of a successful TTS synthesis call.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Audio data returned by a TTS backend.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TtsResponse {
|
||||
/// Raw bytes of the synthesised audio file.
|
||||
pub audio_bytes: Vec<u8>,
|
||||
|
||||
/// MIME type of the audio data (e.g. `"audio/mpeg"`, `"audio/wav"`).
|
||||
pub mime_type: String,
|
||||
}
|
||||
|
||||
impl TtsResponse {
|
||||
/// Construct a response with explicit audio data and MIME type.
|
||||
pub fn new(audio_bytes: Vec<u8>, mime_type: impl Into<String>) -> Self {
|
||||
Self { audio_bytes, mime_type: mime_type.into() }
|
||||
}
|
||||
}
|
||||
24
_primitives/_rust/kei-tts/src/trait_def.rs
Normal file
24
_primitives/_rust/kei-tts/src/trait_def.rs
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Copyright 2026 <author org>
|
||||
//! `TtsBackend` — the core async trait all backend impls satisfy.
|
||||
//!
|
||||
//! Implementing this trait is sufficient to plug a new TTS provider into
|
||||
//! the `from_env()` dispatch without modifying `lib.rs`. Each backend
|
||||
//! module is self-contained and feature-gated.
|
||||
|
||||
use crate::error::TtsError;
|
||||
use crate::request::TtsRequest;
|
||||
use crate::response::TtsResponse;
|
||||
|
||||
/// Async TTS synthesis backend.
|
||||
///
|
||||
/// Implementations must be `Send + Sync` so they can be stored in a
|
||||
/// `Box<dyn TtsBackend>` and shared across Tokio tasks.
|
||||
#[async_trait::async_trait]
|
||||
pub trait TtsBackend: Send + Sync {
|
||||
/// Synthesise `req.text` and return the audio bytes.
|
||||
async fn synth(&self, req: &TtsRequest) -> Result<TtsResponse, TtsError>;
|
||||
|
||||
/// Short, stable identifier for this backend (e.g. `"piper"`).
|
||||
fn name(&self) -> &'static str;
|
||||
}
|
||||
Loading…
Reference in a new issue