457 lines
16 KiB
JavaScript
457 lines
16 KiB
JavaScript
/**
|
|
* Local Piper TTS Skill for OpenClaw
|
|
*
|
|
* Provides offline text-to-speech using Piper TTS with automatic language detection.
|
|
* Includes self-contained setup: creates an isolated Python venv and installs piper-tts.
|
|
* Language routing and voice selection are handled by the bundled piper-tts.sh script —
|
|
* add more languages by installing .onnx models and the heuristics update automatically.
|
|
*/
|
|
|
|
const { execFile } = require('child_process');
|
|
const fs = require('fs');
|
|
const https = require('https');
|
|
const path = require('path');
|
|
const { promisify } = require('util');
|
|
|
|
const execFileAsync = promisify(execFile);
|
|
|
|
// Skill directory is the self-contained root: venv, voice models, and piper-tts.sh all live here
|
|
const PIPER_DIR = __dirname;
|
|
const PIPER_SCRIPT = path.join(PIPER_DIR, 'piper-tts.sh');
|
|
const CONFIG_FILE = path.join(PIPER_DIR, 'config.json');
|
|
|
|
/**
|
|
* Load persisted skill config from config.json in the skill directory.
|
|
* Returns {} if the file doesn't exist yet.
|
|
* @returns {{ lengthScale?: number }}
|
|
*/
|
|
function loadConfig() {
|
|
try {
|
|
return JSON.parse(fs.readFileSync(CONFIG_FILE, 'utf8'));
|
|
} catch (err) {
|
|
if (err.code === 'ENOENT') return {};
|
|
// File exists but is not valid JSON — warn so corrupted config doesn't silently vanish
|
|
console.error(`Warning: failed to parse ${CONFIG_FILE}: ${err.message}`);
|
|
return {};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Save one or more config values to config.json in the skill directory.
|
|
* Merges with existing config — does not overwrite unrelated keys.
|
|
* @param {Object} updates - e.g. { lengthScale: 0.8 }
|
|
*/
|
|
function saveConfig(updates) {
|
|
const current = loadConfig();
|
|
const next = { ...current, ...updates };
|
|
fs.writeFileSync(CONFIG_FILE, JSON.stringify(next, null, 2) + '\n', 'utf8');
|
|
}
|
|
|
|
// Output directory
|
|
const WORKSPACE_DIR = process.env.OPENCLAW_WORKSPACE || path.join(process.env.HOME, '.openclaw', 'workspace');
|
|
const OUTPUT_DIR = path.join(WORKSPACE_DIR, 'tts');
|
|
|
|
/**
|
|
* Resolve a voice stem to a safe, absolute .onnx path within PIPER_DIR.
|
|
* Prevents path traversal: only models inside PIPER_DIR are accepted.
|
|
* @param {string} voice - e.g. "en_US-ryan-high" or "en_US-ryan-high.onnx"
|
|
* @returns {string} absolute path to the .onnx file
|
|
*/
|
|
function resolveVoice(voice) {
|
|
// path.basename strips all directory components, preventing traversal
|
|
const stem = path.basename(voice).replace(/\.onnx$/, '');
|
|
const resolved = path.join(PIPER_DIR, stem + '.onnx');
|
|
// Explicit bounds check as defence-in-depth
|
|
if (!resolved.startsWith(PIPER_DIR + path.sep)) {
|
|
throw new Error(`Invalid voice name: ${voice}`);
|
|
}
|
|
if (!fs.existsSync(resolved)) {
|
|
throw new Error(`Voice model not found: ${stem} (expected at ${resolved})`);
|
|
}
|
|
return resolved;
|
|
}
|
|
|
|
/**
|
|
* Download a single file from a URL, following redirects.
|
|
* @param {string} url
|
|
* @param {string} dest - absolute path to write to
|
|
* @param {number} redirects - redirect budget
|
|
* @returns {Promise<void>}
|
|
*/
|
|
function downloadFile(url, dest, redirects = 10) {
|
|
return new Promise((resolve, reject) => {
|
|
if (redirects === 0) { return reject(new Error('Too many redirects')); }
|
|
if (!url.startsWith('https:')) { return reject(new Error(`Refusing non-HTTPS URL: ${url}`)); }
|
|
const file = fs.createWriteStream(dest);
|
|
let settled = false;
|
|
const done = (err) => {
|
|
if (settled) return;
|
|
settled = true;
|
|
file.close();
|
|
if (err) { try { fs.unlinkSync(dest); } catch (_) {} reject(err); }
|
|
else resolve();
|
|
};
|
|
const req = https.get(url, (res) => {
|
|
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
|
|
file.destroy();
|
|
try { fs.unlinkSync(dest); } catch (_) {}
|
|
settled = true;
|
|
downloadFile(res.headers.location, dest, redirects - 1).then(resolve, reject);
|
|
return;
|
|
}
|
|
if (res.statusCode !== 200) {
|
|
return done(new Error(`HTTP ${res.statusCode} for ${url}`));
|
|
}
|
|
res.pipe(file);
|
|
file.on('finish', () => done(null));
|
|
file.on('error', done);
|
|
res.on('error', done);
|
|
});
|
|
req.on('error', done);
|
|
req.setTimeout(300000, () => { req.destroy(); done(new Error('Download timed out')); });
|
|
});
|
|
}
|
|
|
|
/**
|
|
* List voice model stems installed in PIPER_DIR.
|
|
* Each stem can be passed directly as the `voice` parameter to tts().
|
|
* @returns {string[]} sorted list of voice stems, e.g. ["en_US-ryan-high", "pl_PL-gosia-medium"]
|
|
*/
|
|
function listVoices() {
|
|
if (!fs.existsSync(PIPER_DIR)) return [];
|
|
return fs.readdirSync(PIPER_DIR)
|
|
.filter(f => f.endsWith('.onnx') && !f.endsWith('.onnx.tmp'))
|
|
.map(f => f.replace(/\.onnx$/, ''))
|
|
.sort();
|
|
}
|
|
|
|
/**
|
|
* Return detailed status of the Piper installation.
|
|
* Use this before any tts() call to determine what action is needed.
|
|
* @returns {Promise<{ ready: boolean, stage: string, message: string, voices?: string[] }>}
|
|
*/
|
|
async function status() {
|
|
// piper-tts.sh is always present (bundled); check venv to detect missing setup
|
|
const venvPath = path.join(PIPER_DIR, 'venv');
|
|
if (!fs.existsSync(venvPath)) {
|
|
return {
|
|
ready: false,
|
|
stage: 'not-setup',
|
|
message: 'Piper is not set up. Ask the user for confirmation, then call setup().'
|
|
};
|
|
}
|
|
|
|
const venvPiper = path.join(PIPER_DIR, 'venv', 'bin', 'piper');
|
|
if (!fs.existsSync(venvPiper)) {
|
|
return {
|
|
ready: false,
|
|
stage: 'no-piper',
|
|
message: 'piper binary missing from venv. Ask the user for confirmation, then call setup() to reinstall.'
|
|
};
|
|
}
|
|
|
|
const voices = listVoices();
|
|
if (voices.length === 0) {
|
|
return {
|
|
ready: false,
|
|
stage: 'no-model',
|
|
message: `No voice models installed in ${PIPER_DIR}. Ask the user which language/voice they want, then download the .onnx + .onnx.json from https://github.com/rhasspy/piper/blob/master/VOICES.md`
|
|
};
|
|
}
|
|
|
|
return { ready: true, stage: 'ready', voices };
|
|
}
|
|
|
|
/**
|
|
* Check if Piper TTS is fully ready to use.
|
|
* @returns {Promise<boolean>}
|
|
*/
|
|
async function isAvailable() {
|
|
const s = await status();
|
|
return s.ready;
|
|
}
|
|
|
|
/**
|
|
* Set up the Piper TTS environment.
|
|
* Creates an isolated Python venv inside the skill directory and installs piper-tts.
|
|
* Everything stays self-contained — nothing is written outside the skill directory.
|
|
*
|
|
* IMPORTANT: Always ask the user for confirmation before calling this.
|
|
* It installs piper-tts from PyPI into a venv inside the skill directory.
|
|
*
|
|
* @returns {Promise<{ success: boolean, steps: string[] }>}
|
|
*/
|
|
async function setup() {
|
|
const steps = [];
|
|
|
|
// 1. Verify Python 3 is available
|
|
try {
|
|
await execFileAsync('python3', ['--version'], { timeout: 10000 });
|
|
steps.push('Python 3 found');
|
|
} catch {
|
|
throw new Error('Python 3 is required but not found on PATH. Please install Python 3.8+ first.');
|
|
}
|
|
|
|
// 2. Create venv inside skill directory (skip if already exists)
|
|
const venvPath = path.join(PIPER_DIR, 'venv');
|
|
if (!fs.existsSync(venvPath)) {
|
|
await execFileAsync('python3', ['-m', 'venv', venvPath], { timeout: 60000 });
|
|
steps.push('Python virtual environment created');
|
|
} else {
|
|
steps.push('Python virtual environment already exists');
|
|
}
|
|
|
|
// 3. Verify pip is present in venv
|
|
const pipPath = path.join(venvPath, 'bin', 'pip');
|
|
if (!fs.existsSync(pipPath)) {
|
|
throw new Error('pip not found in venv — venv creation may have failed. Check your Python installation.');
|
|
}
|
|
|
|
// 4. Install piper-tts and its dependencies into the isolated venv (not system Python)
|
|
// pathvalidate is listed as a piper-tts dependency but is occasionally missed by pip
|
|
await execFileAsync(pipPath, ['install', '--quiet', 'piper-tts', 'pathvalidate'], { timeout: 300000 });
|
|
steps.push('piper-tts installed into venv');
|
|
|
|
// 5. Ensure piper-tts.sh is executable (may be lost on cp-based installs)
|
|
if (!fs.existsSync(PIPER_SCRIPT)) {
|
|
throw new Error('piper-tts.sh not found in skill directory — skill installation may be incomplete.');
|
|
}
|
|
fs.chmodSync(PIPER_SCRIPT, 0o755);
|
|
steps.push('piper-tts.sh marked executable');
|
|
|
|
// 6. Check for espeak-ng (required by Piper for phonemization, but varies by platform)
|
|
let hasEspeak = false;
|
|
for (const bin of ['espeak-ng', 'espeak']) {
|
|
try {
|
|
await execFileAsync('which', [bin], { timeout: 5000 });
|
|
hasEspeak = true;
|
|
break;
|
|
} catch (_) {}
|
|
}
|
|
if (!hasEspeak) {
|
|
steps.push('WARNING: espeak-ng not found — Piper requires it for phonemization. Install it: sudo apt install espeak-ng (Debian/Ubuntu), sudo dnf install espeak-ng (Fedora), brew install espeak (macOS)');
|
|
} else {
|
|
steps.push('espeak-ng found');
|
|
}
|
|
|
|
return { success: true, steps };
|
|
}
|
|
|
|
/**
|
|
* Generate speech using local Piper TTS.
|
|
* @param {string} text - Text to synthesize
|
|
* @param {string|null} outputFilename - Optional output filename
|
|
* @param {string|null} voice - Optional voice stem, e.g. "en_US-amy-medium"
|
|
* @returns {Promise<string>} - Path to generated WAV file
|
|
*/
|
|
async function synthesize(text, outputFilename = null, voice = null, lengthScale = null) {
|
|
if (!fs.existsSync(OUTPUT_DIR)) {
|
|
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
|
}
|
|
|
|
// path.basename prevents directory traversal in user-supplied filenames
|
|
const safeBase = outputFilename || `piper-${Date.now()}.wav`;
|
|
const filename = path.basename(safeBase);
|
|
const outputPath = path.join(OUTPUT_DIR, filename);
|
|
|
|
if (!fs.existsSync(PIPER_SCRIPT)) {
|
|
throw new Error('Piper TTS not set up. Call setup() first.');
|
|
}
|
|
|
|
// Inherit env; optionally pin a specific voice model and speed
|
|
const env = { ...process.env };
|
|
if (voice) {
|
|
env.PIPER_VOICE_MODEL = resolveVoice(voice);
|
|
}
|
|
if (lengthScale !== null) {
|
|
env.PIPER_LENGTH_SCALE = String(lengthScale);
|
|
}
|
|
|
|
try {
|
|
await execFileAsync(PIPER_SCRIPT, [text, outputPath],
|
|
{ timeout: 30000, maxBuffer: 1024 * 1024, env }
|
|
);
|
|
|
|
if (!fs.existsSync(outputPath)) {
|
|
throw new Error('Piper TTS failed to create output file');
|
|
}
|
|
|
|
return outputPath;
|
|
} catch (error) {
|
|
throw new Error(`Piper TTS synthesis failed: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Remove a voice model from the skill directory.
|
|
* Deletes both the .onnx and .onnx.json files for the given stem.
|
|
* @param {string} stem - Voice stem to remove, e.g. "en_US-ryan-medium"
|
|
* @returns {{ removed: string, filesDeleted: string[] }}
|
|
*/
|
|
function removeVoice(stem) {
|
|
const safeStem = path.basename(stem).replace(/\.onnx$/, '');
|
|
const onnxPath = path.join(PIPER_DIR, safeStem + '.onnx');
|
|
const jsonPath = path.join(PIPER_DIR, safeStem + '.onnx.json');
|
|
|
|
if (!onnxPath.startsWith(PIPER_DIR + path.sep)) {
|
|
throw new Error(`Invalid voice name: ${stem}`);
|
|
}
|
|
if (!fs.existsSync(onnxPath)) {
|
|
throw new Error(`Voice not installed: ${safeStem}. Use listVoices() to see installed voices.`);
|
|
}
|
|
|
|
const deleted = [];
|
|
fs.unlinkSync(onnxPath);
|
|
deleted.push(safeStem + '.onnx');
|
|
try { fs.unlinkSync(jsonPath); deleted.push(safeStem + '.onnx.json'); } catch (_) {}
|
|
|
|
return { removed: safeStem, filesDeleted: deleted };
|
|
}
|
|
|
|
/**
|
|
* Convert WAV to OGG/Opus format.
|
|
* @param {string} inputPath - Path to WAV file
|
|
* @param {string|null} outputPath - Path for OGG output
|
|
* @returns {Promise<string>} - Path to OGG file
|
|
*/
|
|
async function convertToOgg(inputPath, outputPath = null) {
|
|
if (!outputPath) {
|
|
outputPath = inputPath.replace(/\.wav$/, '.ogg');
|
|
}
|
|
|
|
try {
|
|
await execFileAsync('ffmpeg', ['-y', '-i', inputPath, '-c:a', 'libopus', '-b:a', '64k', outputPath],
|
|
{ timeout: 30000 }
|
|
);
|
|
|
|
if (!fs.existsSync(outputPath)) {
|
|
throw new Error('FFmpeg conversion failed');
|
|
}
|
|
|
|
return outputPath;
|
|
} catch (error) {
|
|
throw new Error(`OGG conversion failed: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Download Piper voice models from HuggingFace (rhasspy/piper-voices).
|
|
* Downloads both .onnx and .onnx.json for each requested stem.
|
|
*
|
|
* Stem format: {lang_region}-{name}-{quality}
|
|
* Examples: "en_US-ryan-medium", "en_US-amy-medium", "pl_PL-gosia-medium"
|
|
*
|
|
* @param {string[]} voices - Voice stems to download
|
|
* @returns {Promise<{ downloaded: string[], failed: Array<{stem: string, error: string}> }>}
|
|
*/
|
|
async function downloadVoices(voices) {
|
|
if (!fs.existsSync(PIPER_DIR)) {
|
|
fs.mkdirSync(PIPER_DIR, { recursive: true });
|
|
}
|
|
|
|
const downloaded = [];
|
|
const failed = [];
|
|
|
|
for (const stem of voices) {
|
|
// Strip directory components — prevents path traversal via crafted stems
|
|
const safeStem = path.basename(stem);
|
|
|
|
// Parse stem: first segment is lang_region (e.g. en_US), last is quality, middle is name
|
|
const parts = safeStem.split('-');
|
|
if (parts.length < 3) {
|
|
failed.push({ stem, error: `Invalid voice stem format: ${stem}` });
|
|
continue;
|
|
}
|
|
const lang_region = parts[0]; // e.g. "en_US"
|
|
const lang = lang_region.split('_')[0]; // e.g. "en"
|
|
const quality = parts[parts.length - 1]; // e.g. "medium"
|
|
const name = parts.slice(1, -1).join('-'); // e.g. "ryan" (handles hyphenated names)
|
|
|
|
// Validate URL path components to prevent traversal in the constructed URL
|
|
if (!/^[a-z]{2}_[A-Z]{2}$/.test(lang_region) ||
|
|
!/^[a-z]{2}$/.test(lang) ||
|
|
!/^[a-zA-Z0-9_-]+$/.test(name) ||
|
|
!/^[a-z]+$/.test(quality)) {
|
|
failed.push({ stem, error: `Voice stem contains invalid characters: ${stem}` });
|
|
continue;
|
|
}
|
|
|
|
const base = `https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/${lang}/${lang_region}/${name}/${quality}/${safeStem}`;
|
|
const onnxDest = path.join(PIPER_DIR, `${safeStem}.onnx`);
|
|
const jsonDest = path.join(PIPER_DIR, `${safeStem}.onnx.json`);
|
|
const onnxTmp = onnxDest + '.tmp';
|
|
const jsonTmp = jsonDest + '.tmp';
|
|
|
|
try {
|
|
// Download to .tmp files first — rename on success for crash safety
|
|
await downloadFile(`${base}.onnx`, onnxTmp);
|
|
await downloadFile(`${base}.onnx.json`, jsonTmp);
|
|
fs.renameSync(onnxTmp, onnxDest);
|
|
fs.renameSync(jsonTmp, jsonDest);
|
|
downloaded.push(safeStem);
|
|
} catch (err) {
|
|
// Clean up partial / temp downloads
|
|
try { fs.unlinkSync(onnxTmp); } catch (_) {}
|
|
try { fs.unlinkSync(jsonTmp); } catch (_) {}
|
|
try { fs.unlinkSync(onnxDest); } catch (_) {}
|
|
try { fs.unlinkSync(jsonDest); } catch (_) {}
|
|
failed.push({ stem, error: err.message });
|
|
}
|
|
}
|
|
|
|
return { downloaded, failed };
|
|
}
|
|
|
|
/**
|
|
* Main TTS function for OpenClaw integration.
|
|
* @param {Object} options
|
|
* @param {string} options.text - Text to synthesize
|
|
* @param {string} [options.format='ogg'] - Output format: 'ogg' or 'wav'
|
|
* @param {string} [options.voice] - Voice stem, e.g. "en_US-amy-medium". Omit for auto-detect.
|
|
* @param {number} [options.lengthScale] - Speech speed. 1.0 = normal, <1.0 = faster, >1.0 = slower. Default: 1.0.
|
|
* @returns {Promise<{ path: string, format: string, size: number }>}
|
|
*/
|
|
async function tts(options) {
|
|
const { text, format = 'ogg', voice = null } = options;
|
|
const lengthScale = options.lengthScale ?? loadConfig().lengthScale ?? null;
|
|
|
|
if (!text || text.trim().length === 0) {
|
|
throw new Error('No text provided for TTS');
|
|
}
|
|
|
|
const wavPath = await synthesize(text, null, voice, lengthScale);
|
|
|
|
if (format === 'ogg') {
|
|
const oggPath = await convertToOgg(wavPath);
|
|
// Remove intermediate WAV — only the OGG is needed
|
|
try { fs.unlinkSync(wavPath); } catch (_) {}
|
|
const stats = fs.statSync(oggPath);
|
|
return { path: oggPath, format: 'ogg', size: stats.size };
|
|
}
|
|
|
|
const stats = fs.statSync(wavPath);
|
|
return { path: wavPath, format: 'wav', size: stats.size };
|
|
}
|
|
|
|
module.exports = {
|
|
tts,
|
|
synthesize,
|
|
convertToOgg,
|
|
isAvailable,
|
|
listVoices,
|
|
status,
|
|
setup,
|
|
downloadVoices,
|
|
removeVoice,
|
|
loadConfig,
|
|
saveConfig,
|
|
|
|
meta: {
|
|
name: 'local-piper-tts-multilang-secure',
|
|
version: '1.1.0',
|
|
description: 'Local offline Piper TTS with self-contained setup, automatic language detection, and per-call voice selection. Add languages by installing .onnx models.',
|
|
license: 'MIT',
|
|
features: ['offline', 'multilingual', 'auto-detect', 'voice-select', 'self-setup', 'workspace-output']
|
|
}
|
|
};
|