/** * Local Piper TTS Skill for OpenClaw * * Provides offline text-to-speech using Piper TTS with automatic language detection. * Includes self-contained setup: creates an isolated Python venv and installs piper-tts. * Language routing and voice selection are handled by the bundled piper-tts.sh script — * add more languages by installing .onnx models and the heuristics update automatically. */ const { execFile } = require('child_process'); const fs = require('fs'); const https = require('https'); const path = require('path'); const { promisify } = require('util'); const execFileAsync = promisify(execFile); // Skill directory is the self-contained root: venv, voice models, and piper-tts.sh all live here const PIPER_DIR = __dirname; const PIPER_SCRIPT = path.join(PIPER_DIR, 'piper-tts.sh'); const CONFIG_FILE = path.join(PIPER_DIR, 'config.json'); /** * Load persisted skill config from config.json in the skill directory. * Returns {} if the file doesn't exist yet. * @returns {{ lengthScale?: number }} */ function loadConfig() { try { return JSON.parse(fs.readFileSync(CONFIG_FILE, 'utf8')); } catch (err) { if (err.code === 'ENOENT') return {}; // File exists but is not valid JSON — warn so corrupted config doesn't silently vanish console.error(`Warning: failed to parse ${CONFIG_FILE}: ${err.message}`); return {}; } } /** * Save one or more config values to config.json in the skill directory. * Merges with existing config — does not overwrite unrelated keys. * @param {Object} updates - e.g. { lengthScale: 0.8 } */ function saveConfig(updates) { const current = loadConfig(); const next = { ...current, ...updates }; fs.writeFileSync(CONFIG_FILE, JSON.stringify(next, null, 2) + '\n', 'utf8'); } // Output directory const WORKSPACE_DIR = process.env.OPENCLAW_WORKSPACE || path.join(process.env.HOME, '.openclaw', 'workspace'); const OUTPUT_DIR = path.join(WORKSPACE_DIR, 'tts'); /** * Resolve a voice stem to a safe, absolute .onnx path within PIPER_DIR. * Prevents path traversal: only models inside PIPER_DIR are accepted. * @param {string} voice - e.g. "en_US-ryan-high" or "en_US-ryan-high.onnx" * @returns {string} absolute path to the .onnx file */ function resolveVoice(voice) { // path.basename strips all directory components, preventing traversal const stem = path.basename(voice).replace(/\.onnx$/, ''); const resolved = path.join(PIPER_DIR, stem + '.onnx'); // Explicit bounds check as defence-in-depth if (!resolved.startsWith(PIPER_DIR + path.sep)) { throw new Error(`Invalid voice name: ${voice}`); } if (!fs.existsSync(resolved)) { throw new Error(`Voice model not found: ${stem} (expected at ${resolved})`); } return resolved; } /** * Download a single file from a URL, following redirects. * @param {string} url * @param {string} dest - absolute path to write to * @param {number} redirects - redirect budget * @returns {Promise} */ function downloadFile(url, dest, redirects = 10) { return new Promise((resolve, reject) => { if (redirects === 0) { return reject(new Error('Too many redirects')); } if (!url.startsWith('https:')) { return reject(new Error(`Refusing non-HTTPS URL: ${url}`)); } const file = fs.createWriteStream(dest); let settled = false; const done = (err) => { if (settled) return; settled = true; file.close(); if (err) { try { fs.unlinkSync(dest); } catch (_) {} reject(err); } else resolve(); }; const req = https.get(url, (res) => { if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { file.destroy(); try { fs.unlinkSync(dest); } catch (_) {} settled = true; downloadFile(res.headers.location, dest, redirects - 1).then(resolve, reject); return; } if (res.statusCode !== 200) { return done(new Error(`HTTP ${res.statusCode} for ${url}`)); } res.pipe(file); file.on('finish', () => done(null)); file.on('error', done); res.on('error', done); }); req.on('error', done); req.setTimeout(300000, () => { req.destroy(); done(new Error('Download timed out')); }); }); } /** * List voice model stems installed in PIPER_DIR. * Each stem can be passed directly as the `voice` parameter to tts(). * @returns {string[]} sorted list of voice stems, e.g. ["en_US-ryan-high", "pl_PL-gosia-medium"] */ function listVoices() { if (!fs.existsSync(PIPER_DIR)) return []; return fs.readdirSync(PIPER_DIR) .filter(f => f.endsWith('.onnx') && !f.endsWith('.onnx.tmp')) .map(f => f.replace(/\.onnx$/, '')) .sort(); } /** * Return detailed status of the Piper installation. * Use this before any tts() call to determine what action is needed. * @returns {Promise<{ ready: boolean, stage: string, message: string, voices?: string[] }>} */ async function status() { // piper-tts.sh is always present (bundled); check venv to detect missing setup const venvPath = path.join(PIPER_DIR, 'venv'); if (!fs.existsSync(venvPath)) { return { ready: false, stage: 'not-setup', message: 'Piper is not set up. Ask the user for confirmation, then call setup().' }; } const venvPiper = path.join(PIPER_DIR, 'venv', 'bin', 'piper'); if (!fs.existsSync(venvPiper)) { return { ready: false, stage: 'no-piper', message: 'piper binary missing from venv. Ask the user for confirmation, then call setup() to reinstall.' }; } const voices = listVoices(); if (voices.length === 0) { return { ready: false, stage: 'no-model', message: `No voice models installed in ${PIPER_DIR}. Ask the user which language/voice they want, then download the .onnx + .onnx.json from https://github.com/rhasspy/piper/blob/master/VOICES.md` }; } return { ready: true, stage: 'ready', voices }; } /** * Check if Piper TTS is fully ready to use. * @returns {Promise} */ async function isAvailable() { const s = await status(); return s.ready; } /** * Set up the Piper TTS environment. * Creates an isolated Python venv inside the skill directory and installs piper-tts. * Everything stays self-contained — nothing is written outside the skill directory. * * IMPORTANT: Always ask the user for confirmation before calling this. * It installs piper-tts from PyPI into a venv inside the skill directory. * * @returns {Promise<{ success: boolean, steps: string[] }>} */ async function setup() { const steps = []; // 1. Verify Python 3 is available try { await execFileAsync('python3', ['--version'], { timeout: 10000 }); steps.push('Python 3 found'); } catch { throw new Error('Python 3 is required but not found on PATH. Please install Python 3.8+ first.'); } // 2. Create venv inside skill directory (skip if already exists) const venvPath = path.join(PIPER_DIR, 'venv'); if (!fs.existsSync(venvPath)) { await execFileAsync('python3', ['-m', 'venv', venvPath], { timeout: 60000 }); steps.push('Python virtual environment created'); } else { steps.push('Python virtual environment already exists'); } // 3. Verify pip is present in venv const pipPath = path.join(venvPath, 'bin', 'pip'); if (!fs.existsSync(pipPath)) { throw new Error('pip not found in venv — venv creation may have failed. Check your Python installation.'); } // 4. Install piper-tts and its dependencies into the isolated venv (not system Python) // pathvalidate is listed as a piper-tts dependency but is occasionally missed by pip await execFileAsync(pipPath, ['install', '--quiet', 'piper-tts', 'pathvalidate'], { timeout: 300000 }); steps.push('piper-tts installed into venv'); // 5. Ensure piper-tts.sh is executable (may be lost on cp-based installs) if (!fs.existsSync(PIPER_SCRIPT)) { throw new Error('piper-tts.sh not found in skill directory — skill installation may be incomplete.'); } fs.chmodSync(PIPER_SCRIPT, 0o755); steps.push('piper-tts.sh marked executable'); // 6. Check for espeak-ng (required by Piper for phonemization, but varies by platform) let hasEspeak = false; for (const bin of ['espeak-ng', 'espeak']) { try { await execFileAsync('which', [bin], { timeout: 5000 }); hasEspeak = true; break; } catch (_) {} } if (!hasEspeak) { steps.push('WARNING: espeak-ng not found — Piper requires it for phonemization. Install it: sudo apt install espeak-ng (Debian/Ubuntu), sudo dnf install espeak-ng (Fedora), brew install espeak (macOS)'); } else { steps.push('espeak-ng found'); } return { success: true, steps }; } /** * Generate speech using local Piper TTS. * @param {string} text - Text to synthesize * @param {string|null} outputFilename - Optional output filename * @param {string|null} voice - Optional voice stem, e.g. "en_US-amy-medium" * @returns {Promise} - Path to generated WAV file */ async function synthesize(text, outputFilename = null, voice = null, lengthScale = null) { if (!fs.existsSync(OUTPUT_DIR)) { fs.mkdirSync(OUTPUT_DIR, { recursive: true }); } // path.basename prevents directory traversal in user-supplied filenames const safeBase = outputFilename || `piper-${Date.now()}.wav`; const filename = path.basename(safeBase); const outputPath = path.join(OUTPUT_DIR, filename); if (!fs.existsSync(PIPER_SCRIPT)) { throw new Error('Piper TTS not set up. Call setup() first.'); } // Inherit env; optionally pin a specific voice model and speed const env = { ...process.env }; if (voice) { env.PIPER_VOICE_MODEL = resolveVoice(voice); } if (lengthScale !== null) { env.PIPER_LENGTH_SCALE = String(lengthScale); } try { await execFileAsync(PIPER_SCRIPT, [text, outputPath], { timeout: 30000, maxBuffer: 1024 * 1024, env } ); if (!fs.existsSync(outputPath)) { throw new Error('Piper TTS failed to create output file'); } return outputPath; } catch (error) { throw new Error(`Piper TTS synthesis failed: ${error.message}`); } } /** * Remove a voice model from the skill directory. * Deletes both the .onnx and .onnx.json files for the given stem. * @param {string} stem - Voice stem to remove, e.g. "en_US-ryan-medium" * @returns {{ removed: string, filesDeleted: string[] }} */ function removeVoice(stem) { const safeStem = path.basename(stem).replace(/\.onnx$/, ''); const onnxPath = path.join(PIPER_DIR, safeStem + '.onnx'); const jsonPath = path.join(PIPER_DIR, safeStem + '.onnx.json'); if (!onnxPath.startsWith(PIPER_DIR + path.sep)) { throw new Error(`Invalid voice name: ${stem}`); } if (!fs.existsSync(onnxPath)) { throw new Error(`Voice not installed: ${safeStem}. Use listVoices() to see installed voices.`); } const deleted = []; fs.unlinkSync(onnxPath); deleted.push(safeStem + '.onnx'); try { fs.unlinkSync(jsonPath); deleted.push(safeStem + '.onnx.json'); } catch (_) {} return { removed: safeStem, filesDeleted: deleted }; } /** * Convert WAV to OGG/Opus format. * @param {string} inputPath - Path to WAV file * @param {string|null} outputPath - Path for OGG output * @returns {Promise} - Path to OGG file */ async function convertToOgg(inputPath, outputPath = null) { if (!outputPath) { outputPath = inputPath.replace(/\.wav$/, '.ogg'); } try { await execFileAsync('ffmpeg', ['-y', '-i', inputPath, '-c:a', 'libopus', '-b:a', '64k', outputPath], { timeout: 30000 } ); if (!fs.existsSync(outputPath)) { throw new Error('FFmpeg conversion failed'); } return outputPath; } catch (error) { throw new Error(`OGG conversion failed: ${error.message}`); } } /** * Download Piper voice models from HuggingFace (rhasspy/piper-voices). * Downloads both .onnx and .onnx.json for each requested stem. * * Stem format: {lang_region}-{name}-{quality} * Examples: "en_US-ryan-medium", "en_US-amy-medium", "pl_PL-gosia-medium" * * @param {string[]} voices - Voice stems to download * @returns {Promise<{ downloaded: string[], failed: Array<{stem: string, error: string}> }>} */ async function downloadVoices(voices) { if (!fs.existsSync(PIPER_DIR)) { fs.mkdirSync(PIPER_DIR, { recursive: true }); } const downloaded = []; const failed = []; for (const stem of voices) { // Strip directory components — prevents path traversal via crafted stems const safeStem = path.basename(stem); // Parse stem: first segment is lang_region (e.g. en_US), last is quality, middle is name const parts = safeStem.split('-'); if (parts.length < 3) { failed.push({ stem, error: `Invalid voice stem format: ${stem}` }); continue; } const lang_region = parts[0]; // e.g. "en_US" const lang = lang_region.split('_')[0]; // e.g. "en" const quality = parts[parts.length - 1]; // e.g. "medium" const name = parts.slice(1, -1).join('-'); // e.g. "ryan" (handles hyphenated names) // Validate URL path components to prevent traversal in the constructed URL if (!/^[a-z]{2}_[A-Z]{2}$/.test(lang_region) || !/^[a-z]{2}$/.test(lang) || !/^[a-zA-Z0-9_-]+$/.test(name) || !/^[a-z]+$/.test(quality)) { failed.push({ stem, error: `Voice stem contains invalid characters: ${stem}` }); continue; } const base = `https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/${lang}/${lang_region}/${name}/${quality}/${safeStem}`; const onnxDest = path.join(PIPER_DIR, `${safeStem}.onnx`); const jsonDest = path.join(PIPER_DIR, `${safeStem}.onnx.json`); const onnxTmp = onnxDest + '.tmp'; const jsonTmp = jsonDest + '.tmp'; try { // Download to .tmp files first — rename on success for crash safety await downloadFile(`${base}.onnx`, onnxTmp); await downloadFile(`${base}.onnx.json`, jsonTmp); fs.renameSync(onnxTmp, onnxDest); fs.renameSync(jsonTmp, jsonDest); downloaded.push(safeStem); } catch (err) { // Clean up partial / temp downloads try { fs.unlinkSync(onnxTmp); } catch (_) {} try { fs.unlinkSync(jsonTmp); } catch (_) {} try { fs.unlinkSync(onnxDest); } catch (_) {} try { fs.unlinkSync(jsonDest); } catch (_) {} failed.push({ stem, error: err.message }); } } return { downloaded, failed }; } /** * Main TTS function for OpenClaw integration. * @param {Object} options * @param {string} options.text - Text to synthesize * @param {string} [options.format='ogg'] - Output format: 'ogg' or 'wav' * @param {string} [options.voice] - Voice stem, e.g. "en_US-amy-medium". Omit for auto-detect. * @param {number} [options.lengthScale] - Speech speed. 1.0 = normal, <1.0 = faster, >1.0 = slower. Default: 1.0. * @returns {Promise<{ path: string, format: string, size: number }>} */ async function tts(options) { const { text, format = 'ogg', voice = null } = options; const lengthScale = options.lengthScale ?? loadConfig().lengthScale ?? null; if (!text || text.trim().length === 0) { throw new Error('No text provided for TTS'); } const wavPath = await synthesize(text, null, voice, lengthScale); if (format === 'ogg') { const oggPath = await convertToOgg(wavPath); // Remove intermediate WAV — only the OGG is needed try { fs.unlinkSync(wavPath); } catch (_) {} const stats = fs.statSync(oggPath); return { path: oggPath, format: 'ogg', size: stats.size }; } const stats = fs.statSync(wavPath); return { path: wavPath, format: 'wav', size: stats.size }; } module.exports = { tts, synthesize, convertToOgg, isAvailable, listVoices, status, setup, downloadVoices, removeVoice, loadConfig, saveConfig, meta: { name: 'local-piper-tts-multilang-secure', version: '1.1.0', description: 'Local offline Piper TTS with self-contained setup, automatic language detection, and per-call voice selection. Add languages by installing .onnx models.', license: 'MIT', features: ['offline', 'multilingual', 'auto-detect', 'voice-select', 'self-setup', 'workspace-output'] } };