voice/examples/client_browser.html at 2bff5aa1f671efb665fe1455e279c788b65ea29f

Fork: 0
root / voice
Find file
Newer
Older
voice / examples / client_browser.html
Eugene Sukhodolskiy 22 days ago 9 KB fix: word-by-word payload merging and browser client sample rate
Raw Blame History
<!DOCTYPE html>
<html lang="ru">
<head>
  <meta charset="UTF-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Voice TTS WebSocket Client</title>
  <style>
    :root { font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; }
    body { max-width: 720px; margin: 2rem auto; padding: 0 1rem; line-height: 1.5; }
    label { display: block; margin-top: 1rem; font-weight: 600; }
    input, textarea, select, button { font: inherit; padding: 0.5rem; margin-top: 0.25rem; }
    input[type="text"], input[type="number"], select { width: 100%; box-sizing: border-box; }
    textarea { width: 100%; height: 6rem; box-sizing: border-box; }
    .row { display: flex; gap: 1rem; align-items: end; }
    .row > * { flex: 1; }
    button { cursor: pointer; background: #2563eb; color: white; border: none; border-radius: 0.375rem; }
    button:disabled { opacity: 0.5; cursor: not-allowed; }
    .stop { background: #dc2626; }
    #log { margin-top: 1rem; padding: 0.75rem; min-height: 8rem; background: #f3f4f6; border-radius: 0.375rem; white-space: pre-wrap; font-family: ui-monospace, SFMono-Regular, Menlo, monospace; font-size: 0.875rem; }
    .status { color: #4b5563; }
  </style>
</head>
<body>
  <h1>Voice TTS WebSocket Client</h1>

  <label>Server URI</label>
  <input id="uri" type="text" value="ws://localhost:8765/ws" />

  <div class="row">
    <div>
      <label>Language</label>
      <select id="language">
        <option value="ru">Russian</option>
        <option value="en">English</option>
      </select>
    </div>
    <div>
      <label>Speed</label>
      <input id="speed" type="number" min="0.5" max="2" step="0.1" value="1.0" />
    </div>
    <div>
      <label>Emotion</label>
      <input id="emotion" type="text" value="neutral" />
    </div>
  </div>

  <label>Voice reference path</label>
  <input id="voiceRef" type="text" value="voices/self_ref_clean.wav" />

  <label>Text to speak</label>
  <textarea id="text">Привет. Это тестовый запуск из браузера.</textarea>

  <div class="row" style="margin-top: 1rem;">
    <button id="connect">Connect</button>
    <button id="testAudio" disabled>Test audio</button>
    <button id="speak" disabled>Speak streaming</button>
    <button id="stop" class="stop" disabled>Stop</button>
  </div>

  <div id="log"></div>

  <script>
    const $ = (id) => document.getElementById(id);
    const log = (msg) => {
      const line = `[${new Date().toLocaleTimeString()}] ${msg}`;
      $('log').textContent += line + '\n';
    };

    let ws = null;
    let audioCtx = null;
    let nextStartTime = 0;
    let seq = 0;

    const nextSeq = () => ++seq;

    const ensureAudioContext = async () => {
      const AudioContextCtor = window.AudioContext || window.webkitAudioContext;
      if (!audioCtx) {
        audioCtx = new AudioContextCtor({ sampleRate: 44100 });
      }
      if (audioCtx.state === 'closed') {
        audioCtx = new AudioContextCtor({ sampleRate: 44100 });
      }
      if (audioCtx.state === 'suspended') {
        log(`Resuming AudioContext (state=${audioCtx.state}) ...`);
        await audioCtx.resume();
        // Some browsers need a moment before currentTime starts advancing.
        let attempts = 0;
        while (audioCtx.state !== 'running' && attempts < 20) {
          await new Promise((r) => setTimeout(r, 25));
          attempts++;
        }
        log(`AudioContext state after resume: ${audioCtx.state}`);
      }
    };

    const sampleRate = 44100;

    const playTone = async (freq = 440, duration = 0.5, amplitude = 0.5) => {
      await ensureAudioContext();
      const samplesCount = Math.ceil(sampleRate * duration);
      const buffer = audioCtx.createBuffer(1, samplesCount, sampleRate);
      const channel = buffer.getChannelData(0);
      for (let i = 0; i < samplesCount; i++) {
        const t = i / sampleRate;
        channel[i] = amplitude * Math.sin(2 * Math.PI * freq * t) * (1 - t / duration);
      }
      const source = audioCtx.createBufferSource();
      source.buffer = buffer;
      source.connect(audioCtx.destination);
      const startTime = audioCtx.currentTime + 0.02;
      source.start(startTime);
      log(`Playing test tone at ${freq} Hz for ${duration}s`);
    };

    const base64ToBytes = (base64) => {
      const binary = atob(base64);
      const bytes = new Uint8Array(binary.length);
      for (let i = 0; i < binary.length; i++) {
        bytes[i] = binary.charCodeAt(i);
      }
      return bytes;
    };

    const playPcm16 = async (base64Data, seq, serverSampleRate = sampleRate) => {
      await ensureAudioContext();
      if (audioCtx.state !== 'running') {
        throw new Error(`AudioContext not running (state=${audioCtx.state})`);
      }
      const bytes = base64ToBytes(base64Data);
      if (bytes.length === 0) {
        throw new Error('Empty audio data');
      }
      if (bytes.length % 2 !== 0) {
        throw new Error(`Odd raw audio length: ${bytes.length}`);
      }
      const sampleCount = bytes.length / 2;
      const view = new DataView(bytes.buffer);
      const buffer = audioCtx.createBuffer(1, sampleCount, serverSampleRate);
      const channel = buffer.getChannelData(0);
      for (let i = 0; i < sampleCount; i++) {
        // little-endian PCM16 -> float32 in [-1, 1]
        channel[i] = view.getInt16(i * 2, true) / 32768.0;
      }

      const source = audioCtx.createBufferSource();
      source.buffer = buffer;
      source.connect(audioCtx.destination);

      const now = audioCtx.currentTime;
      if (nextStartTime < now) {
        nextStartTime = now;
      }
      const startAt = nextStartTime;
      source.start(startAt);
      nextStartTime += buffer.duration;
      log(`audio queued seq=${seq} raw=${bytes.length} samples=${sampleCount} duration=${buffer.duration.toFixed(2)}s startAt=${startAt.toFixed(3)} ctxState=${audioCtx.state}`);

      source.onended = () => {
        log(`audio ended seq=${seq}`);
      };
    };

    $('connect').onclick = async () => {
      if (ws) return;
      const uri = $('uri').value;
      log(`Connecting to ${uri} ...`);
      try {
        ws = new WebSocket(uri);
        ws.onopen = async () => {
          await ensureAudioContext();
          log('Connected');
          $('connect').disabled = true;
          $('testAudio').disabled = false;
          $('speak').disabled = false;
          $('stop').disabled = false;

          const init = {
            type: 'init',
            seq: nextSeq(),
            session_id: 'browser-client',
            language: $('language').value,
            speed: parseFloat($('speed').value),
            emotion: $('emotion').value,
          };
          if ($('voiceRef').value.trim()) {
            init.voice_ref = $('voiceRef').value.trim();
          }
          ws.send(JSON.stringify(init));
          log('Sent init');
        };

        ws.onmessage = (event) => {
          let msg;
          try {
            msg = JSON.parse(event.data);
          } catch (err) {
            log(`malformed message: ${err.message}`);
            return;
          }
          if (msg.type === 'audio') {
            (async () => {
              try {
                log(`received audio seq=${msg.seq} base64=${String(msg.data).length} format=${msg.format} sr=${msg.sample_rate} ch=${msg.channels}`);
                if (!msg.data || String(msg.data).trim() === '') {
                  throw new Error('Server sent empty audio data');
                }
                await playPcm16(msg.data, msg.seq, msg.sample_rate);
              } catch (err) {
                log(`audio playback error: ${err.message}`);
              }
            })();
          } else if (msg.type === 'status') {
            log(`status ${msg.event} seq=${msg.seq}`);
          } else if (msg.type === 'error') {
            log(`error: ${msg.message}`);
          } else {
            log(`unknown message type=${msg.type}`);
          }
        };

        ws.onclose = () => {
          log('Disconnected');
          ws = null;
          $('connect').disabled = false;
          $('testAudio').disabled = true;
          $('speak').disabled = true;
          $('stop').disabled = true;
        };

        ws.onerror = (err) => log(`WebSocket error: ${err.message || err}`);
      } catch (err) {
        log(`Connection failed: ${err.message}`);
      }
    };

    $('testAudio').onclick = async () => {
      await playTone(440, 0.6, 0.5);
    };

    $('speak').onclick = async () => {
      if (!ws || ws.readyState !== WebSocket.OPEN) {
        log('Not connected');
        return;
      }
      await ensureAudioContext();
      nextStartTime = audioCtx.currentTime;
      log(`Speak start, ctxState=${audioCtx.state}, nextStartTime=${nextStartTime.toFixed(3)}`);

      const text = $('text').value.trim();
      const words = text.split(/\s+/);
      log(`Streaming ${words.length} words ...`);

      for (let i = 0; i < words.length; i++) {
        const payload = words[i] + (i < words.length - 1 ? ' ' : '');
        ws.send(JSON.stringify({ type: 'text', payload, seq: nextSeq() }));
        await new Promise((r) => setTimeout(r, 120));
      }

      ws.send(JSON.stringify({ type: 'flush', seq: nextSeq() }));
      log('Sent flush');
    };

    $('stop').onclick = () => {
      if (!ws || ws.readyState !== WebSocket.OPEN) return;
      ws.send(JSON.stringify({ type: 'stop', reason: 'user-interrupt', seq: nextSeq() }));
      nextStartTime = 0;
      log('Sent stop');
    };
  </script>
</body>
</html>