improve fmod whisper bridge and radio button using it

2026-02-21 20:25:12 +02:00
parent c968e6bed4
commit 1a29f785b8
4 changed files with 204 additions and 86 deletions
@@ -5,18 +5,19 @@ using UnityEngine;
 using FMOD;
 using FMODUnity;
 using Whisper;          // WhisperManager, WhisperStream, WhisperResult
-using Whisper.Utils;    // AudioChunk
+using Whisper.Utils;
+using Debug = UnityEngine.Debug;    // AudioChunk

 /// <summary>
-/// Capture microphone with FMOD and feed chunks to WhisperStream (no Unity Microphone).
-/// Also (optionally) plays the recorded sound back via FMOD loopback.
+/// FMOD mic is initialized once (Start) and runs continuously in a ring buffer.
+/// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording().
+/// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls.
 /// </summary>
-
 public class FMODWhisperBridge : MonoBehaviour
 {
    [Header("Whisper")]
    [SerializeField] private WhisperManager whisper;        // assign in Inspector
-    [SerializeField] private bool useVadInStream = false;   // let WhisperStream do VAD or not
+    [SerializeField] private bool useVadInStream = true;   // let WhisperStream do VAD or not

    [Header("FMOD capture")]
    [Tooltip("Recording device index (0 = default)")]
@@ -29,6 +30,8 @@ public class FMODWhisperBridge : MonoBehaviour

    [Header("Loopback (monitor your voice)")]
    public bool playLoopback = true;
+    [Tooltip("If true, loopback plays only while active; otherwise it’s always on.")]
+    public bool loopbackOnlyWhenActive = true;
    [Range(0f, 2f)] public float loopbackVolume = 1.0f;

    public delegate void OnWhisperSegmentUpdatedDelegate(string result);
@@ -55,7 +58,9 @@ public class FMODWhisperBridge : MonoBehaviour

    // temp conversion buffer
    private float[] _floatTmp = new float[0];
+    private short[] _shortOverlay;

+    // activation flag
    private bool isRecordingActivated = false;

    private void Awake()
@@ -66,85 +71,180 @@ public class FMODWhisperBridge : MonoBehaviour

    private async void Start()
    {
-        // Query device info to get native rate/channels.
-        // (FMOD: getRecordDriverInfo gives you system rate & speaker mode)
+        // -------------- FMOD initialize ONCE --------------
+        // Query device info
        string name;
        Guid guid;
        SPEAKERMODE sm;
        int smChannels;
        DRIVER_STATE driverState;
-        // signature: getRecordDriverInfo(id, out name, nameLen, out guid, out systemrate, out speakermode, out speakermodechannels, out driverState)
-        _core.getRecordDriverInfo(recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState);
-        _nativeChannels = channels > 0 ? channels : smChannels;
-        UnityEngine.Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\"  rate={_nativeRate} ch={_nativeChannels}");

-        // Build a user sound buffer that FMOD will fill (OPENUSER | LOOP_NORMAL).
+        var res = _core.getRecordDriverInfo(
+            recordDriverId,
+            out name, 256,
+            out guid,
+            out _nativeRate,
+            out sm,
+            out smChannels,
+            out driverState
+        );
+
+        if (res != RESULT.OK)
+        {
+            Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}");
+            return;
+        }
+
+        _nativeChannels = channels > 0 ? channels : smChannels;
+        int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
+        Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\"  rate={rate} ch={_nativeChannels}");
+
+        // Build user sound (ring buffer) — multiple seconds
        CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
        {
            cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
            numchannels = _nativeChannels,
-            defaultfrequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
+            defaultfrequency = rate,
            format = SOUND_FORMAT.PCM16,
-            length = (uint)(((_nativeRate > 0 ? _nativeRate : desiredSampleRate) * _nativeChannels) * sizeof(short)) // seconds=1 (we loop)
+            length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec)
        };

-        _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
+        res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
+        if (res != RESULT.OK)
+        {
+            Debug.LogError($"[FMOD→Whisper] createSound failed: {res}");
+            return;
+        }
+
        _recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);

-        // Start FMOD recording into that sound (looping ring buffer).
-        _core.recordStart(recordDriverId, _recSound, true);
-        UnityEngine.Debug.Log("[FMOD→Whisper] Recording started.");
+        // Start recording (looping)
+        res = _core.recordStart(recordDriverId, _recSound, true);
+        if (res != RESULT.OK)
+        {
+            Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}");
+            _recSound.release();
+            _recSound.clearHandle();
+            return;
+        }

-        // Optional loopback playback using FMOD (plays same sound ring buffer).
+        // Initialize record position to avoid a huge first delta
+        _core.getRecordPosition(recordDriverId, out _lastRecordPos);
+        Debug.Log("[FMOD→Whisper] Recording started.");
+
+        // Loopback channel (optional). Start once; pause when inactive if desired.
        _core.getMasterChannelGroup(out _masterGroup);
        if (playLoopback)
        {
-            _core.playSound(_recSound, _masterGroup, false, out _playChannel);
-            _playChannel.setMode(MODE._2D);
-            _playChannel.setVolume(loopbackVolume);
-            UnityEngine.Debug.Log("[FMOD→Whisper] Loopback playback started.");
+            res = _core.playSound(_recSound, _masterGroup, false, out _playChannel);
+            if (res == RESULT.OK && _playChannel.hasHandle())
+            {
+                _playChannel.setMode(MODE._2D);
+                _playChannel.setVolume(loopbackVolume);
+                if (loopbackOnlyWhenActive) _playChannel.setPaused(true); // keep muted until Activate
+                Debug.Log("[FMOD→Whisper] Loopback playback ready.");
+            }
+            else
+            {
+                Debug.LogWarning($"[FMOD→Whisper] playSound failed or channel invalid: {res}");
+            }
        }

-        // Create Whisper stream WITHOUT MicrophoneRecord, just from (freq, channels).
-        // We'll push AudioChunk manually.
-        // NOTE: WhisperStream’s sliding window is governed by manager’s stepSec/keepSec/lengthSec.
-        _stream = await whisper.CreateStream(ex.defaultfrequency, _nativeChannels);
-        _stream.OnResultUpdated += (txt) =>
+        // No Whisper stream here. It will be created on ActivateRecording().
+        await System.Threading.Tasks.Task.Yield();
+    }
+
+    /// <summary>
+    /// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording).
+    /// </summary>
+    public async void ActivateRecording()
+    {
+        if (isRecordingActivated)
        {
-            //OnWhisperResultProcessed?.Invoke(txt);
-        };
+            Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active).");
+            return;
+        }
+
+        if (!_recSound.hasHandle())
+        {
+            Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running.");
+            return;
+        }
+
+        int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
+
+        try
+        {
+            _stream = await whisper.CreateStream(rate, _nativeChannels);
+        }
+        catch (Exception e)
+        {
+            Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}");
+            _stream = null;
+            _streamStarted = false;
+            return;
+        }
+
+        // Wire events
        _stream.OnSegmentUpdated += (seg) =>
        {
            if (IsSpeechMeaningful(seg.Result))
-            {
                OnWhisperSegmentUpdated?.Invoke(seg.Result);
-            }
        };
        _stream.OnSegmentFinished += (seg) =>
        {
            if (IsSpeechMeaningful(seg.Result))
-            {
                OnWhisperSegmentFinished?.Invoke(seg.Result);
-            }
        };

-        // If you want Whisper to respect VAD, enable in manager or set useVad (manager controls stream params).
        whisper.useVad = useVadInStream;

        _stream.StartStream();
        _streamStarted = true;

-        // prepare temp arrays roughly 100ms of audio
-        EnsureTmpCapacity((ex.defaultfrequency / 10) * _nativeChannels);
+        // Unpause loopback if it's meant to play only while active
+        if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
+            _playChannel.setPaused(false);
+
+        // Prepare temp arrays roughly 100ms of audio
+        EnsureTmpCapacity((rate / 10) * _nativeChannels);
+
+        isRecordingActivated = true;
+        Debug.Log("[FMOD→Whisper] Stream activated (Whisper started; FMOD was already recording).");
+    }
+
+    /// <summary>
+    /// Stops and disposes the Whisper stream only. FMOD keeps recording.
+    /// </summary>
+    public void DeactivateRecording()
+    {
+        if (!isRecordingActivated && !_streamStarted)
+            return;
+
+        isRecordingActivated = false;
+
+        // Pause loopback if it should only be active during recording
+        if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
+            _playChannel.setPaused(true);
+
+        // Tear down Whisper stream
+        if (_streamStarted)
+        {
+            try { _stream.StopStream(); } catch { /* ignore */ }
+            _streamStarted = false;
+        }
+        _stream = null;
+
+        Debug.Log("[FMOD→Whisper] Stream deactivated (Whisper stopped; FMOD still recording).");
    }

    private void Update()
    {
-        if (!isRecordingActivated) return;
+        // Always tick FMOD
        if (_core.handle != IntPtr.Zero) _core.update();
-        if (!_streamStarted || !_recSound.hasHandle()) return;
+        if (!_recSound.hasHandle()) return;

-        // How many samples recorded since last frame?
+        // Compute how many samples recorded since last frame.
        uint recPos;
        _core.getRecordPosition(recordDriverId, out recPos);

@@ -152,23 +252,39 @@ public class FMODWhisperBridge : MonoBehaviour
            ? (recPos - _lastRecordPos)
            : (recPos + _soundPcmLength - _lastRecordPos);

-        if (deltaSamples == 0) return;
+        if (deltaSamples == 0)
+        {
+            // Even if 0, keep last pos
+            _lastRecordPos = recPos;
+            return;
+        }

-        // We’ll read that region (16-bit) and convert to float[] [-1..1].
-        // Calculate byte range to lock in sound buffer
-        uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 16-bit = 2 bytes
+        // If not active, we *still* advance the ring (so we don't backlog data),
+        // but we *don't* push chunks to Whisper.
+        bool shouldFeed = isRecordingActivated && _streamStarted && _stream != null;
+
+        // Calculate byte range to lock (16-bit)
+        uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample
        uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;

        IntPtr p1, p2;
        uint len1, len2;
-        // Lock can wrap — FMOD splits into p1/p2.
-        _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
+        var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
+        if (r != RESULT.OK)
+        {
+            // If lock fails, still advance last position to avoid spin
+            _lastRecordPos = recPos;
+            return;
+        }

        try
        {
-            // Convert both parts to float and push to Whisper
-            if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
-            if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
+            if (shouldFeed)
+            {
+                if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
+                if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
+            }
+            // else: just discard; we’re only keeping the ring fresh.
        }
        finally
        {
@@ -178,19 +294,11 @@ public class FMODWhisperBridge : MonoBehaviour
        _lastRecordPos = recPos;
    }

-    public void ActivateRecording()
-    {
-        isRecordingActivated = true;
-    }
-
-    public void DeactivateRecording()
-    {
-        isRecordingActivated = false;
-    }
-
    private bool IsSpeechMeaningful(string userText)
    {
-        return !string.IsNullOrEmpty(userText) && !userText.Contains("BLANK_AUDIO") && !userText.Trim().Equals("[ Silence ]");
+        return !string.IsNullOrEmpty(userText)
+               && !userText.Contains("BLANK_AUDIO")
+               && !userText.Trim().Equals("[ Silence ]");
    }

    private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
@@ -198,26 +306,20 @@ public class FMODWhisperBridge : MonoBehaviour
        int samples = (int)(byteLen / 2); // 2 bytes per sample
        EnsureTmpCapacity(samples);

-        // Marshal the 16-bit PCM into managed space
-        // We pin a short[] overlay to avoid copying twice
-        int shorts = samples;
-        int byteCount = (int)byteLen;
+        EnsureShortOverlay(samples, out short[] sBuf);
+        Marshal.Copy(src, sBuf, 0, samples);

-        // Use Marshal.Copy into a short[] then convert to float[-1..1]
-        // (You can also unsafe copy for speed if needed.)
-        EnsureShortOverlay(shorts, out short[] sBuf);
-        Marshal.Copy(src, sBuf, 0, shorts);
-
-        for (int i = 0; i < shorts; i++)
+        // Convert to float [-1..1] (no downmix change from your original)
+        for (int i = 0; i < samples; i++)
        {
-            // 32768f avoids clipping at -32768
            _floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
        }

-        // Build a chunk for WhisperStream; with VAD off, IsVoiceDetected=true is fine.
+        // TODO (optional): downmix to mono and/or run a light gate before feeding.
+        // For now we keep your original behavior:
        var chunk = new AudioChunk
        {
-            Data = _floatTmp.AsSpan(0, shorts).ToArray(),
+            Data = _floatTmp.AsSpan(0, samples).ToArray(),
            Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
            Channels = _nativeChannels,
            IsVoiceDetected = true
@@ -226,7 +328,6 @@ public class FMODWhisperBridge : MonoBehaviour
        _stream.AddToStream(chunk);
    }

-    private short[] _shortOverlay;
    private void EnsureShortOverlay(int samples, out short[] buf)
    {
        if (_shortOverlay == null || _shortOverlay.Length < samples)
@@ -242,13 +343,20 @@ public class FMODWhisperBridge : MonoBehaviour

    private void OnDisable()
    {
-        if (_streamStarted)
-        {
-            _stream.StopStream();
-            _streamStarted = false;
-        }
+        // Stop Whisper (if active)
+        DeactivateRecording();

-        if (_playChannel.hasHandle()) { _playChannel.stop(); _playChannel.clearHandle(); }
-        if (_recSound.hasHandle()) { _core.recordStop(recordDriverId); _recSound.release(); _recSound.clearHandle(); }
+        // Stop/purge FMOD resources (since object is going away)
+        if (_playChannel.hasHandle())
+        {
+            try { _playChannel.stop(); } catch { /* ignore */ }
+            _playChannel.clearHandle();
+        }
+        if (_recSound.hasHandle())
+        {
+            try { _core.recordStop(recordDriverId); } catch { /* ignore */ }
+            try { _recSound.release(); } catch { /* ignore */ }
+            _recSound.clearHandle();
+        }
    }
 }