improve fmod whisper bridge and radio button using it

2026-02-21 20:25:12 +02:00
parent c968e6bed4
commit 1a29f785b8
4 changed files with 204 additions and 86 deletions
@@ -5,18 +5,19 @@ using UnityEngine;
 using FMOD;
 using FMODUnity;
 using Whisper;          // WhisperManager, WhisperStream, WhisperResult
-using Whisper.Utils;    // AudioChunk
+using Whisper.Utils;
 using Debug = UnityEngine.Debug;    // AudioChunk
 /// <summary>
-/// Capture microphone with FMOD and feed chunks to WhisperStream (no Unity Microphone).
+/// FMOD mic is initialized once (Start) and runs continuously in a ring buffer.
-/// Also (optionally) plays the recorded sound back via FMOD loopback.
+/// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording().
 /// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls.
 /// </summary>
 public class FMODWhisperBridge : MonoBehaviour
 {
    [Header("Whisper")]
    [SerializeField] private WhisperManager whisper;        // assign in Inspector
-    [SerializeField] private bool useVadInStream = false;   // let WhisperStream do VAD or not
+    [SerializeField] private bool useVadInStream = true;   // let WhisperStream do VAD or not
    [Header("FMOD capture")]
    [Tooltip("Recording device index (0 = default)")]
@@ -29,6 +30,8 @@ public class FMODWhisperBridge : MonoBehaviour
    [Header("Loopback (monitor your voice)")]
    public bool playLoopback = true;
    [Tooltip("If true, loopback plays only while active; otherwise it’s always on.")]
    public bool loopbackOnlyWhenActive = true;
    [Range(0f, 2f)] public float loopbackVolume = 1.0f;
    public delegate void OnWhisperSegmentUpdatedDelegate(string result);
@@ -55,7 +58,9 @@ public class FMODWhisperBridge : MonoBehaviour
    // temp conversion buffer
    private float[] _floatTmp = new float[0];
    private short[] _shortOverlay;
    // activation flag
    private bool isRecordingActivated = false;
    private void Awake()
@@ -66,85 +71,180 @@ public class FMODWhisperBridge : MonoBehaviour
    private async void Start()
    {
-        // Query device info to get native rate/channels.
+        // -------------- FMOD initialize ONCE --------------
-        // (FMOD: getRecordDriverInfo gives you system rate & speaker mode)
+        // Query device info
        string name;
        Guid guid;
        SPEAKERMODE sm;
        int smChannels;
        DRIVER_STATE driverState;
        // signature: getRecordDriverInfo(id, out name, nameLen, out guid, out systemrate, out speakermode, out speakermodechannels, out driverState)
        _core.getRecordDriverInfo(recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState);
        _nativeChannels = channels > 0 ? channels : smChannels;
        UnityEngine.Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\"  rate={_nativeRate} ch={_nativeChannels}");
-        // Build a user sound buffer that FMOD will fill (OPENUSER | LOOP_NORMAL).
+        var res = _core.getRecordDriverInfo(
            recordDriverId,
            out name, 256,
            out guid,
            out _nativeRate,
            out sm,
            out smChannels,
            out driverState
        );
        if (res != RESULT.OK)
        {
            Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}");
            return;
        }
        _nativeChannels = channels > 0 ? channels : smChannels;
        int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
        Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\"  rate={rate} ch={_nativeChannels}");
        // Build user sound (ring buffer) — multiple seconds
        CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
        {
            cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
            numchannels = _nativeChannels,
-            defaultfrequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
+            defaultfrequency = rate,
            format = SOUND_FORMAT.PCM16,
-            length = (uint)(((_nativeRate > 0 ? _nativeRate : desiredSampleRate) * _nativeChannels) * sizeof(short)) // seconds=1 (we loop)
+            length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec)
        };
-        _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
+        res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
        if (res != RESULT.OK)
        {
            Debug.LogError($"[FMOD→Whisper] createSound failed: {res}");
            return;
        }
        _recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);
-        // Start FMOD recording into that sound (looping ring buffer).
+        // Start recording (looping)
-        _core.recordStart(recordDriverId, _recSound, true);
+        res = _core.recordStart(recordDriverId, _recSound, true);
-        UnityEngine.Debug.Log("[FMOD→Whisper] Recording started.");
+        if (res != RESULT.OK)
        {
            Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}");
            _recSound.release();
            _recSound.clearHandle();
            return;
        }
-        // Optional loopback playback using FMOD (plays same sound ring buffer).
+        // Initialize record position to avoid a huge first delta
        _core.getRecordPosition(recordDriverId, out _lastRecordPos);
        Debug.Log("[FMOD→Whisper] Recording started.");
        // Loopback channel (optional). Start once; pause when inactive if desired.
        _core.getMasterChannelGroup(out _masterGroup);
        if (playLoopback)
        {
-            _core.playSound(_recSound, _masterGroup, false, out _playChannel);
+            res = _core.playSound(_recSound, _masterGroup, false, out _playChannel);
-            _playChannel.setMode(MODE._2D);
+            if (res == RESULT.OK && _playChannel.hasHandle())
-            _playChannel.setVolume(loopbackVolume);
+            {
-            UnityEngine.Debug.Log("[FMOD→Whisper] Loopback playback started.");
+                _playChannel.setMode(MODE._2D);
                _playChannel.setVolume(loopbackVolume);
                if (loopbackOnlyWhenActive) _playChannel.setPaused(true); // keep muted until Activate
                Debug.Log("[FMOD→Whisper] Loopback playback ready.");
            }
            else
            {
                Debug.LogWarning($"[FMOD→Whisper] playSound failed or channel invalid: {res}");
            }
        }
-        // Create Whisper stream WITHOUT MicrophoneRecord, just from (freq, channels).
+        // No Whisper stream here. It will be created on ActivateRecording().
-        // We'll push AudioChunk manually.
+        await System.Threading.Tasks.Task.Yield();
-        // NOTE: WhisperStream’s sliding window is governed by manager’s stepSec/keepSec/lengthSec.
+    }
-        _stream = await whisper.CreateStream(ex.defaultfrequency, _nativeChannels);
+
-        _stream.OnResultUpdated += (txt) =>
+    /// <summary>
    /// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording).
    /// </summary>
    public async void ActivateRecording()
    {
        if (isRecordingActivated)
        {
-            //OnWhisperResultProcessed?.Invoke(txt);
+            Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active).");
-        };
+            return;
        }
        if (!_recSound.hasHandle())
        {
            Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running.");
            return;
        }
        int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
        try
        {
            _stream = await whisper.CreateStream(rate, _nativeChannels);
        }
        catch (Exception e)
        {
            Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}");
            _stream = null;
            _streamStarted = false;
            return;
        }
        // Wire events
        _stream.OnSegmentUpdated += (seg) =>
        {
            if (IsSpeechMeaningful(seg.Result))
            {
                OnWhisperSegmentUpdated?.Invoke(seg.Result);
            }
        };
        _stream.OnSegmentFinished += (seg) =>
        {
            if (IsSpeechMeaningful(seg.Result))
            {
                OnWhisperSegmentFinished?.Invoke(seg.Result);
            }
        };
        // If you want Whisper to respect VAD, enable in manager or set useVad (manager controls stream params).
        whisper.useVad = useVadInStream;
        _stream.StartStream();
        _streamStarted = true;
-        // prepare temp arrays roughly 100ms of audio
+        // Unpause loopback if it's meant to play only while active
-        EnsureTmpCapacity((ex.defaultfrequency / 10) * _nativeChannels);
+        if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
            _playChannel.setPaused(false);
        // Prepare temp arrays roughly 100ms of audio
        EnsureTmpCapacity((rate / 10) * _nativeChannels);
        isRecordingActivated = true;
        Debug.Log("[FMOD→Whisper] Stream activated (Whisper started; FMOD was already recording).");
    }
    /// <summary>
    /// Stops and disposes the Whisper stream only. FMOD keeps recording.
    /// </summary>
    public void DeactivateRecording()
    {
        if (!isRecordingActivated && !_streamStarted)
            return;
        isRecordingActivated = false;
        // Pause loopback if it should only be active during recording
        if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
            _playChannel.setPaused(true);
        // Tear down Whisper stream
        if (_streamStarted)
        {
            try { _stream.StopStream(); } catch { /* ignore */ }
            _streamStarted = false;
        }
        _stream = null;
        Debug.Log("[FMOD→Whisper] Stream deactivated (Whisper stopped; FMOD still recording).");
    }
    private void Update()
    {
-        if (!isRecordingActivated) return;
+        // Always tick FMOD
        if (_core.handle != IntPtr.Zero) _core.update();
-        if (!_streamStarted || !_recSound.hasHandle()) return;
+        if (!_recSound.hasHandle()) return;
-        // How many samples recorded since last frame?
+        // Compute how many samples recorded since last frame.
        uint recPos;
        _core.getRecordPosition(recordDriverId, out recPos);
@@ -152,23 +252,39 @@ public class FMODWhisperBridge : MonoBehaviour
            ? (recPos - _lastRecordPos)
            : (recPos + _soundPcmLength - _lastRecordPos);
-        if (deltaSamples == 0) return;
+        if (deltaSamples == 0)
        {
            // Even if 0, keep last pos
            _lastRecordPos = recPos;
            return;
        }
-        // We’ll read that region (16-bit) and convert to float[] [-1..1].
+        // If not active, we *still* advance the ring (so we don't backlog data),
-        // Calculate byte range to lock in sound buffer
+        // but we *don't* push chunks to Whisper.
-        uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 16-bit = 2 bytes
+        bool shouldFeed = isRecordingActivated && _streamStarted && _stream != null;
        // Calculate byte range to lock (16-bit)
        uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample
        uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;
        IntPtr p1, p2;
        uint len1, len2;
-        // Lock can wrap — FMOD splits into p1/p2.
+        var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
-        _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
+        if (r != RESULT.OK)
        {
            // If lock fails, still advance last position to avoid spin
            _lastRecordPos = recPos;
            return;
        }
        try
        {
-            // Convert both parts to float and push to Whisper
+            if (shouldFeed)
-            if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
+            {
-            if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
+                if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
                if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
            }
            // else: just discard; we’re only keeping the ring fresh.
        }
        finally
        {
@@ -178,19 +294,11 @@ public class FMODWhisperBridge : MonoBehaviour
        _lastRecordPos = recPos;
    }
    public void ActivateRecording()
    {
        isRecordingActivated = true;
    }
    public void DeactivateRecording()
    {
        isRecordingActivated = false;
    }
    private bool IsSpeechMeaningful(string userText)
    {
-        return !string.IsNullOrEmpty(userText) && !userText.Contains("BLANK_AUDIO") && !userText.Trim().Equals("[ Silence ]");
+        return !string.IsNullOrEmpty(userText)
               && !userText.Contains("BLANK_AUDIO")
               && !userText.Trim().Equals("[ Silence ]");
    }
    private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
@@ -198,26 +306,20 @@ public class FMODWhisperBridge : MonoBehaviour
        int samples = (int)(byteLen / 2); // 2 bytes per sample
        EnsureTmpCapacity(samples);
-        // Marshal the 16-bit PCM into managed space
+        EnsureShortOverlay(samples, out short[] sBuf);
-        // We pin a short[] overlay to avoid copying twice
+        Marshal.Copy(src, sBuf, 0, samples);
        int shorts = samples;
        int byteCount = (int)byteLen;
-        // Use Marshal.Copy into a short[] then convert to float[-1..1]
+        // Convert to float [-1..1] (no downmix change from your original)
-        // (You can also unsafe copy for speed if needed.)
+        for (int i = 0; i < samples; i++)
        EnsureShortOverlay(shorts, out short[] sBuf);
        Marshal.Copy(src, sBuf, 0, shorts);
        for (int i = 0; i < shorts; i++)
        {
            // 32768f avoids clipping at -32768
            _floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
        }
-        // Build a chunk for WhisperStream; with VAD off, IsVoiceDetected=true is fine.
+        // TODO (optional): downmix to mono and/or run a light gate before feeding.
        // For now we keep your original behavior:
        var chunk = new AudioChunk
        {
-            Data = _floatTmp.AsSpan(0, shorts).ToArray(),
+            Data = _floatTmp.AsSpan(0, samples).ToArray(),
            Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
            Channels = _nativeChannels,
            IsVoiceDetected = true
@@ -226,7 +328,6 @@ public class FMODWhisperBridge : MonoBehaviour
        _stream.AddToStream(chunk);
    }
    private short[] _shortOverlay;
    private void EnsureShortOverlay(int samples, out short[] buf)
    {
        if (_shortOverlay == null || _shortOverlay.Length < samples)
@@ -242,13 +343,20 @@ public class FMODWhisperBridge : MonoBehaviour
    private void OnDisable()
    {
-        if (_streamStarted)
+        // Stop Whisper (if active)
-        {
+        DeactivateRecording();
            _stream.StopStream();
            _streamStarted = false;
        }
-        if (_playChannel.hasHandle()) { _playChannel.stop(); _playChannel.clearHandle(); }
+        // Stop/purge FMOD resources (since object is going away)
-        if (_recSound.hasHandle()) { _core.recordStop(recordDriverId); _recSound.release(); _recSound.clearHandle(); }
+        if (_playChannel.hasHandle())
        {
            try { _playChannel.stop(); } catch { /* ignore */ }
            _playChannel.clearHandle();
        }
        if (_recSound.hasHandle())
        {
            try { _core.recordStop(recordDriverId); } catch { /* ignore */ }
            try { _recSound.release(); } catch { /* ignore */ }
            _recSound.clearHandle();
        }
    }
 }
@@ -5,6 +5,8 @@ public class ReleasableButton : MonoBehaviour
 {
    public delegate void OnButtonPressedDelegate();
    public event OnButtonPressedDelegate OnButtonPressed;
    public delegate void OnButtonReleasedDelegate();
    public event OnButtonReleasedDelegate OnButtonReleased;
    public Transform movableParts;
    public float moveDuration = 0.25f;
@@ -37,7 +39,6 @@ public class ReleasableButton : MonoBehaviour
    {
        if (!isButtonDown && collider.gameObject.tag.EndsWith("Hand"))
        {
            Debug.Log("collided with: " + collider.gameObject.name);
            Activate();
            OnButtonPressed?.Invoke();
        }
@@ -47,8 +48,8 @@ public class ReleasableButton : MonoBehaviour
    {
        if (isButtonDown && collider.gameObject.tag.EndsWith("Hand"))
        {
            Debug.Log("collider exited: " + collider.gameObject.name);
            Deactivate();
            OnButtonReleased?.Invoke();
        }
    }
@@ -18,6 +18,7 @@ public class RadioTransmitter : XRGrabInteractable
    void Start()
    {
        radioButton.OnButtonPressed += OnRadioButtonPressed;
        radioButton.OnButtonReleased += OnRadioButtonReleased;
    }
    // Update is called once per frame
@@ -38,12 +39,20 @@ public class RadioTransmitter : XRGrabInteractable
        fmodWhisperBridge.OnWhisperSegmentFinished += OnPlayerSpeechFinished;
        AudioManager.Instance.PlayAttachedInstance(FMODEvents.Instance.RadioButton, gameObject);
        fmodWhisperBridge.ActivateRecording();
-        // TODO: deactivate when button is released
+    }
    private void OnRadioButtonReleased()
    {
        fmodWhisperBridge.OnWhisperSegmentUpdated -= OnPlayerSpeechUpdated;
        fmodWhisperBridge.OnWhisperSegmentFinished -= OnPlayerSpeechFinished;
        AudioManager.Instance.PlayAttachedInstance(FMODEvents.Instance.RadioButton, gameObject);
        fmodWhisperBridge.DeactivateRecording();
    }
    private void OnPlayerSpeechUpdated(string text)
    {
        computerScreen.text = text;
        OnPlayerFinishedSpeaking?.Invoke();
    }
    private void OnPlayerSpeechFinished(string playerText)