From 1a29f785b8f415746fb2f83f34983c91bf476318 Mon Sep 17 00:00:00 2001 From: henrisel Date: Sat, 21 Feb 2026 20:25:12 +0200 Subject: [PATCH] improve fmod whisper bridge and radio button using it --- .../_PROJECT/Scenes/DeltaBuilding_base.unity | 4 +- .../ModeGeneration/FMODWhisperBridge.cs | 270 ++++++++++++------ .../ModeGeneration/ReleasableButton.cs | 5 +- .../ShapeDetection/RadioTransmitter.cs | 11 +- 4 files changed, 204 insertions(+), 86 deletions(-) diff --git a/Assets/_PROJECT/Scenes/DeltaBuilding_base.unity b/Assets/_PROJECT/Scenes/DeltaBuilding_base.unity index 653ed15f..e59fbf7c 100644 --- a/Assets/_PROJECT/Scenes/DeltaBuilding_base.unity +++ b/Assets/_PROJECT/Scenes/DeltaBuilding_base.unity @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0631fb43a46acd8f647502cd91a9d48c6df78c38b3f8e0a6727dad534486e4bf -size 68525490 +oid sha256:5f4fd0fdace577985445de3829614fe92721750ede29834c23c8c30d1a3f4b7d +size 68526229 diff --git a/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs b/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs index c86c8e32..c7ec4473 100644 --- a/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs +++ b/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs @@ -5,18 +5,19 @@ using UnityEngine; using FMOD; using FMODUnity; using Whisper; // WhisperManager, WhisperStream, WhisperResult -using Whisper.Utils; // AudioChunk +using Whisper.Utils; +using Debug = UnityEngine.Debug; // AudioChunk /// -/// Capture microphone with FMOD and feed chunks to WhisperStream (no Unity Microphone). -/// Also (optionally) plays the recorded sound back via FMOD loopback. +/// FMOD mic is initialized once (Start) and runs continuously in a ring buffer. +/// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording(). +/// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls. /// - public class FMODWhisperBridge : MonoBehaviour { [Header("Whisper")] [SerializeField] private WhisperManager whisper; // assign in Inspector - [SerializeField] private bool useVadInStream = false; // let WhisperStream do VAD or not + [SerializeField] private bool useVadInStream = true; // let WhisperStream do VAD or not [Header("FMOD capture")] [Tooltip("Recording device index (0 = default)")] @@ -29,6 +30,8 @@ public class FMODWhisperBridge : MonoBehaviour [Header("Loopback (monitor your voice)")] public bool playLoopback = true; + [Tooltip("If true, loopback plays only while active; otherwise it’s always on.")] + public bool loopbackOnlyWhenActive = true; [Range(0f, 2f)] public float loopbackVolume = 1.0f; public delegate void OnWhisperSegmentUpdatedDelegate(string result); @@ -55,7 +58,9 @@ public class FMODWhisperBridge : MonoBehaviour // temp conversion buffer private float[] _floatTmp = new float[0]; + private short[] _shortOverlay; + // activation flag private bool isRecordingActivated = false; private void Awake() @@ -66,85 +71,180 @@ public class FMODWhisperBridge : MonoBehaviour private async void Start() { - // Query device info to get native rate/channels. - // (FMOD: getRecordDriverInfo gives you system rate & speaker mode) + // -------------- FMOD initialize ONCE -------------- + // Query device info string name; Guid guid; SPEAKERMODE sm; int smChannels; DRIVER_STATE driverState; - // signature: getRecordDriverInfo(id, out name, nameLen, out guid, out systemrate, out speakermode, out speakermodechannels, out driverState) - _core.getRecordDriverInfo(recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState); - _nativeChannels = channels > 0 ? channels : smChannels; - UnityEngine.Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={_nativeRate} ch={_nativeChannels}"); - // Build a user sound buffer that FMOD will fill (OPENUSER | LOOP_NORMAL). + var res = _core.getRecordDriverInfo( + recordDriverId, + out name, 256, + out guid, + out _nativeRate, + out sm, + out smChannels, + out driverState + ); + + if (res != RESULT.OK) + { + Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}"); + return; + } + + _nativeChannels = channels > 0 ? channels : smChannels; + int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate; + Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={rate} ch={_nativeChannels}"); + + // Build user sound (ring buffer) — multiple seconds CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO { cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)), numchannels = _nativeChannels, - defaultfrequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate, + defaultfrequency = rate, format = SOUND_FORMAT.PCM16, - length = (uint)(((_nativeRate > 0 ? _nativeRate : desiredSampleRate) * _nativeChannels) * sizeof(short)) // seconds=1 (we loop) + length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec) }; - _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound); + res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound); + if (res != RESULT.OK) + { + Debug.LogError($"[FMOD→Whisper] createSound failed: {res}"); + return; + } + _recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM); - // Start FMOD recording into that sound (looping ring buffer). - _core.recordStart(recordDriverId, _recSound, true); - UnityEngine.Debug.Log("[FMOD→Whisper] Recording started."); + // Start recording (looping) + res = _core.recordStart(recordDriverId, _recSound, true); + if (res != RESULT.OK) + { + Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}"); + _recSound.release(); + _recSound.clearHandle(); + return; + } - // Optional loopback playback using FMOD (plays same sound ring buffer). + // Initialize record position to avoid a huge first delta + _core.getRecordPosition(recordDriverId, out _lastRecordPos); + Debug.Log("[FMOD→Whisper] Recording started."); + + // Loopback channel (optional). Start once; pause when inactive if desired. _core.getMasterChannelGroup(out _masterGroup); if (playLoopback) { - _core.playSound(_recSound, _masterGroup, false, out _playChannel); - _playChannel.setMode(MODE._2D); - _playChannel.setVolume(loopbackVolume); - UnityEngine.Debug.Log("[FMOD→Whisper] Loopback playback started."); + res = _core.playSound(_recSound, _masterGroup, false, out _playChannel); + if (res == RESULT.OK && _playChannel.hasHandle()) + { + _playChannel.setMode(MODE._2D); + _playChannel.setVolume(loopbackVolume); + if (loopbackOnlyWhenActive) _playChannel.setPaused(true); // keep muted until Activate + Debug.Log("[FMOD→Whisper] Loopback playback ready."); + } + else + { + Debug.LogWarning($"[FMOD→Whisper] playSound failed or channel invalid: {res}"); + } } - // Create Whisper stream WITHOUT MicrophoneRecord, just from (freq, channels). - // We'll push AudioChunk manually. - // NOTE: WhisperStream’s sliding window is governed by manager’s stepSec/keepSec/lengthSec. - _stream = await whisper.CreateStream(ex.defaultfrequency, _nativeChannels); - _stream.OnResultUpdated += (txt) => + // No Whisper stream here. It will be created on ActivateRecording(). + await System.Threading.Tasks.Task.Yield(); + } + + /// + /// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording). + /// + public async void ActivateRecording() + { + if (isRecordingActivated) { - //OnWhisperResultProcessed?.Invoke(txt); - }; + Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active)."); + return; + } + + if (!_recSound.hasHandle()) + { + Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running."); + return; + } + + int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate; + + try + { + _stream = await whisper.CreateStream(rate, _nativeChannels); + } + catch (Exception e) + { + Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}"); + _stream = null; + _streamStarted = false; + return; + } + + // Wire events _stream.OnSegmentUpdated += (seg) => { if (IsSpeechMeaningful(seg.Result)) - { OnWhisperSegmentUpdated?.Invoke(seg.Result); - } }; _stream.OnSegmentFinished += (seg) => { if (IsSpeechMeaningful(seg.Result)) - { OnWhisperSegmentFinished?.Invoke(seg.Result); - } }; - // If you want Whisper to respect VAD, enable in manager or set useVad (manager controls stream params). whisper.useVad = useVadInStream; _stream.StartStream(); _streamStarted = true; - // prepare temp arrays roughly 100ms of audio - EnsureTmpCapacity((ex.defaultfrequency / 10) * _nativeChannels); + // Unpause loopback if it's meant to play only while active + if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle()) + _playChannel.setPaused(false); + + // Prepare temp arrays roughly 100ms of audio + EnsureTmpCapacity((rate / 10) * _nativeChannels); + + isRecordingActivated = true; + Debug.Log("[FMOD→Whisper] Stream activated (Whisper started; FMOD was already recording)."); + } + + /// + /// Stops and disposes the Whisper stream only. FMOD keeps recording. + /// + public void DeactivateRecording() + { + if (!isRecordingActivated && !_streamStarted) + return; + + isRecordingActivated = false; + + // Pause loopback if it should only be active during recording + if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle()) + _playChannel.setPaused(true); + + // Tear down Whisper stream + if (_streamStarted) + { + try { _stream.StopStream(); } catch { /* ignore */ } + _streamStarted = false; + } + _stream = null; + + Debug.Log("[FMOD→Whisper] Stream deactivated (Whisper stopped; FMOD still recording)."); } private void Update() { - if (!isRecordingActivated) return; + // Always tick FMOD if (_core.handle != IntPtr.Zero) _core.update(); - if (!_streamStarted || !_recSound.hasHandle()) return; + if (!_recSound.hasHandle()) return; - // How many samples recorded since last frame? + // Compute how many samples recorded since last frame. uint recPos; _core.getRecordPosition(recordDriverId, out recPos); @@ -152,23 +252,39 @@ public class FMODWhisperBridge : MonoBehaviour ? (recPos - _lastRecordPos) : (recPos + _soundPcmLength - _lastRecordPos); - if (deltaSamples == 0) return; + if (deltaSamples == 0) + { + // Even if 0, keep last pos + _lastRecordPos = recPos; + return; + } - // We’ll read that region (16-bit) and convert to float[] [-1..1]. - // Calculate byte range to lock in sound buffer - uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 16-bit = 2 bytes + // If not active, we *still* advance the ring (so we don't backlog data), + // but we *don't* push chunks to Whisper. + bool shouldFeed = isRecordingActivated && _streamStarted && _stream != null; + + // Calculate byte range to lock (16-bit) + uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2; IntPtr p1, p2; uint len1, len2; - // Lock can wrap — FMOD splits into p1/p2. - _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2); + var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2); + if (r != RESULT.OK) + { + // If lock fails, still advance last position to avoid spin + _lastRecordPos = recPos; + return; + } try { - // Convert both parts to float and push to Whisper - if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1); - if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2); + if (shouldFeed) + { + if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1); + if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2); + } + // else: just discard; we’re only keeping the ring fresh. } finally { @@ -178,19 +294,11 @@ public class FMODWhisperBridge : MonoBehaviour _lastRecordPos = recPos; } - public void ActivateRecording() - { - isRecordingActivated = true; - } - - public void DeactivateRecording() - { - isRecordingActivated = false; - } - private bool IsSpeechMeaningful(string userText) { - return !string.IsNullOrEmpty(userText) && !userText.Contains("BLANK_AUDIO") && !userText.Trim().Equals("[ Silence ]"); + return !string.IsNullOrEmpty(userText) + && !userText.Contains("BLANK_AUDIO") + && !userText.Trim().Equals("[ Silence ]"); } private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen) @@ -198,26 +306,20 @@ public class FMODWhisperBridge : MonoBehaviour int samples = (int)(byteLen / 2); // 2 bytes per sample EnsureTmpCapacity(samples); - // Marshal the 16-bit PCM into managed space - // We pin a short[] overlay to avoid copying twice - int shorts = samples; - int byteCount = (int)byteLen; + EnsureShortOverlay(samples, out short[] sBuf); + Marshal.Copy(src, sBuf, 0, samples); - // Use Marshal.Copy into a short[] then convert to float[-1..1] - // (You can also unsafe copy for speed if needed.) - EnsureShortOverlay(shorts, out short[] sBuf); - Marshal.Copy(src, sBuf, 0, shorts); - - for (int i = 0; i < shorts; i++) + // Convert to float [-1..1] (no downmix change from your original) + for (int i = 0; i < samples; i++) { - // 32768f avoids clipping at -32768 _floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f); } - // Build a chunk for WhisperStream; with VAD off, IsVoiceDetected=true is fine. + // TODO (optional): downmix to mono and/or run a light gate before feeding. + // For now we keep your original behavior: var chunk = new AudioChunk { - Data = _floatTmp.AsSpan(0, shorts).ToArray(), + Data = _floatTmp.AsSpan(0, samples).ToArray(), Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate, Channels = _nativeChannels, IsVoiceDetected = true @@ -226,7 +328,6 @@ public class FMODWhisperBridge : MonoBehaviour _stream.AddToStream(chunk); } - private short[] _shortOverlay; private void EnsureShortOverlay(int samples, out short[] buf) { if (_shortOverlay == null || _shortOverlay.Length < samples) @@ -242,13 +343,20 @@ public class FMODWhisperBridge : MonoBehaviour private void OnDisable() { - if (_streamStarted) - { - _stream.StopStream(); - _streamStarted = false; - } + // Stop Whisper (if active) + DeactivateRecording(); - if (_playChannel.hasHandle()) { _playChannel.stop(); _playChannel.clearHandle(); } - if (_recSound.hasHandle()) { _core.recordStop(recordDriverId); _recSound.release(); _recSound.clearHandle(); } + // Stop/purge FMOD resources (since object is going away) + if (_playChannel.hasHandle()) + { + try { _playChannel.stop(); } catch { /* ignore */ } + _playChannel.clearHandle(); + } + if (_recSound.hasHandle()) + { + try { _core.recordStop(recordDriverId); } catch { /* ignore */ } + try { _recSound.release(); } catch { /* ignore */ } + _recSound.clearHandle(); + } } } diff --git a/Assets/_PROJECT/Scripts/ModeGeneration/ReleasableButton.cs b/Assets/_PROJECT/Scripts/ModeGeneration/ReleasableButton.cs index 832ee6ac..678eb5d9 100644 --- a/Assets/_PROJECT/Scripts/ModeGeneration/ReleasableButton.cs +++ b/Assets/_PROJECT/Scripts/ModeGeneration/ReleasableButton.cs @@ -5,6 +5,8 @@ public class ReleasableButton : MonoBehaviour { public delegate void OnButtonPressedDelegate(); public event OnButtonPressedDelegate OnButtonPressed; + public delegate void OnButtonReleasedDelegate(); + public event OnButtonReleasedDelegate OnButtonReleased; public Transform movableParts; public float moveDuration = 0.25f; @@ -37,7 +39,6 @@ public class ReleasableButton : MonoBehaviour { if (!isButtonDown && collider.gameObject.tag.EndsWith("Hand")) { - Debug.Log("collided with: " + collider.gameObject.name); Activate(); OnButtonPressed?.Invoke(); } @@ -47,8 +48,8 @@ public class ReleasableButton : MonoBehaviour { if (isButtonDown && collider.gameObject.tag.EndsWith("Hand")) { - Debug.Log("collider exited: " + collider.gameObject.name); Deactivate(); + OnButtonReleased?.Invoke(); } } diff --git a/Assets/_PROJECT/Scripts/ModeGeneration/ShapeDetection/RadioTransmitter.cs b/Assets/_PROJECT/Scripts/ModeGeneration/ShapeDetection/RadioTransmitter.cs index 6c647a3a..8e06528f 100644 --- a/Assets/_PROJECT/Scripts/ModeGeneration/ShapeDetection/RadioTransmitter.cs +++ b/Assets/_PROJECT/Scripts/ModeGeneration/ShapeDetection/RadioTransmitter.cs @@ -18,6 +18,7 @@ public class RadioTransmitter : XRGrabInteractable void Start() { radioButton.OnButtonPressed += OnRadioButtonPressed; + radioButton.OnButtonReleased += OnRadioButtonReleased; } // Update is called once per frame @@ -38,12 +39,20 @@ public class RadioTransmitter : XRGrabInteractable fmodWhisperBridge.OnWhisperSegmentFinished += OnPlayerSpeechFinished; AudioManager.Instance.PlayAttachedInstance(FMODEvents.Instance.RadioButton, gameObject); fmodWhisperBridge.ActivateRecording(); - // TODO: deactivate when button is released + } + + private void OnRadioButtonReleased() + { + fmodWhisperBridge.OnWhisperSegmentUpdated -= OnPlayerSpeechUpdated; + fmodWhisperBridge.OnWhisperSegmentFinished -= OnPlayerSpeechFinished; + AudioManager.Instance.PlayAttachedInstance(FMODEvents.Instance.RadioButton, gameObject); + fmodWhisperBridge.DeactivateRecording(); } private void OnPlayerSpeechUpdated(string text) { computerScreen.text = text; + OnPlayerFinishedSpeaking?.Invoke(); } private void OnPlayerSpeechFinished(string playerText)