using System; using System.Runtime.InteropServices; using UnityEngine; using FMOD; using FMODUnity; using Whisper; // WhisperManager, WhisperStream, WhisperResult using Whisper.Utils; using Debug = UnityEngine.Debug; // AudioChunk ///

/// FMOD mic is initialized once (Start) and runs continuously in a ring buffer. /// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording(). /// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls. ///

public class FMODWhisperBridge : MonoBehaviour { [Header("Whisper")] [SerializeField] private WhisperManager whisper; // assign in Inspector [SerializeField] private bool useVadInStream = true; // let WhisperStream do VAD or not [Header("FMOD capture")] [Tooltip("Recording device index (0 = default)")] public int recordDriverId = 0; [Tooltip("Set 48000 on Quest; falls back to device rate automatically")] public int desiredSampleRate = 48000; [Tooltip("Mono recommended for Whisper")] public int channels = 1; [Range(1, 10)] public int bufferLengthSec = 5; [Header("Loopback (monitor your voice)")] public bool playLoopback = true; [Tooltip("If true, loopback plays only while active; otherwise it’s always on.")] public bool loopbackOnlyWhenActive = true; [Range(0f, 2f)] public float loopbackVolume = 1.0f; public delegate void OnWhisperSegmentUpdatedDelegate(string result); public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated; public delegate void OnWhisperSegmentFinishedDelegate(string result); public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished; // FMOD private FMOD.System _core; private Sound _recSound; private Channel _playChannel; private ChannelGroup _masterGroup; private uint _soundPcmLength; // in samples private int _nativeRate; private int _nativeChannels; // ring-buffer tracking private uint _lastRecordPos = 0; // Whisper private WhisperStream _stream; private bool _streamStarted; // temp conversion buffer private float[] _floatTmp = new float[0]; private short[] _shortOverlay; // activation flag private bool isRecordingActivated = false; private void Awake() { if (!whisper) whisper = FindObjectOfType(); _core = RuntimeManager.CoreSystem; // FMOD core system } private async void Start() { // -------------- FMOD initialize ONCE -------------- // Query device info string name; Guid guid; SPEAKERMODE sm; int smChannels; DRIVER_STATE driverState; var res = _core.getRecordDriverInfo( recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState ); if (res != RESULT.OK) { Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}"); return; } _nativeChannels = channels > 0 ? channels : smChannels; int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate; Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={rate} ch={_nativeChannels}"); // Build user sound (ring buffer) — multiple seconds CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO { cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)), numchannels = _nativeChannels, defaultfrequency = rate, format = SOUND_FORMAT.PCM16, length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec) }; res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound); if (res != RESULT.OK) { Debug.LogError($"[FMOD→Whisper] createSound failed: {res}"); return; } _recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM); // Start recording (looping) res = _core.recordStart(recordDriverId, _recSound, true); if (res != RESULT.OK) { Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}"); _recSound.release(); _recSound.clearHandle(); return; } // Initialize record position to avoid a huge first delta _core.getRecordPosition(recordDriverId, out _lastRecordPos); Debug.Log("[FMOD→Whisper] Recording started."); // Loopback channel (optional). Start once; pause when inactive if desired. _core.getMasterChannelGroup(out _masterGroup); if (playLoopback) { res = _core.playSound(_recSound, _masterGroup, false, out _playChannel); if (res == RESULT.OK && _playChannel.hasHandle()) { _playChannel.setMode(MODE._2D); _playChannel.setVolume(loopbackVolume); if (loopbackOnlyWhenActive) _playChannel.setPaused(true); // keep muted until Activate Debug.Log("[FMOD→Whisper] Loopback playback ready."); } else { Debug.LogWarning($"[FMOD→Whisper] playSound failed or channel invalid: {res}"); } } // No Whisper stream here. It will be created on ActivateRecording(). await System.Threading.Tasks.Task.Yield(); } ///

/// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording). ///

public async void ActivateRecording() { if (isRecordingActivated) { Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active)."); return; } if (!_recSound.hasHandle()) { Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running."); return; } int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate; try { _stream = await whisper.CreateStream(rate, _nativeChannels); } catch (Exception e) { Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}"); _stream = null; _streamStarted = false; return; } // Wire events _stream.OnSegmentUpdated += (seg) => { if (IsSpeechMeaningful(seg.Result)) OnWhisperSegmentUpdated?.Invoke(seg.Result); }; _stream.OnSegmentFinished += (seg) => { if (IsSpeechMeaningful(seg.Result)) OnWhisperSegmentFinished?.Invoke(seg.Result); }; whisper.useVad = useVadInStream; _stream.StartStream(); _streamStarted = true; // Unpause loopback if it's meant to play only while active if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle()) _playChannel.setPaused(false); // Prepare temp arrays roughly 100ms of audio EnsureTmpCapacity((rate / 10) * _nativeChannels); isRecordingActivated = true; Debug.Log("[FMOD→Whisper] Stream activated (Whisper started; FMOD was already recording)."); } ///

/// Stops and disposes the Whisper stream only. FMOD keeps recording. ///

public void DeactivateRecording() { if (!isRecordingActivated && !_streamStarted) return; isRecordingActivated = false; // Pause loopback if it should only be active during recording if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle()) _playChannel.setPaused(true); // Tear down Whisper stream if (_streamStarted) { try { _stream.StopStream(); } catch { /* ignore */ } _streamStarted = false; } _stream = null; Debug.Log("[FMOD→Whisper] Stream deactivated (Whisper stopped; FMOD still recording)."); } private void Update() { // Always tick FMOD if (_core.handle != IntPtr.Zero) _core.update(); if (!_recSound.hasHandle()) return; // Compute how many samples recorded since last frame. uint recPos; _core.getRecordPosition(recordDriverId, out recPos); uint deltaSamples = (recPos >= _lastRecordPos) ? (recPos - _lastRecordPos) : (recPos + _soundPcmLength - _lastRecordPos); if (deltaSamples == 0) { // Even if 0, keep last pos _lastRecordPos = recPos; return; } // If not active, we *still* advance the ring (so we don't backlog data), // but we *don't* push chunks to Whisper. bool shouldFeed = isRecordingActivated && _streamStarted && _stream != null; // Calculate byte range to lock (16-bit) uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2; IntPtr p1, p2; uint len1, len2; var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2); if (r != RESULT.OK) { // If lock fails, still advance last position to avoid spin _lastRecordPos = recPos; return; } try { if (shouldFeed) { if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1); if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2); } // else: just discard; we’re only keeping the ring fresh. } finally { _recSound.unlock(p1, p2, len1, len2); } _lastRecordPos = recPos; } private bool IsSpeechMeaningful(string userText) { return !string.IsNullOrEmpty(userText) && !userText.Contains("BLANK_AUDIO") && !userText.Trim().Equals("[ Silence ]"); } private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen) { int samples = (int)(byteLen / 2); // 2 bytes per sample EnsureTmpCapacity(samples); EnsureShortOverlay(samples, out short[] sBuf); Marshal.Copy(src, sBuf, 0, samples); // Convert to float [-1..1] (no downmix change from your original) for (int i = 0; i < samples; i++) { _floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f); } // TODO (optional): downmix to mono and/or run a light gate before feeding. // For now we keep your original behavior: var chunk = new AudioChunk { Data = _floatTmp.AsSpan(0, samples).ToArray(), Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate, Channels = _nativeChannels, IsVoiceDetected = true }; _stream.AddToStream(chunk); } private void EnsureShortOverlay(int samples, out short[] buf) { if (_shortOverlay == null || _shortOverlay.Length < samples) _shortOverlay = new short[Mathf.NextPowerOfTwo(samples)]; buf = _shortOverlay; } private void EnsureTmpCapacity(int samples) { if (_floatTmp == null || _floatTmp.Length < samples) _floatTmp = new float[Mathf.NextPowerOfTwo(samples)]; } private void OnDisable() { // Stop Whisper (if active) DeactivateRecording(); // Stop/purge FMOD resources (since object is going away) if (_playChannel.hasHandle()) { try { _playChannel.stop(); } catch { /* ignore */ } _playChannel.clearHandle(); } if (_recSound.hasHandle()) { try { _core.recordStop(recordDriverId); } catch { /* ignore */ } try { _recSound.release(); } catch { /* ignore */ } _recSound.clearHandle(); } } }