using System; using System.Runtime.InteropServices; using UnityEngine; using FMOD; using FMODUnity; using Whisper; using Whisper.Utils; using Debug = UnityEngine.Debug; /// /// FMOD mic is initialized once (Start) and runs continuously in a ring buffer. /// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording(). /// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls. /// public class FMODWhisperBridge : MonoBehaviour { [Header("Whisper")] [SerializeField] private WhisperManager whisper; // assign in Inspector [SerializeField] private bool useVadInStream = true; // let WhisperStream do VAD or not [Header("FMOD capture")] [Tooltip("Recording device index (0 = default)")] public int recordDriverId = 0; [Tooltip("Set 48000 on Quest; falls back to device rate automatically")] public int desiredSampleRate = 48000; [Tooltip("Mono recommended for Whisper")] public int channels = 1; [Range(1, 10)] public int bufferLengthSec = 5; public delegate void OnWhisperSegmentUpdatedDelegate(string result); public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated; public delegate void OnWhisperSegmentFinishedDelegate(string result); public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished; // FMOD private FMOD.System _core; private Sound _recSound; private Channel _playChannel; private uint _soundPcmLength; // in samples private int _nativeRate; private int _nativeChannels; // ring-buffer tracking private uint _lastRecordPos = 0; // Whisper private WhisperStream _stream; // temp conversion buffer private float[] _floatTmp = new float[0]; private short[] _shortOverlay; // activation flag private bool isRecordingActivated = false; private bool _skipOneFeedFrame = false; // --- Speech Volume Measurement --- private float currentVolumeRms = 0f; // Smoothed RMS for external access private float volumeSmoothing = 0.15f; // How fast the meter reacts (0.1–0.3 good) private void Awake() { if (!whisper) whisper = FindObjectOfType(); _core = RuntimeManager.CoreSystem; // FMOD core system } private async void Start() { // -------------- FMOD initialize ONCE -------------- // Query device info string name; Guid guid; SPEAKERMODE sm; int smChannels; DRIVER_STATE driverState; var res = _core.getRecordDriverInfo( recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState ); if (res != RESULT.OK) { Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}"); return; } _nativeChannels = channels > 0 ? channels : smChannels; int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate; Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={rate} ch={_nativeChannels}"); // Build user sound (ring buffer) — multiple seconds CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO { cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)), numchannels = _nativeChannels, defaultfrequency = rate, format = SOUND_FORMAT.PCM16, length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec) }; res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound); if (res != RESULT.OK) { Debug.LogError($"[FMOD→Whisper] createSound failed: {res}"); return; } _recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM); // Start recording (looping) res = _core.recordStart(recordDriverId, _recSound, true); if (res != RESULT.OK) { Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}"); _recSound.release(); _recSound.clearHandle(); return; } // Initialize record position to avoid a huge first delta _core.getRecordPosition(recordDriverId, out _lastRecordPos); Debug.Log("[FMOD→Whisper] Recording started."); // No Whisper stream here. It will be created on ActivateRecording(). await System.Threading.Tasks.Task.Yield(); } /// /// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording). /// public async void ActivateRecording() { if (isRecordingActivated) { Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active)."); return; } if (!_recSound.hasHandle()) { Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running."); return; } int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate; if (_stream != null) { _stream.StopStream(); } try { _stream = await whisper.CreateStream(rate, _nativeChannels); } catch (Exception e) { Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}"); _stream = null; return; } // Wire events _stream.OnSegmentUpdated += (seg) => { string cleanedText = PostProcessInput(seg.Result); if (!string.IsNullOrEmpty(cleanedText)) OnWhisperSegmentUpdated?.Invoke(cleanedText); }; _stream.OnSegmentFinished += (seg) => { string cleanedText = PostProcessInput(seg.Result); if (!string.IsNullOrEmpty(cleanedText)) OnWhisperSegmentFinished?.Invoke(cleanedText); }; whisper.useVad = useVadInStream; _stream.StartStream(); // Reset our read pointer to the current write head _core.getRecordPosition(recordDriverId, out _lastRecordPos); // We’ll skip feeding for one frame to guarantee a clean start _skipOneFeedFrame = true; isRecordingActivated = true; Debug.Log("[FMOD→Whisper] Stream activated (buffer cleared; reading from current head)."); } /// /// Stops and disposes the Whisper stream only. FMOD keeps recording. /// public void DeactivateRecording() { if (!isRecordingActivated) return; isRecordingActivated = false; } /// /// Returns current microphone level in dBFS (decibels relative to full-scale). /// 0 dBFS = digital clipping; normal speech is typically around -35 to -20 dBFS. /// public float GetCurrentVolumeDb() { // Guard from log(0) const float minRms = 1e-7f; float rms = Mathf.Clamp(currentVolumeRms, minRms, 1f); return 20f * Mathf.Log10(rms); } /// /// Returns a UI-friendly 0..1 loudness from the current dBFS value. /// Adjust the dB range to your content/environment if needed. /// public float GetNormalizedVolume01() { float db = GetCurrentVolumeDb(); // typically ~ -60 .. -15 during use return Mathf.Clamp01(Mathf.InverseLerp(-60f, -15f, db)); } private void Update() { // Always tick FMOD if (_core.handle != IntPtr.Zero) _core.update(); if (!_recSound.hasHandle()) return; // Compute how many samples recorded since last frame. _core.getRecordPosition(recordDriverId, out uint recPos); uint deltaSamples = (recPos >= _lastRecordPos) ? (recPos - _lastRecordPos) : (recPos + _soundPcmLength - _lastRecordPos); if (deltaSamples == 0) { // Even if 0, keep last pos _lastRecordPos = recPos; return; } // Calculate byte range to lock (16-bit) uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2; IntPtr p1, p2; uint len1, len2; var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2); if (r != RESULT.OK) { _lastRecordPos = recPos; return; } try { float rmsAccumulator = 0f; int rmsSampleCount = 0; // 1) Measure volume if (len1 > 0) { ComputeRmsFromPcm16(p1, len1, ref rmsAccumulator, ref rmsSampleCount); } if (len2 > 0) { ComputeRmsFromPcm16(p2, len2, ref rmsAccumulator, ref rmsSampleCount); } if (rmsSampleCount > 0) { float rms = Mathf.Sqrt(rmsAccumulator / rmsSampleCount); // Smooth the value currentVolumeRms = Mathf.Lerp(currentVolumeRms, rms, 1f - Mathf.Pow(1f - volumeSmoothing, Time.deltaTime * 60f)); } // 2) Feed audio to Whisper if (_stream != null) { if (isRecordingActivated && !_skipOneFeedFrame) { // Feed real mic if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1); if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2); } else { // Feed same-size silence instead if (len1 > 0) FeedSilentChunk(len1); if (len2 > 0) FeedSilentChunk(len2); } } // If skipping, we just discard this frame to ensure no stale data leaks. } finally { _recSound.unlock(p1, p2, len1, len2); } if (_skipOneFeedFrame) _skipOneFeedFrame = false; _lastRecordPos = recPos; } private string PostProcessInput(string input) { return input.Replace("[silence]", "").Replace("[ Silence ]", "").Replace("BLANK_AUDIO", "").Replace("[", "").Replace("]", "").Replace("END PLAYBACK", "").Trim(); } private void FeedSilentChunk(uint byteLen) { int samples = (int)(byteLen / 2); EnsureTmpCapacity(samples); // fill existing _floatTmp with zeros (efficient) Array.Clear(_floatTmp, 0, samples); var chunk = new AudioChunk { Data = _floatTmp.AsSpan(0, samples).ToArray(), Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate, Channels = _nativeChannels, IsVoiceDetected = false }; _stream.AddToStream(chunk); } private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen) { int samples = (int)(byteLen / 2); // 2 bytes per sample EnsureTmpCapacity(samples); EnsureShortOverlay(samples, out short[] sBuf); Marshal.Copy(src, sBuf, 0, samples); // Convert to float [-1..1] (no downmix change from your original) for (int i = 0; i < samples; i++) { _floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f); } // TODO (optional): downmix to mono and/or run a light gate before feeding. // For now we keep your original behavior: var chunk = new AudioChunk { Data = _floatTmp.AsSpan(0, samples).ToArray(), Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate, Channels = _nativeChannels, IsVoiceDetected = true }; _stream.AddToStream(chunk); } private void EnsureShortOverlay(int samples, out short[] buf) { if (_shortOverlay == null || _shortOverlay.Length < samples) _shortOverlay = new short[Mathf.NextPowerOfTwo(samples)]; buf = _shortOverlay; } private void EnsureTmpCapacity(int samples) { if (_floatTmp == null || _floatTmp.Length < samples) _floatTmp = new float[Mathf.NextPowerOfTwo(samples)]; } private void OnDisable() { // Stop Whisper (if active) DeactivateRecording(); // Stop/purge FMOD resources (since object is going away) if (_playChannel.hasHandle()) { try { _playChannel.stop(); } catch { /* ignore */ } _playChannel.clearHandle(); } if (_recSound.hasHandle()) { try { _core.recordStop(recordDriverId); } catch { /* ignore */ } try { _recSound.release(); } catch { /* ignore */ } _recSound.clearHandle(); } } /// /// Computes RMS (root mean square) from a PCM16 block using only safe code. /// Uses the shared _shortOverlay buffer (no allocations). /// Accumulates results into accumulator + sampleCount. /// private void ComputeRmsFromPcm16(IntPtr src, uint byteLen, ref float accumulator, ref int sampleCount) { // Number of PCM16 samples (2 bytes per sample) int samples = (int)(byteLen / 2); if (samples <= 0) return; // Ensure overlay buffer exists & is large enough EnsureShortOverlay(samples, out short[] sBuf); // Copy PCM16 into managed buffer (safe) Marshal.Copy(src, sBuf, 0, samples); // Accumulate squared amplitude for (int i = 0; i < samples; i++) { float v = sBuf[i] / 32768f; // normalize to [-1..1] accumulator += v * v; } sampleCount += samples; } }