using System; using System.Runtime.InteropServices; using UnityEngine; using FMOD; using FMODUnity; using Whisper; // WhisperManager, WhisperStream, WhisperResult using Whisper.Utils; // AudioChunk ///

/// Capture microphone with FMOD and feed chunks to WhisperStream (no Unity Microphone). /// Also (optionally) plays the recorded sound back via FMOD loopback. ///

public class FMODWhisperBridge : MonoBehaviour { [Header("Whisper")] [SerializeField] private WhisperManager whisper; // assign in Inspector [SerializeField] private bool useVadInStream = false; // let WhisperStream do VAD or not [Header("FMOD capture")] [Tooltip("Recording device index (0 = default)")] public int recordDriverId = 0; [Tooltip("Set 48000 on Quest; falls back to device rate automatically")] public int desiredSampleRate = 48000; [Tooltip("Mono recommended for Whisper")] public int channels = 1; [Range(1, 10)] public int bufferLengthSec = 5; [Header("Loopback (monitor your voice)")] public bool playLoopback = true; [Range(0f, 2f)] public float loopbackVolume = 1.0f; public delegate void OnWhisperSegmentUpdatedDelegate(string result); public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated; public delegate void OnWhisperSegmentFinishedDelegate(string result); public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished; // FMOD private FMOD.System _core; private Sound _recSound; private Channel _playChannel; private ChannelGroup _masterGroup; private uint _soundPcmLength; // in samples private int _nativeRate; private int _nativeChannels; // ring-buffer tracking private uint _lastRecordPos = 0; // Whisper private WhisperStream _stream; private bool _streamStarted; // temp conversion buffer private float[] _floatTmp = new float[0]; private bool isRecordingActivated = false; private void Awake() { if (!whisper) whisper = FindObjectOfType(); _core = RuntimeManager.CoreSystem; // FMOD core system } private async void Start() { // Query device info to get native rate/channels. // (FMOD: getRecordDriverInfo gives you system rate & speaker mode) string name; Guid guid; SPEAKERMODE sm; int smChannels; DRIVER_STATE driverState; // signature: getRecordDriverInfo(id, out name, nameLen, out guid, out systemrate, out speakermode, out speakermodechannels, out driverState) _core.getRecordDriverInfo(recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState); _nativeChannels = channels > 0 ? channels : smChannels; UnityEngine.Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={_nativeRate} ch={_nativeChannels}"); // Build a user sound buffer that FMOD will fill (OPENUSER | LOOP_NORMAL). CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO { cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)), numchannels = _nativeChannels, defaultfrequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate, format = SOUND_FORMAT.PCM16, length = (uint)(((_nativeRate > 0 ? _nativeRate : desiredSampleRate) * _nativeChannels) * sizeof(short)) // seconds=1 (we loop) }; _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound); _recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM); // Start FMOD recording into that sound (looping ring buffer). _core.recordStart(recordDriverId, _recSound, true); UnityEngine.Debug.Log("[FMOD→Whisper] Recording started."); // Optional loopback playback using FMOD (plays same sound ring buffer). _core.getMasterChannelGroup(out _masterGroup); if (playLoopback) { _core.playSound(_recSound, _masterGroup, false, out _playChannel); _playChannel.setMode(MODE._2D); _playChannel.setVolume(loopbackVolume); UnityEngine.Debug.Log("[FMOD→Whisper] Loopback playback started."); } // Create Whisper stream WITHOUT MicrophoneRecord, just from (freq, channels). // We'll push AudioChunk manually. // NOTE: WhisperStream’s sliding window is governed by manager’s stepSec/keepSec/lengthSec. _stream = await whisper.CreateStream(ex.defaultfrequency, _nativeChannels); _stream.OnResultUpdated += (txt) => { //OnWhisperResultProcessed?.Invoke(txt); //UnityEngine.Debug.Log($"[Whisper] result updated: {txt}"); }; _stream.OnSegmentUpdated += (seg) => { OnWhisperSegmentUpdated?.Invoke(seg.Result); //UnityEngine.Debug.Log($"[Whisper] Seg updated: {seg.Result}"); }; _stream.OnSegmentFinished += (seg) => { OnWhisperSegmentFinished?.Invoke(seg.Result); //UnityEngine.Debug.Log($"[Whisper] Seg finished: {seg.Result}"); }; // If you want Whisper to respect VAD, enable in manager or set useVad (manager controls stream params). whisper.useVad = useVadInStream; _stream.StartStream(); _streamStarted = true; // prepare temp arrays roughly 100ms of audio EnsureTmpCapacity((ex.defaultfrequency / 10) * _nativeChannels); } private void Update() { if (!isRecordingActivated) return; if (_core.handle != IntPtr.Zero) _core.update(); if (!_streamStarted || !_recSound.hasHandle()) return; // How many samples recorded since last frame? uint recPos; _core.getRecordPosition(recordDriverId, out recPos); uint deltaSamples = (recPos >= _lastRecordPos) ? (recPos - _lastRecordPos) : (recPos + _soundPcmLength - _lastRecordPos); if (deltaSamples == 0) return; // We’ll read that region (16-bit) and convert to float[] [-1..1]. // Calculate byte range to lock in sound buffer uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 16-bit = 2 bytes uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2; IntPtr p1, p2; uint len1, len2; // Lock can wrap — FMOD splits into p1/p2. _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2); try { // Convert both parts to float and push to Whisper if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1); if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2); } finally { _recSound.unlock(p1, p2, len1, len2); } _lastRecordPos = recPos; } public void ActivateRecording() { isRecordingActivated = true; } public void DeactivateRecording() { isRecordingActivated = false; } private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen) { int samples = (int)(byteLen / 2); // 2 bytes per sample EnsureTmpCapacity(samples); // Marshal the 16-bit PCM into managed space // We pin a short[] overlay to avoid copying twice int shorts = samples; int byteCount = (int)byteLen; // Use Marshal.Copy into a short[] then convert to float[-1..1] // (You can also unsafe copy for speed if needed.) EnsureShortOverlay(shorts, out short[] sBuf); Marshal.Copy(src, sBuf, 0, shorts); for (int i = 0; i < shorts; i++) { // 32768f avoids clipping at -32768 _floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f); } // Build a chunk for WhisperStream; with VAD off, IsVoiceDetected=true is fine. var chunk = new AudioChunk { Data = _floatTmp.AsSpan(0, shorts).ToArray(), Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate, Channels = _nativeChannels, IsVoiceDetected = true }; _stream.AddToStream(chunk); } private short[] _shortOverlay; private void EnsureShortOverlay(int samples, out short[] buf) { if (_shortOverlay == null || _shortOverlay.Length < samples) _shortOverlay = new short[Mathf.NextPowerOfTwo(samples)]; buf = _shortOverlay; } private void EnsureTmpCapacity(int samples) { if (_floatTmp == null || _floatTmp.Length < samples) _floatTmp = new float[Mathf.NextPowerOfTwo(samples)]; } private void OnDisable() { if (_streamStarted) { _stream.StopStream(); _streamStarted = false; } if (_playChannel.hasHandle()) { _playChannel.stop(); _playChannel.clearHandle(); } if (_recSound.hasHandle()) { _core.recordStop(recordDriverId); _recSound.release(); _recSound.clearHandle(); } } }