using System;
using System.Runtime.InteropServices;
using UnityEngine;
using FMOD;
using FMODUnity;
using Whisper; // WhisperManager, WhisperStream, WhisperResult
using Whisper.Utils;
using Debug = UnityEngine.Debug; // AudioChunk
///
/// FMOD mic is initialized once (Start) and runs continuously in a ring buffer.
/// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording().
/// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls.
///
public class FMODWhisperBridge : MonoBehaviour
{
[Header("Whisper")]
[SerializeField] private WhisperManager whisper; // assign in Inspector
[SerializeField] private bool useVadInStream = true; // let WhisperStream do VAD or not
[Header("FMOD capture")]
[Tooltip("Recording device index (0 = default)")]
public int recordDriverId = 0;
[Tooltip("Set 48000 on Quest; falls back to device rate automatically")]
public int desiredSampleRate = 48000;
[Tooltip("Mono recommended for Whisper")]
public int channels = 1;
[Range(1, 10)] public int bufferLengthSec = 5;
[Header("Loopback (monitor your voice)")]
public bool playLoopback = true;
[Tooltip("If true, loopback plays only while active; otherwise it’s always on.")]
public bool loopbackOnlyWhenActive = true;
[Range(0f, 2f)] public float loopbackVolume = 1.0f;
public delegate void OnWhisperSegmentUpdatedDelegate(string result);
public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated;
public delegate void OnWhisperSegmentFinishedDelegate(string result);
public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished;
// FMOD
private FMOD.System _core;
private Sound _recSound;
private Channel _playChannel;
private ChannelGroup _masterGroup;
private uint _soundPcmLength; // in samples
private int _nativeRate;
private int _nativeChannels;
// ring-buffer tracking
private uint _lastRecordPos = 0;
// Whisper
private WhisperStream _stream;
private bool _streamStarted;
// temp conversion buffer
private float[] _floatTmp = new float[0];
private short[] _shortOverlay;
// activation flag
private bool isRecordingActivated = false;
private void Awake()
{
if (!whisper) whisper = FindObjectOfType();
_core = RuntimeManager.CoreSystem; // FMOD core system
}
private async void Start()
{
// -------------- FMOD initialize ONCE --------------
// Query device info
string name;
Guid guid;
SPEAKERMODE sm;
int smChannels;
DRIVER_STATE driverState;
var res = _core.getRecordDriverInfo(
recordDriverId,
out name, 256,
out guid,
out _nativeRate,
out sm,
out smChannels,
out driverState
);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}");
return;
}
_nativeChannels = channels > 0 ? channels : smChannels;
int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={rate} ch={_nativeChannels}");
// Build user sound (ring buffer) — multiple seconds
CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
{
cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
numchannels = _nativeChannels,
defaultfrequency = rate,
format = SOUND_FORMAT.PCM16,
length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec)
};
res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] createSound failed: {res}");
return;
}
_recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);
// Start recording (looping)
res = _core.recordStart(recordDriverId, _recSound, true);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}");
_recSound.release();
_recSound.clearHandle();
return;
}
// Initialize record position to avoid a huge first delta
_core.getRecordPosition(recordDriverId, out _lastRecordPos);
Debug.Log("[FMOD→Whisper] Recording started.");
// Loopback channel (optional). Start once; pause when inactive if desired.
_core.getMasterChannelGroup(out _masterGroup);
if (playLoopback)
{
res = _core.playSound(_recSound, _masterGroup, false, out _playChannel);
if (res == RESULT.OK && _playChannel.hasHandle())
{
_playChannel.setMode(MODE._2D);
_playChannel.setVolume(loopbackVolume);
if (loopbackOnlyWhenActive) _playChannel.setPaused(true); // keep muted until Activate
Debug.Log("[FMOD→Whisper] Loopback playback ready.");
}
else
{
Debug.LogWarning($"[FMOD→Whisper] playSound failed or channel invalid: {res}");
}
}
// No Whisper stream here. It will be created on ActivateRecording().
await System.Threading.Tasks.Task.Yield();
}
///
/// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording).
///
public async void ActivateRecording()
{
if (isRecordingActivated)
{
Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active).");
return;
}
if (!_recSound.hasHandle())
{
Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running.");
return;
}
int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
try
{
_stream = await whisper.CreateStream(rate, _nativeChannels);
}
catch (Exception e)
{
Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}");
_stream = null;
_streamStarted = false;
return;
}
// Wire events
_stream.OnSegmentUpdated += (seg) =>
{
if (IsSpeechMeaningful(seg.Result))
OnWhisperSegmentUpdated?.Invoke(seg.Result);
};
_stream.OnSegmentFinished += (seg) =>
{
if (IsSpeechMeaningful(seg.Result))
OnWhisperSegmentFinished?.Invoke(seg.Result);
};
whisper.useVad = useVadInStream;
_stream.StartStream();
_streamStarted = true;
// Unpause loopback if it's meant to play only while active
if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
_playChannel.setPaused(false);
// Prepare temp arrays roughly 100ms of audio
EnsureTmpCapacity((rate / 10) * _nativeChannels);
isRecordingActivated = true;
Debug.Log("[FMOD→Whisper] Stream activated (Whisper started; FMOD was already recording).");
}
///
/// Stops and disposes the Whisper stream only. FMOD keeps recording.
///
public void DeactivateRecording()
{
if (!isRecordingActivated && !_streamStarted)
return;
isRecordingActivated = false;
// Pause loopback if it should only be active during recording
if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
_playChannel.setPaused(true);
// Tear down Whisper stream
if (_streamStarted)
{
try { _stream.StopStream(); } catch { /* ignore */ }
_streamStarted = false;
}
_stream = null;
Debug.Log("[FMOD→Whisper] Stream deactivated (Whisper stopped; FMOD still recording).");
}
private void Update()
{
// Always tick FMOD
if (_core.handle != IntPtr.Zero) _core.update();
if (!_recSound.hasHandle()) return;
// Compute how many samples recorded since last frame.
uint recPos;
_core.getRecordPosition(recordDriverId, out recPos);
uint deltaSamples = (recPos >= _lastRecordPos)
? (recPos - _lastRecordPos)
: (recPos + _soundPcmLength - _lastRecordPos);
if (deltaSamples == 0)
{
// Even if 0, keep last pos
_lastRecordPos = recPos;
return;
}
// If not active, we *still* advance the ring (so we don't backlog data),
// but we *don't* push chunks to Whisper.
bool shouldFeed = isRecordingActivated && _streamStarted && _stream != null;
// Calculate byte range to lock (16-bit)
uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample
uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;
IntPtr p1, p2;
uint len1, len2;
var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
if (r != RESULT.OK)
{
// If lock fails, still advance last position to avoid spin
_lastRecordPos = recPos;
return;
}
try
{
if (shouldFeed)
{
if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
}
// else: just discard; we’re only keeping the ring fresh.
}
finally
{
_recSound.unlock(p1, p2, len1, len2);
}
_lastRecordPos = recPos;
}
private bool IsSpeechMeaningful(string userText)
{
return !string.IsNullOrEmpty(userText)
&& !userText.Contains("BLANK_AUDIO")
&& !userText.Trim().Equals("[ Silence ]");
}
private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
{
int samples = (int)(byteLen / 2); // 2 bytes per sample
EnsureTmpCapacity(samples);
EnsureShortOverlay(samples, out short[] sBuf);
Marshal.Copy(src, sBuf, 0, samples);
// Convert to float [-1..1] (no downmix change from your original)
for (int i = 0; i < samples; i++)
{
_floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
}
// TODO (optional): downmix to mono and/or run a light gate before feeding.
// For now we keep your original behavior:
var chunk = new AudioChunk
{
Data = _floatTmp.AsSpan(0, samples).ToArray(),
Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
Channels = _nativeChannels,
IsVoiceDetected = true
};
_stream.AddToStream(chunk);
}
private void EnsureShortOverlay(int samples, out short[] buf)
{
if (_shortOverlay == null || _shortOverlay.Length < samples)
_shortOverlay = new short[Mathf.NextPowerOfTwo(samples)];
buf = _shortOverlay;
}
private void EnsureTmpCapacity(int samples)
{
if (_floatTmp == null || _floatTmp.Length < samples)
_floatTmp = new float[Mathf.NextPowerOfTwo(samples)];
}
private void OnDisable()
{
// Stop Whisper (if active)
DeactivateRecording();
// Stop/purge FMOD resources (since object is going away)
if (_playChannel.hasHandle())
{
try { _playChannel.stop(); } catch { /* ignore */ }
_playChannel.clearHandle();
}
if (_recSound.hasHandle())
{
try { _core.recordStop(recordDriverId); } catch { /* ignore */ }
try { _recSound.release(); } catch { /* ignore */ }
_recSound.clearHandle();
}
}
}