1
0
forked from cgvr/DeltaVR

improve fmod whisper bridge and radio button using it

This commit is contained in:
2026-02-21 20:25:12 +02:00
parent c968e6bed4
commit 1a29f785b8
4 changed files with 204 additions and 86 deletions

View File

@@ -5,18 +5,19 @@ using UnityEngine;
using FMOD; using FMOD;
using FMODUnity; using FMODUnity;
using Whisper; // WhisperManager, WhisperStream, WhisperResult using Whisper; // WhisperManager, WhisperStream, WhisperResult
using Whisper.Utils; // AudioChunk using Whisper.Utils;
using Debug = UnityEngine.Debug; // AudioChunk
/// <summary> /// <summary>
/// Capture microphone with FMOD and feed chunks to WhisperStream (no Unity Microphone). /// FMOD mic is initialized once (Start) and runs continuously in a ring buffer.
/// Also (optionally) plays the recorded sound back via FMOD loopback. /// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording().
/// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls.
/// </summary> /// </summary>
public class FMODWhisperBridge : MonoBehaviour public class FMODWhisperBridge : MonoBehaviour
{ {
[Header("Whisper")] [Header("Whisper")]
[SerializeField] private WhisperManager whisper; // assign in Inspector [SerializeField] private WhisperManager whisper; // assign in Inspector
[SerializeField] private bool useVadInStream = false; // let WhisperStream do VAD or not [SerializeField] private bool useVadInStream = true; // let WhisperStream do VAD or not
[Header("FMOD capture")] [Header("FMOD capture")]
[Tooltip("Recording device index (0 = default)")] [Tooltip("Recording device index (0 = default)")]
@@ -29,6 +30,8 @@ public class FMODWhisperBridge : MonoBehaviour
[Header("Loopback (monitor your voice)")] [Header("Loopback (monitor your voice)")]
public bool playLoopback = true; public bool playLoopback = true;
[Tooltip("If true, loopback plays only while active; otherwise its always on.")]
public bool loopbackOnlyWhenActive = true;
[Range(0f, 2f)] public float loopbackVolume = 1.0f; [Range(0f, 2f)] public float loopbackVolume = 1.0f;
public delegate void OnWhisperSegmentUpdatedDelegate(string result); public delegate void OnWhisperSegmentUpdatedDelegate(string result);
@@ -55,7 +58,9 @@ public class FMODWhisperBridge : MonoBehaviour
// temp conversion buffer // temp conversion buffer
private float[] _floatTmp = new float[0]; private float[] _floatTmp = new float[0];
private short[] _shortOverlay;
// activation flag
private bool isRecordingActivated = false; private bool isRecordingActivated = false;
private void Awake() private void Awake()
@@ -66,85 +71,180 @@ public class FMODWhisperBridge : MonoBehaviour
private async void Start() private async void Start()
{ {
// Query device info to get native rate/channels. // -------------- FMOD initialize ONCE --------------
// (FMOD: getRecordDriverInfo gives you system rate & speaker mode) // Query device info
string name; string name;
Guid guid; Guid guid;
SPEAKERMODE sm; SPEAKERMODE sm;
int smChannels; int smChannels;
DRIVER_STATE driverState; DRIVER_STATE driverState;
// signature: getRecordDriverInfo(id, out name, nameLen, out guid, out systemrate, out speakermode, out speakermodechannels, out driverState)
_core.getRecordDriverInfo(recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState);
_nativeChannels = channels > 0 ? channels : smChannels;
UnityEngine.Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={_nativeRate} ch={_nativeChannels}");
// Build a user sound buffer that FMOD will fill (OPENUSER | LOOP_NORMAL). var res = _core.getRecordDriverInfo(
recordDriverId,
out name, 256,
out guid,
out _nativeRate,
out sm,
out smChannels,
out driverState
);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}");
return;
}
_nativeChannels = channels > 0 ? channels : smChannels;
int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={rate} ch={_nativeChannels}");
// Build user sound (ring buffer) — multiple seconds
CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
{ {
cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)), cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
numchannels = _nativeChannels, numchannels = _nativeChannels,
defaultfrequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate, defaultfrequency = rate,
format = SOUND_FORMAT.PCM16, format = SOUND_FORMAT.PCM16,
length = (uint)(((_nativeRate > 0 ? _nativeRate : desiredSampleRate) * _nativeChannels) * sizeof(short)) // seconds=1 (we loop) length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec)
}; };
_core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound); res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] createSound failed: {res}");
return;
}
_recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM); _recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);
// Start FMOD recording into that sound (looping ring buffer). // Start recording (looping)
_core.recordStart(recordDriverId, _recSound, true); res = _core.recordStart(recordDriverId, _recSound, true);
UnityEngine.Debug.Log("[FMOD→Whisper] Recording started."); if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}");
_recSound.release();
_recSound.clearHandle();
return;
}
// Optional loopback playback using FMOD (plays same sound ring buffer). // Initialize record position to avoid a huge first delta
_core.getRecordPosition(recordDriverId, out _lastRecordPos);
Debug.Log("[FMOD→Whisper] Recording started.");
// Loopback channel (optional). Start once; pause when inactive if desired.
_core.getMasterChannelGroup(out _masterGroup); _core.getMasterChannelGroup(out _masterGroup);
if (playLoopback) if (playLoopback)
{ {
_core.playSound(_recSound, _masterGroup, false, out _playChannel); res = _core.playSound(_recSound, _masterGroup, false, out _playChannel);
_playChannel.setMode(MODE._2D); if (res == RESULT.OK && _playChannel.hasHandle())
_playChannel.setVolume(loopbackVolume); {
UnityEngine.Debug.Log("[FMOD→Whisper] Loopback playback started."); _playChannel.setMode(MODE._2D);
_playChannel.setVolume(loopbackVolume);
if (loopbackOnlyWhenActive) _playChannel.setPaused(true); // keep muted until Activate
Debug.Log("[FMOD→Whisper] Loopback playback ready.");
}
else
{
Debug.LogWarning($"[FMOD→Whisper] playSound failed or channel invalid: {res}");
}
} }
// Create Whisper stream WITHOUT MicrophoneRecord, just from (freq, channels). // No Whisper stream here. It will be created on ActivateRecording().
// We'll push AudioChunk manually. await System.Threading.Tasks.Task.Yield();
// NOTE: WhisperStreams sliding window is governed by managers stepSec/keepSec/lengthSec. }
_stream = await whisper.CreateStream(ex.defaultfrequency, _nativeChannels);
_stream.OnResultUpdated += (txt) => /// <summary>
/// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording).
/// </summary>
public async void ActivateRecording()
{
if (isRecordingActivated)
{ {
//OnWhisperResultProcessed?.Invoke(txt); Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active).");
}; return;
}
if (!_recSound.hasHandle())
{
Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running.");
return;
}
int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
try
{
_stream = await whisper.CreateStream(rate, _nativeChannels);
}
catch (Exception e)
{
Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}");
_stream = null;
_streamStarted = false;
return;
}
// Wire events
_stream.OnSegmentUpdated += (seg) => _stream.OnSegmentUpdated += (seg) =>
{ {
if (IsSpeechMeaningful(seg.Result)) if (IsSpeechMeaningful(seg.Result))
{
OnWhisperSegmentUpdated?.Invoke(seg.Result); OnWhisperSegmentUpdated?.Invoke(seg.Result);
}
}; };
_stream.OnSegmentFinished += (seg) => _stream.OnSegmentFinished += (seg) =>
{ {
if (IsSpeechMeaningful(seg.Result)) if (IsSpeechMeaningful(seg.Result))
{
OnWhisperSegmentFinished?.Invoke(seg.Result); OnWhisperSegmentFinished?.Invoke(seg.Result);
}
}; };
// If you want Whisper to respect VAD, enable in manager or set useVad (manager controls stream params).
whisper.useVad = useVadInStream; whisper.useVad = useVadInStream;
_stream.StartStream(); _stream.StartStream();
_streamStarted = true; _streamStarted = true;
// prepare temp arrays roughly 100ms of audio // Unpause loopback if it's meant to play only while active
EnsureTmpCapacity((ex.defaultfrequency / 10) * _nativeChannels); if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
_playChannel.setPaused(false);
// Prepare temp arrays roughly 100ms of audio
EnsureTmpCapacity((rate / 10) * _nativeChannels);
isRecordingActivated = true;
Debug.Log("[FMOD→Whisper] Stream activated (Whisper started; FMOD was already recording).");
}
/// <summary>
/// Stops and disposes the Whisper stream only. FMOD keeps recording.
/// </summary>
public void DeactivateRecording()
{
if (!isRecordingActivated && !_streamStarted)
return;
isRecordingActivated = false;
// Pause loopback if it should only be active during recording
if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
_playChannel.setPaused(true);
// Tear down Whisper stream
if (_streamStarted)
{
try { _stream.StopStream(); } catch { /* ignore */ }
_streamStarted = false;
}
_stream = null;
Debug.Log("[FMOD→Whisper] Stream deactivated (Whisper stopped; FMOD still recording).");
} }
private void Update() private void Update()
{ {
if (!isRecordingActivated) return; // Always tick FMOD
if (_core.handle != IntPtr.Zero) _core.update(); if (_core.handle != IntPtr.Zero) _core.update();
if (!_streamStarted || !_recSound.hasHandle()) return; if (!_recSound.hasHandle()) return;
// How many samples recorded since last frame? // Compute how many samples recorded since last frame.
uint recPos; uint recPos;
_core.getRecordPosition(recordDriverId, out recPos); _core.getRecordPosition(recordDriverId, out recPos);
@@ -152,23 +252,39 @@ public class FMODWhisperBridge : MonoBehaviour
? (recPos - _lastRecordPos) ? (recPos - _lastRecordPos)
: (recPos + _soundPcmLength - _lastRecordPos); : (recPos + _soundPcmLength - _lastRecordPos);
if (deltaSamples == 0) return; if (deltaSamples == 0)
{
// Even if 0, keep last pos
_lastRecordPos = recPos;
return;
}
// Well read that region (16-bit) and convert to float[] [-1..1]. // If not active, we *still* advance the ring (so we don't backlog data),
// Calculate byte range to lock in sound buffer // but we *don't* push chunks to Whisper.
uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 16-bit = 2 bytes bool shouldFeed = isRecordingActivated && _streamStarted && _stream != null;
// Calculate byte range to lock (16-bit)
uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample
uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2; uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;
IntPtr p1, p2; IntPtr p1, p2;
uint len1, len2; uint len1, len2;
// Lock can wrap — FMOD splits into p1/p2. var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
_recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2); if (r != RESULT.OK)
{
// If lock fails, still advance last position to avoid spin
_lastRecordPos = recPos;
return;
}
try try
{ {
// Convert both parts to float and push to Whisper if (shouldFeed)
if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1); {
if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2); if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
}
// else: just discard; were only keeping the ring fresh.
} }
finally finally
{ {
@@ -178,19 +294,11 @@ public class FMODWhisperBridge : MonoBehaviour
_lastRecordPos = recPos; _lastRecordPos = recPos;
} }
public void ActivateRecording()
{
isRecordingActivated = true;
}
public void DeactivateRecording()
{
isRecordingActivated = false;
}
private bool IsSpeechMeaningful(string userText) private bool IsSpeechMeaningful(string userText)
{ {
return !string.IsNullOrEmpty(userText) && !userText.Contains("BLANK_AUDIO") && !userText.Trim().Equals("[ Silence ]"); return !string.IsNullOrEmpty(userText)
&& !userText.Contains("BLANK_AUDIO")
&& !userText.Trim().Equals("[ Silence ]");
} }
private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen) private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
@@ -198,26 +306,20 @@ public class FMODWhisperBridge : MonoBehaviour
int samples = (int)(byteLen / 2); // 2 bytes per sample int samples = (int)(byteLen / 2); // 2 bytes per sample
EnsureTmpCapacity(samples); EnsureTmpCapacity(samples);
// Marshal the 16-bit PCM into managed space EnsureShortOverlay(samples, out short[] sBuf);
// We pin a short[] overlay to avoid copying twice Marshal.Copy(src, sBuf, 0, samples);
int shorts = samples;
int byteCount = (int)byteLen;
// Use Marshal.Copy into a short[] then convert to float[-1..1] // Convert to float [-1..1] (no downmix change from your original)
// (You can also unsafe copy for speed if needed.) for (int i = 0; i < samples; i++)
EnsureShortOverlay(shorts, out short[] sBuf);
Marshal.Copy(src, sBuf, 0, shorts);
for (int i = 0; i < shorts; i++)
{ {
// 32768f avoids clipping at -32768
_floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f); _floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
} }
// Build a chunk for WhisperStream; with VAD off, IsVoiceDetected=true is fine. // TODO (optional): downmix to mono and/or run a light gate before feeding.
// For now we keep your original behavior:
var chunk = new AudioChunk var chunk = new AudioChunk
{ {
Data = _floatTmp.AsSpan(0, shorts).ToArray(), Data = _floatTmp.AsSpan(0, samples).ToArray(),
Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate, Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
Channels = _nativeChannels, Channels = _nativeChannels,
IsVoiceDetected = true IsVoiceDetected = true
@@ -226,7 +328,6 @@ public class FMODWhisperBridge : MonoBehaviour
_stream.AddToStream(chunk); _stream.AddToStream(chunk);
} }
private short[] _shortOverlay;
private void EnsureShortOverlay(int samples, out short[] buf) private void EnsureShortOverlay(int samples, out short[] buf)
{ {
if (_shortOverlay == null || _shortOverlay.Length < samples) if (_shortOverlay == null || _shortOverlay.Length < samples)
@@ -242,13 +343,20 @@ public class FMODWhisperBridge : MonoBehaviour
private void OnDisable() private void OnDisable()
{ {
if (_streamStarted) // Stop Whisper (if active)
{ DeactivateRecording();
_stream.StopStream();
_streamStarted = false;
}
if (_playChannel.hasHandle()) { _playChannel.stop(); _playChannel.clearHandle(); } // Stop/purge FMOD resources (since object is going away)
if (_recSound.hasHandle()) { _core.recordStop(recordDriverId); _recSound.release(); _recSound.clearHandle(); } if (_playChannel.hasHandle())
{
try { _playChannel.stop(); } catch { /* ignore */ }
_playChannel.clearHandle();
}
if (_recSound.hasHandle())
{
try { _core.recordStop(recordDriverId); } catch { /* ignore */ }
try { _recSound.release(); } catch { /* ignore */ }
_recSound.clearHandle();
}
} }
} }

View File

@@ -5,6 +5,8 @@ public class ReleasableButton : MonoBehaviour
{ {
public delegate void OnButtonPressedDelegate(); public delegate void OnButtonPressedDelegate();
public event OnButtonPressedDelegate OnButtonPressed; public event OnButtonPressedDelegate OnButtonPressed;
public delegate void OnButtonReleasedDelegate();
public event OnButtonReleasedDelegate OnButtonReleased;
public Transform movableParts; public Transform movableParts;
public float moveDuration = 0.25f; public float moveDuration = 0.25f;
@@ -37,7 +39,6 @@ public class ReleasableButton : MonoBehaviour
{ {
if (!isButtonDown && collider.gameObject.tag.EndsWith("Hand")) if (!isButtonDown && collider.gameObject.tag.EndsWith("Hand"))
{ {
Debug.Log("collided with: " + collider.gameObject.name);
Activate(); Activate();
OnButtonPressed?.Invoke(); OnButtonPressed?.Invoke();
} }
@@ -47,8 +48,8 @@ public class ReleasableButton : MonoBehaviour
{ {
if (isButtonDown && collider.gameObject.tag.EndsWith("Hand")) if (isButtonDown && collider.gameObject.tag.EndsWith("Hand"))
{ {
Debug.Log("collider exited: " + collider.gameObject.name);
Deactivate(); Deactivate();
OnButtonReleased?.Invoke();
} }
} }

View File

@@ -18,6 +18,7 @@ public class RadioTransmitter : XRGrabInteractable
void Start() void Start()
{ {
radioButton.OnButtonPressed += OnRadioButtonPressed; radioButton.OnButtonPressed += OnRadioButtonPressed;
radioButton.OnButtonReleased += OnRadioButtonReleased;
} }
// Update is called once per frame // Update is called once per frame
@@ -38,12 +39,20 @@ public class RadioTransmitter : XRGrabInteractable
fmodWhisperBridge.OnWhisperSegmentFinished += OnPlayerSpeechFinished; fmodWhisperBridge.OnWhisperSegmentFinished += OnPlayerSpeechFinished;
AudioManager.Instance.PlayAttachedInstance(FMODEvents.Instance.RadioButton, gameObject); AudioManager.Instance.PlayAttachedInstance(FMODEvents.Instance.RadioButton, gameObject);
fmodWhisperBridge.ActivateRecording(); fmodWhisperBridge.ActivateRecording();
// TODO: deactivate when button is released }
private void OnRadioButtonReleased()
{
fmodWhisperBridge.OnWhisperSegmentUpdated -= OnPlayerSpeechUpdated;
fmodWhisperBridge.OnWhisperSegmentFinished -= OnPlayerSpeechFinished;
AudioManager.Instance.PlayAttachedInstance(FMODEvents.Instance.RadioButton, gameObject);
fmodWhisperBridge.DeactivateRecording();
} }
private void OnPlayerSpeechUpdated(string text) private void OnPlayerSpeechUpdated(string text)
{ {
computerScreen.text = text; computerScreen.text = text;
OnPlayerFinishedSpeaking?.Invoke();
} }
private void OnPlayerSpeechFinished(string playerText) private void OnPlayerSpeechFinished(string playerText)