1
0
forked from cgvr/DeltaVR
Files
DeltaVR3DModelGeneration/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs

363 lines
12 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System;
using System.Runtime.InteropServices;
using UnityEngine;
using FMOD;
using FMODUnity;
using Whisper; // WhisperManager, WhisperStream, WhisperResult
using Whisper.Utils;
using Debug = UnityEngine.Debug; // AudioChunk
/// <summary>
/// FMOD mic is initialized once (Start) and runs continuously in a ring buffer.
/// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording().
/// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls.
/// </summary>
public class FMODWhisperBridge : MonoBehaviour
{
[Header("Whisper")]
[SerializeField] private WhisperManager whisper; // assign in Inspector
[SerializeField] private bool useVadInStream = true; // let WhisperStream do VAD or not
[Header("FMOD capture")]
[Tooltip("Recording device index (0 = default)")]
public int recordDriverId = 0;
[Tooltip("Set 48000 on Quest; falls back to device rate automatically")]
public int desiredSampleRate = 48000;
[Tooltip("Mono recommended for Whisper")]
public int channels = 1;
[Range(1, 10)] public int bufferLengthSec = 5;
[Header("Loopback (monitor your voice)")]
public bool playLoopback = true;
[Tooltip("If true, loopback plays only while active; otherwise its always on.")]
public bool loopbackOnlyWhenActive = true;
[Range(0f, 2f)] public float loopbackVolume = 1.0f;
public delegate void OnWhisperSegmentUpdatedDelegate(string result);
public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated;
public delegate void OnWhisperSegmentFinishedDelegate(string result);
public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished;
// FMOD
private FMOD.System _core;
private Sound _recSound;
private Channel _playChannel;
private ChannelGroup _masterGroup;
private uint _soundPcmLength; // in samples
private int _nativeRate;
private int _nativeChannels;
// ring-buffer tracking
private uint _lastRecordPos = 0;
// Whisper
private WhisperStream _stream;
private bool _streamStarted;
// temp conversion buffer
private float[] _floatTmp = new float[0];
private short[] _shortOverlay;
// activation flag
private bool isRecordingActivated = false;
private void Awake()
{
if (!whisper) whisper = FindObjectOfType<WhisperManager>();
_core = RuntimeManager.CoreSystem; // FMOD core system
}
private async void Start()
{
// -------------- FMOD initialize ONCE --------------
// Query device info
string name;
Guid guid;
SPEAKERMODE sm;
int smChannels;
DRIVER_STATE driverState;
var res = _core.getRecordDriverInfo(
recordDriverId,
out name, 256,
out guid,
out _nativeRate,
out sm,
out smChannels,
out driverState
);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}");
return;
}
_nativeChannels = channels > 0 ? channels : smChannels;
int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={rate} ch={_nativeChannels}");
// Build user sound (ring buffer) — multiple seconds
CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
{
cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
numchannels = _nativeChannels,
defaultfrequency = rate,
format = SOUND_FORMAT.PCM16,
length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec)
};
res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] createSound failed: {res}");
return;
}
_recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);
// Start recording (looping)
res = _core.recordStart(recordDriverId, _recSound, true);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}");
_recSound.release();
_recSound.clearHandle();
return;
}
// Initialize record position to avoid a huge first delta
_core.getRecordPosition(recordDriverId, out _lastRecordPos);
Debug.Log("[FMOD→Whisper] Recording started.");
// Loopback channel (optional). Start once; pause when inactive if desired.
_core.getMasterChannelGroup(out _masterGroup);
if (playLoopback)
{
res = _core.playSound(_recSound, _masterGroup, false, out _playChannel);
if (res == RESULT.OK && _playChannel.hasHandle())
{
_playChannel.setMode(MODE._2D);
_playChannel.setVolume(loopbackVolume);
if (loopbackOnlyWhenActive) _playChannel.setPaused(true); // keep muted until Activate
Debug.Log("[FMOD→Whisper] Loopback playback ready.");
}
else
{
Debug.LogWarning($"[FMOD→Whisper] playSound failed or channel invalid: {res}");
}
}
// No Whisper stream here. It will be created on ActivateRecording().
await System.Threading.Tasks.Task.Yield();
}
/// <summary>
/// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording).
/// </summary>
public async void ActivateRecording()
{
if (isRecordingActivated)
{
Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active).");
return;
}
if (!_recSound.hasHandle())
{
Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running.");
return;
}
int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
try
{
_stream = await whisper.CreateStream(rate, _nativeChannels);
}
catch (Exception e)
{
Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}");
_stream = null;
_streamStarted = false;
return;
}
// Wire events
_stream.OnSegmentUpdated += (seg) =>
{
if (IsSpeechMeaningful(seg.Result))
OnWhisperSegmentUpdated?.Invoke(seg.Result);
};
_stream.OnSegmentFinished += (seg) =>
{
if (IsSpeechMeaningful(seg.Result))
OnWhisperSegmentFinished?.Invoke(seg.Result);
};
whisper.useVad = useVadInStream;
_stream.StartStream();
_streamStarted = true;
// Unpause loopback if it's meant to play only while active
if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
_playChannel.setPaused(false);
// Prepare temp arrays roughly 100ms of audio
EnsureTmpCapacity((rate / 10) * _nativeChannels);
isRecordingActivated = true;
Debug.Log("[FMOD→Whisper] Stream activated (Whisper started; FMOD was already recording).");
}
/// <summary>
/// Stops and disposes the Whisper stream only. FMOD keeps recording.
/// </summary>
public void DeactivateRecording()
{
if (!isRecordingActivated && !_streamStarted)
return;
isRecordingActivated = false;
// Pause loopback if it should only be active during recording
if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
_playChannel.setPaused(true);
// Tear down Whisper stream
if (_streamStarted)
{
try { _stream.StopStream(); } catch { /* ignore */ }
_streamStarted = false;
}
_stream = null;
Debug.Log("[FMOD→Whisper] Stream deactivated (Whisper stopped; FMOD still recording).");
}
private void Update()
{
// Always tick FMOD
if (_core.handle != IntPtr.Zero) _core.update();
if (!_recSound.hasHandle()) return;
// Compute how many samples recorded since last frame.
uint recPos;
_core.getRecordPosition(recordDriverId, out recPos);
uint deltaSamples = (recPos >= _lastRecordPos)
? (recPos - _lastRecordPos)
: (recPos + _soundPcmLength - _lastRecordPos);
if (deltaSamples == 0)
{
// Even if 0, keep last pos
_lastRecordPos = recPos;
return;
}
// If not active, we *still* advance the ring (so we don't backlog data),
// but we *don't* push chunks to Whisper.
bool shouldFeed = isRecordingActivated && _streamStarted && _stream != null;
// Calculate byte range to lock (16-bit)
uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample
uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;
IntPtr p1, p2;
uint len1, len2;
var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
if (r != RESULT.OK)
{
// If lock fails, still advance last position to avoid spin
_lastRecordPos = recPos;
return;
}
try
{
if (shouldFeed)
{
if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
}
// else: just discard; were only keeping the ring fresh.
}
finally
{
_recSound.unlock(p1, p2, len1, len2);
}
_lastRecordPos = recPos;
}
private bool IsSpeechMeaningful(string userText)
{
return !string.IsNullOrEmpty(userText)
&& !userText.Contains("BLANK_AUDIO")
&& !userText.Trim().Equals("[ Silence ]");
}
private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
{
int samples = (int)(byteLen / 2); // 2 bytes per sample
EnsureTmpCapacity(samples);
EnsureShortOverlay(samples, out short[] sBuf);
Marshal.Copy(src, sBuf, 0, samples);
// Convert to float [-1..1] (no downmix change from your original)
for (int i = 0; i < samples; i++)
{
_floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
}
// TODO (optional): downmix to mono and/or run a light gate before feeding.
// For now we keep your original behavior:
var chunk = new AudioChunk
{
Data = _floatTmp.AsSpan(0, samples).ToArray(),
Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
Channels = _nativeChannels,
IsVoiceDetected = true
};
_stream.AddToStream(chunk);
}
private void EnsureShortOverlay(int samples, out short[] buf)
{
if (_shortOverlay == null || _shortOverlay.Length < samples)
_shortOverlay = new short[Mathf.NextPowerOfTwo(samples)];
buf = _shortOverlay;
}
private void EnsureTmpCapacity(int samples)
{
if (_floatTmp == null || _floatTmp.Length < samples)
_floatTmp = new float[Mathf.NextPowerOfTwo(samples)];
}
private void OnDisable()
{
// Stop Whisper (if active)
DeactivateRecording();
// Stop/purge FMOD resources (since object is going away)
if (_playChannel.hasHandle())
{
try { _playChannel.stop(); } catch { /* ignore */ }
_playChannel.clearHandle();
}
if (_recSound.hasHandle())
{
try { _core.recordStop(recordDriverId); } catch { /* ignore */ }
try { _recSound.release(); } catch { /* ignore */ }
_recSound.clearHandle();
}
}
}