forked from cgvr/DeltaVR
431 lines
14 KiB
C#
431 lines
14 KiB
C#
|
||
using System;
|
||
using System.Runtime.InteropServices;
|
||
using UnityEngine;
|
||
using FMOD;
|
||
using FMODUnity;
|
||
using Whisper; // WhisperManager, WhisperStream, WhisperResult
|
||
using Whisper.Utils;
|
||
using Debug = UnityEngine.Debug; // AudioChunk
|
||
|
||
/// <summary>
|
||
/// FMOD mic is initialized once (Start) and runs continuously in a ring buffer.
|
||
/// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording().
|
||
/// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls.
|
||
/// </summary>
|
||
public class FMODWhisperBridge : MonoBehaviour
|
||
{
|
||
[Header("Whisper")]
|
||
[SerializeField] private WhisperManager whisper; // assign in Inspector
|
||
[SerializeField] private bool useVadInStream = true; // let WhisperStream do VAD or not
|
||
|
||
[Header("FMOD capture")]
|
||
[Tooltip("Recording device index (0 = default)")]
|
||
public int recordDriverId = 0;
|
||
[Tooltip("Set 48000 on Quest; falls back to device rate automatically")]
|
||
public int desiredSampleRate = 48000;
|
||
[Tooltip("Mono recommended for Whisper")]
|
||
public int channels = 1;
|
||
[Range(1, 10)] public int bufferLengthSec = 5;
|
||
|
||
[Header("Loopback (monitor your voice)")]
|
||
public bool playLoopback = true;
|
||
[Tooltip("If true, loopback plays only while active; otherwise it’s always on.")]
|
||
public bool loopbackOnlyWhenActive = true;
|
||
[Range(0f, 2f)] public float loopbackVolume = 1.0f;
|
||
|
||
public delegate void OnWhisperSegmentUpdatedDelegate(string result);
|
||
public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated;
|
||
|
||
public delegate void OnWhisperSegmentFinishedDelegate(string result);
|
||
public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished;
|
||
|
||
// FMOD
|
||
private FMOD.System _core;
|
||
private Sound _recSound;
|
||
private Channel _playChannel;
|
||
private ChannelGroup _masterGroup;
|
||
private uint _soundPcmLength; // in samples
|
||
private int _nativeRate;
|
||
private int _nativeChannels;
|
||
|
||
// ring-buffer tracking
|
||
private uint _lastRecordPos = 0;
|
||
|
||
// Whisper
|
||
private WhisperStream _stream;
|
||
private bool _streamStarted;
|
||
|
||
// temp conversion buffer
|
||
private float[] _floatTmp = new float[0];
|
||
private short[] _shortOverlay;
|
||
|
||
// activation flag
|
||
private bool isRecordingActivated = false;
|
||
private bool _skipOneFeedFrame = false;
|
||
|
||
private void Awake()
|
||
{
|
||
if (!whisper) whisper = FindObjectOfType<WhisperManager>();
|
||
_core = RuntimeManager.CoreSystem; // FMOD core system
|
||
}
|
||
|
||
private async void Start()
|
||
{
|
||
// -------------- FMOD initialize ONCE --------------
|
||
// Query device info
|
||
string name;
|
||
Guid guid;
|
||
SPEAKERMODE sm;
|
||
int smChannels;
|
||
DRIVER_STATE driverState;
|
||
|
||
var res = _core.getRecordDriverInfo(
|
||
recordDriverId,
|
||
out name, 256,
|
||
out guid,
|
||
out _nativeRate,
|
||
out sm,
|
||
out smChannels,
|
||
out driverState
|
||
);
|
||
|
||
if (res != RESULT.OK)
|
||
{
|
||
Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}");
|
||
return;
|
||
}
|
||
|
||
_nativeChannels = channels > 0 ? channels : smChannels;
|
||
int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
|
||
Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={rate} ch={_nativeChannels}");
|
||
|
||
// Build user sound (ring buffer) — multiple seconds
|
||
CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
|
||
{
|
||
cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
|
||
numchannels = _nativeChannels,
|
||
defaultfrequency = rate,
|
||
format = SOUND_FORMAT.PCM16,
|
||
length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec)
|
||
};
|
||
|
||
res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
|
||
if (res != RESULT.OK)
|
||
{
|
||
Debug.LogError($"[FMOD→Whisper] createSound failed: {res}");
|
||
return;
|
||
}
|
||
|
||
_recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);
|
||
|
||
// Start recording (looping)
|
||
res = _core.recordStart(recordDriverId, _recSound, true);
|
||
if (res != RESULT.OK)
|
||
{
|
||
Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}");
|
||
_recSound.release();
|
||
_recSound.clearHandle();
|
||
return;
|
||
}
|
||
|
||
// Initialize record position to avoid a huge first delta
|
||
_core.getRecordPosition(recordDriverId, out _lastRecordPos);
|
||
Debug.Log("[FMOD→Whisper] Recording started.");
|
||
|
||
// Loopback channel (optional). Start once; pause when inactive if desired.
|
||
_core.getMasterChannelGroup(out _masterGroup);
|
||
if (playLoopback)
|
||
{
|
||
res = _core.playSound(_recSound, _masterGroup, false, out _playChannel);
|
||
if (res == RESULT.OK && _playChannel.hasHandle())
|
||
{
|
||
_playChannel.setMode(MODE._2D);
|
||
_playChannel.setVolume(loopbackVolume);
|
||
if (loopbackOnlyWhenActive) _playChannel.setPaused(true); // keep muted until Activate
|
||
Debug.Log("[FMOD→Whisper] Loopback playback ready.");
|
||
}
|
||
else
|
||
{
|
||
Debug.LogWarning($"[FMOD→Whisper] playSound failed or channel invalid: {res}");
|
||
}
|
||
}
|
||
|
||
// No Whisper stream here. It will be created on ActivateRecording().
|
||
await System.Threading.Tasks.Task.Yield();
|
||
}
|
||
|
||
/// <summary>
|
||
/// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording).
|
||
/// </summary>
|
||
public async void ActivateRecording()
|
||
{
|
||
if (isRecordingActivated)
|
||
{
|
||
Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active).");
|
||
return;
|
||
}
|
||
|
||
if (!_recSound.hasHandle())
|
||
{
|
||
Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running.");
|
||
return;
|
||
}
|
||
|
||
int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
|
||
|
||
try
|
||
{
|
||
_stream = await whisper.CreateStream(rate, _nativeChannels);
|
||
}
|
||
catch (Exception e)
|
||
{
|
||
Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}");
|
||
_stream = null;
|
||
_streamStarted = false;
|
||
return;
|
||
}
|
||
|
||
// Wire events
|
||
_stream.OnSegmentUpdated += (seg) =>
|
||
{
|
||
string cleanedText = PostProcessInput(seg.Result);
|
||
if (!string.IsNullOrEmpty(cleanedText))
|
||
OnWhisperSegmentUpdated?.Invoke(cleanedText);
|
||
};
|
||
_stream.OnSegmentFinished += (seg) =>
|
||
{
|
||
string cleanedText = PostProcessInput(seg.Result);
|
||
if (!string.IsNullOrEmpty(cleanedText))
|
||
OnWhisperSegmentFinished?.Invoke(cleanedText);
|
||
};
|
||
|
||
whisper.useVad = useVadInStream;
|
||
|
||
_stream.StartStream();
|
||
_streamStarted = true;
|
||
|
||
|
||
// --- NEW: Clear the ring buffer and reset read pointer ---
|
||
// Pause loopback while we clear (optional, but avoids clicks)
|
||
if (playLoopback && _playChannel.hasHandle())
|
||
_playChannel.setPaused(true);
|
||
|
||
// Clear buffer bytes
|
||
ClearRecordRingBuffer();
|
||
|
||
// Reset our read pointer to the current write head
|
||
_core.getRecordPosition(recordDriverId, out _lastRecordPos);
|
||
|
||
// We’ll skip feeding for one frame to guarantee a clean start
|
||
_skipOneFeedFrame = true;
|
||
|
||
// Unpause loopback if we want it active during recording
|
||
if (playLoopback && _playChannel.hasHandle() && (!loopbackOnlyWhenActive || isRecordingActivated))
|
||
_playChannel.setPaused(loopbackOnlyWhenActive ? false : _playChannel.getPaused(out var paused) == FMOD.RESULT.OK && paused ? false : false);
|
||
|
||
isRecordingActivated = true;
|
||
Debug.Log("[FMOD→Whisper] Stream activated (buffer cleared; reading from current head).");
|
||
|
||
}
|
||
|
||
/// <summary>
|
||
/// Stops and disposes the Whisper stream only. FMOD keeps recording.
|
||
/// </summary>
|
||
public void DeactivateRecording()
|
||
{
|
||
if (!isRecordingActivated && !_streamStarted)
|
||
return;
|
||
|
||
isRecordingActivated = false;
|
||
|
||
// Pause loopback if it should only be active during recording
|
||
if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
|
||
_playChannel.setPaused(true);
|
||
|
||
// Tear down Whisper stream
|
||
if (_streamStarted)
|
||
{
|
||
try { _stream.StopStream(); } catch { /* ignore */ }
|
||
_streamStarted = false;
|
||
}
|
||
_stream = null;
|
||
|
||
Debug.Log("[FMOD→Whisper] Stream deactivated (Whisper stopped; FMOD still recording).");
|
||
}
|
||
|
||
private void Update()
|
||
{
|
||
// Always tick FMOD
|
||
if (_core.handle != IntPtr.Zero) _core.update();
|
||
if (!_recSound.hasHandle()) return;
|
||
|
||
// Compute how many samples recorded since last frame.
|
||
uint recPos;
|
||
_core.getRecordPosition(recordDriverId, out recPos);
|
||
|
||
uint deltaSamples = (recPos >= _lastRecordPos)
|
||
? (recPos - _lastRecordPos)
|
||
: (recPos + _soundPcmLength - _lastRecordPos);
|
||
|
||
if (deltaSamples == 0)
|
||
{
|
||
// Even if 0, keep last pos
|
||
_lastRecordPos = recPos;
|
||
return;
|
||
}
|
||
|
||
// If not active, we *still* advance the ring (so we don't backlog data),
|
||
// but we *don't* push chunks to Whisper.
|
||
bool shouldFeed = isRecordingActivated && _streamStarted && _stream != null;
|
||
|
||
// Calculate byte range to lock (16-bit)
|
||
uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample
|
||
uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;
|
||
|
||
IntPtr p1, p2;
|
||
uint len1, len2;
|
||
|
||
var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
|
||
if (r != RESULT.OK)
|
||
{
|
||
_lastRecordPos = recPos;
|
||
return;
|
||
}
|
||
|
||
try
|
||
{
|
||
if (shouldFeed && !_skipOneFeedFrame)
|
||
{
|
||
if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
|
||
if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
|
||
}
|
||
// If skipping, we just discard this frame to ensure no stale data leaks.
|
||
}
|
||
finally
|
||
{
|
||
_recSound.unlock(p1, p2, len1, len2);
|
||
}
|
||
|
||
if (_skipOneFeedFrame) _skipOneFeedFrame = false;
|
||
|
||
_lastRecordPos = recPos;
|
||
|
||
}
|
||
|
||
private string PostProcessInput(string input)
|
||
{
|
||
return input.Replace("[silence]", "").Replace("[ Silence ]", "").Replace("BLANK_AUDIO", "").Replace("[", "").Replace("]", "").Trim();
|
||
}
|
||
|
||
private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
|
||
{
|
||
int samples = (int)(byteLen / 2); // 2 bytes per sample
|
||
EnsureTmpCapacity(samples);
|
||
|
||
EnsureShortOverlay(samples, out short[] sBuf);
|
||
Marshal.Copy(src, sBuf, 0, samples);
|
||
|
||
// Convert to float [-1..1] (no downmix change from your original)
|
||
for (int i = 0; i < samples; i++)
|
||
{
|
||
_floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
|
||
}
|
||
|
||
// TODO (optional): downmix to mono and/or run a light gate before feeding.
|
||
// For now we keep your original behavior:
|
||
var chunk = new AudioChunk
|
||
{
|
||
Data = _floatTmp.AsSpan(0, samples).ToArray(),
|
||
Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
|
||
Channels = _nativeChannels,
|
||
IsVoiceDetected = true
|
||
};
|
||
|
||
_stream.AddToStream(chunk);
|
||
}
|
||
|
||
private void EnsureShortOverlay(int samples, out short[] buf)
|
||
{
|
||
if (_shortOverlay == null || _shortOverlay.Length < samples)
|
||
_shortOverlay = new short[Mathf.NextPowerOfTwo(samples)];
|
||
buf = _shortOverlay;
|
||
}
|
||
|
||
private void EnsureTmpCapacity(int samples)
|
||
{
|
||
if (_floatTmp == null || _floatTmp.Length < samples)
|
||
_floatTmp = new float[Mathf.NextPowerOfTwo(samples)];
|
||
}
|
||
|
||
private void OnDisable()
|
||
{
|
||
// Stop Whisper (if active)
|
||
DeactivateRecording();
|
||
|
||
// Stop/purge FMOD resources (since object is going away)
|
||
if (_playChannel.hasHandle())
|
||
{
|
||
try { _playChannel.stop(); } catch { /* ignore */ }
|
||
_playChannel.clearHandle();
|
||
}
|
||
if (_recSound.hasHandle())
|
||
{
|
||
try { _core.recordStop(recordDriverId); } catch { /* ignore */ }
|
||
try { _recSound.release(); } catch { /* ignore */ }
|
||
_recSound.clearHandle();
|
||
}
|
||
}
|
||
|
||
|
||
private void ClearRecordRingBuffer()
|
||
{
|
||
if (!_recSound.hasHandle() || _soundPcmLength == 0) return;
|
||
|
||
uint totalBytes = _soundPcmLength * (uint)_nativeChannels * 2; // PCM16
|
||
IntPtr p1, p2;
|
||
uint len1, len2;
|
||
|
||
// Lock the whole buffer (start=0, length=totalBytes)
|
||
var r = _recSound.@lock(0, totalBytes, out p1, out p2, out len1, out len2);
|
||
if (r != FMOD.RESULT.OK)
|
||
{
|
||
Debug.LogWarning($"[FMOD→Whisper] Could not lock ring buffer to clear: {r}");
|
||
return;
|
||
}
|
||
|
||
try
|
||
{
|
||
if (len1 > 0)
|
||
{
|
||
// zero p1
|
||
// We’ll reuse a static zero array to avoid allocating huge buffers repeatedly
|
||
ZeroMem(p1, (int)len1);
|
||
}
|
||
if (len2 > 0)
|
||
{
|
||
ZeroMem(p2, (int)len2);
|
||
}
|
||
}
|
||
finally
|
||
{
|
||
_recSound.unlock(p1, p2, len1, len2);
|
||
}
|
||
}
|
||
|
||
// cheap zeroing helper (avoids allocating len-sized arrays each time)
|
||
private static readonly byte[] _zeroChunk = new byte[16 * 1024]; // 16 KB
|
||
private static void ZeroMem(IntPtr dst, int byteLen)
|
||
{
|
||
int offset = 0;
|
||
while (byteLen > 0)
|
||
{
|
||
int n = Math.Min(_zeroChunk.Length, byteLen);
|
||
Marshal.Copy(_zeroChunk, 0, dst + offset, n);
|
||
offset += n;
|
||
byteLen -= n;
|
||
}
|
||
}
|
||
|
||
}
|