1
0
forked from cgvr/DeltaVR
Files
DeltaVR3DModelGeneration/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs

429 lines
14 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System;
using System.Runtime.InteropServices;
using UnityEngine;
using FMOD;
using FMODUnity;
using Whisper;
using Whisper.Utils;
using Debug = UnityEngine.Debug;
/// <summary>
/// FMOD mic is initialized once (Start) and runs continuously in a ring buffer.
/// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording().
/// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls.
/// </summary>
public class FMODWhisperBridge : MonoBehaviour
{
[Header("Whisper")]
[SerializeField] private WhisperManager whisper; // assign in Inspector
[SerializeField] private bool useVadInStream = true; // let WhisperStream do VAD or not
[Header("FMOD capture")]
[Tooltip("Recording device index (0 = default)")]
public int recordDriverId = 0;
[Tooltip("Set 48000 on Quest; falls back to device rate automatically")]
public int desiredSampleRate = 48000;
[Tooltip("Mono recommended for Whisper")]
public int channels = 1;
[Range(1, 10)] public int bufferLengthSec = 5;
public delegate void OnWhisperSegmentUpdatedDelegate(string result);
public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated;
public delegate void OnWhisperSegmentFinishedDelegate(string result);
public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished;
// FMOD
private FMOD.System _core;
private Sound _recSound;
private Channel _playChannel;
private uint _soundPcmLength; // in samples
private int _nativeRate;
private int _nativeChannels;
// ring-buffer tracking
private uint _lastRecordPos = 0;
// Whisper
private WhisperStream _stream;
// temp conversion buffer
private float[] _floatTmp = new float[0];
private short[] _shortOverlay;
// activation flag
private bool isRecordingActivated = false;
private bool _skipOneFeedFrame = false;
// --- Speech Volume Measurement ---
private float currentVolumeRms = 0f; // Smoothed RMS for external access
private float volumeSmoothing = 0.15f; // How fast the meter reacts (0.10.3 good)
private void Awake()
{
if (!whisper) whisper = FindObjectOfType<WhisperManager>();
_core = RuntimeManager.CoreSystem; // FMOD core system
}
private async void Start()
{
// -------------- FMOD initialize ONCE --------------
// Query device info
string name;
Guid guid;
SPEAKERMODE sm;
int smChannels;
DRIVER_STATE driverState;
var res = _core.getRecordDriverInfo(
recordDriverId,
out name, 256,
out guid,
out _nativeRate,
out sm,
out smChannels,
out driverState
);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}");
return;
}
_nativeChannels = channels > 0 ? channels : smChannels;
int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={rate} ch={_nativeChannels}");
// Build user sound (ring buffer) — multiple seconds
CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
{
cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
numchannels = _nativeChannels,
defaultfrequency = rate,
format = SOUND_FORMAT.PCM16,
length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec)
};
res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] createSound failed: {res}");
return;
}
_recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);
// Start recording (looping)
res = _core.recordStart(recordDriverId, _recSound, true);
if (res != RESULT.OK)
{
Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}");
_recSound.release();
_recSound.clearHandle();
return;
}
// Initialize record position to avoid a huge first delta
_core.getRecordPosition(recordDriverId, out _lastRecordPos);
Debug.Log("[FMOD→Whisper] Recording started.");
// No Whisper stream here. It will be created on ActivateRecording().
await System.Threading.Tasks.Task.Yield();
}
/// <summary>
/// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording).
/// </summary>
public async void ActivateRecording()
{
if (isRecordingActivated)
{
Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active).");
return;
}
if (!_recSound.hasHandle())
{
Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running.");
return;
}
int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
if (_stream != null)
{
_stream.StopStream();
}
try
{
_stream = await whisper.CreateStream(rate, _nativeChannels);
}
catch (Exception e)
{
Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}");
_stream = null;
return;
}
// Wire events
_stream.OnSegmentUpdated += (seg) =>
{
string cleanedText = PostProcessInput(seg.Result);
if (!string.IsNullOrEmpty(cleanedText))
OnWhisperSegmentUpdated?.Invoke(cleanedText);
};
_stream.OnSegmentFinished += (seg) =>
{
string cleanedText = PostProcessInput(seg.Result);
if (!string.IsNullOrEmpty(cleanedText))
OnWhisperSegmentFinished?.Invoke(cleanedText);
};
whisper.useVad = useVadInStream;
_stream.StartStream();
// Reset our read pointer to the current write head
_core.getRecordPosition(recordDriverId, out _lastRecordPos);
// Well skip feeding for one frame to guarantee a clean start
_skipOneFeedFrame = true;
isRecordingActivated = true;
Debug.Log("[FMOD→Whisper] Stream activated (buffer cleared; reading from current head).");
}
/// <summary>
/// Stops and disposes the Whisper stream only. FMOD keeps recording.
/// </summary>
public void DeactivateRecording()
{
if (!isRecordingActivated)
return;
isRecordingActivated = false;
}
/// <summary>
/// Returns current microphone level in dBFS (decibels relative to full-scale).
/// 0 dBFS = digital clipping; normal speech is typically around -35 to -20 dBFS.
/// </summary>
public float GetCurrentVolumeDb()
{
// Guard from log(0)
const float minRms = 1e-7f;
float rms = Mathf.Clamp(currentVolumeRms, minRms, 1f);
return 20f * Mathf.Log10(rms);
}
/// <summary>
/// Returns a UI-friendly 0..1 loudness from the current dBFS value.
/// Adjust the dB range to your content/environment if needed.
/// </summary>
public float GetNormalizedVolume01()
{
float db = GetCurrentVolumeDb(); // typically ~ -60 .. -15 during use
return Mathf.Clamp01(Mathf.InverseLerp(-60f, -15f, db));
}
private void Update()
{
// Always tick FMOD
if (_core.handle != IntPtr.Zero) _core.update();
if (!_recSound.hasHandle()) return;
// Compute how many samples recorded since last frame.
_core.getRecordPosition(recordDriverId, out uint recPos);
uint deltaSamples = (recPos >= _lastRecordPos)
? (recPos - _lastRecordPos)
: (recPos + _soundPcmLength - _lastRecordPos);
if (deltaSamples == 0)
{
// Even if 0, keep last pos
_lastRecordPos = recPos;
return;
}
// Calculate byte range to lock (16-bit)
uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample
uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;
IntPtr p1, p2;
uint len1, len2;
var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
if (r != RESULT.OK)
{
_lastRecordPos = recPos;
return;
}
try
{
float rmsAccumulator = 0f;
int rmsSampleCount = 0;
// 1) Measure volume
if (len1 > 0)
{
ComputeRmsFromPcm16(p1, len1, ref rmsAccumulator, ref rmsSampleCount);
}
if (len2 > 0)
{
ComputeRmsFromPcm16(p2, len2, ref rmsAccumulator, ref rmsSampleCount);
}
if (rmsSampleCount > 0)
{
float rms = Mathf.Sqrt(rmsAccumulator / rmsSampleCount);
// Smooth the value
currentVolumeRms = Mathf.Lerp(currentVolumeRms, rms, 1f - Mathf.Pow(1f - volumeSmoothing, Time.deltaTime * 60f));
}
// 2) Feed audio to Whisper
if (_stream != null)
{
if (isRecordingActivated && !_skipOneFeedFrame)
{
// Feed real mic
if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
}
else
{
// Feed same-size silence instead
if (len1 > 0) FeedSilentChunk(len1);
if (len2 > 0) FeedSilentChunk(len2);
}
}
// If skipping, we just discard this frame to ensure no stale data leaks.
}
finally
{
_recSound.unlock(p1, p2, len1, len2);
}
if (_skipOneFeedFrame) _skipOneFeedFrame = false;
_lastRecordPos = recPos;
}
private string PostProcessInput(string input)
{
return input.Replace("[silence]", "").Replace("[ Silence ]", "").Replace("BLANK_AUDIO", "").Replace("[", "").Replace("]", "").Replace("END PLAYBACK", "").Trim();
}
private void FeedSilentChunk(uint byteLen)
{
int samples = (int)(byteLen / 2);
EnsureTmpCapacity(samples);
// fill existing _floatTmp with zeros (efficient)
Array.Clear(_floatTmp, 0, samples);
var chunk = new AudioChunk
{
Data = _floatTmp.AsSpan(0, samples).ToArray(),
Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
Channels = _nativeChannels,
IsVoiceDetected = false
};
_stream.AddToStream(chunk);
}
private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
{
int samples = (int)(byteLen / 2); // 2 bytes per sample
EnsureTmpCapacity(samples);
EnsureShortOverlay(samples, out short[] sBuf);
Marshal.Copy(src, sBuf, 0, samples);
// Convert to float [-1..1] (no downmix change from your original)
for (int i = 0; i < samples; i++)
{
_floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
}
// TODO (optional): downmix to mono and/or run a light gate before feeding.
// For now we keep your original behavior:
var chunk = new AudioChunk
{
Data = _floatTmp.AsSpan(0, samples).ToArray(),
Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
Channels = _nativeChannels,
IsVoiceDetected = true
};
_stream.AddToStream(chunk);
}
private void EnsureShortOverlay(int samples, out short[] buf)
{
if (_shortOverlay == null || _shortOverlay.Length < samples)
_shortOverlay = new short[Mathf.NextPowerOfTwo(samples)];
buf = _shortOverlay;
}
private void EnsureTmpCapacity(int samples)
{
if (_floatTmp == null || _floatTmp.Length < samples)
_floatTmp = new float[Mathf.NextPowerOfTwo(samples)];
}
private void OnDisable()
{
// Stop Whisper (if active)
DeactivateRecording();
// Stop/purge FMOD resources (since object is going away)
if (_playChannel.hasHandle())
{
try { _playChannel.stop(); } catch { /* ignore */ }
_playChannel.clearHandle();
}
if (_recSound.hasHandle())
{
try { _core.recordStop(recordDriverId); } catch { /* ignore */ }
try { _recSound.release(); } catch { /* ignore */ }
_recSound.clearHandle();
}
}
/// <summary>
/// Computes RMS (root mean square) from a PCM16 block using only safe code.
/// Uses the shared _shortOverlay buffer (no allocations).
/// Accumulates results into accumulator + sampleCount.
/// </summary>
private void ComputeRmsFromPcm16(IntPtr src, uint byteLen, ref float accumulator, ref int sampleCount)
{
// Number of PCM16 samples (2 bytes per sample)
int samples = (int)(byteLen / 2);
if (samples <= 0) return;
// Ensure overlay buffer exists & is large enough
EnsureShortOverlay(samples, out short[] sBuf);
// Copy PCM16 into managed buffer (safe)
Marshal.Copy(src, sBuf, 0, samples);
// Accumulate squared amplitude
for (int i = 0; i < samples; i++)
{
float v = sBuf[i] / 32768f; // normalize to [-1..1]
accumulator += v * v;
}
sampleCount += samples;
}
}