1
0
forked from cgvr/DeltaVR
Files
DeltaVR3DModelGeneration/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs

247 lines
8.9 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System;
using System.Runtime.InteropServices;
using UnityEngine;
using FMOD;
using FMODUnity;
using Whisper; // WhisperManager, WhisperStream, WhisperResult
using Whisper.Utils; // AudioChunk
/// <summary>
/// Capture microphone with FMOD and feed chunks to WhisperStream (no Unity Microphone).
/// Also (optionally) plays the recorded sound back via FMOD loopback.
/// </summary>
public class FMODWhisperBridge : MonoBehaviour
{
[Header("Whisper")]
[SerializeField] private WhisperManager whisper; // assign in Inspector
[SerializeField] private bool useVadInStream = false; // let WhisperStream do VAD or not
[Header("FMOD capture")]
[Tooltip("Recording device index (0 = default)")]
public int recordDriverId = 0;
[Tooltip("Set 48000 on Quest; falls back to device rate automatically")]
public int desiredSampleRate = 48000;
[Tooltip("Mono recommended for Whisper")]
public int channels = 1;
[Range(1, 10)] public int bufferLengthSec = 5;
[Header("Loopback (monitor your voice)")]
public bool playLoopback = true;
[Range(0f, 2f)] public float loopbackVolume = 1.0f;
public delegate void OnWhisperSegmentUpdatedDelegate(string result);
public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated;
public delegate void OnWhisperSegmentFinishedDelegate(string result);
public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished;
// FMOD
private FMOD.System _core;
private Sound _recSound;
private Channel _playChannel;
private ChannelGroup _masterGroup;
private uint _soundPcmLength; // in samples
private int _nativeRate;
private int _nativeChannels;
// ring-buffer tracking
private uint _lastRecordPos = 0;
// Whisper
private WhisperStream _stream;
private bool _streamStarted;
// temp conversion buffer
private float[] _floatTmp = new float[0];
private bool isRecordingActivated = false;
private void Awake()
{
if (!whisper) whisper = FindObjectOfType<WhisperManager>();
_core = RuntimeManager.CoreSystem; // FMOD core system
}
private async void Start()
{
// Query device info to get native rate/channels.
// (FMOD: getRecordDriverInfo gives you system rate & speaker mode)
string name;
Guid guid;
SPEAKERMODE sm;
int smChannels;
DRIVER_STATE driverState;
// signature: getRecordDriverInfo(id, out name, nameLen, out guid, out systemrate, out speakermode, out speakermodechannels, out driverState)
_core.getRecordDriverInfo(recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState);
_nativeChannels = channels > 0 ? channels : smChannels;
UnityEngine.Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={_nativeRate} ch={_nativeChannels}");
// Build a user sound buffer that FMOD will fill (OPENUSER | LOOP_NORMAL).
CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
{
cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
numchannels = _nativeChannels,
defaultfrequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
format = SOUND_FORMAT.PCM16,
length = (uint)(((_nativeRate > 0 ? _nativeRate : desiredSampleRate) * _nativeChannels) * sizeof(short)) // seconds=1 (we loop)
};
_core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
_recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);
// Start FMOD recording into that sound (looping ring buffer).
_core.recordStart(recordDriverId, _recSound, true);
UnityEngine.Debug.Log("[FMOD→Whisper] Recording started.");
// Optional loopback playback using FMOD (plays same sound ring buffer).
_core.getMasterChannelGroup(out _masterGroup);
if (playLoopback)
{
_core.playSound(_recSound, _masterGroup, false, out _playChannel);
_playChannel.setMode(MODE._2D);
_playChannel.setVolume(loopbackVolume);
UnityEngine.Debug.Log("[FMOD→Whisper] Loopback playback started.");
}
// Create Whisper stream WITHOUT MicrophoneRecord, just from (freq, channels).
// We'll push AudioChunk manually.
// NOTE: WhisperStreams sliding window is governed by managers stepSec/keepSec/lengthSec.
_stream = await whisper.CreateStream(ex.defaultfrequency, _nativeChannels);
_stream.OnResultUpdated += (txt) =>
{
//OnWhisperResultProcessed?.Invoke(txt);
//UnityEngine.Debug.Log($"[Whisper] result updated: {txt}");
};
_stream.OnSegmentUpdated += (seg) =>
{
OnWhisperSegmentUpdated?.Invoke(seg.Result);
//UnityEngine.Debug.Log($"[Whisper] Seg updated: {seg.Result}");
};
_stream.OnSegmentFinished += (seg) =>
{
OnWhisperSegmentFinished?.Invoke(seg.Result);
//UnityEngine.Debug.Log($"[Whisper] Seg finished: {seg.Result}");
};
// If you want Whisper to respect VAD, enable in manager or set useVad (manager controls stream params).
whisper.useVad = useVadInStream;
_stream.StartStream();
_streamStarted = true;
// prepare temp arrays roughly 100ms of audio
EnsureTmpCapacity((ex.defaultfrequency / 10) * _nativeChannels);
}
private void Update()
{
if (!isRecordingActivated) return;
if (_core.handle != IntPtr.Zero) _core.update();
if (!_streamStarted || !_recSound.hasHandle()) return;
// How many samples recorded since last frame?
uint recPos;
_core.getRecordPosition(recordDriverId, out recPos);
uint deltaSamples = (recPos >= _lastRecordPos)
? (recPos - _lastRecordPos)
: (recPos + _soundPcmLength - _lastRecordPos);
if (deltaSamples == 0) return;
// Well read that region (16-bit) and convert to float[] [-1..1].
// Calculate byte range to lock in sound buffer
uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 16-bit = 2 bytes
uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;
IntPtr p1, p2;
uint len1, len2;
// Lock can wrap — FMOD splits into p1/p2.
_recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
try
{
// Convert both parts to float and push to Whisper
if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
}
finally
{
_recSound.unlock(p1, p2, len1, len2);
}
_lastRecordPos = recPos;
}
public void ActivateRecording()
{
isRecordingActivated = true;
}
public void DeactivateRecording()
{
isRecordingActivated = false;
}
private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
{
int samples = (int)(byteLen / 2); // 2 bytes per sample
EnsureTmpCapacity(samples);
// Marshal the 16-bit PCM into managed space
// We pin a short[] overlay to avoid copying twice
int shorts = samples;
int byteCount = (int)byteLen;
// Use Marshal.Copy into a short[] then convert to float[-1..1]
// (You can also unsafe copy for speed if needed.)
EnsureShortOverlay(shorts, out short[] sBuf);
Marshal.Copy(src, sBuf, 0, shorts);
for (int i = 0; i < shorts; i++)
{
// 32768f avoids clipping at -32768
_floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
}
// Build a chunk for WhisperStream; with VAD off, IsVoiceDetected=true is fine.
var chunk = new AudioChunk
{
Data = _floatTmp.AsSpan(0, shorts).ToArray(),
Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
Channels = _nativeChannels,
IsVoiceDetected = true
};
_stream.AddToStream(chunk);
}
private short[] _shortOverlay;
private void EnsureShortOverlay(int samples, out short[] buf)
{
if (_shortOverlay == null || _shortOverlay.Length < samples)
_shortOverlay = new short[Mathf.NextPowerOfTwo(samples)];
buf = _shortOverlay;
}
private void EnsureTmpCapacity(int samples)
{
if (_floatTmp == null || _floatTmp.Length < samples)
_floatTmp = new float[Mathf.NextPowerOfTwo(samples)];
}
private void OnDisable()
{
if (_streamStarted)
{
_stream.StopStream();
_streamStarted = false;
}
if (_playChannel.hasHandle()) { _playChannel.stop(); _playChannel.clearHandle(); }
if (_recSound.hasHandle()) { _core.recordStop(recordDriverId); _recSound.release(); _recSound.clearHandle(); }
}
}