forked from cgvr/DeltaVR
247 lines
8.9 KiB
C#
247 lines
8.9 KiB
C#
|
||
using System;
|
||
using System.Runtime.InteropServices;
|
||
using UnityEngine;
|
||
using FMOD;
|
||
using FMODUnity;
|
||
using Whisper; // WhisperManager, WhisperStream, WhisperResult
|
||
using Whisper.Utils; // AudioChunk
|
||
|
||
/// <summary>
|
||
/// Capture microphone with FMOD and feed chunks to WhisperStream (no Unity Microphone).
|
||
/// Also (optionally) plays the recorded sound back via FMOD loopback.
|
||
/// </summary>
|
||
|
||
public class FMODWhisperBridge : MonoBehaviour
|
||
{
|
||
[Header("Whisper")]
|
||
[SerializeField] private WhisperManager whisper; // assign in Inspector
|
||
[SerializeField] private bool useVadInStream = false; // let WhisperStream do VAD or not
|
||
|
||
[Header("FMOD capture")]
|
||
[Tooltip("Recording device index (0 = default)")]
|
||
public int recordDriverId = 0;
|
||
[Tooltip("Set 48000 on Quest; falls back to device rate automatically")]
|
||
public int desiredSampleRate = 48000;
|
||
[Tooltip("Mono recommended for Whisper")]
|
||
public int channels = 1;
|
||
[Range(1, 10)] public int bufferLengthSec = 5;
|
||
|
||
[Header("Loopback (monitor your voice)")]
|
||
public bool playLoopback = true;
|
||
[Range(0f, 2f)] public float loopbackVolume = 1.0f;
|
||
|
||
public delegate void OnWhisperSegmentUpdatedDelegate(string result);
|
||
public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated;
|
||
|
||
public delegate void OnWhisperSegmentFinishedDelegate(string result);
|
||
public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished;
|
||
|
||
// FMOD
|
||
private FMOD.System _core;
|
||
private Sound _recSound;
|
||
private Channel _playChannel;
|
||
private ChannelGroup _masterGroup;
|
||
private uint _soundPcmLength; // in samples
|
||
private int _nativeRate;
|
||
private int _nativeChannels;
|
||
|
||
// ring-buffer tracking
|
||
private uint _lastRecordPos = 0;
|
||
|
||
// Whisper
|
||
private WhisperStream _stream;
|
||
private bool _streamStarted;
|
||
|
||
// temp conversion buffer
|
||
private float[] _floatTmp = new float[0];
|
||
|
||
private bool isRecordingActivated = false;
|
||
|
||
private void Awake()
|
||
{
|
||
if (!whisper) whisper = FindObjectOfType<WhisperManager>();
|
||
_core = RuntimeManager.CoreSystem; // FMOD core system
|
||
}
|
||
|
||
private async void Start()
|
||
{
|
||
// Query device info to get native rate/channels.
|
||
// (FMOD: getRecordDriverInfo gives you system rate & speaker mode)
|
||
string name;
|
||
Guid guid;
|
||
SPEAKERMODE sm;
|
||
int smChannels;
|
||
DRIVER_STATE driverState;
|
||
// signature: getRecordDriverInfo(id, out name, nameLen, out guid, out systemrate, out speakermode, out speakermodechannels, out driverState)
|
||
_core.getRecordDriverInfo(recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState);
|
||
_nativeChannels = channels > 0 ? channels : smChannels;
|
||
UnityEngine.Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\" rate={_nativeRate} ch={_nativeChannels}");
|
||
|
||
// Build a user sound buffer that FMOD will fill (OPENUSER | LOOP_NORMAL).
|
||
CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
|
||
{
|
||
cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
|
||
numchannels = _nativeChannels,
|
||
defaultfrequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
|
||
format = SOUND_FORMAT.PCM16,
|
||
length = (uint)(((_nativeRate > 0 ? _nativeRate : desiredSampleRate) * _nativeChannels) * sizeof(short)) // seconds=1 (we loop)
|
||
};
|
||
|
||
_core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
|
||
_recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);
|
||
|
||
// Start FMOD recording into that sound (looping ring buffer).
|
||
_core.recordStart(recordDriverId, _recSound, true);
|
||
UnityEngine.Debug.Log("[FMOD→Whisper] Recording started.");
|
||
|
||
// Optional loopback playback using FMOD (plays same sound ring buffer).
|
||
_core.getMasterChannelGroup(out _masterGroup);
|
||
if (playLoopback)
|
||
{
|
||
_core.playSound(_recSound, _masterGroup, false, out _playChannel);
|
||
_playChannel.setMode(MODE._2D);
|
||
_playChannel.setVolume(loopbackVolume);
|
||
UnityEngine.Debug.Log("[FMOD→Whisper] Loopback playback started.");
|
||
}
|
||
|
||
// Create Whisper stream WITHOUT MicrophoneRecord, just from (freq, channels).
|
||
// We'll push AudioChunk manually.
|
||
// NOTE: WhisperStream’s sliding window is governed by manager’s stepSec/keepSec/lengthSec.
|
||
_stream = await whisper.CreateStream(ex.defaultfrequency, _nativeChannels);
|
||
_stream.OnResultUpdated += (txt) =>
|
||
{
|
||
//OnWhisperResultProcessed?.Invoke(txt);
|
||
//UnityEngine.Debug.Log($"[Whisper] result updated: {txt}");
|
||
};
|
||
_stream.OnSegmentUpdated += (seg) =>
|
||
{
|
||
OnWhisperSegmentUpdated?.Invoke(seg.Result);
|
||
//UnityEngine.Debug.Log($"[Whisper] Seg updated: {seg.Result}");
|
||
};
|
||
_stream.OnSegmentFinished += (seg) =>
|
||
{
|
||
OnWhisperSegmentFinished?.Invoke(seg.Result);
|
||
//UnityEngine.Debug.Log($"[Whisper] Seg finished: {seg.Result}");
|
||
};
|
||
|
||
// If you want Whisper to respect VAD, enable in manager or set useVad (manager controls stream params).
|
||
whisper.useVad = useVadInStream;
|
||
|
||
_stream.StartStream();
|
||
_streamStarted = true;
|
||
|
||
// prepare temp arrays roughly 100ms of audio
|
||
EnsureTmpCapacity((ex.defaultfrequency / 10) * _nativeChannels);
|
||
}
|
||
|
||
private void Update()
|
||
{
|
||
if (!isRecordingActivated) return;
|
||
if (_core.handle != IntPtr.Zero) _core.update();
|
||
if (!_streamStarted || !_recSound.hasHandle()) return;
|
||
|
||
// How many samples recorded since last frame?
|
||
uint recPos;
|
||
_core.getRecordPosition(recordDriverId, out recPos);
|
||
|
||
uint deltaSamples = (recPos >= _lastRecordPos)
|
||
? (recPos - _lastRecordPos)
|
||
: (recPos + _soundPcmLength - _lastRecordPos);
|
||
|
||
if (deltaSamples == 0) return;
|
||
|
||
// We’ll read that region (16-bit) and convert to float[] [-1..1].
|
||
// Calculate byte range to lock in sound buffer
|
||
uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 16-bit = 2 bytes
|
||
uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;
|
||
|
||
IntPtr p1, p2;
|
||
uint len1, len2;
|
||
// Lock can wrap — FMOD splits into p1/p2.
|
||
_recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
|
||
|
||
try
|
||
{
|
||
// Convert both parts to float and push to Whisper
|
||
if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
|
||
if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
|
||
}
|
||
finally
|
||
{
|
||
_recSound.unlock(p1, p2, len1, len2);
|
||
}
|
||
|
||
_lastRecordPos = recPos;
|
||
}
|
||
|
||
public void ActivateRecording()
|
||
{
|
||
isRecordingActivated = true;
|
||
}
|
||
|
||
public void DeactivateRecording()
|
||
{
|
||
isRecordingActivated = false;
|
||
}
|
||
|
||
private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
|
||
{
|
||
int samples = (int)(byteLen / 2); // 2 bytes per sample
|
||
EnsureTmpCapacity(samples);
|
||
|
||
// Marshal the 16-bit PCM into managed space
|
||
// We pin a short[] overlay to avoid copying twice
|
||
int shorts = samples;
|
||
int byteCount = (int)byteLen;
|
||
|
||
// Use Marshal.Copy into a short[] then convert to float[-1..1]
|
||
// (You can also unsafe copy for speed if needed.)
|
||
EnsureShortOverlay(shorts, out short[] sBuf);
|
||
Marshal.Copy(src, sBuf, 0, shorts);
|
||
|
||
for (int i = 0; i < shorts; i++)
|
||
{
|
||
// 32768f avoids clipping at -32768
|
||
_floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
|
||
}
|
||
|
||
// Build a chunk for WhisperStream; with VAD off, IsVoiceDetected=true is fine.
|
||
var chunk = new AudioChunk
|
||
{
|
||
Data = _floatTmp.AsSpan(0, shorts).ToArray(),
|
||
Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
|
||
Channels = _nativeChannels,
|
||
IsVoiceDetected = true
|
||
};
|
||
|
||
_stream.AddToStream(chunk);
|
||
}
|
||
|
||
private short[] _shortOverlay;
|
||
private void EnsureShortOverlay(int samples, out short[] buf)
|
||
{
|
||
if (_shortOverlay == null || _shortOverlay.Length < samples)
|
||
_shortOverlay = new short[Mathf.NextPowerOfTwo(samples)];
|
||
buf = _shortOverlay;
|
||
}
|
||
|
||
private void EnsureTmpCapacity(int samples)
|
||
{
|
||
if (_floatTmp == null || _floatTmp.Length < samples)
|
||
_floatTmp = new float[Mathf.NextPowerOfTwo(samples)];
|
||
}
|
||
|
||
private void OnDisable()
|
||
{
|
||
if (_streamStarted)
|
||
{
|
||
_stream.StopStream();
|
||
_streamStarted = false;
|
||
}
|
||
|
||
if (_playChannel.hasHandle()) { _playChannel.stop(); _playChannel.clearHandle(); }
|
||
if (_recSound.hasHandle()) { _core.recordStop(recordDriverId); _recSound.release(); _recSound.clearHandle(); }
|
||
}
|
||
}
|