microphone recording working with FMOD!!!

2026-01-14 18:02:41 +02:00
parent 96839e0e82
commit cb4d4036e7
12 changed files with 410 additions and 140 deletions
--- a/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs
+++ b/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs
@@ -0,0 +1,238 @@
+
+using System;
+using System.Runtime.InteropServices;
+using UnityEngine;
+using FMOD;
+using FMODUnity;
+using Whisper;          // WhisperManager, WhisperStream, WhisperResult
+using Whisper.Utils;    // AudioChunk
+
+/// <summary>
+/// Capture microphone with FMOD and feed chunks to WhisperStream (no Unity Microphone).
+/// Also (optionally) plays the recorded sound back via FMOD loopback.
+/// </summary>
+
+public class FMODWhisperBridge : MonoBehaviour
+{
+    [Header("Whisper")]
+    [SerializeField] private WhisperManager whisper;        // assign in Inspector
+    [SerializeField] private bool useVadInStream = false;   // let WhisperStream do VAD or not
+
+    [Header("FMOD capture")]
+    [Tooltip("Recording device index (0 = default)")]
+    public int recordDriverId = 0;
+    [Tooltip("Set 48000 on Quest; falls back to device rate automatically")]
+    public int desiredSampleRate = 48000;
+    [Tooltip("Mono recommended for Whisper")]
+    public int channels = 1;
+    [Range(1, 10)] public int bufferLengthSec = 5;
+
+    [Header("Loopback (monitor your voice)")]
+    public bool playLoopback = true;
+    [Range(0f, 2f)] public float loopbackVolume = 1.0f;
+
+    public delegate void OnWhisperResultProcessedDelegate(string result);
+    public event OnWhisperResultProcessedDelegate OnWhisperResultProcessed;
+
+    // FMOD
+    private FMOD.System _core;
+    private Sound _recSound;
+    private Channel _playChannel;
+    private ChannelGroup _masterGroup;
+    private uint _soundPcmLength;          // in samples
+    private int _nativeRate;
+    private int _nativeChannels;
+
+    // ring-buffer tracking
+    private uint _lastRecordPos = 0;
+
+    // Whisper
+    private WhisperStream _stream;
+    private bool _streamStarted;
+
+    // temp conversion buffer
+    private float[] _floatTmp = new float[0];
+
+    private bool isRecordingActivated = false;
+
+    private void Awake()
+    {
+        if (!whisper) whisper = FindObjectOfType<WhisperManager>();
+        _core = RuntimeManager.CoreSystem; // FMOD core system
+    }
+
+    private async void Start()
+    {
+        // Query device info to get native rate/channels.
+        // (FMOD: getRecordDriverInfo gives you system rate & speaker mode)
+        string name;
+        Guid guid;
+        SPEAKERMODE sm;
+        int smChannels;
+        DRIVER_STATE driverState;
+        // signature: getRecordDriverInfo(id, out name, nameLen, out guid, out systemrate, out speakermode, out speakermodechannels, out driverState)
+        _core.getRecordDriverInfo(recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState);
+        _nativeChannels = channels > 0 ? channels : smChannels;
+        UnityEngine.Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\"  rate={_nativeRate} ch={_nativeChannels}");
+
+        // Build a user sound buffer that FMOD will fill (OPENUSER | LOOP_NORMAL).
+        CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
+        {
+            cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
+            numchannels = _nativeChannels,
+            defaultfrequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
+            format = SOUND_FORMAT.PCM16,
+            length = (uint)(((_nativeRate > 0 ? _nativeRate : desiredSampleRate) * _nativeChannels) * sizeof(short)) // seconds=1 (we loop)
+        };
+
+        _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
+        _recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);
+
+        // Start FMOD recording into that sound (looping ring buffer).
+        _core.recordStart(recordDriverId, _recSound, true);
+        UnityEngine.Debug.Log("[FMOD→Whisper] Recording started.");
+
+        // Optional loopback playback using FMOD (plays same sound ring buffer).
+        _core.getMasterChannelGroup(out _masterGroup);
+        if (playLoopback)
+        {
+            _core.playSound(_recSound, _masterGroup, false, out _playChannel);
+            _playChannel.setMode(MODE._2D);
+            _playChannel.setVolume(loopbackVolume);
+            UnityEngine.Debug.Log("[FMOD→Whisper] Loopback playback started.");
+        }
+
+        // Create Whisper stream WITHOUT MicrophoneRecord, just from (freq, channels).
+        // We'll push AudioChunk manually.
+        // NOTE: WhisperStream’s sliding window is governed by manager’s stepSec/keepSec/lengthSec.
+        _stream = await whisper.CreateStream(ex.defaultfrequency, _nativeChannels);
+        _stream.OnResultUpdated += (txt) =>
+        {
+            //OnWhisperResultProcessed?.Invoke(txt);
+            //UnityEngine.Debug.Log($"[Whisper] result updated: {txt}");
+        };
+        _stream.OnSegmentUpdated += (seg) =>
+        {
+            OnWhisperResultProcessed?.Invoke(seg.Result);
+            //UnityEngine.Debug.Log($"[Whisper] Seg finished: {seg.Result}");
+        };
+
+        // If you want Whisper to respect VAD, enable in manager or set useVad (manager controls stream params).
+        whisper.useVad = useVadInStream;
+
+        _stream.StartStream();
+        _streamStarted = true;
+
+        // prepare temp arrays roughly 100ms of audio
+        EnsureTmpCapacity((ex.defaultfrequency / 10) * _nativeChannels);
+    }
+
+    private void Update()
+    {
+        if (!isRecordingActivated) return;
+        if (_core.handle != IntPtr.Zero) _core.update();
+        if (!_streamStarted || !_recSound.hasHandle()) return;
+
+        // How many samples recorded since last frame?
+        uint recPos;
+        _core.getRecordPosition(recordDriverId, out recPos);
+
+        uint deltaSamples = (recPos >= _lastRecordPos)
+            ? (recPos - _lastRecordPos)
+            : (recPos + _soundPcmLength - _lastRecordPos);
+
+        if (deltaSamples == 0) return;
+
+        // We’ll read that region (16-bit) and convert to float[] [-1..1].
+        // Calculate byte range to lock in sound buffer
+        uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 16-bit = 2 bytes
+        uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;
+
+        IntPtr p1, p2;
+        uint len1, len2;
+        // Lock can wrap — FMOD splits into p1/p2.
+        _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
+
+        try
+        {
+            // Convert both parts to float and push to Whisper
+            if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
+            if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
+        }
+        finally
+        {
+            _recSound.unlock(p1, p2, len1, len2);
+        }
+
+        _lastRecordPos = recPos;
+    }
+
+    public void ActivateRecording()
+    {
+        isRecordingActivated = true;
+    }
+
+    public void DeactivateRecording()
+    {
+        isRecordingActivated = false;
+    }
+
+    private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
+    {
+        int samples = (int)(byteLen / 2); // 2 bytes per sample
+        EnsureTmpCapacity(samples);
+
+        // Marshal the 16-bit PCM into managed space
+        // We pin a short[] overlay to avoid copying twice
+        int shorts = samples;
+        int byteCount = (int)byteLen;
+
+        // Use Marshal.Copy into a short[] then convert to float[-1..1]
+        // (You can also unsafe copy for speed if needed.)
+        EnsureShortOverlay(shorts, out short[] sBuf);
+        Marshal.Copy(src, sBuf, 0, shorts);
+
+        for (int i = 0; i < shorts; i++)
+        {
+            // 32768f avoids clipping at -32768
+            _floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
+        }
+
+        // Build a chunk for WhisperStream; with VAD off, IsVoiceDetected=true is fine.
+        var chunk = new AudioChunk
+        {
+            Data = _floatTmp.AsSpan(0, shorts).ToArray(),
+            Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
+            Channels = _nativeChannels,
+            IsVoiceDetected = true
+        };
+
+        _stream.AddToStream(chunk);
+    }
+
+    private short[] _shortOverlay;
+    private void EnsureShortOverlay(int samples, out short[] buf)
+    {
+        if (_shortOverlay == null || _shortOverlay.Length < samples)
+            _shortOverlay = new short[Mathf.NextPowerOfTwo(samples)];
+        buf = _shortOverlay;
+    }
+
+    private void EnsureTmpCapacity(int samples)
+    {
+        if (_floatTmp == null || _floatTmp.Length < samples)
+            _floatTmp = new float[Mathf.NextPowerOfTwo(samples)];
+    }
+
+    private void OnDisable()
+    {
+        if (_streamStarted)
+        {
+            _stream.StopStream();
+            _streamStarted = false;
+        }
+
+        if (_playChannel.hasHandle()) { _playChannel.stop(); _playChannel.clearHandle(); }
+        if (_recSound.hasHandle()) { _core.recordStop(recordDriverId); _recSound.release(); _recSound.clearHandle(); }
+    }
+}