DeltaVR3DModelGeneration/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs


using System;
using System.Runtime.InteropServices;
using UnityEngine;
using FMOD;
using FMODUnity;
using Whisper;          // WhisperManager, WhisperStream, WhisperResult
using Whisper.Utils;    // AudioChunk

/// <summary>
/// Capture microphone with FMOD and feed chunks to WhisperStream (no Unity Microphone).
/// Also (optionally) plays the recorded sound back via FMOD loopback.
/// </summary>

public class FMODWhisperBridge : MonoBehaviour
{
    [Header("Whisper")]
    [SerializeField] private WhisperManager whisper;        // assign in Inspector
    [SerializeField] private bool useVadInStream = false;   // let WhisperStream do VAD or not

    [Header("FMOD capture")]
    [Tooltip("Recording device index (0 = default)")]
    public int recordDriverId = 0;
    [Tooltip("Set 48000 on Quest; falls back to device rate automatically")]
    public int desiredSampleRate = 48000;
    [Tooltip("Mono recommended for Whisper")]
    public int channels = 1;
    [Range(1, 10)] public int bufferLengthSec = 5;

    [Header("Loopback (monitor your voice)")]
    public bool playLoopback = true;
    [Range(0f, 2f)] public float loopbackVolume = 1.0f;

    public delegate void OnWhisperSegmentUpdatedDelegate(string result);
    public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated;

    public delegate void OnWhisperSegmentFinishedDelegate(string result);
    public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished;

    // FMOD
    private FMOD.System _core;
    private Sound _recSound;
    private Channel _playChannel;
    private ChannelGroup _masterGroup;
    private uint _soundPcmLength;          // in samples
    private int _nativeRate;
    private int _nativeChannels;

    // ring-buffer tracking
    private uint _lastRecordPos = 0;

    // Whisper
    private WhisperStream _stream;
    private bool _streamStarted;

    // temp conversion buffer
    private float[] _floatTmp = new float[0];

    private bool isRecordingActivated = false;

    private void Awake()
    {
        if (!whisper) whisper = FindObjectOfType<WhisperManager>();
        _core = RuntimeManager.CoreSystem; // FMOD core system
    }

    private async void Start()
    {
        // Query device info to get native rate/channels.
        // (FMOD: getRecordDriverInfo gives you system rate & speaker mode)
        string name;
        Guid guid;
        SPEAKERMODE sm;
        int smChannels;
        DRIVER_STATE driverState;
        // signature: getRecordDriverInfo(id, out name, nameLen, out guid, out systemrate, out speakermode, out speakermodechannels, out driverState)
        _core.getRecordDriverInfo(recordDriverId, out name, 256, out guid, out _nativeRate, out sm, out smChannels, out driverState);
        _nativeChannels = channels > 0 ? channels : smChannels;
        UnityEngine.Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\"  rate={_nativeRate} ch={_nativeChannels}");

        // Build a user sound buffer that FMOD will fill (OPENUSER | LOOP_NORMAL).
        CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
        {
            cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
            numchannels = _nativeChannels,
            defaultfrequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
            format = SOUND_FORMAT.PCM16,
            length = (uint)(((_nativeRate > 0 ? _nativeRate : desiredSampleRate) * _nativeChannels) * sizeof(short)) // seconds=1 (we loop)
        };

        _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
        _recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);

        // Start FMOD recording into that sound (looping ring buffer).
        _core.recordStart(recordDriverId, _recSound, true);
        UnityEngine.Debug.Log("[FMOD→Whisper] Recording started.");

        // Optional loopback playback using FMOD (plays same sound ring buffer).
        _core.getMasterChannelGroup(out _masterGroup);
        if (playLoopback)
        {
            _core.playSound(_recSound, _masterGroup, false, out _playChannel);
            _playChannel.setMode(MODE._2D);
            _playChannel.setVolume(loopbackVolume);
            UnityEngine.Debug.Log("[FMOD→Whisper] Loopback playback started.");
        }

        // Create Whisper stream WITHOUT MicrophoneRecord, just from (freq, channels).
        // We'll push AudioChunk manually.
        // NOTE: WhisperStream’s sliding window is governed by manager’s stepSec/keepSec/lengthSec.
        _stream = await whisper.CreateStream(ex.defaultfrequency, _nativeChannels);
        _stream.OnResultUpdated += (txt) =>
        {
            //OnWhisperResultProcessed?.Invoke(txt);
            //UnityEngine.Debug.Log($"[Whisper] result updated: {txt}");
        };
        _stream.OnSegmentUpdated += (seg) =>
        {
            OnWhisperSegmentUpdated?.Invoke(seg.Result);
            //UnityEngine.Debug.Log($"[Whisper] Seg updated: {seg.Result}");
        };
        _stream.OnSegmentFinished += (seg) =>
        {
            OnWhisperSegmentFinished?.Invoke(seg.Result);
            //UnityEngine.Debug.Log($"[Whisper] Seg finished: {seg.Result}");
        };

        // If you want Whisper to respect VAD, enable in manager or set useVad (manager controls stream params).
        whisper.useVad = useVadInStream;

        _stream.StartStream();
        _streamStarted = true;

        // prepare temp arrays roughly 100ms of audio
        EnsureTmpCapacity((ex.defaultfrequency / 10) * _nativeChannels);
    }

    private void Update()
    {
        if (!isRecordingActivated) return;
        if (_core.handle != IntPtr.Zero) _core.update();
        if (!_streamStarted || !_recSound.hasHandle()) return;

        // How many samples recorded since last frame?
        uint recPos;
        _core.getRecordPosition(recordDriverId, out recPos);

        uint deltaSamples = (recPos >= _lastRecordPos)
            ? (recPos - _lastRecordPos)
            : (recPos + _soundPcmLength - _lastRecordPos);

        if (deltaSamples == 0) return;

        // We’ll read that region (16-bit) and convert to float[] [-1..1].
        // Calculate byte range to lock in sound buffer
        uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 16-bit = 2 bytes
        uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;

        IntPtr p1, p2;
        uint len1, len2;
        // Lock can wrap — FMOD splits into p1/p2.
        _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);

        try
        {
            // Convert both parts to float and push to Whisper
            if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
            if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
        }
        finally
        {
            _recSound.unlock(p1, p2, len1, len2);
        }

        _lastRecordPos = recPos;
    }

    public void ActivateRecording()
    {
        isRecordingActivated = true;
    }

    public void DeactivateRecording()
    {
        isRecordingActivated = false;
    }

    private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
    {
        int samples = (int)(byteLen / 2); // 2 bytes per sample
        EnsureTmpCapacity(samples);

        // Marshal the 16-bit PCM into managed space
        // We pin a short[] overlay to avoid copying twice
        int shorts = samples;
        int byteCount = (int)byteLen;

        // Use Marshal.Copy into a short[] then convert to float[-1..1]
        // (You can also unsafe copy for speed if needed.)
        EnsureShortOverlay(shorts, out short[] sBuf);
        Marshal.Copy(src, sBuf, 0, shorts);

        for (int i = 0; i < shorts; i++)
        {
            // 32768f avoids clipping at -32768
            _floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
        }

        // Build a chunk for WhisperStream; with VAD off, IsVoiceDetected=true is fine.
        var chunk = new AudioChunk
        {
            Data = _floatTmp.AsSpan(0, shorts).ToArray(),
            Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
            Channels = _nativeChannels,
            IsVoiceDetected = true
        };

        _stream.AddToStream(chunk);
    }

    private short[] _shortOverlay;
    private void EnsureShortOverlay(int samples, out short[] buf)
    {
        if (_shortOverlay == null || _shortOverlay.Length < samples)
            _shortOverlay = new short[Mathf.NextPowerOfTwo(samples)];
        buf = _shortOverlay;
    }

    private void EnsureTmpCapacity(int samples)
    {
        if (_floatTmp == null || _floatTmp.Length < samples)
            _floatTmp = new float[Mathf.NextPowerOfTwo(samples)];
    }

    private void OnDisable()
    {
        if (_streamStarted)
        {
            _stream.StopStream();
            _streamStarted = false;
        }

        if (_playChannel.hasHandle()) { _playChannel.stop(); _playChannel.clearHandle(); }
        if (_recSound.hasHandle()) { _core.recordStop(recordDriverId); _recSound.release(); _recSound.clearHandle(); }
    }
}