DeltaVR3DModelGeneration/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs


using System;
using System.Runtime.InteropServices;
using UnityEngine;
using FMOD;
using FMODUnity;
using Whisper;          // WhisperManager, WhisperStream, WhisperResult
using Whisper.Utils;
using Debug = UnityEngine.Debug;    // AudioChunk

/// <summary>
/// FMOD mic is initialized once (Start) and runs continuously in a ring buffer.
/// Whisper stream is created when ActivateRecording() is called and disposed on DeactivateRecording().
/// Optional loopback can be paused/resumed instead of starting/stopping to avoid stalls.
/// </summary>
public class FMODWhisperBridge : MonoBehaviour
{
    [Header("Whisper")]
    [SerializeField] private WhisperManager whisper;        // assign in Inspector
    [SerializeField] private bool useVadInStream = true;   // let WhisperStream do VAD or not

    [Header("FMOD capture")]
    [Tooltip("Recording device index (0 = default)")]
    public int recordDriverId = 0;
    [Tooltip("Set 48000 on Quest; falls back to device rate automatically")]
    public int desiredSampleRate = 48000;
    [Tooltip("Mono recommended for Whisper")]
    public int channels = 1;
    [Range(1, 10)] public int bufferLengthSec = 5;

    [Header("Loopback (monitor your voice)")]
    public bool playLoopback = true;
    [Tooltip("If true, loopback plays only while active; otherwise it’s always on.")]
    public bool loopbackOnlyWhenActive = true;
    [Range(0f, 2f)] public float loopbackVolume = 1.0f;

    public delegate void OnWhisperSegmentUpdatedDelegate(string result);
    public event OnWhisperSegmentUpdatedDelegate OnWhisperSegmentUpdated;

    public delegate void OnWhisperSegmentFinishedDelegate(string result);
    public event OnWhisperSegmentFinishedDelegate OnWhisperSegmentFinished;

    // FMOD
    private FMOD.System _core;
    private Sound _recSound;
    private Channel _playChannel;
    private ChannelGroup _masterGroup;
    private uint _soundPcmLength;          // in samples
    private int _nativeRate;
    private int _nativeChannels;

    // ring-buffer tracking
    private uint _lastRecordPos = 0;

    // Whisper
    private WhisperStream _stream;
    private bool _streamStarted;

    // temp conversion buffer
    private float[] _floatTmp = new float[0];
    private short[] _shortOverlay;

    // activation flag
    private bool isRecordingActivated = false;
    private bool _skipOneFeedFrame = false;

    private void Awake()
    {
        if (!whisper) whisper = FindObjectOfType<WhisperManager>();
        _core = RuntimeManager.CoreSystem; // FMOD core system
    }

    private async void Start()
    {
        // -------------- FMOD initialize ONCE --------------
        // Query device info
        string name;
        Guid guid;
        SPEAKERMODE sm;
        int smChannels;
        DRIVER_STATE driverState;

        var res = _core.getRecordDriverInfo(
            recordDriverId,
            out name, 256,
            out guid,
            out _nativeRate,
            out sm,
            out smChannels,
            out driverState
        );

        if (res != RESULT.OK)
        {
            Debug.LogError($"[FMOD→Whisper] getRecordDriverInfo failed: {res}");
            return;
        }

        _nativeChannels = channels > 0 ? channels : smChannels;
        int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;
        Debug.Log($"[FMOD→Whisper] Using input device #{recordDriverId}: \"{name}\"  rate={rate} ch={_nativeChannels}");

        // Build user sound (ring buffer) — multiple seconds
        CREATESOUNDEXINFO ex = new CREATESOUNDEXINFO
        {
            cbsize = Marshal.SizeOf(typeof(CREATESOUNDEXINFO)),
            numchannels = _nativeChannels,
            defaultfrequency = rate,
            format = SOUND_FORMAT.PCM16,
            length = (uint)(rate * _nativeChannels * sizeof(short) * bufferLengthSec)
        };

        res = _core.createSound("", MODE.OPENUSER | MODE.LOOP_NORMAL | MODE.CREATESAMPLE, ref ex, out _recSound);
        if (res != RESULT.OK)
        {
            Debug.LogError($"[FMOD→Whisper] createSound failed: {res}");
            return;
        }

        _recSound.getLength(out _soundPcmLength, TIMEUNIT.PCM);

        // Start recording (looping)
        res = _core.recordStart(recordDriverId, _recSound, true);
        if (res != RESULT.OK)
        {
            Debug.LogError($"[FMOD→Whisper] recordStart failed: {res}");
            _recSound.release();
            _recSound.clearHandle();
            return;
        }

        // Initialize record position to avoid a huge first delta
        _core.getRecordPosition(recordDriverId, out _lastRecordPos);
        Debug.Log("[FMOD→Whisper] Recording started.");

        // Loopback channel (optional). Start once; pause when inactive if desired.
        _core.getMasterChannelGroup(out _masterGroup);
        if (playLoopback)
        {
            res = _core.playSound(_recSound, _masterGroup, false, out _playChannel);
            if (res == RESULT.OK && _playChannel.hasHandle())
            {
                _playChannel.setMode(MODE._2D);
                _playChannel.setVolume(loopbackVolume);
                if (loopbackOnlyWhenActive) _playChannel.setPaused(true); // keep muted until Activate
                Debug.Log("[FMOD→Whisper] Loopback playback ready.");
            }
            else
            {
                Debug.LogWarning($"[FMOD→Whisper] playSound failed or channel invalid: {res}");
            }
        }

        // No Whisper stream here. It will be created on ActivateRecording().
        await System.Threading.Tasks.Task.Yield();
    }

    /// <summary>
    /// Creates a fresh Whisper stream and starts feeding audio (FMOD already recording).
    /// </summary>
    public async void ActivateRecording()
    {
        if (isRecordingActivated)
        {
            Debug.Log("[FMOD→Whisper] ActivateRecording ignored (already active).");
            return;
        }

        if (!_recSound.hasHandle())
        {
            Debug.LogError("[FMOD→Whisper] FMOD not initialized or recording not running.");
            return;
        }

        int rate = (_nativeRate > 0) ? _nativeRate : desiredSampleRate;

        try
        {
            _stream = await whisper.CreateStream(rate, _nativeChannels);
        }
        catch (Exception e)
        {
            Debug.LogError($"[FMOD→Whisper] CreateStream exception: {e}");
            _stream = null;
            _streamStarted = false;
            return;
        }

        // Wire events
        _stream.OnSegmentUpdated += (seg) =>
        {
            string cleanedText = PostProcessInput(seg.Result);
            if (!string.IsNullOrEmpty(cleanedText))
                OnWhisperSegmentUpdated?.Invoke(cleanedText);
        };
        _stream.OnSegmentFinished += (seg) =>
        {
            string cleanedText = PostProcessInput(seg.Result);
            if (!string.IsNullOrEmpty(cleanedText))
                OnWhisperSegmentFinished?.Invoke(cleanedText);
        };

        whisper.useVad = useVadInStream;

        _stream.StartStream();
        _streamStarted = true;


        // --- NEW: Clear the ring buffer and reset read pointer ---
        // Pause loopback while we clear (optional, but avoids clicks)
        if (playLoopback && _playChannel.hasHandle())
            _playChannel.setPaused(true);

        // Clear buffer bytes
        ClearRecordRingBuffer();

        // Reset our read pointer to the current write head
        _core.getRecordPosition(recordDriverId, out _lastRecordPos);

        // We’ll skip feeding for one frame to guarantee a clean start
        _skipOneFeedFrame = true;

        // Unpause loopback if we want it active during recording
        if (playLoopback && _playChannel.hasHandle() && (!loopbackOnlyWhenActive || isRecordingActivated))
            _playChannel.setPaused(loopbackOnlyWhenActive ? false : _playChannel.getPaused(out var paused) == FMOD.RESULT.OK && paused ? false : false);

        isRecordingActivated = true;
        Debug.Log("[FMOD→Whisper] Stream activated (buffer cleared; reading from current head).");

    }

    /// <summary>
    /// Stops and disposes the Whisper stream only. FMOD keeps recording.
    /// </summary>
    public void DeactivateRecording()
    {
        if (!isRecordingActivated && !_streamStarted)
            return;

        isRecordingActivated = false;

        // Pause loopback if it should only be active during recording
        if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle())
            _playChannel.setPaused(true);

        // Tear down Whisper stream
        if (_streamStarted)
        {
            try { _stream.StopStream(); } catch { /* ignore */ }
            _streamStarted = false;
        }
        _stream = null;

        Debug.Log("[FMOD→Whisper] Stream deactivated (Whisper stopped; FMOD still recording).");
    }

    private void Update()
    {
        // Always tick FMOD
        if (_core.handle != IntPtr.Zero) _core.update();
        if (!_recSound.hasHandle()) return;

        // Compute how many samples recorded since last frame.
        uint recPos;
        _core.getRecordPosition(recordDriverId, out recPos);

        uint deltaSamples = (recPos >= _lastRecordPos)
            ? (recPos - _lastRecordPos)
            : (recPos + _soundPcmLength - _lastRecordPos);

        if (deltaSamples == 0)
        {
            // Even if 0, keep last pos
            _lastRecordPos = recPos;
            return;
        }

        // If not active, we *still* advance the ring (so we don't backlog data),
        // but we *don't* push chunks to Whisper.
        bool shouldFeed = isRecordingActivated && _streamStarted && _stream != null;

        // Calculate byte range to lock (16-bit)
        uint bytesToRead = deltaSamples * (uint)_nativeChannels * 2; // 2 bytes per sample
        uint startBytes = _lastRecordPos * (uint)_nativeChannels * 2;

        IntPtr p1, p2;
        uint len1, len2;

        var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2);
        if (r != RESULT.OK)
        {
            _lastRecordPos = recPos;
            return;
        }

        try
        {
            if (shouldFeed && !_skipOneFeedFrame)
            {
                if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1);
                if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2);
            }
            // If skipping, we just discard this frame to ensure no stale data leaks.
        }
        finally
        {
            _recSound.unlock(p1, p2, len1, len2);
        }

        if (_skipOneFeedFrame) _skipOneFeedFrame = false;

        _lastRecordPos = recPos;

    }

    private string PostProcessInput(string input)
    {
        return input.Replace("[silence]", "").Replace("[ Silence ]", "").Replace("BLANK_AUDIO", "").Replace("[", "").Replace("]", "").Trim();
    }

    private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen)
    {
        int samples = (int)(byteLen / 2); // 2 bytes per sample
        EnsureTmpCapacity(samples);

        EnsureShortOverlay(samples, out short[] sBuf);
        Marshal.Copy(src, sBuf, 0, samples);

        // Convert to float [-1..1] (no downmix change from your original)
        for (int i = 0; i < samples; i++)
        {
            _floatTmp[i] = Mathf.Clamp(sBuf[i] / 32768f, -1f, 1f);
        }

        // TODO (optional): downmix to mono and/or run a light gate before feeding.
        // For now we keep your original behavior:
        var chunk = new AudioChunk
        {
            Data = _floatTmp.AsSpan(0, samples).ToArray(),
            Frequency = (_nativeRate > 0) ? _nativeRate : desiredSampleRate,
            Channels = _nativeChannels,
            IsVoiceDetected = true
        };

        _stream.AddToStream(chunk);
    }

    private void EnsureShortOverlay(int samples, out short[] buf)
    {
        if (_shortOverlay == null || _shortOverlay.Length < samples)
            _shortOverlay = new short[Mathf.NextPowerOfTwo(samples)];
        buf = _shortOverlay;
    }

    private void EnsureTmpCapacity(int samples)
    {
        if (_floatTmp == null || _floatTmp.Length < samples)
            _floatTmp = new float[Mathf.NextPowerOfTwo(samples)];
    }

    private void OnDisable()
    {
        // Stop Whisper (if active)
        DeactivateRecording();

        // Stop/purge FMOD resources (since object is going away)
        if (_playChannel.hasHandle())
        {
            try { _playChannel.stop(); } catch { /* ignore */ }
            _playChannel.clearHandle();
        }
        if (_recSound.hasHandle())
        {
            try { _core.recordStop(recordDriverId); } catch { /* ignore */ }
            try { _recSound.release(); } catch { /* ignore */ }
            _recSound.clearHandle();
        }
    }


    private void ClearRecordRingBuffer()
    {
        if (!_recSound.hasHandle() || _soundPcmLength == 0) return;

        uint totalBytes = _soundPcmLength * (uint)_nativeChannels * 2; // PCM16
        IntPtr p1, p2;
        uint len1, len2;

        // Lock the whole buffer (start=0, length=totalBytes)
        var r = _recSound.@lock(0, totalBytes, out p1, out p2, out len1, out len2);
        if (r != FMOD.RESULT.OK)
        {
            Debug.LogWarning($"[FMOD→Whisper] Could not lock ring buffer to clear: {r}");
            return;
        }

        try
        {
            if (len1 > 0)
            {
                // zero p1
                // We’ll reuse a static zero array to avoid allocating huge buffers repeatedly
                ZeroMem(p1, (int)len1);
            }
            if (len2 > 0)
            {
                ZeroMem(p2, (int)len2);
            }
        }
        finally
        {
            _recSound.unlock(p1, p2, len1, len2);
        }
    }

    // cheap zeroing helper (avoids allocating len-sized arrays each time)
    private static readonly byte[] _zeroChunk = new byte[16 * 1024]; // 16 KB
    private static void ZeroMem(IntPtr dst, int byteLen)
    {
        int offset = 0;
        while (byteLen > 0)
        {
            int n = Math.Min(_zeroChunk.Length, byteLen);
            Marshal.Copy(_zeroChunk, 0, dst + offset, n);
            offset += n;
            byteLen -= n;
        }
    }

}