From 8a3f10a9a202f0f5a9c941ba9488dd92ea5cc461 Mon Sep 17 00:00:00 2001 From: henrisel Date: Sat, 21 Feb 2026 22:06:23 +0200 Subject: [PATCH] cafe waiter npc and mic detect speech more --- .../_PROJECT/Scenes/DeltaBuilding_base.unity | 4 +- .../ArcheryRange/MicrophoneStand.cs | 1 + .../ModeGeneration/FMODWhisperBridge.cs | 102 +++++++++++++++--- .../ModeGeneration/NPCs/CafeWaiterNPC.cs | 27 ++++- ProjectSettings/ProjectSettings.asset | 4 +- 5 files changed, 113 insertions(+), 25 deletions(-) diff --git a/Assets/_PROJECT/Scenes/DeltaBuilding_base.unity b/Assets/_PROJECT/Scenes/DeltaBuilding_base.unity index c3435584..ebdda0fe 100644 --- a/Assets/_PROJECT/Scenes/DeltaBuilding_base.unity +++ b/Assets/_PROJECT/Scenes/DeltaBuilding_base.unity @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc7db0e218123c5dc98ba87fcc18357f1f55b3376c38e3a55b270fe0b26bfc1d -size 68526226 +oid sha256:53285491147fecd2b93e5c047688195eae2aefa6731e22d27e4577d7f4ab4c0c +size 68499163 diff --git a/Assets/_PROJECT/Scripts/ModeGeneration/ArcheryRange/MicrophoneStand.cs b/Assets/_PROJECT/Scripts/ModeGeneration/ArcheryRange/MicrophoneStand.cs index 5fc3cd54..f0c49fcb 100644 --- a/Assets/_PROJECT/Scripts/ModeGeneration/ArcheryRange/MicrophoneStand.cs +++ b/Assets/_PROJECT/Scripts/ModeGeneration/ArcheryRange/MicrophoneStand.cs @@ -61,6 +61,7 @@ public class MicrophoneStand : MonoBehaviour private void OnPlayerSpeechUpdated(string text) { outputText.text = text; + OnPlayerFinishedSpeaking?.Invoke(); } private void OnPlayerSpeechFinished(string playerText) diff --git a/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs b/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs index c7ec4473..98e44952 100644 --- a/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs +++ b/Assets/_PROJECT/Scripts/ModeGeneration/FMODWhisperBridge.cs @@ -62,6 +62,7 @@ public class FMODWhisperBridge : MonoBehaviour // activation flag private bool isRecordingActivated = false; + private bool _skipOneFeedFrame = false; private void Awake() { @@ -188,13 +189,15 @@ public class FMODWhisperBridge : MonoBehaviour // Wire events _stream.OnSegmentUpdated += (seg) => { - if (IsSpeechMeaningful(seg.Result)) - OnWhisperSegmentUpdated?.Invoke(seg.Result); + string cleanedText = PostProcessInput(seg.Result); + if (!string.IsNullOrEmpty(cleanedText)) + OnWhisperSegmentUpdated?.Invoke(cleanedText); }; _stream.OnSegmentFinished += (seg) => { - if (IsSpeechMeaningful(seg.Result)) - OnWhisperSegmentFinished?.Invoke(seg.Result); + string cleanedText = PostProcessInput(seg.Result); + if (!string.IsNullOrEmpty(cleanedText)) + OnWhisperSegmentFinished?.Invoke(cleanedText); }; whisper.useVad = useVadInStream; @@ -202,15 +205,28 @@ public class FMODWhisperBridge : MonoBehaviour _stream.StartStream(); _streamStarted = true; - // Unpause loopback if it's meant to play only while active - if (playLoopback && loopbackOnlyWhenActive && _playChannel.hasHandle()) - _playChannel.setPaused(false); - // Prepare temp arrays roughly 100ms of audio - EnsureTmpCapacity((rate / 10) * _nativeChannels); + // --- NEW: Clear the ring buffer and reset read pointer --- + // Pause loopback while we clear (optional, but avoids clicks) + if (playLoopback && _playChannel.hasHandle()) + _playChannel.setPaused(true); + + // Clear buffer bytes + ClearRecordRingBuffer(); + + // Reset our read pointer to the current write head + _core.getRecordPosition(recordDriverId, out _lastRecordPos); + + // We’ll skip feeding for one frame to guarantee a clean start + _skipOneFeedFrame = true; + + // Unpause loopback if we want it active during recording + if (playLoopback && _playChannel.hasHandle() && (!loopbackOnlyWhenActive || isRecordingActivated)) + _playChannel.setPaused(loopbackOnlyWhenActive ? false : _playChannel.getPaused(out var paused) == FMOD.RESULT.OK && paused ? false : false); isRecordingActivated = true; - Debug.Log("[FMOD→Whisper] Stream activated (Whisper started; FMOD was already recording)."); + Debug.Log("[FMOD→Whisper] Stream activated (buffer cleared; reading from current head)."); + } /// @@ -269,36 +285,37 @@ public class FMODWhisperBridge : MonoBehaviour IntPtr p1, p2; uint len1, len2; + var r = _recSound.@lock(startBytes, bytesToRead, out p1, out p2, out len1, out len2); if (r != RESULT.OK) { - // If lock fails, still advance last position to avoid spin _lastRecordPos = recPos; return; } try { - if (shouldFeed) + if (shouldFeed && !_skipOneFeedFrame) { if (len1 > 0) CopyPcm16ToFloatAndFeed(p1, len1); if (len2 > 0) CopyPcm16ToFloatAndFeed(p2, len2); } - // else: just discard; we’re only keeping the ring fresh. + // If skipping, we just discard this frame to ensure no stale data leaks. } finally { _recSound.unlock(p1, p2, len1, len2); } + if (_skipOneFeedFrame) _skipOneFeedFrame = false; + _lastRecordPos = recPos; + } - private bool IsSpeechMeaningful(string userText) + private string PostProcessInput(string input) { - return !string.IsNullOrEmpty(userText) - && !userText.Contains("BLANK_AUDIO") - && !userText.Trim().Equals("[ Silence ]"); + return input.Replace("[silence]", "").Replace("[ Silence ]", "").Replace("BLANK_AUDIO", "").Replace("[", "").Replace("]", "").Trim(); } private void CopyPcm16ToFloatAndFeed(IntPtr src, uint byteLen) @@ -359,4 +376,55 @@ public class FMODWhisperBridge : MonoBehaviour _recSound.clearHandle(); } } + + + private void ClearRecordRingBuffer() + { + if (!_recSound.hasHandle() || _soundPcmLength == 0) return; + + uint totalBytes = _soundPcmLength * (uint)_nativeChannels * 2; // PCM16 + IntPtr p1, p2; + uint len1, len2; + + // Lock the whole buffer (start=0, length=totalBytes) + var r = _recSound.@lock(0, totalBytes, out p1, out p2, out len1, out len2); + if (r != FMOD.RESULT.OK) + { + Debug.LogWarning($"[FMOD→Whisper] Could not lock ring buffer to clear: {r}"); + return; + } + + try + { + if (len1 > 0) + { + // zero p1 + // We’ll reuse a static zero array to avoid allocating huge buffers repeatedly + ZeroMem(p1, (int)len1); + } + if (len2 > 0) + { + ZeroMem(p2, (int)len2); + } + } + finally + { + _recSound.unlock(p1, p2, len1, len2); + } + } + + // cheap zeroing helper (avoids allocating len-sized arrays each time) + private static readonly byte[] _zeroChunk = new byte[16 * 1024]; // 16 KB + private static void ZeroMem(IntPtr dst, int byteLen) + { + int offset = 0; + while (byteLen > 0) + { + int n = Math.Min(_zeroChunk.Length, byteLen); + Marshal.Copy(_zeroChunk, 0, dst + offset, n); + offset += n; + byteLen -= n; + } + } + } diff --git a/Assets/_PROJECT/Scripts/ModeGeneration/NPCs/CafeWaiterNPC.cs b/Assets/_PROJECT/Scripts/ModeGeneration/NPCs/CafeWaiterNPC.cs index 6ec84a6a..055a816c 100644 --- a/Assets/_PROJECT/Scripts/ModeGeneration/NPCs/CafeWaiterNPC.cs +++ b/Assets/_PROJECT/Scripts/ModeGeneration/NPCs/CafeWaiterNPC.cs @@ -50,9 +50,9 @@ public class CafeWaiterNPC : NPCController { SpeakVoiceLine(0); + fmodWhisperBridge.ActivateRecording(); fmodWhisperBridge.OnWhisperSegmentUpdated += OnPlayerSpeechUpdated; fmodWhisperBridge.OnWhisperSegmentFinished += OnPlayerSpeechFinished; - fmodWhisperBridge.ActivateRecording(); notepad.SetActive(true); state = 1; @@ -76,7 +76,7 @@ public class CafeWaiterNPC : NPCController private void OnPlayerSpeechFinished(string playerText) { - if (Time.time < lastPlayerVoiceUpdateTime + 0.5f) + if (Time.time < lastPlayerVoiceUpdateTime + 1.0f) { return; } @@ -85,23 +85,34 @@ public class CafeWaiterNPC : NPCController if (state == 1) { // Show transcription and ask whether it is correct + fmodWhisperBridge.DeactivateRecording(); notepadText.text = playerText; - notepad.transform.DOLocalRotate(notepadFlippedRotation, 0.5f); + notepad.transform.DOLocalRotate(notepadFlippedRotation, 0.5f).OnComplete(() => + { + fmodWhisperBridge.ActivateRecording(); + fmodWhisperBridge.OnWhisperSegmentUpdated += OnPlayerSpeechUpdated; + fmodWhisperBridge.OnWhisperSegmentFinished += OnPlayerSpeechFinished; + }); SpeakVoiceLine(1); state = 2; } else if (state == 2) { + fmodWhisperBridge.DeactivateRecording(); // Flip notepad back notepad.transform.DOLocalRotate(notepadOriginalRotation, 0.5f); // if player answered positively, bring food, otherwise ask again - if (playerText.ToLower().Contains("yes")) + if (playerText.ToLower().Contains("ye")) { + SpeakVoiceLine(2); Invoke("BringFood", 1f); state = 3; } else { SpeakVoiceLine(3); + fmodWhisperBridge.ActivateRecording(); + fmodWhisperBridge.OnWhisperSegmentUpdated += OnPlayerSpeechUpdated; + fmodWhisperBridge.OnWhisperSegmentFinished += OnPlayerSpeechFinished; state = 1; } } @@ -113,6 +124,14 @@ public class CafeWaiterNPC : NPCController if (state == 1) { notepadText.text = playerText; + + // For now, when something is transcribed, treat it as player finished speaking + OnPlayerSpeechFinished(playerText); + } + // faster reaction to player answering yes/no + else if (state == 2) + { + OnPlayerSpeechFinished(playerText); } } diff --git a/ProjectSettings/ProjectSettings.asset b/ProjectSettings/ProjectSettings.asset index 558f4158..982ff64b 100644 --- a/ProjectSettings/ProjectSettings.asset +++ b/ProjectSettings/ProjectSettings.asset @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d0812577ef673b87b04aba8d878bfe8a84068278b9368bcbe0700a93d87ad50 -size 26527 +oid sha256:69667d5279fb448c3347888d340da965b1e2acd14f1bef8f84a74d290ff6970f +size 26836