using System; namespace Photon.Voice { /// Collection of Audio Utility functions and classes. public static partial class AudioUtil { /// Resample audio data so that the complete src buffer fits into dstCount samples in the dst buffer. /// This implements a primitive nearest-neighbor resampling algorithm for an arbitrary number of channels. /// Source buffer. /// Destination buffer. /// Target size of destination buffer (in samples per channel). /// Number of channels in the signal (1=mono, 2=stereo). Must be > 0. public static void Resample(T[] src, T[] dst, int dstCount, int channels) { if (channels == 1) { for (int i = 0; i < dstCount; i++) { dst[i] = src[i * src.Length / dstCount]; } } else if (channels == 2) { for (int i = 0; i < dstCount / 2; i++) { var srcI = i * src.Length / dstCount; var dstCh0I = i * 2; var srcCh0I = srcI * 2; dst[dstCh0I++] = src[srcCh0I++]; dst[dstCh0I] = src[srcCh0I]; } } else { for (int i = 0; i < dstCount / channels; i++) { var srcI = i * src.Length / dstCount; var dstCh0I = i * channels; var srcCh0I = srcI * channels; for (int ch = 0; ch < channels; ch++) { dst[dstCh0I++] = src[srcCh0I++]; } } } } public static void Resample(T[] src, int srcOffset, int srcCount, T[] dst, int dstOffset, int dstCount, int channels) { if (channels == 1) { for (int i = 0; i < dstCount; i++) { dst[dstOffset + i] = src[srcOffset + i * srcCount / dstCount]; } } else if (channels == 2) { for (int i = 0; i < dstCount / 2; i++) { var srcI = i * srcCount / dstCount; var dstCh0I = i * 2; var srcCh0I = srcI * 2; dst[dstOffset + dstCh0I++] = src[srcOffset + srcCh0I++]; dst[dstOffset + dstCh0I] = src[srcOffset + srcCh0I]; } } else { for (int i = 0; i < dstCount / channels; i++) { var srcI = i * srcCount / dstCount; var dstCh0I = i * channels; var srcCh0I = srcI * channels; for (int ch = 0; ch < channels; ch++) { dst[dstOffset + dstCh0I++] = src[srcOffset + srcCh0I++]; } } } } // does not sum channel values but only maps channel to channel (it's not possible to apply math to generic type T) public static void Resample(T[] src, int srcOffset, int srcCount, int srcChannels, T[] dst, int dstOffset, int dstCount, int dstChannels) { if (srcChannels == dstChannels) { Resample(src, srcOffset, srcCount, dst, dstOffset, dstCount, dstChannels); return; } if (srcChannels == 1 && dstChannels == 2) { for (int i = 0, j = 0; i < dstCount / 2; i++) { var v = src[srcOffset + i * srcCount * 2 / dstCount]; dst[dstOffset + j++] = v; dst[dstOffset + j++] = v; } } else if (srcChannels == 2 && dstChannels == 1) { for (int i = 0; i < dstCount; i++) { dst[dstOffset + i] = src[srcOffset + i * srcCount / dstCount / 2 * 2]; } } else { for (int i = 0, j = 0; i < dstCount / dstChannels; i++) { var srcI = srcOffset + i * srcCount * dstChannels / dstCount / srcChannels * srcChannels; if (srcChannels >= dstChannels) { for (int ch = 0; ch < dstChannels; ch++) { dst[dstOffset + j++] = src[srcI + ch]; } } else { for (int ch = 0; ch < srcChannels; ch++) { dst[dstOffset + j++] = src[srcI + ch]; } j += dstChannels - srcChannels; } } } } /// Resample audio data so that the complete src buffer fits into dstCount samples in the dst buffer, /// and convert short to float samples along the way. /// This implements a primitive nearest-neighbor resampling algorithm for an arbitrary number of channels. /// Source buffer. /// Destination buffer. /// Target size of destination buffer (in samples per channel). /// Number of channels in the signal (1=mono, 2=stereo). Must be > 0. public static void ResampleAndConvert(short[] src, float[] dst, int dstCount, int channels) { if (channels == 1) { for (int i = 0; i < dstCount; i++) { dst[i] = src[i * src.Length / dstCount] / (float)short.MaxValue; } } else if (channels == 2) { for (int i = 0; i < dstCount / 2; i++) { var srcI = i * src.Length / dstCount; var dstCh0I = i * 2; var srcCh0I = srcI * 2; dst[dstCh0I++] = src[srcCh0I++] / (float)short.MaxValue; dst[dstCh0I] = src[srcCh0I] / (float)short.MaxValue; } } else { for (int i = 0; i < dstCount / channels; i++) { var srcI = i * src.Length / dstCount; var dstCh0I = i * channels; var srcCh0I = srcI * channels; for (int ch = 0; ch < channels; ch++) { dst[dstCh0I++] = src[srcCh0I++] / (float)short.MaxValue; } } } } /// Resample audio data so that the complete src buffer fits into dstCount samples in the dst buffer, /// and convert float to short samples along the way. /// This implements a primitive nearest-neighbor resampling algorithm for an arbitrary number of channels. /// Source buffer. /// Destination buffer. /// Target size of destination buffer (in samples per channel). /// Number of channels in the signal (1=mono, 2=stereo). Must be > 0. public static void ResampleAndConvert(float[] src, short[] dst, int dstCount, int channels) { if (channels == 1) { for (int i = 0; i < dstCount; i++) { dst[i] = (short)(src[i * src.Length / dstCount] * (float)short.MaxValue); } } else if (channels == 2) { for (int i = 0; i < dstCount / 2; i++) { var srcI = i * src.Length / dstCount; var dstCh0I = i * 2; var srcCh0I = srcI * 2; dst[dstCh0I++] = (short)(src[srcCh0I++] * (float)short.MaxValue); dst[dstCh0I] = (short)(src[srcCh0I] * (float)short.MaxValue); } } else { for (int i = 0; i < dstCount / channels; i++) { var srcI = i * src.Length / dstCount; var dstCh0I = i * channels; var srcCh0I = srcI * channels; for (int ch = 0; ch < channels; ch++) { dst[dstCh0I++] = (short)(src[srcCh0I++] * (float)short.MaxValue); } } } } /// Convert audio buffer from float to short samples. /// Source buffer. /// Destination buffer. /// Size of destination buffer (in total samples), source buffer must be of same length or longer. public static void Convert(float[] src, short[] dst, int dstCount) { for (int i = 0; i < dstCount; i++) { dst[i] = (short)(src[i] * (float)short.MaxValue); } } /// Convert audio buffer from short to float samples. /// Source buffer. /// Destination buffer. /// Size of destination buffer (in total samples), source buffer must be of same length or longer. public static void Convert(short[] src, float[] dst, int dstCount) { for (int i = 0; i < dstCount; i++) { dst[i] = src[i] / (float)short.MaxValue; } } /// Convert audio buffer with arbitrary number of channels to stereo. /// For mono sources (srcChannels==1), the signal will be copied to both Left and Right stereo channels. /// For all others, the first two available channels will be used, any other channels will be discarded. /// Source buffer. /// Destination buffer. /// Number of (interleaved) channels in src. public static void ForceToStereo(T[] src, T[] dst, int srcChannels) { for (int i = 0, j = 0; j < dst.Length - 1; i += srcChannels, j += 2) { dst[j] = src[i]; dst[j + 1] = srcChannels > 1 ? src[i + 1] : src[i]; } } internal static string tostr(T[] x, int lim = 10) { System.Text.StringBuilder b = new System.Text.StringBuilder(); for (var i = 0; i < (x.Length < lim ? x.Length : lim); i++) { b.Append("-"); b.Append(x[i]); } return b.ToString(); } public class TempoUp { readonly int sizeofT = System.Runtime.InteropServices.Marshal.SizeOf(default(T)); int channels; int skipGroup; int skipFactor; int sign = 0; int waveCnt; bool skipping; public void Begin(int channels, int changePerc, int skipGroup) { this.channels = channels; this.skipFactor = 100 / changePerc; this.skipGroup = skipGroup; sign = 0; skipping = false; waveCnt = 0; } public int Process(T[] s, T[] d) { if (sizeofT == 2) { return processShort(s as short[], d as short[]); } else { return processFloat(s as float[], d as float[]); } } // returns the number of samples required to skip in order to complete currently skipping wave public int End(T[] s) { if (!skipping) { return 0; } if (sizeofT == 2) { return endShort(s as short[]); } else { return endFloat(s as float[]); } } int processFloat(float[] s, float[] d) { int dPos = 0; if (channels == 1) { for (int i = 0; i < s.Length; i++) { if (s[i] < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; sign = 0; } if (!skipping) { d[dPos++] = s[i]; } } } else if (channels == 2) { for (int i = 0; i < s.Length; i += 2) { if (s[i] + s[i + 1] < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; sign = 0; } if (!skipping) { d[dPos++] = s[i]; d[dPos++] = s[i + 1]; } } } else { for (int i = 0; i < s.Length; i += channels) { var x = s[i] + s[i + 1]; for (int j = 2; i < channels; j++) { x += s[i + j]; } if (x < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; sign = 0; } if (!skipping) { d[dPos++] = s[i]; d[dPos++] = s[i + 1]; for (int j = 2; i < channels; j++) { d[dPos++] += s[i + j]; } } } } return dPos / channels; } public int endFloat(float[] s) { if (channels == 1) { for (int i = 0; i < s.Length; i++) { if (s[i] < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; if (!skipping) { return i; } sign = 0; } } } else if (channels == 2) { for (int i = 0; i < s.Length; i += 2) { if (s[i] + s[i + 1] < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; if (!skipping) { return i / 2; } sign = 0; } } } else { for (int i = 0; i < s.Length; i += channels) { var x = s[i] + s[i + 1]; for (int j = 2; i < channels; j++) { x += s[i + j]; } if (x < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; if (!skipping) { return i / channels; } sign = 0; } } } return 0; } int processShort(short[] s, short[] d) { int dPos = 0; if (channels == 1) { for (int i = 0; i < s.Length; i++) { if (s[i] < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; sign = 0; } if (!skipping) { d[dPos++] = s[i]; } } } else if (channels == 2) { for (int i = 0; i < s.Length; i += 2) { if (s[i] + s[i + 1] < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; sign = 0; } if (!skipping) { d[dPos++] = s[i]; d[dPos++] = s[i + 1]; } } } else { for (int i = 0; i < s.Length; i += channels) { var x = s[i] + s[i + 1]; for (int j = 2; i < channels; j++) { x += s[i + j]; } if (x < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; sign = 0; } if (!skipping) { d[dPos++] = s[i]; d[dPos++] = s[i + 1]; for (int j = 2; i < channels; j++) { d[dPos++] += s[i + j]; } } } } return dPos / channels; } public int endShort(short[] s) { if (channels == 1) { for (int i = 0; i < s.Length; i++) { if (s[i] < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; if (!skipping) { return i; } sign = 0; } } } else if (channels == 2) { for (int i = 0; i < s.Length; i += 2) { if (s[i] + s[i + 1] < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; if (!skipping) { return i / 2; } sign = 0; } } } else { for (int i = 0; i < s.Length; i += channels) { var x = s[i] + s[i + 1]; for (int j = 2; i < channels; j++) { x += s[i + j]; } if (x < 0) { sign = -1; } else if (sign < 0) { waveCnt++; skipping = waveCnt % (skipGroup * skipFactor) < skipGroup; if (!skipping) { return i / channels; } sign = 0; } } } return 0; } } /// Sample-rate conversion Audio Processor. /// This processor converts the sample-rate of the source stream. Internally, it uses . public class Resampler : IProcessor { protected T[] frameResampled; int channels; /// Create a new Resampler instance. /// Frame size of a destination frame. Determins output rate. /// Number of audio channels expected in both in- and output. public Resampler(int dstSize, int channels) { this.frameResampled = new T[dstSize]; this.channels = channels; } public T[] Process(T[] buf) { AudioUtil.Resample(buf, this.frameResampled, this.frameResampled.Length, channels); return this.frameResampled; } public void Dispose() { } } /// Audio Level Metering interface. public interface ILevelMeter { /// /// Average amplitude value over last half second. /// float CurrentAvgAmp { get; } /// /// Maximum amplitude value over last half second sec. /// float CurrentPeakAmp { get; } /// /// Average of CurrentPeakAmps since last reset. /// float AccumAvgPeakAmp { get; } /// /// Reset . /// void ResetAccumAvgPeakAmp(); } /// Dummy Audio Level Meter that doesn't actually do anything. public class LevelMeterDummy : ILevelMeter { public float CurrentAvgAmp { get { return 0; } } public float CurrentPeakAmp { get { return 0; } } public float AccumAvgPeakAmp { get { return 0; } } public void ResetAccumAvgPeakAmp() { } } /// /// Audio Level Meter. /// abstract public class LevelMeter : IProcessor, ILevelMeter { // sum of all values in buffer protected float ampSum; // max of values from start buffer to current pos protected float ampPeak; protected int bufferSize; protected float[] prevValues; protected int prevValuesHead; protected float accumAvgPeakAmpSum; protected int accumAvgPeakAmpCount; protected float currentPeakAmp; protected float norm; internal LevelMeter(int samplingRate, int numChannels) { this.bufferSize = samplingRate * numChannels / 2; // 1/2 sec this.prevValues = new float[this.bufferSize]; } public float CurrentAvgAmp { get { return ampSum / this.bufferSize * norm; } } public float CurrentPeakAmp { get { return currentPeakAmp * norm; } protected set { currentPeakAmp = value / norm; } } public float AccumAvgPeakAmp { get { return this.accumAvgPeakAmpCount == 0 ? 0 : accumAvgPeakAmpSum / this.accumAvgPeakAmpCount * norm; } } public void ResetAccumAvgPeakAmp() { this.accumAvgPeakAmpSum = 0; this.accumAvgPeakAmpCount = 0; ampPeak = 0; } public abstract T[] Process(T[] buf); public void Dispose() { } } /// /// LevelMeter specialization for float audio. /// public class LevelMeterFloat : LevelMeter { /// Create new LevelMeterFloat instance. /// Sampling rate of the audio signal (in Hz). /// Number of channels in the audio signal. public LevelMeterFloat(int samplingRate, int numChannels) : base(samplingRate, numChannels) { norm = 1.0f; } public override float[] Process(float[] buf) { foreach (var v in buf) { var a = v; if (a < 0) { a = -a; } ampSum = ampSum + a - this.prevValues[this.prevValuesHead]; this.prevValues[this.prevValuesHead] = a; if (ampPeak < a) { ampPeak = a; } if (this.prevValuesHead == 0) { currentPeakAmp = ampPeak; ampPeak = 0; accumAvgPeakAmpSum += currentPeakAmp; accumAvgPeakAmpCount++; } this.prevValuesHead = (this.prevValuesHead + 1) % this.bufferSize; } return buf; } } /// /// LevelMeter specialization for short audio. /// public class LevelMeterShort : LevelMeter { /// Create new LevelMeterShort instance. /// Sampling rate of the audio signal (in Hz). /// Number of channels in the audio signal. public LevelMeterShort(int samplingRate, int numChannels) : base(samplingRate, numChannels) { norm = 1.0f / short.MaxValue; } public override short[] Process(short[] buf) { foreach (var v in buf) { var a = v; if (a < 0) { a = (short)-a; } ampSum = ampSum + a - this.prevValues[this.prevValuesHead]; this.prevValues[this.prevValuesHead] = a; if (ampPeak < a) { ampPeak = a; } if (this.prevValuesHead == 0) { currentPeakAmp = ampPeak; ampPeak = 0; accumAvgPeakAmpSum += currentPeakAmp; accumAvgPeakAmpCount++; } this.prevValuesHead = (this.prevValuesHead + 1) % this.bufferSize; } return buf; } } /// Voice Activity Detector interface. public interface IVoiceDetector { /// If true, voice detection enabled. bool On { get; set; } /// Voice detected as soon as signal level exceeds threshold. float Threshold { get; set; } /// If true, voice detected. bool Detected { get; } /// Last time when switched to detected state. DateTime DetectedTime { get; } /// Called when switched to detected state. event Action OnDetected; /// Keep detected state during this time after signal level dropped below threshold. int ActivityDelayMs { get; set; } } /// Calibration Utility for Voice Detector. /// Using this audio processor, you can calibrate the . public class VoiceDetectorCalibration : IProcessor { IVoiceDetector voiceDetector; ILevelMeter levelMeter; int valuesPerSec; public bool IsCalibrating { get { return calibrateCount > 0; } } protected int calibrateCount; private Action onCalibrated; /// Create new VoiceDetectorCalibration instance. /// Voice Detector to calibrate. /// Level Meter to look at for calibration. /// Sampling rate of the audio signal (in Hz). /// Number of channels in the audio signal. public VoiceDetectorCalibration(IVoiceDetector voiceDetector, ILevelMeter levelMeter, int samplingRate, int channels) { this.valuesPerSec = samplingRate * channels; this.voiceDetector = voiceDetector; this.levelMeter = levelMeter; } /// Start calibration. /// Duration of the calibration procedure (in milliseconds). /// Optional callback that is called after calibration is complete. /// /// This activates the Calibration process. /// It will reset the given LevelMeter's AccumAvgPeakAmp (accumulated average peak amplitude), /// and when the duration has passed, use it for the VoiceDetector's detection threshold. /// public void Calibrate(int durationMs, Action onCalibrated = null) { this.calibrateCount = valuesPerSec * durationMs / 1000; this.onCalibrated = onCalibrated; levelMeter.ResetAccumAvgPeakAmp(); } public T[] Process(T[] buf) { if (this.calibrateCount != 0) { this.calibrateCount -= buf.Length; if (this.calibrateCount <= 0) { this.calibrateCount = 0; this.voiceDetector.Threshold = levelMeter.AccumAvgPeakAmp * 2; if (this.onCalibrated != null) this.onCalibrated(this.voiceDetector.Threshold); } } return buf; } public void Dispose() { } } /// Dummy VoiceDetector that doesn't actually do anything. public class VoiceDetectorDummy : IVoiceDetector { public bool On { get { return false; } set { } } public float Threshold { get { return 0; } set { } } public bool Detected { get { return false; } } public int ActivityDelayMs { get { return 0; } set { } } public DateTime DetectedTime { get; private set; } public event Action OnDetected { add { } remove { } } // Disabling Warning CS0067 The event 'AudioUtil.VoiceDetectorDummy.OnDetected' is never used. } /// /// Simple voice activity detector triggered by signal level. /// abstract public class VoiceDetector : IProcessor, IVoiceDetector { /// If true, voice detection enabled. public bool On { get; set; } /// Voice detected as soon as signal level exceeds threshold. public float Threshold { get { return threshold * norm; } set { threshold = value / norm; } } protected float norm; protected float threshold; bool detected; /// If true, voice detected. public bool Detected { get { return detected; } protected set { if (detected != value) { detected = value; DetectedTime = DateTime.Now; if (detected && OnDetected != null) OnDetected(); } } } /// Last time when switched to detected state. public DateTime DetectedTime { get; private set; } /// Keep detected state during this time after signal level dropped below threshold. public int ActivityDelayMs { get { return this.activityDelay; } set { this.activityDelay = value; this.activityDelayValuesCount = value * valuesCountPerSec / 1000; } } /// Called when switched to detected state. public event Action OnDetected; protected int activityDelay; protected int autoSilenceCounter = 0; protected int valuesCountPerSec; protected int activityDelayValuesCount; internal VoiceDetector(int samplingRate, int numChannels) { this.valuesCountPerSec = samplingRate * numChannels; this.ActivityDelayMs = 500; this.On = true; } public abstract T[] Process(T[] buf); public void Dispose() { } } /// VoiceDetector specialization for float audio. public class VoiceDetectorFloat : VoiceDetector { /// Create a new VoiceDetectorFloat instance. /// Sampling rate of the audio signal (in Hz). /// Number of channels in the audio signal. public VoiceDetectorFloat(int samplingRate, int numChannels) : base(samplingRate, numChannels) { norm = 1f; } public override float[] Process(float[] buffer) { if (this.On) { foreach (var s in buffer) { if (s > this.threshold) { this.Detected = true; this.autoSilenceCounter = 0; } else { this.autoSilenceCounter++; } } if (this.autoSilenceCounter > this.activityDelayValuesCount) { this.Detected = false; } return Detected ? buffer : null; } else { return buffer; } } } /// VoiceDetector specialization for float audio. public class VoiceDetectorShort : VoiceDetector { /// Create a new VoiceDetectorFloat instance /// Sampling rate of the audio signal (in Hz). /// Number of channels in the audio signal. public VoiceDetectorShort(int samplingRate, int numChannels) : base(samplingRate, numChannels) { norm = 1.0f / short.MaxValue; } public override short[] Process(short[] buffer) { if (this.On) { foreach (var s in buffer) { if (s > this.threshold) { this.Detected = true; this.autoSilenceCounter = 0; } else { this.autoSilenceCounter++; } } if (this.autoSilenceCounter > this.activityDelayValuesCount) { this.Detected = false; } return Detected ? buffer : null; } else { return buffer; } } } /// Utility Audio Processor Voice Detection Calibration. /// Encapsulates level meter, voice detector and voice detector calibrator in single instance. public class VoiceLevelDetectCalibrate : IProcessor { /// The LevelMeter in use. public ILevelMeter LevelMeter { get; private set; } /// The VoiceDetector in use public IVoiceDetector VoiceDetector { get; private set; } /// The VoiceDetectorCalibration in use. VoiceDetectorCalibration calibration; /// Create new VoiceLevelDetectCalibrate instance /// Sampling rate of the audio signal (in Hz). /// Number of channels in the audio signal. public VoiceLevelDetectCalibrate(int samplingRate, int channels) { var x = new T[1]; if (x[0] is float) { LevelMeter = new LevelMeterFloat(samplingRate, channels); VoiceDetector = new VoiceDetectorFloat(samplingRate, channels); } else if (x[0] is short) { LevelMeter = new LevelMeterShort(samplingRate, channels); VoiceDetector = new VoiceDetectorShort(samplingRate, channels); } else { throw new Exception("VoiceLevelDetectCalibrate: type not supported: " + x[0].GetType()); } calibration = new VoiceDetectorCalibration(VoiceDetector, LevelMeter, samplingRate, channels); } /// Start calibration /// Duration of the calibration procedure (in milliseconds). /// Called when calibration is complete. Parameter is new threshold value. /// This activates the Calibration process. /// It will reset the given LevelMeter's AccumAvgPeakAmp (accumulated average peak amplitude), /// and when the duration has passed, use it for the VoiceDetector's detection threshold. public void Calibrate(int durationMs, Action onCalibrated = null) { calibration.Calibrate(durationMs, onCalibrated); } public bool IsCalibrating { get { return calibration.IsCalibrating; } } public T[] Process(T[] buf) { buf = (LevelMeter as IProcessor).Process(buf); buf = (calibration as IProcessor).Process(buf); buf = (VoiceDetector as IProcessor).Process(buf); return buf; } public void Dispose() { (LevelMeter as IProcessor).Dispose(); (VoiceDetector as IProcessor).Dispose(); calibration.Dispose(); } } } }