/** A base class for ultra-fast audio recognition. [Part of the Ultrafa.st framework] Idea: We do NOT spend time on a full FFT/DCT in the early stages of the recognition. Instead we stay in the time domain, turn the sample data into pixels and then convert that verybig*1px image into an "integral" image. Then we use the integral image access functions to probe for various frequencies and wavelets we come up with during the recognition of whatever we are currently listening to. Note we want a full 16 bit range for each pixel's value to make this truly hi-fi, so we actually reserve a whole 8 bytes for each cell in the (1D) table. Stefan Reich, Gaz.AI, Sep 3 2021 [Insert very liberal license here] **/ sclass AudioRecognizer { IAudioSample mainSample; double defaultInputSampleRate() { ret 44100; } // It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample. interface IAudioSample { int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer double length(); // in samples according to sampleRate double sampleRate(); // in hertz // Query the integral. // Result is in the range -32768*(end-start) to 32767*(end-start)... // unless you applied too much gain (there is no clipping). // channel is between 0 and channels()-1 from here on out double sampleSum(int channel, double start, double end); // Here the range is -1 to 1 just to spice things up default double getPixel(int channel, double start, double end) { ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768); } // render audio as black-and-white (grayscale) stripes // h = height per channel default BufferedImage stripes(int h default 50) { int w = iceil(length()); int channels = channels(); ret imageFromFunction(w, h*channels, (x, y) -> { int channel = y/h; double value = sampleSum(channel, x, x+1); // lose lower 8 bits and shift to 0 to 255 int digital = ifloor(value/256)+128; ret rgbIntFullAlpha(digital, digital, digital); }); } // render audio as graph // h = height per channel default BufferedImage graph(int h default 100) { int w = iceil(length()); ret mergeBufferedImagesVertically( countIteratorToList(channels(), c -> simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767))); } // render audio as stripes + graph (best way to look at it) default BufferedImage render(int h default 100) { ret mergeBufferedImagesVertically(stripes(h/2), graph(h)); } // find maximum amplitude, going pixel-by-pixel // (remember: This clip may already have been temporally // scaled with speedUp(), so a "pixel" may represent the average // of multiple audio samples.) default double maxAmplitude() { int n = iceil(length()), channels = channels(); double max = 0; for i to n: for c to channels: max = max(max, abs(sampleSum(c, i, i+1))); ret min(32767, max); } // There are various non-destructive virtual transformations // which you can do on the audio clip (gain, speed-up and time-shift). // All transformations are affine in time and amplitude and thus // preserve the "integral image" property. default IAudioSample gain(double factor) { ret factor == 1 ? this : new Gain(factor, this); } // gain to maimum volume possible without clipping default IAudioSample normalize() { ret gain(doubleRatio(32767, maxAmplitude())); } // resample with a factor public default IAudioSample speedUp(double factor) { ret factor == 1 ? this : new SpeedUp(factor, this); } // resample to a target frequency public default IAudioSample sampleAt(double freq) { ret speedUp(sampleRate()/freq); } public default IAudioSample timeShift aka shift(double shift) { ret shift == 0 ? this : new TimeShift(shift, this); } // For debug-printing. Valued from 0 to 1 this time // because why not. First channel only default L firstPixels(int n default 20) { double[] pixels = new[n]; for i to n: pixels[i] = sampleSum(0, i, i+1)/32768; ret wrapDoubleArrayAsList(pixels); } } // end of IAudioSample // The core integral 1D image. sclass AudioSample implements IAudioSample { int channels; double sampleRate; int length; // Here they are: the partial sums of the 16 bit audio samples. // Channels are stored interleaved long[] data; public double sampleRate() { ret sampleRate; } public int channels() { ret channels; } public double length() { ret length; } // result is in the range -32768*(end-start) to 32767*(end-start) public double sampleSum(int channel, double start, double end) { // We could do linear interpolation here if we weren't so basic. int a = ifloor(start), b = ifloor(end); ret getEntry(channel, b-1)-getEntry(channel, a-1); } // Get an entry of the sum table - allow for out-of-bounds // requests (those just default to silence). long getEntry(int channel, int i) { if (i < 0) ret 0; i = min(i, length-1); ret data[i*channels+channel]; } // perform the integration of the raw audio data *(L samples, int *channels, double *sampleRate) { length = lengthLevel2_shortArrays(samples); data = new long[length*channels]; long[] sums = new[channels]; int iSample = 0, iChunk = 0, iInArray = 0; short[] chunk = null; for i to length: for c to channels: { if (chunk == null || iInArray >= chunk.length) { chunk = samples.get(iChunk++); iInArray = 0; } data[iSample++] = (sums[c] += chunk[iInArray++]); } } } // implementation of gain modifier srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample { public double sampleRate() { ret original.sampleRate(); } public int channels() { ret original.channels(); } public double length() { ret original.length(); } public double sampleSum(int channel, double start, double end) { ret original.sampleSum(channel, start, end)*factor; } // coalesce consecutive gains public IAudioSample gain(double factor) { ret original.gain(this.factor*factor); } } // Implementation of the time-shift modifier. // moves the input samples to the left (cuts off beginning). // Shift can be fractional - we're in integral image (audio) wonderland after all // where a traditional pixel has no meaning. srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample { public double sampleRate() { ret original.sampleRate(); } public int channels() { ret original.channels(); } public double length() { ret original.length()-shift; } public double sampleSum(int channel, double start, double end) { ret original.sampleSum(channel, start+shift, end+shift); } // coalesce consecutive time-shifts public IAudioSample timeShift(double shift) { ret original.timeShift(this.shift+shift); } } // Implementation of the speed-up modifier // which transforms every frequency f to f*factor. // This is for convenience, you could also just // call sampleSum() directly with larger intervals. sclass SpeedUp implements IAudioSample { double factor, invFactor; IAudioSample original; *(double *factor, IAudioSample *original) { if (factor < 1) fail("Can't slow down. " + factor); invFactor = 1/factor; } public double sampleRate() { ret original.sampleRate()*invFactor; } public int channels() { ret original.channels(); } public double length() { ret original.length()*invFactor; } public double sampleSum(int channel, double start, double end) { ret original.sampleSum(channel, start*factor, end*factor)*invFactor; } // coalesce consecutive speed-ups public IAudioSample speedUp(double factor) { ret original.speedUp(this.factor*factor); } } // Constructors from various types of PCM data (including rendered-on-the-spot) *() {} *(short[] samples, int channels) { this(ll(samples), channels); } *(L samples, int channels) { mainSample = new AudioSample(samples, channels, defaultInputSampleRate()); } *(double seconds, VF1 soundSource, int channels) { this(soundSourceToShortArrays(seconds, soundSource, channels), channels); } // in-place modifiers for mainSample (convenience functions) void applyGain(double factor) { mainSample = mainSample.gain(factor); } void normalize { mainSample = mainSample.normalize(); } void speedUp(double factor) { mainSample = mainSample.speedUp(factor); } // Here come the actual analysis functions. // This looks at a number of periods of a given frequency starting at a certain time in the audio // and returns an intensity value. // No phase adjustment here, so you have to call this twice to get meaningful (complex) results. srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) { double period, end; double rawSum() { period = sample.sampleRate()/freq; double sum = 0, t = start; for p to periods: { // Subtract an expected trough from an expected neighboring peak and add to overall sum. // Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature! // By the use of which we elegantly get around nasty complications like DC offsets. sum += sample.sampleSum(channel, t, t+period/2) - sample.sampleSum(channel, t+period/2, t+period); t += period; } end = t; ret sum; } // alternate calculation adjusted for duration double sumDividedByDuration() { ret rawSum()/(end-start); } } // divided by duration Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) { double duration = sample.sampleRate()/freq*periods; ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration); } // Not divided by duration - this seems like the best frequency detector at this point. // As in a proper FFT/DCT, we return a complex value to represent phase. // Call abs() to get the desired intensity value. Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) { SumOfVibrations sum = new(sample, channel, start, freq, periods); double re = sum.rawSum(); sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle double im = sum.rawSum(); ret Complex(re, im); } }