// A base class for ultra-fast audio recognition. // [Part of the Ultrafa.st framework] // Idea: We do NOT spend time on a full FFT/DCT in the early stages // of the recognition. // Instead we stay in the time domain, turn the sample data into pixels // and then convert that verybig*1px image into an "integral" image. // Then we use the integral image access functions to probe for // various frequencies and wavelets we come up with during the // recognition of whatever we are currently listening to (that's // what the higher-level algorithms based on this class do). // Note we want a full 16 bit range for each pixel's value to make // this truly hi-fi, so we actually reserve a whole 8 bytes for each // cell in the (1D) table (could make that 6 but that's annoying to // handle). // Stefan Reich, Gaz.AI, Sep 3 2021 // // [Insert very liberal license here] sclass AudioRecognizer { IAudioSample mainSample; double defaultInputSampleRate() { ret 44100; } // It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample. interface IAudioSample { int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer double length(); // in samples according to sampleRate double sampleRate(); // in hertz // Query the integral. // Result is in the range -32768*(end-start) to 32767*(end-start)... // unless you applied too much gain (there is no clipping). // channel is between 0 and channels()-1 from here on out double sampleSum(int channel, double start, double end); // Here the range is -1 to 1 just to spice things up default double getPixel(int channel, double start, double end) { ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768); } // RENDERING FUNCTIONS (visualize audio as BufferedImage) // render audio as black-and-white (grayscale) stripes // h = height per channel default BufferedImage stripes(int h default 50) { int w = iceil(length()); int channels = channels(); ret imageFromFunction(w, h*channels, (x, y) -> { int channel = y/h; double value = sampleSum(channel, x, x+1); // lose lower 8 bits and shift to 0 to 255 int digital = ifloor(value/256)+128; ret rgbIntFullAlpha(digital, digital, digital); }); } // render audio as graph // h = height per channel default BufferedImage graph(int h default 100) { int w = iceil(length()); ret mergeBufferedImagesVertically( countIteratorToList(channels(), c -> simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767))); } // render audio as stripes + graph (best way to look at it) default BufferedImage render(int h default 100) { ret mergeBufferedImagesVertically(stripes(h/2), graph(h)); } // END OF RENDERING FUNCTIONS // find maximum amplitude, going pixel-by-pixel // (remember: This clip may already have been temporally // scaled with speedUp(), so a "pixel" may represent the average // of multiple audio samples.) default double maxAmplitude() { int n = iceil(length()), channels = channels(); double max = 0; for i to n: for c to channels: max = max(max, abs(sampleSum(c, i, i+1))); ret min(32767, max); } // There are various non-destructive virtual transformations // which you can do on the audio clip (gain, speed-up and time-shift). // All transformations are affine in time and amplitude and thus // preserve the "integral image" property. default IAudioSample gain(double factor) { ret factor == 1 ? this : new Gain(factor, this); } // gain to maximum volume possible without clipping // (even though clipping isn't even a thing in integral audio wonderland, // so we just define "clipping" as exceeding the 32767 value we are used to from real audio.) default IAudioSample normalize() { ret gain(doubleRatio(32767, maxAmplitude())); } // resample with a factor public default IAudioSample speedUp(double factor) { ret factor == 1 ? this : new SpeedUp(factor, this); } // resample to a target frequency public default IAudioSample sampleAt(double freq) { ret speedUp(sampleRate()/freq); } public default IAudioSample timeShift aka shift(double shift) { ret shift == 0 ? this : new TimeShift(shift, this); } // For debug-printing. Valued from 0 to 1 this time because why not. First channel only default L firstPixels(int n default 20) { double[] pixels = new[n]; for i to n: pixels[i] = sampleSum(0, i, i+1)/32768; ret wrapDoubleArrayAsList(pixels); } } // end of IAudioSample // The core integral 1D image. sclass AudioSample implements IAudioSample { int channels; double sampleRate; int length; // Here they are: the partial sums of the 16 bit audio samples. // Channels are stored interleaved long[] data; public double sampleRate() { ret sampleRate; } public int channels() { ret channels; } public double length() { ret length; } // result is in the range -32768*(end-start) to 32767*(end-start) public double sampleSum(int channel, double start, double end) { // We could do linear interpolation here if we weren't so basic. int a = ifloor(start), b = ifloor(end); ret getEntry(channel, b-1)-getEntry(channel, a-1); } // Get an entry of the sum table - allow for out-of-bounds // requests (those just default to silence). long getEntry(int channel, int i) { if (i < 0) ret 0; i = min(i, length-1); ret data[i*channels+channel]; } // perform the integration of the raw audio data *(L samples, int *channels, double *sampleRate) { length = lengthLevel2_shortArrays(samples); data = new long[length*channels]; long[] sums = new[channels]; int iSample = 0, iChunk = 0, iInArray = 0; short[] chunk = null; for i to length: for c to channels: { if (chunk == null || iInArray >= chunk.length) { chunk = samples.get(iChunk++); iInArray = 0; } data[iSample++] = (sums[c] += chunk[iInArray++]); } } } // implementation of gain modifier srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample { public double sampleRate() { ret original.sampleRate(); } public int channels() { ret original.channels(); } public double length() { ret original.length(); } public double sampleSum(int channel, double start, double end) { ret original.sampleSum(channel, start, end)*factor; } // coalesce consecutive gains public IAudioSample gain(double factor) { ret original.gain(this.factor*factor); } } // Implementation of the time-shift modifier. // moves the input samples to the left (cuts off beginning). // Shift can be fractional - we're in integral image (audio) wonderland after all // where a traditional pixel has no meaning. srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample { public double sampleRate() { ret original.sampleRate(); } public int channels() { ret original.channels(); } public double length() { ret original.length()-shift; } public double sampleSum(int channel, double start, double end) { ret original.sampleSum(channel, start+shift, end+shift); } // coalesce consecutive time-shifts public IAudioSample timeShift(double shift) { ret original.timeShift(this.shift+shift); } } // Implementation of the speed-up modifier which transforms every frequency f to f*factor. // This is for convenience, you could also just call sampleSum() directly with larger intervals. sclass SpeedUp implements IAudioSample { double factor, invFactor; IAudioSample original; *(double *factor, IAudioSample *original) { if (factor < 1) fail("Can't slow down. " + factor); invFactor = 1/factor; } public double sampleRate() { ret original.sampleRate()*invFactor; } public int channels() { ret original.channels(); } public double length() { ret original.length()*invFactor; } public double sampleSum(int channel, double start, double end) { ret original.sampleSum(channel, start*factor, end*factor)*invFactor; } // coalesce consecutive speed-ups public IAudioSample speedUp(double factor) { ret original.speedUp(this.factor*factor); } } // Constructors from various types of PCM data (including rendered-on-the-spot) *() {} *(short[] samples, int channels) { this(ll(samples), channels); } *(L samples, int channels) { mainSample = new AudioSample(samples, channels, defaultInputSampleRate()); } *(double seconds, VF1 soundSource, int channels) { this(soundSourceToShortArrays(seconds, soundSource, channels), channels); } // in-place modifiers for mainSample (convenience functions) void applyGain(double factor) { mainSample = mainSample.gain(factor); } void normalize { mainSample = mainSample.normalize(); } void speedUp(double factor) { mainSample = mainSample.speedUp(factor); } // Here come the actual analysis functions. // This looks at a number of periods of a given frequency starting at a certain time in the audio // and returns an intensity value. // No phase adjustment here, so you have to call this twice to get meaningful (complex) results. srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) { double period, end; double rawSum() { period = sample.sampleRate()/freq; double sum = 0, t = start; for p to periods: { // Subtract an expected trough from an expected neighboring peak and add to overall sum. // Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature! // By the use of which we automatically get around nasty complications like DC offsets in the input data. sum += sample.sampleSum(channel, t, t+period/2) - sample.sampleSum(channel, t+period/2, t+period); t += period; } end = t; ret sum; } // alternate calculation adjusted for duration double sumDividedByDuration() { ret rawSum()/(end-start); } } // divided by duration Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) { double duration = sample.sampleRate()/freq*periods; ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration); } // Not divided by duration - this seems like the best frequency detector at this point. // As in a proper FFT/DCT, we return a complex value to represent phase. // Call abs() to get the desired intensity value. Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) { SumOfVibrations sum = new(sample, channel, start, freq, periods); double re = sum.rawSum(); sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle double im = sum.rawSum(); ret Complex(re, im); } }