// Idea: We simply turn the sample data into pixels and // create ourselves a verybig*1px integral image. // Then we use the integral image access functions to probe for // various frequencies and wavelets we come up with during the // recognition of whatever we are currently listening to. // For audio recognition, however, we need twice the bit depth // for each pixel's value, seeing as satisfying audio data usually // has at least 16 bits per sample, not 8 like a true-color // RGB channel. (Granted, those are evolving too these days.) // I mean we could reduce the audio to 8 bit as step 1... but do you // really want to torture your nascent audio recognition with nasty // dithered 8 bit retro sound? sclass AudioRecognizer { IAudioSample mainSample; double defaultInputSampleRate() { ret 44100; } // It works like this: There is a general interface for // accessing an "integrated" audio clip - IAudioSample. interface IAudioSample { int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer double length(); // in samples according to sampleRate double sampleRate(); // in hertz // Query the integral. // Result is in the range -32768*(end-start) to 32767*(end-start). // unless you applied too much gain (there is no clipping) double sampleSum(int channel, double start, double end); // Here the range is -1 to 1 just to spice things up default double getPixel(int channel, double start, double end) { ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768); } // render audio as black-and-white (grayscale) stripes // h = height per channel default BufferedImage stripes(int h default 50) { int w = iceil(length()); int channels = channels(); ret imageFromFunction(w, h*channels, (x, y) -> { int channel = y/h; double value = sampleSum(channel, x, x+1); // lose lower 8 bits and shift to 0 to 255 int digital = ifloor(value/256)+128; //if (x < 20) printVars(+value, +digital); ret rgbIntFullAlpha(digital, digital, digital); }); } // render audio as graph // h = height per channel default BufferedImage graph(int h default 100) { int w = iceil(length()); ret mergeBufferedImagesVertically( countIteratorToList(channels(), c -> simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767))); } // render audio as stripes + graph (best way to look at it) default BufferedImage render(int h default 100) { ret mergeBufferedImagesVertically(stripes(h/2), graph(h)); } // find maximum amplitude, going pixel-by-pixel // (remember: This clip may already have been temporally // scaled with speedUp(), so a "pixel" may represent the average // of multiple audio samples.) default double maxAmplitude() { int n = iceil(length()), channels = channels(); double max = 0; for i to n: for c to channels: max = max(max, abs(sampleSum(c, i, i+1))); ret min(32767, max); } // There are various non-destructive virtual // transformations which you can do on the audio clip // (gain, speed-up and time-shift). // All transformations are affine and thus preserve the // "integral image" property. default IAudioSample gain(double factor) { ret factor == 1 ? this : new Gain(factor, this); } default IAudioSample normalize() { ret gain(doubleRatio(32767, maxAmplitude())); } public default IAudioSample speedUp(double factor) { ret factor == 1 ? this : new SpeedUp(factor, this); } public default IAudioSample sampleAt(double freq) { ret speedUp(sampleRate()/freq); } public default IAudioSample timeShift aka shift(double shift) { ret shift == 0 ? this : new TimeShift(shift, this); } // valued from 0 to 1 because why not // first channel only default L firstPixels(int n default 20) { double[] pixels = new[n]; for i to n: pixels[i] = sampleSum(0, i, i+1)/32768; ret wrapDoubleArrayAsList(pixels); } } // end of IAudioSample // the 16 bit per channel 1D integral image // we use to represent audio samples sclass AudioSample implements IAudioSample { int channels; double sampleRate; int length; // Here they are: the partial sums of the 16 bit audio samples. // Channels are stored interleaved long[] data; public double sampleRate() { ret sampleRate; } public int channels() { ret channels; } public double length() { ret length; } // result is in the range -32768*(end-start) to 32767*(end-start) public double sampleSum(int channel, double start, double end) { // We could do linear interpolation here if we weren't so basic. int a = ifloor(start), b = ifloor(end); ret getEntry(channel, b-1)-getEntry(channel, a-1); } // Get an entry of the sum table - allow for out-of-bounds // requests (those just default to silence). long getEntry(int channel, int i) { if (i < 0) ret 0; i = min(i, length-1); ret data[i*channels+channel]; } // perform the integration of the raw audio data *(L samples, int *channels, double *sampleRate) { length = lengthLevel2_shortArrays(samples); data = new long[length*channels]; long[] sums = new[channels]; int iSample = 0, iChunk = 0, iInArray = 0; short[] chunk = null; for i to length: { if (chunk == null || iInArray >= chunk.length) { chunk = samples.get(iChunk++); iInArray = 0; } for c to channels: data[iSample++] = (sums[c] += chunk[iInArray++]); } } } // implementation of gain modifier srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample { public double sampleRate() { ret original.sampleRate(); } public int channels() { ret original.channels(); } public double length() { ret original.length(); } public double sampleSum(int channel, double start, double end) { ret original.sampleSum(channel, start, end)*factor; } // optimize double gain public IAudioSample gain(double factor) { ret original.gain(this.factor*factor); } } // Implementation of the time-shift modifier. // moves the input samples to the left (cuts off beginning). // Shift can be fractional - we're in integral image (audio) wonderland after all // where a traditional pixel has no meaning. srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample { public double sampleRate() { ret original.sampleRate(); } public int channels() { ret original.channels(); } public double length() { ret original.length()-shift; } public double sampleSum(int channel, double start, double end) { ret original.sampleSum(channel, start+shift, end+shift); } // optimize double shift public IAudioSample timeShift(double shift) { ret original.timeShift(this.shift+shift); } } // Implementation of the speed-up modifier // which transforms every frequency f to f*factor. // This is for convenience, you could also just // call sampleSum() directly with larger intervals. sclass SpeedUp implements IAudioSample { double factor, invFactor; IAudioSample original; *(double *factor, IAudioSample *original) { if (factor < 1) fail("Can't slow down. " + factor); invFactor = 1/factor; } public double sampleRate() { ret original.sampleRate()*invFactor; } public int channels() { ret original.channels(); } public double length() { ret original.length()*invFactor; } public double sampleSum(int channel, double start, double end) { ret original.sampleSum(channel, start*factor, end*factor)*invFactor; } // optimize double speed-up public IAudioSample speedUp(double factor) { ret original.speedUp(this.factor*factor); } } // Constructors from various types of PCM data *() {} *(short[] samples, int channels) { this(ll(samples), channels); } *(L samples, int channels) { mainSample = new AudioSample(samples, channels, defaultInputSampleRate()); } *(double seconds, VF1 soundSource, int channels) { this(soundSourceToShortArrays(seconds, soundSource, channels), channels); } // in-place modifiers for mainSample (convenience functions) void applyGain(double factor) { mainSample = mainSample.gain(factor); } void normalize { mainSample = mainSample.normalize(); } void speedUp(double factor) { mainSample = mainSample.speedUp(factor); } // Here come the actual analysis functions. // This looks at a number of periods of a given frequency // starting at a certain time in the audio // and returns an intensity value. srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) { double period, end; double rawSum() { period = sample.sampleRate()/freq; double sum = 0; double t = start; for p to periods: { // Subtract a trough from a neighboring peak. // Nota bene: This is basically a Haar-like feature! // By the use of which we elegantly get around nasty // complications like DC offsets. sum += sample.sampleSum(channel, t, t+period/2) - sample.sampleSum(channel, t+period/2, t+period); t += period; } end = t; ret sum; } // alternate calculation adjusted for duration double sumDividedByDuration() { ret rawSum()/(end-start); } } // divided by duration Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) { double duration = sample.sampleRate()/freq*periods; ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration); } // Not divided by duration - this seems like the best frequency detector at this point. // As in a proper FFT/DCT, we return a complex value to represent // phase. Call abs() to get the desired intensity value. Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) { double period = sample.sampleRate()/freq; SumOfVibrations sum = new(sample, channel, start, freq, periods); double re = sum.rawSum(); sum.start += sum.period/4; double im = sum.rawSum(); ret Complex(re, im); } }