AudioRecognizer [converts audio to integral "image", original version with 8 byte per entry] [1032438]

// A base class for ultra-fast audio recognition.
// [Part of the Ultrafa.st framework]

// Idea: We do NOT spend time on a full FFT/DCT in the early stages
// of the recognition.

// Instead we stay in the time domain, turn the sample data into pixels
// and then convert that verybig*1px image into an "integral" image.

// Then we use the integral image access functions to probe for
// various frequencies and wavelets we come up with during the
// recognition of whatever we are currently listening to (that's
// what the higher-level algorithms based on this class do).

// Note we want a full 16 bit range for each pixel's value to make
// this truly hi-fi, so we actually reserve a whole 8 bytes for each 
// cell in the (1D) table (could make that 6 but that's annoying to
// handle).

// Stefan Reich, Gaz.AI, Sep 3 2021
//
// [Insert very liberal license here]

sclass AudioRecognizer {
  IAudioSample mainSample;
  
  double defaultInputSampleRate() { ret 44100; }
  
  // It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample.
  interface IAudioSample {
    int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer
    
    double length();     // in samples according to sampleRate
    double sampleRate(); // in hertz
    
    // Query the integral.
    // Result is in the range -32768*(end-start) to 32767*(end-start)...
    // unless you applied too much gain (there is no clipping).
    // channel is between 0 and channels()-1 from here on out
    double sampleSum(int channel, double start, double end);
    
    // Here the range is -1 to 1 just to spice things up
    default double getPixel(int channel, double start, double end) {
      ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768);
    }
    
    // RENDERING FUNCTIONS (visualize audio as BufferedImage)
    
    // render audio as black-and-white (grayscale) stripes
    // h = height per channel
    default BufferedImage stripes(int h default 50) {
      int w = iceil(length());
      int channels = channels();
      ret imageFromFunction(w, h*channels, (x, y) -> {
        int channel = y/h;
        double value = sampleSum(channel, x, x+1);
        
        // lose lower 8 bits and shift to 0 to 255
        int digital = ifloor(value/256)+128;
        ret rgbIntFullAlpha(digital, digital, digital);
      });
    }
   
    // render audio as graph
    // h = height per channel
    default BufferedImage graph(int h default 100) {
      int w = iceil(length());
      ret mergeBufferedImagesVertically(
        countIteratorToList(channels(), c ->
          simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767)));
    }
    
    // render audio as stripes + graph (best way to look at it)
    default BufferedImage render(int h default 100) {
      ret mergeBufferedImagesVertically(stripes(h/2), graph(h));
    }
    
    // END OF RENDERING FUNCTIONS
   
    // find maximum amplitude, going pixel-by-pixel
    // (remember: This clip may already have been temporally
    // scaled with speedUp(), so a "pixel" may represent the average
    // of multiple audio samples.)
    default double maxAmplitude() {
      int n = iceil(length()), channels = channels();
      double max = 0;
      for i to n:
        for c to channels: 
          max = max(max, abs(sampleSum(c, i, i+1)));
      ret min(32767, max);
    }
    
    // There are various non-destructive virtual transformations
    // which you can do on the audio clip (gain, speed-up and time-shift).
    // All transformations are affine in time and amplitude and thus
    // preserve the "integral image" property.
    
    default IAudioSample gain(double factor) {
      ret factor == 1 ? this : new Gain(factor, this);
    }
    
    // gain to maximum volume possible without clipping
    // (even though clipping isn't even a thing in integral audio wonderland,
    // so we just define "clipping" as exceeding the 32767 value we are used to from real audio.)
    default IAudioSample normalize() {
      ret gain(doubleRatio(32767, maxAmplitude()));
    }
    
    // resample with a factor
    public default IAudioSample speedUp(double factor) {
      ret factor == 1 ? this : new SpeedUp(factor, this);
    }
    
    // resample to a target frequency
    public default IAudioSample sampleAt(double freq) {
      ret speedUp(sampleRate()/freq);
    }
    
    public default IAudioSample timeShift aka shift(double shift) {
      ret shift == 0 ? this : new TimeShift(shift, this);
    }
    
    // For debug-printing. Valued from 0 to 1 this time because why not. First channel only
    default L<Double> firstPixels(int n default 20) {
      double[] pixels = new[n];
      for i to n:
        pixels[i] = sampleSum(0, i, i+1)/32768;
      ret wrapDoubleArrayAsList(pixels);
    }
  } // end of IAudioSample
  
  // The core integral 1D image.
  sclass AudioSample implements IAudioSample {
    int channels;
    double sampleRate;
    int length;
    
    // Here they are: the partial sums of the 16 bit audio samples.
    // Channels are stored interleaved
    long[] data;
    
    public double sampleRate() { ret sampleRate; }
    public int channels() { ret channels; }
    public double length() { ret length; }
    
    // result is in the range -32768*(end-start) to 32767*(end-start)
    public double sampleSum(int channel, double start, double end) {
      // We could do linear interpolation here if we weren't so basic.
      int a = ifloor(start), b = ifloor(end);
      ret getEntry(channel, b-1)-getEntry(channel, a-1);
    }
    
    // Get an entry of the sum table - allow for out-of-bounds
    // requests (those just default to silence).
    long getEntry(int channel, int i) {
      if (i < 0) ret 0;
      i = min(i, length-1);
      ret data[i*channels+channel];
    }
    
    // perform the integration of the raw audio data
    *(L<short[]> samples, int *channels, double *sampleRate) {
      length = lengthLevel2_shortArrays(samples);
      data = new long[length*channels];
      long[] sums = new[channels];
      int iSample = 0, iChunk = 0, iInArray = 0;
      short[] chunk = null;
      for i to length:
        for c to channels: {
          if (chunk == null || iInArray >= chunk.length) {
            chunk = samples.get(iChunk++);
            iInArray = 0;
          }
          data[iSample++] = (sums[c] += chunk[iInArray++]);
        }
    }
  }

  // implementation of gain modifier
  srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample {
    public double sampleRate() { ret original.sampleRate(); }
    public int channels() { ret original.channels(); }
    public double length() { ret original.length(); }
    
    public double sampleSum(int channel, double start, double end) {
      ret original.sampleSum(channel, start, end)*factor;
    }
    
    // coalesce consecutive gains
    public IAudioSample gain(double factor) {
      ret original.gain(this.factor*factor);
    }
  }
  
  // Implementation of the time-shift modifier.
  // moves the input <shift> samples to the left (cuts off beginning).
  // Shift can be fractional - we're in integral image (audio) wonderland after all
  // where a traditional pixel has no meaning.
  srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample {
    public double sampleRate() { ret original.sampleRate(); }
    public int channels() { ret original.channels(); }
    public double length() { ret original.length()-shift; }
    
    public double sampleSum(int channel, double start, double end) {
      ret original.sampleSum(channel, start+shift, end+shift);
    }
    
    // coalesce consecutive time-shifts
    public IAudioSample timeShift(double shift) {
      ret original.timeShift(this.shift+shift);
    }
  }
  
  // Implementation of the speed-up modifier which transforms every frequency f to f*factor.
  // This is for convenience, you could also just call sampleSum() directly with larger intervals.
  sclass SpeedUp implements IAudioSample {
    double factor, invFactor;
    IAudioSample original;

    *(double *factor, IAudioSample *original) {
      if (factor < 1) fail("Can't slow down. " + factor);
      invFactor = 1/factor;
    }
    
    public double sampleRate() { ret original.sampleRate()*invFactor; }
    public int channels() { ret original.channels(); }
    public double length() { ret original.length()*invFactor; }
    
    public double sampleSum(int channel, double start, double end) {
      ret original.sampleSum(channel, start*factor, end*factor)*invFactor;
    }
    
    // coalesce consecutive speed-ups
    public IAudioSample speedUp(double factor) {
      ret original.speedUp(this.factor*factor);
    }
  }
  
  // Constructors from various types of PCM data (including rendered-on-the-spot)
  
  *() {}
  *(short[] samples, int channels) {
    this(ll(samples), channels);
  }
  
  *(L<short[]> samples, int channels) {
    mainSample = new AudioSample(samples, channels, defaultInputSampleRate());
  }
  
  *(double seconds, VF1<double[]> soundSource, int channels) {
    this(soundSourceToShortArrays(seconds, soundSource, channels), channels);
  }
  
  // in-place modifiers for mainSample (convenience functions)
  
  void applyGain(double factor) { mainSample = mainSample.gain(factor); }
  void normalize                { mainSample = mainSample.normalize(); }
  void speedUp(double factor)   { mainSample = mainSample.speedUp(factor); }
  
  // Here come the actual analysis functions.
  
  // This looks at a number of periods of a given frequency starting at a certain time in the audio
  // and returns an intensity value.
  // No phase adjustment here, so you have to call this twice to get meaningful (complex) results.
  srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
    double period, end;
    
    double rawSum() {
      period = sample.sampleRate()/freq;
      double sum = 0, t = start;
      for p to periods: {
        // Subtract an expected trough from an expected neighboring peak and add to overall sum.
        // Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature!
        // By the use of which we automatically get around nasty complications like DC offsets in the input data.
        
        sum += sample.sampleSum(channel, t, t+period/2)
             - sample.sampleSum(channel, t+period/2, t+period);
          
        t += period;
      }
      end = t;
      ret sum;
    }
    
    // alternate calculation adjusted for duration
    double sumDividedByDuration() {
      ret rawSum()/(end-start);
    }
  }
  
  // divided by duration
  Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
    double duration = sample.sampleRate()/freq*periods;
    ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration);
  }
  
  // Not divided by duration - this seems like the best frequency detector at this point.
  // As in a proper FFT/DCT, we return a complex value to represent phase.
  // Call abs() to get the desired intensity value.
  Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) {
    SumOfVibrations sum = new(sample, channel, start, freq, periods);
    double re = sum.rawSum();
    sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle
    double im = sum.rawSum();
    ret Complex(re, im);
  }
}

Travelled to 4 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj, pyentgdyhuwx

Snippet ID:	#1032438
Snippet name:	AudioRecognizer [converts audio to integral "image", original version with 8 byte per entry]
Eternal ID of this version:	#1032438/1
Text MD5:	cabe295b99480bb1fedbc48347cd8802
Transpilation MD5:	2c9c0805c0bcb6b2b4b520642d95cc2a
Author:	stefan
Category:	javax / audio recognition
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2021-09-05 06:21:13
Source code size:	11932 bytes / 307 lines
Pitched / IR pitched:	No / No
Views / Downloads:	585 / 690
Referenced in:	[show references]

< > BotCompany Repo | #1032438 // AudioRecognizer [converts audio to integral "image", original version with 8 byte per entry]

JavaX fragment (include) [tags: use-pretranspiled]

Author comment

1	// A base class for ultra-fast audio recognition.
2	// [Part of the Ultrafa.st framework]
3
4	// Idea: We do NOT spend time on a full FFT/DCT in the early stages
5	// of the recognition.
6
7	// Instead we stay in the time domain, turn the sample data into pixels
8	// and then convert that verybig*1px image into an "integral" image.
9
10	// Then we use the integral image access functions to probe for
11	// various frequencies and wavelets we come up with during the
12	// recognition of whatever we are currently listening to (that's
13	// what the higher-level algorithms based on this class do).
14
15	// Note we want a full 16 bit range for each pixel's value to make
16	// this truly hi-fi, so we actually reserve a whole 8 bytes for each
17	// cell in the (1D) table (could make that 6 but that's annoying to
18	// handle).
19
20	// Stefan Reich, Gaz.AI, Sep 3 2021
21	//
22	// [Insert very liberal license here]
23
24	sclass AudioRecognizer {
25	IAudioSample mainSample;
26
27	double defaultInputSampleRate() { ret 44100; }
28
29	// It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample.
30	interface IAudioSample {
31	int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer
32
33	double length(); // in samples according to sampleRate
34	double sampleRate(); // in hertz
35
36	// Query the integral.
37	// Result is in the range -32768(end-start) to 32767(end-start)...
38	// unless you applied too much gain (there is no clipping).
39	// channel is between 0 and channels()-1 from here on out
40	double sampleSum(int channel, double start, double end);
41
42	// Here the range is -1 to 1 just to spice things up
43	default double getPixel(int channel, double start, double end) {
44	ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768);
45	}
46
47	// RENDERING FUNCTIONS (visualize audio as BufferedImage)
48
49	// render audio as black-and-white (grayscale) stripes
50	// h = height per channel
51	default BufferedImage stripes(int h default 50) {
52	int w = iceil(length());
53	int channels = channels();
54	ret imageFromFunction(w, h*channels, (x, y) -> {
55	int channel = y/h;
56	double value = sampleSum(channel, x, x+1);
57
58	// lose lower 8 bits and shift to 0 to 255
59	int digital = ifloor(value/256)+128;
60	ret rgbIntFullAlpha(digital, digital, digital);
61	});
62	}
63
64	// render audio as graph
65	// h = height per channel
66	default BufferedImage graph(int h default 100) {
67	int w = iceil(length());
68	ret mergeBufferedImagesVertically(
69	countIteratorToList(channels(), c ->
70	simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767)));
71	}
72
73	// render audio as stripes + graph (best way to look at it)
74	default BufferedImage render(int h default 100) {
75	ret mergeBufferedImagesVertically(stripes(h/2), graph(h));
76	}
77
78	// END OF RENDERING FUNCTIONS
79
80	// find maximum amplitude, going pixel-by-pixel
81	// (remember: This clip may already have been temporally
82	// scaled with speedUp(), so a "pixel" may represent the average
83	// of multiple audio samples.)
84	default double maxAmplitude() {
85	int n = iceil(length()), channels = channels();
86	double max = 0;
87	for i to n:
88	for c to channels:
89	max = max(max, abs(sampleSum(c, i, i+1)));
90	ret min(32767, max);
91	}
92
93	// There are various non-destructive virtual transformations
94	// which you can do on the audio clip (gain, speed-up and time-shift).
95	// All transformations are affine in time and amplitude and thus
96	// preserve the "integral image" property.
97
98	default IAudioSample gain(double factor) {
99	ret factor == 1 ? this : new Gain(factor, this);
100	}
101
102	// gain to maximum volume possible without clipping
103	// (even though clipping isn't even a thing in integral audio wonderland,
104	// so we just define "clipping" as exceeding the 32767 value we are used to from real audio.)
105	default IAudioSample normalize() {
106	ret gain(doubleRatio(32767, maxAmplitude()));
107	}
108
109	// resample with a factor
110	public default IAudioSample speedUp(double factor) {
111	ret factor == 1 ? this : new SpeedUp(factor, this);
112	}
113
114	// resample to a target frequency
115	public default IAudioSample sampleAt(double freq) {
116	ret speedUp(sampleRate()/freq);
117	}
118
119	public default IAudioSample timeShift aka shift(double shift) {
120	ret shift == 0 ? this : new TimeShift(shift, this);
121	}
122
123	// For debug-printing. Valued from 0 to 1 this time because why not. First channel only
124	default L<Double> firstPixels(int n default 20) {
125	double[] pixels = new[n];
126	for i to n:
127	pixels[i] = sampleSum(0, i, i+1)/32768;
128	ret wrapDoubleArrayAsList(pixels);
129	}
130	} // end of IAudioSample
131
132	// The core integral 1D image.
133	sclass AudioSample implements IAudioSample {
134	int channels;
135	double sampleRate;
136	int length;
137
138	// Here they are: the partial sums of the 16 bit audio samples.
139	// Channels are stored interleaved
140	long[] data;
141
142	public double sampleRate() { ret sampleRate; }
143	public int channels() { ret channels; }
144	public double length() { ret length; }
145
146	// result is in the range -32768(end-start) to 32767(end-start)
147	public double sampleSum(int channel, double start, double end) {
148	// We could do linear interpolation here if we weren't so basic.
149	int a = ifloor(start), b = ifloor(end);
150	ret getEntry(channel, b-1)-getEntry(channel, a-1);
151	}
152
153	// Get an entry of the sum table - allow for out-of-bounds
154	// requests (those just default to silence).
155	long getEntry(int channel, int i) {
156	if (i < 0) ret 0;
157	i = min(i, length-1);
158	ret data[i*channels+channel];
159	}
160
161	// perform the integration of the raw audio data
162	(L<short[]> samples, int channels, double *sampleRate) {
163	length = lengthLevel2_shortArrays(samples);
164	data = new long[length*channels];
165	long[] sums = new[channels];
166	int iSample = 0, iChunk = 0, iInArray = 0;
167	short[] chunk = null;
168	for i to length:
169	for c to channels: {
170	if (chunk == null \|\| iInArray >= chunk.length) {
171	chunk = samples.get(iChunk++);
172	iInArray = 0;
173	}
174	data[iSample++] = (sums[c] += chunk[iInArray++]);
175	}
176	}
177	}
178
179	// implementation of gain modifier
180	srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample {
181	public double sampleRate() { ret original.sampleRate(); }
182	public int channels() { ret original.channels(); }
183	public double length() { ret original.length(); }
184
185	public double sampleSum(int channel, double start, double end) {
186	ret original.sampleSum(channel, start, end)*factor;
187	}
188
189	// coalesce consecutive gains
190	public IAudioSample gain(double factor) {
191	ret original.gain(this.factor*factor);
192	}
193	}
194
195	// Implementation of the time-shift modifier.
196	// moves the input <shift> samples to the left (cuts off beginning).
197	// Shift can be fractional - we're in integral image (audio) wonderland after all
198	// where a traditional pixel has no meaning.
199	srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample {
200	public double sampleRate() { ret original.sampleRate(); }
201	public int channels() { ret original.channels(); }
202	public double length() { ret original.length()-shift; }
203
204	public double sampleSum(int channel, double start, double end) {
205	ret original.sampleSum(channel, start+shift, end+shift);
206	}
207
208	// coalesce consecutive time-shifts
209	public IAudioSample timeShift(double shift) {
210	ret original.timeShift(this.shift+shift);
211	}
212	}
213
214	// Implementation of the speed-up modifier which transforms every frequency f to f*factor.
215	// This is for convenience, you could also just call sampleSum() directly with larger intervals.
216	sclass SpeedUp implements IAudioSample {
217	double factor, invFactor;
218	IAudioSample original;
219
220	(double factor, IAudioSample *original) {
221	if (factor < 1) fail("Can't slow down. " + factor);
222	invFactor = 1/factor;
223	}
224
225	public double sampleRate() { ret original.sampleRate()*invFactor; }
226	public int channels() { ret original.channels(); }
227	public double length() { ret original.length()*invFactor; }
228
229	public double sampleSum(int channel, double start, double end) {
230	ret original.sampleSum(channel, startfactor, endfactor)*invFactor;
231	}
232
233	// coalesce consecutive speed-ups
234	public IAudioSample speedUp(double factor) {
235	ret original.speedUp(this.factor*factor);
236	}
237	}
238
239	// Constructors from various types of PCM data (including rendered-on-the-spot)
240
241	*() {}
242	*(short[] samples, int channels) {
243	this(ll(samples), channels);
244	}
245
246	*(L<short[]> samples, int channels) {
247	mainSample = new AudioSample(samples, channels, defaultInputSampleRate());
248	}
249
250	*(double seconds, VF1<double[]> soundSource, int channels) {
251	this(soundSourceToShortArrays(seconds, soundSource, channels), channels);
252	}
253
254	// in-place modifiers for mainSample (convenience functions)
255
256	void applyGain(double factor) { mainSample = mainSample.gain(factor); }
257	void normalize { mainSample = mainSample.normalize(); }
258	void speedUp(double factor) { mainSample = mainSample.speedUp(factor); }
259
260	// Here come the actual analysis functions.
261
262	// This looks at a number of periods of a given frequency starting at a certain time in the audio
263	// and returns an intensity value.
264	// No phase adjustment here, so you have to call this twice to get meaningful (complex) results.
265	srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
266	double period, end;
267
268	double rawSum() {
269	period = sample.sampleRate()/freq;
270	double sum = 0, t = start;
271	for p to periods: {
272	// Subtract an expected trough from an expected neighboring peak and add to overall sum.
273	// Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature!
274	// By the use of which we automatically get around nasty complications like DC offsets in the input data.
275
276	sum += sample.sampleSum(channel, t, t+period/2)
277	- sample.sampleSum(channel, t+period/2, t+period);
278
279	t += period;
280	}
281	end = t;
282	ret sum;
283	}
284
285	// alternate calculation adjusted for duration
286	double sumDividedByDuration() {
287	ret rawSum()/(end-start);
288	}
289	}
290
291	// divided by duration
292	Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
293	double duration = sample.sampleRate()/freq*periods;
294	ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration);
295	}
296
297	// Not divided by duration - this seems like the best frequency detector at this point.
298	// As in a proper FFT/DCT, we return a complex value to represent phase.
299	// Call abs() to get the desired intensity value.
300	Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) {
301	SumOfVibrations sum = new(sample, channel, start, freq, periods);
302	double re = sum.rawSum();
303	sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle
304	double im = sum.rawSum();
305	ret Complex(re, im);
306	}
307	}