AudioRecognizer [converts audio to integral "image"] [1032403]

// A base class for ultra-fast audio recognition.
// [Part of the Ultrafa.st framework]

// Idea: We do NOT spend time on a full FFT/DCT in the early stages
// of the recognition.

// Instead we stay in the time domain, turn the sample data into pixels
// and then convert that verybig*1px image into an "integral" image.

// Then we use the integral image access functions to probe for
// various frequencies and wavelets we come up with during the
// recognition of whatever we are currently listening to (that's
// what the higher-level algorithms based on this class do).

// Note we want a full 16 bit range for each pixel's value to make
// this truly hi-fi, so we actually reserve a whole 6 bytes for each 
// cell in the (1D) table.

// Stefan Reich, Gaz.AI, Sep 3 2021
//
// [Insert very liberal license here]

sclass AudioRecognizer {
  IAudioSample mainSample;
  
  double defaultInputSampleRate() { ret 44100; }
  
  // It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample.
  interface IAudioSample {
    int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer
    
    double length();     // in samples according to sampleRate
    double sampleRate(); // in hertz
    
    // Query the integral.
    // Result is in the range -32768*(end-start) to 32767*(end-start)...
    // unless you applied too much gain (there is no clipping).
    // channel is between 0 and channels()-1 from here on out
    double sampleSum(int channel, double start, double end);
    
    // Here the range is -1 to 1 just to spice things up
    default double getPixel(int channel, double start, double end) {
      ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768);
    }
    
    // RENDERING FUNCTIONS (visualize audio as BufferedImage)
    
    // render audio as black-and-white (grayscale) stripes
    // h = height per channel
    default BufferedImage stripes(int h default 50) {
      int w = iceil(length());
      int channels = channels();
      ret imageFromFunction(w, h*channels, (x, y) -> {
        int channel = y/h;
        double value = sampleSum(channel, x, x+1);
        
        // lose lower 8 bits and shift to 0 to 255
        int digital = ifloor(value/256)+128;
        ret rgbIntFullAlpha(digital, digital, digital);
      });
    }
   
    // render audio as graph
    // h = height per channel
    default BufferedImage graph(int h default 100) {
      int w = iceil(length());
      ret mergeBufferedImagesVertically(
        countIteratorToList(channels(), c ->
          simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767)));
    }
    
    // render audio as stripes + graph (best way to look at it)
    default BufferedImage render(int h default 100) {
      ret mergeBufferedImagesVertically(stripes(h/2), graph(h));
    }
    
    // END OF RENDERING FUNCTIONS
   
    // find maximum amplitude, going pixel-by-pixel
    // (remember: This clip may already have been temporally
    // scaled with speedUp(), so a "pixel" may represent the average
    // of multiple audio samples.)
    default double maxAmplitude() {
      int n = iceil(length()), channels = channels();
      double max = 0;
      for i to n:
        for c to channels: 
          max = max(max, abs(sampleSum(c, i, i+1)));
      ret min(32767, max);
    }
    
    // There are various non-destructive virtual transformations
    // which you can do on the audio clip (gain, speed-up and time-shift).
    // All transformations are affine in time and amplitude and thus
    // preserve the "integral image" property.
    
    default IAudioSample gain(double factor) {
      ret factor == 1 ? this : new Gain(factor, this);
    }
    
    // gain to maximum volume possible without clipping
    // (even though clipping isn't even a thing in integral audio wonderland,
    // so we just define "clipping" as exceeding the 32767 value we are used to from real audio.)
    default IAudioSample normalize() {
      ret gain(doubleRatio(32767, maxAmplitude()));
    }
    
    // resample with a factor
    public default IAudioSample speedUp(double factor) {
      ret factor == 1 ? this : new SpeedUp(factor, this);
    }
    
    // resample to a target frequency
    public default IAudioSample sampleAt(double freq) {
      ret speedUp(sampleRate()/freq);
    }
    
    public default IAudioSample timeShift aka shift(double shift) {
      ret shift == 0 ? this : new TimeShift(shift, this);
    }
    
    // For debug-printing. Valued from 0 to 1 this time because why not. First channel only
    default L<Double> firstPixels(int n default 20) {
      double[] pixels = new[n];
      for i to n:
        pixels[i] = sampleSum(0, i, i+1)/32768;
      ret wrapDoubleArrayAsList(pixels);
    }
  } // end of IAudioSample
  
  // The core integral 1D image.
  sclass AudioSample implements IAudioSample {
    int channels;
    double sampleRate;
    int length;
    
    // Here they are: the partial sums of the 16 bit audio samples
    // in an array of 6-byte integers. Channels are stored interleaved.
    HalfLongs data;
    
    public double sampleRate() { ret sampleRate; }
    public int channels() { ret channels; }
    public double length() { ret length; }
    
    // result is in the range -32768*(end-start) to 32767*(end-start)
    public double sampleSum(int channel, double start, double end) {
      // We could do linear interpolation here if we weren't so basic.
      int a = ifloor(start), b = ifloor(end);
      ret getEntry(channel, b-1)-getEntry(channel, a-1);
    }
    
    // Get an entry of the sum table - allow for out-of-bounds
    // requests (those just default to silence).
    long getEntry(int channel, int i) {
      if (i < 0) ret 0;
      i = min(i, length-1);
      ret data.get(i*channels+channel);
    }
    
    // perform the integration of the raw audio data
    *(L<short[]> samples, int *channels, double *sampleRate) {
      length = lengthLevel2_shortArrays(samples);
      data = new HalfLongs(length*channels);
      long[] sums = new[channels];
      int iSample = 0, iChunk = 0, iInArray = 0;
      short[] chunk = null;
      for i to length:
        for c to channels: {
          if (chunk == null || iInArray >= chunk.length) {
            chunk = samples.get(iChunk++);
            iInArray = 0;
          }
          data.set(iSample++, sums[c] += chunk[iInArray++]);
        }
    }
  }

  // implementation of gain modifier
  srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample {
    public double sampleRate() { ret original.sampleRate(); }
    public int channels() { ret original.channels(); }
    public double length() { ret original.length(); }
    
    public double sampleSum(int channel, double start, double end) {
      ret original.sampleSum(channel, start, end)*factor;
    }
    
    // coalesce consecutive gains
    public IAudioSample gain(double factor) {
      ret original.gain(this.factor*factor);
    }
  }
  
  // Implementation of the time-shift modifier.
  // moves the input <shift> samples to the left (cuts off beginning).
  // Shift can be fractional - we're in integral image (audio) wonderland after all
  // where a traditional pixel has no meaning.
  srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample {
    public double sampleRate() { ret original.sampleRate(); }
    public int channels() { ret original.channels(); }
    public double length() { ret original.length()-shift; }
    
    public double sampleSum(int channel, double start, double end) {
      ret original.sampleSum(channel, start+shift, end+shift);
    }
    
    // coalesce consecutive time-shifts
    public IAudioSample timeShift(double shift) {
      ret original.timeShift(this.shift+shift);
    }
  }
  
  // Implementation of the speed-up modifier which transforms every frequency f to f*factor.
  // This is for convenience, you could also just call sampleSum() directly with larger intervals.
  sclass SpeedUp implements IAudioSample {
    double factor, invFactor;
    IAudioSample original;

    *(double *factor, IAudioSample *original) {
      if (factor < 1) fail("Can't slow down. " + factor);
      invFactor = 1/factor;
    }
    
    public double sampleRate() { ret original.sampleRate()*invFactor; }
    public int channels() { ret original.channels(); }
    public double length() { ret original.length()*invFactor; }
    
    public double sampleSum(int channel, double start, double end) {
      ret original.sampleSum(channel, start*factor, end*factor)*invFactor;
    }
    
    // coalesce consecutive speed-ups
    public IAudioSample speedUp(double factor) {
      ret original.speedUp(this.factor*factor);
    }
  }
  
  // Constructors from various types of PCM data (including rendered-on-the-spot)
  
  *() {}
  *(short[] samples, int channels) {
    this(ll(samples), channels);
  }
  
  *(L<short[]> samples, int channels) {
    mainSample = new AudioSample(samples, channels, defaultInputSampleRate());
  }
  
  *(double seconds, VF1<double[]> soundSource, int channels) {
    this(soundSourceToShortArrays(seconds, soundSource, channels), channels);
  }
  
  // in-place modifiers for mainSample (convenience functions)
  
  void applyGain(double factor) { mainSample = mainSample.gain(factor); }
  void normalize                { mainSample = mainSample.normalize(); }
  void speedUp(double factor)   { mainSample = mainSample.speedUp(factor); }
  
  // Here come the actual analysis functions.
  
  // This looks at a number of periods of a given frequency starting at a certain time in the audio
  // and returns an intensity value.
  // No phase adjustment here, so you have to call this twice to get meaningful (complex) results.
  srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
    double period, end;
    
    double rawSum() {
      period = sample.sampleRate()/freq;
      double sum = 0, t = start;
      for p to periods: {
        // Subtract an expected trough from an expected neighboring peak and add to overall sum.
        // Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature!
        // By the use of which we automatically get around nasty complications like DC offsets in the input data.
        
        sum += sample.sampleSum(channel, t, t+period/2)
             - sample.sampleSum(channel, t+period/2, t+period);
          
        t += period;
      }
      end = t;
      ret sum;
    }
    
    // alternate calculation adjusted for duration
    double sumDividedByDuration() {
      ret rawSum()/(end-start);
    }
  }
  
  // divided by duration
  Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
    double duration = sample.sampleRate()/freq*periods;
    ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration);
  }
  
  // Not divided by duration - this seems like the best frequency detector at this point.
  // As in a proper FFT/DCT, we return a complex value to represent phase.
  // Call abs() to get the desired intensity value.
  Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) {
    SumOfVibrations sum = new(sample, channel, start, freq, periods);
    double re = sum.rawSum();
    sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle
    double im = sum.rawSum();
    ret Complex(re, im);
  }
}

Travelled to 4 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj, pyentgdyhuwx

Snippet ID:	#1032403
Snippet name:	AudioRecognizer [converts audio to integral "image"]
Eternal ID of this version:	#1032403/139
Text MD5:	c32474ce0d23ac9e491d7e0880be4bf7
Transpilation MD5:	0c80b5fd7c725c15c44e7eb4a11f9e1e
Author:	stefan
Category:	javax / audio recognition
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2021-09-05 08:29:07
Source code size:	11923 bytes / 306 lines
Pitched / IR pitched:	No / No
Views / Downloads:	525 / 1030
Version history:	138 change(s)
Referenced in:	[show references]

< > BotCompany Repo | #1032403 // AudioRecognizer [converts audio to integral "image"]

JavaX fragment (include) [tags: use-pretranspiled]

Author comment

1	// A base class for ultra-fast audio recognition.
2	// [Part of the Ultrafa.st framework]
3
4	// Idea: We do NOT spend time on a full FFT/DCT in the early stages
5	// of the recognition.
6
7	// Instead we stay in the time domain, turn the sample data into pixels
8	// and then convert that verybig*1px image into an "integral" image.
9
10	// Then we use the integral image access functions to probe for
11	// various frequencies and wavelets we come up with during the
12	// recognition of whatever we are currently listening to (that's
13	// what the higher-level algorithms based on this class do).
14
15	// Note we want a full 16 bit range for each pixel's value to make
16	// this truly hi-fi, so we actually reserve a whole 6 bytes for each
17	// cell in the (1D) table.
18
19	// Stefan Reich, Gaz.AI, Sep 3 2021
20	//
21	// [Insert very liberal license here]
22
23	sclass AudioRecognizer {
24	IAudioSample mainSample;
25
26	double defaultInputSampleRate() { ret 44100; }
27
28	// It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample.
29	interface IAudioSample {
30	int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer
31
32	double length(); // in samples according to sampleRate
33	double sampleRate(); // in hertz
34
35	// Query the integral.
36	// Result is in the range -32768(end-start) to 32767(end-start)...
37	// unless you applied too much gain (there is no clipping).
38	// channel is between 0 and channels()-1 from here on out
39	double sampleSum(int channel, double start, double end);
40
41	// Here the range is -1 to 1 just to spice things up
42	default double getPixel(int channel, double start, double end) {
43	ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768);
44	}
45
46	// RENDERING FUNCTIONS (visualize audio as BufferedImage)
47
48	// render audio as black-and-white (grayscale) stripes
49	// h = height per channel
50	default BufferedImage stripes(int h default 50) {
51	int w = iceil(length());
52	int channels = channels();
53	ret imageFromFunction(w, h*channels, (x, y) -> {
54	int channel = y/h;
55	double value = sampleSum(channel, x, x+1);
56
57	// lose lower 8 bits and shift to 0 to 255
58	int digital = ifloor(value/256)+128;
59	ret rgbIntFullAlpha(digital, digital, digital);
60	});
61	}
62
63	// render audio as graph
64	// h = height per channel
65	default BufferedImage graph(int h default 100) {
66	int w = iceil(length());
67	ret mergeBufferedImagesVertically(
68	countIteratorToList(channels(), c ->
69	simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767)));
70	}
71
72	// render audio as stripes + graph (best way to look at it)
73	default BufferedImage render(int h default 100) {
74	ret mergeBufferedImagesVertically(stripes(h/2), graph(h));
75	}
76
77	// END OF RENDERING FUNCTIONS
78
79	// find maximum amplitude, going pixel-by-pixel
80	// (remember: This clip may already have been temporally
81	// scaled with speedUp(), so a "pixel" may represent the average
82	// of multiple audio samples.)
83	default double maxAmplitude() {
84	int n = iceil(length()), channels = channels();
85	double max = 0;
86	for i to n:
87	for c to channels:
88	max = max(max, abs(sampleSum(c, i, i+1)));
89	ret min(32767, max);
90	}
91
92	// There are various non-destructive virtual transformations
93	// which you can do on the audio clip (gain, speed-up and time-shift).
94	// All transformations are affine in time and amplitude and thus
95	// preserve the "integral image" property.
96
97	default IAudioSample gain(double factor) {
98	ret factor == 1 ? this : new Gain(factor, this);
99	}
100
101	// gain to maximum volume possible without clipping
102	// (even though clipping isn't even a thing in integral audio wonderland,
103	// so we just define "clipping" as exceeding the 32767 value we are used to from real audio.)
104	default IAudioSample normalize() {
105	ret gain(doubleRatio(32767, maxAmplitude()));
106	}
107
108	// resample with a factor
109	public default IAudioSample speedUp(double factor) {
110	ret factor == 1 ? this : new SpeedUp(factor, this);
111	}
112
113	// resample to a target frequency
114	public default IAudioSample sampleAt(double freq) {
115	ret speedUp(sampleRate()/freq);
116	}
117
118	public default IAudioSample timeShift aka shift(double shift) {
119	ret shift == 0 ? this : new TimeShift(shift, this);
120	}
121
122	// For debug-printing. Valued from 0 to 1 this time because why not. First channel only
123	default L<Double> firstPixels(int n default 20) {
124	double[] pixels = new[n];
125	for i to n:
126	pixels[i] = sampleSum(0, i, i+1)/32768;
127	ret wrapDoubleArrayAsList(pixels);
128	}
129	} // end of IAudioSample
130
131	// The core integral 1D image.
132	sclass AudioSample implements IAudioSample {
133	int channels;
134	double sampleRate;
135	int length;
136
137	// Here they are: the partial sums of the 16 bit audio samples
138	// in an array of 6-byte integers. Channels are stored interleaved.
139	HalfLongs data;
140
141	public double sampleRate() { ret sampleRate; }
142	public int channels() { ret channels; }
143	public double length() { ret length; }
144
145	// result is in the range -32768(end-start) to 32767(end-start)
146	public double sampleSum(int channel, double start, double end) {
147	// We could do linear interpolation here if we weren't so basic.
148	int a = ifloor(start), b = ifloor(end);
149	ret getEntry(channel, b-1)-getEntry(channel, a-1);
150	}
151
152	// Get an entry of the sum table - allow for out-of-bounds
153	// requests (those just default to silence).
154	long getEntry(int channel, int i) {
155	if (i < 0) ret 0;
156	i = min(i, length-1);
157	ret data.get(i*channels+channel);
158	}
159
160	// perform the integration of the raw audio data
161	(L<short[]> samples, int channels, double *sampleRate) {
162	length = lengthLevel2_shortArrays(samples);
163	data = new HalfLongs(length*channels);
164	long[] sums = new[channels];
165	int iSample = 0, iChunk = 0, iInArray = 0;
166	short[] chunk = null;
167	for i to length:
168	for c to channels: {
169	if (chunk == null \|\| iInArray >= chunk.length) {
170	chunk = samples.get(iChunk++);
171	iInArray = 0;
172	}
173	data.set(iSample++, sums[c] += chunk[iInArray++]);
174	}
175	}
176	}
177
178	// implementation of gain modifier
179	srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample {
180	public double sampleRate() { ret original.sampleRate(); }
181	public int channels() { ret original.channels(); }
182	public double length() { ret original.length(); }
183
184	public double sampleSum(int channel, double start, double end) {
185	ret original.sampleSum(channel, start, end)*factor;
186	}
187
188	// coalesce consecutive gains
189	public IAudioSample gain(double factor) {
190	ret original.gain(this.factor*factor);
191	}
192	}
193
194	// Implementation of the time-shift modifier.
195	// moves the input <shift> samples to the left (cuts off beginning).
196	// Shift can be fractional - we're in integral image (audio) wonderland after all
197	// where a traditional pixel has no meaning.
198	srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample {
199	public double sampleRate() { ret original.sampleRate(); }
200	public int channels() { ret original.channels(); }
201	public double length() { ret original.length()-shift; }
202
203	public double sampleSum(int channel, double start, double end) {
204	ret original.sampleSum(channel, start+shift, end+shift);
205	}
206
207	// coalesce consecutive time-shifts
208	public IAudioSample timeShift(double shift) {
209	ret original.timeShift(this.shift+shift);
210	}
211	}
212
213	// Implementation of the speed-up modifier which transforms every frequency f to f*factor.
214	// This is for convenience, you could also just call sampleSum() directly with larger intervals.
215	sclass SpeedUp implements IAudioSample {
216	double factor, invFactor;
217	IAudioSample original;
218
219	(double factor, IAudioSample *original) {
220	if (factor < 1) fail("Can't slow down. " + factor);
221	invFactor = 1/factor;
222	}
223
224	public double sampleRate() { ret original.sampleRate()*invFactor; }
225	public int channels() { ret original.channels(); }
226	public double length() { ret original.length()*invFactor; }
227
228	public double sampleSum(int channel, double start, double end) {
229	ret original.sampleSum(channel, startfactor, endfactor)*invFactor;
230	}
231
232	// coalesce consecutive speed-ups
233	public IAudioSample speedUp(double factor) {
234	ret original.speedUp(this.factor*factor);
235	}
236	}
237
238	// Constructors from various types of PCM data (including rendered-on-the-spot)
239
240	*() {}
241	*(short[] samples, int channels) {
242	this(ll(samples), channels);
243	}
244
245	*(L<short[]> samples, int channels) {
246	mainSample = new AudioSample(samples, channels, defaultInputSampleRate());
247	}
248
249	*(double seconds, VF1<double[]> soundSource, int channels) {
250	this(soundSourceToShortArrays(seconds, soundSource, channels), channels);
251	}
252
253	// in-place modifiers for mainSample (convenience functions)
254
255	void applyGain(double factor) { mainSample = mainSample.gain(factor); }
256	void normalize { mainSample = mainSample.normalize(); }
257	void speedUp(double factor) { mainSample = mainSample.speedUp(factor); }
258
259	// Here come the actual analysis functions.
260
261	// This looks at a number of periods of a given frequency starting at a certain time in the audio
262	// and returns an intensity value.
263	// No phase adjustment here, so you have to call this twice to get meaningful (complex) results.
264	srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
265	double period, end;
266
267	double rawSum() {
268	period = sample.sampleRate()/freq;
269	double sum = 0, t = start;
270	for p to periods: {
271	// Subtract an expected trough from an expected neighboring peak and add to overall sum.
272	// Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature!
273	// By the use of which we automatically get around nasty complications like DC offsets in the input data.
274
275	sum += sample.sampleSum(channel, t, t+period/2)
276	- sample.sampleSum(channel, t+period/2, t+period);
277
278	t += period;
279	}
280	end = t;
281	ret sum;
282	}
283
284	// alternate calculation adjusted for duration
285	double sumDividedByDuration() {
286	ret rawSum()/(end-start);
287	}
288	}
289
290	// divided by duration
291	Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
292	double duration = sample.sampleRate()/freq*periods;
293	ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration);
294	}
295
296	// Not divided by duration - this seems like the best frequency detector at this point.
297	// As in a proper FFT/DCT, we return a complex value to represent phase.
298	// Call abs() to get the desired intensity value.
299	Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) {
300	SumOfVibrations sum = new(sample, channel, start, freq, periods);
301	double re = sum.rawSum();
302	sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle
303	double im = sum.rawSum();
304	ret Complex(re, im);
305	}
306	}