Libraryless. Click here for Pure Java version (5810L/34K).
1 | // A base class for ultra-fast audio recognition. |
2 | // [Part of the Ultrafa.st framework] |
3 | |
4 | // Idea: We do NOT spend time on a full FFT/DCT in the early stages |
5 | // of the recognition. |
6 | |
7 | // Instead we stay in the time domain, turn the sample data into pixels |
8 | // and then convert that verybig*1px image into an "integral" image. |
9 | |
10 | // Then we use the integral image access functions to probe for |
11 | // various frequencies and wavelets we come up with during the |
12 | // recognition of whatever we are currently listening to (that's |
13 | // what the higher-level algorithms based on this class do). |
14 | |
15 | // Note we want a full 16 bit range for each pixel's value to make |
16 | // this truly hi-fi, so we actually reserve a whole 6 bytes for each |
17 | // cell in the (1D) table. |
18 | |
19 | // Stefan Reich, Gaz.AI, Sep 3 2021 |
20 | // |
21 | // [Insert very liberal license here] |
22 | |
23 | sclass AudioRecognizer { |
24 | IAudioSample mainSample; |
25 | |
26 | double defaultInputSampleRate() { ret 44100; } |
27 | |
28 | // It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample. |
29 | interface IAudioSample { |
30 | int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer |
31 | |
32 | double length(); // in samples according to sampleRate |
33 | double sampleRate(); // in hertz |
34 | |
35 | // Query the integral. |
36 | // Result is in the range -32768*(end-start) to 32767*(end-start)... |
37 | // unless you applied too much gain (there is no clipping). |
38 | // channel is between 0 and channels()-1 from here on out |
39 | double sampleSum(int channel, double start, double end); |
40 | |
41 | // Here the range is -1 to 1 just to spice things up |
42 | default double getPixel(int channel, double start, double end) { |
43 | ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768); |
44 | } |
45 | |
46 | // RENDERING FUNCTIONS (visualize audio as BufferedImage) |
47 | |
48 | // render audio as black-and-white (grayscale) stripes |
49 | // h = height per channel |
50 | default BufferedImage stripes(int h default 50) { |
51 | int w = iceil(length()); |
52 | int channels = channels(); |
53 | ret imageFromFunction(w, h*channels, (x, y) -> { |
54 | int channel = y/h; |
55 | double value = sampleSum(channel, x, x+1); |
56 | |
57 | // lose lower 8 bits and shift to 0 to 255 |
58 | int digital = ifloor(value/256)+128; |
59 | ret rgbIntFullAlpha(digital, digital, digital); |
60 | }); |
61 | } |
62 | |
63 | // render audio as graph |
64 | // h = height per channel |
65 | default BufferedImage graph(int h default 100) { |
66 | int w = iceil(length()); |
67 | ret mergeBufferedImagesVertically( |
68 | countIteratorToList(channels(), c -> |
69 | simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767))); |
70 | } |
71 | |
72 | // render audio as stripes + graph (best way to look at it) |
73 | default BufferedImage render(int h default 100) { |
74 | ret mergeBufferedImagesVertically(stripes(h/2), graph(h)); |
75 | } |
76 | |
77 | // END OF RENDERING FUNCTIONS |
78 | |
79 | // find maximum amplitude, going pixel-by-pixel |
80 | // (remember: This clip may already have been temporally |
81 | // scaled with speedUp(), so a "pixel" may represent the average |
82 | // of multiple audio samples.) |
83 | default double maxAmplitude() { |
84 | int n = iceil(length()), channels = channels(); |
85 | double max = 0; |
86 | for i to n: |
87 | for c to channels: |
88 | max = max(max, abs(sampleSum(c, i, i+1))); |
89 | ret min(32767, max); |
90 | } |
91 | |
92 | // There are various non-destructive virtual transformations |
93 | // which you can do on the audio clip (gain, speed-up and time-shift). |
94 | // All transformations are affine in time and amplitude and thus |
95 | // preserve the "integral image" property. |
96 | |
97 | default IAudioSample gain(double factor) { |
98 | ret factor == 1 ? this : new Gain(factor, this); |
99 | } |
100 | |
101 | // gain to maximum volume possible without clipping |
102 | // (even though clipping isn't even a thing in integral audio wonderland, |
103 | // so we just define "clipping" as exceeding the 32767 value we are used to from real audio.) |
104 | default IAudioSample normalize() { |
105 | ret gain(doubleRatio(32767, maxAmplitude())); |
106 | } |
107 | |
108 | // resample with a factor |
109 | public default IAudioSample speedUp(double factor) { |
110 | ret factor == 1 ? this : new SpeedUp(factor, this); |
111 | } |
112 | |
113 | // resample to a target frequency |
114 | public default IAudioSample sampleAt(double freq) { |
115 | ret speedUp(sampleRate()/freq); |
116 | } |
117 | |
118 | public default IAudioSample timeShift aka shift(double shift) { |
119 | ret shift == 0 ? this : new TimeShift(shift, this); |
120 | } |
121 | |
122 | // For debug-printing. Valued from 0 to 1 this time because why not. First channel only |
123 | default L<Double> firstPixels(int n default 20) { |
124 | double[] pixels = new[n]; |
125 | for i to n: |
126 | pixels[i] = sampleSum(0, i, i+1)/32768; |
127 | ret wrapDoubleArrayAsList(pixels); |
128 | } |
129 | } // end of IAudioSample |
130 | |
131 | // The core integral 1D image. |
132 | sclass AudioSample implements IAudioSample { |
133 | int channels; |
134 | double sampleRate; |
135 | int length; |
136 | |
137 | // Here they are: the partial sums of the 16 bit audio samples |
138 | // in an array of 6-byte integers. Channels are stored interleaved. |
139 | HalfLongs data; |
140 | |
141 | public double sampleRate() { ret sampleRate; } |
142 | public int channels() { ret channels; } |
143 | public double length() { ret length; } |
144 | |
145 | // result is in the range -32768*(end-start) to 32767*(end-start) |
146 | public double sampleSum(int channel, double start, double end) { |
147 | // We could do linear interpolation here if we weren't so basic. |
148 | int a = ifloor(start), b = ifloor(end); |
149 | ret getEntry(channel, b-1)-getEntry(channel, a-1); |
150 | } |
151 | |
152 | // Get an entry of the sum table - allow for out-of-bounds |
153 | // requests (those just default to silence). |
154 | long getEntry(int channel, int i) { |
155 | if (i < 0) ret 0; |
156 | i = min(i, length-1); |
157 | ret data.get(i*channels+channel); |
158 | } |
159 | |
160 | // perform the integration of the raw audio data |
161 | *(L<short[]> samples, int *channels, double *sampleRate) { |
162 | length = lengthLevel2_shortArrays(samples); |
163 | data = new HalfLongs(length*channels); |
164 | long[] sums = new[channels]; |
165 | int iSample = 0, iChunk = 0, iInArray = 0; |
166 | short[] chunk = null; |
167 | for i to length: |
168 | for c to channels: { |
169 | if (chunk == null || iInArray >= chunk.length) { |
170 | chunk = samples.get(iChunk++); |
171 | iInArray = 0; |
172 | } |
173 | data.set(iSample++, sums[c] += chunk[iInArray++]); |
174 | } |
175 | } |
176 | } |
177 | |
178 | // implementation of gain modifier |
179 | srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample { |
180 | public double sampleRate() { ret original.sampleRate(); } |
181 | public int channels() { ret original.channels(); } |
182 | public double length() { ret original.length(); } |
183 | |
184 | public double sampleSum(int channel, double start, double end) { |
185 | ret original.sampleSum(channel, start, end)*factor; |
186 | } |
187 | |
188 | // coalesce consecutive gains |
189 | public IAudioSample gain(double factor) { |
190 | ret original.gain(this.factor*factor); |
191 | } |
192 | } |
193 | |
194 | // Implementation of the time-shift modifier. |
195 | // moves the input <shift> samples to the left (cuts off beginning). |
196 | // Shift can be fractional - we're in integral image (audio) wonderland after all |
197 | // where a traditional pixel has no meaning. |
198 | srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample { |
199 | public double sampleRate() { ret original.sampleRate(); } |
200 | public int channels() { ret original.channels(); } |
201 | public double length() { ret original.length()-shift; } |
202 | |
203 | public double sampleSum(int channel, double start, double end) { |
204 | ret original.sampleSum(channel, start+shift, end+shift); |
205 | } |
206 | |
207 | // coalesce consecutive time-shifts |
208 | public IAudioSample timeShift(double shift) { |
209 | ret original.timeShift(this.shift+shift); |
210 | } |
211 | } |
212 | |
213 | // Implementation of the speed-up modifier which transforms every frequency f to f*factor. |
214 | // This is for convenience, you could also just call sampleSum() directly with larger intervals. |
215 | sclass SpeedUp implements IAudioSample { |
216 | double factor, invFactor; |
217 | IAudioSample original; |
218 | |
219 | *(double *factor, IAudioSample *original) { |
220 | if (factor < 1) fail("Can't slow down. " + factor); |
221 | invFactor = 1/factor; |
222 | } |
223 | |
224 | public double sampleRate() { ret original.sampleRate()*invFactor; } |
225 | public int channels() { ret original.channels(); } |
226 | public double length() { ret original.length()*invFactor; } |
227 | |
228 | public double sampleSum(int channel, double start, double end) { |
229 | ret original.sampleSum(channel, start*factor, end*factor)*invFactor; |
230 | } |
231 | |
232 | // coalesce consecutive speed-ups |
233 | public IAudioSample speedUp(double factor) { |
234 | ret original.speedUp(this.factor*factor); |
235 | } |
236 | } |
237 | |
238 | // Constructors from various types of PCM data (including rendered-on-the-spot) |
239 | |
240 | *() {} |
241 | *(short[] samples, int channels) { |
242 | this(ll(samples), channels); |
243 | } |
244 | |
245 | *(L<short[]> samples, int channels) { |
246 | mainSample = new AudioSample(samples, channels, defaultInputSampleRate()); |
247 | } |
248 | |
249 | *(double seconds, VF1<double[]> soundSource, int channels) { |
250 | this(soundSourceToShortArrays(seconds, soundSource, channels), channels); |
251 | } |
252 | |
253 | // in-place modifiers for mainSample (convenience functions) |
254 | |
255 | void applyGain(double factor) { mainSample = mainSample.gain(factor); } |
256 | void normalize { mainSample = mainSample.normalize(); } |
257 | void speedUp(double factor) { mainSample = mainSample.speedUp(factor); } |
258 | |
259 | // Here come the actual analysis functions. |
260 | |
261 | // This looks at a number of periods of a given frequency starting at a certain time in the audio |
262 | // and returns an intensity value. |
263 | // No phase adjustment here, so you have to call this twice to get meaningful (complex) results. |
264 | srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) { |
265 | double period, end; |
266 | |
267 | double rawSum() { |
268 | period = sample.sampleRate()/freq; |
269 | double sum = 0, t = start; |
270 | for p to periods: { |
271 | // Subtract an expected trough from an expected neighboring peak and add to overall sum. |
272 | // Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature! |
273 | // By the use of which we automatically get around nasty complications like DC offsets in the input data. |
274 | |
275 | sum += sample.sampleSum(channel, t, t+period/2) |
276 | - sample.sampleSum(channel, t+period/2, t+period); |
277 | |
278 | t += period; |
279 | } |
280 | end = t; |
281 | ret sum; |
282 | } |
283 | |
284 | // alternate calculation adjusted for duration |
285 | double sumDividedByDuration() { |
286 | ret rawSum()/(end-start); |
287 | } |
288 | } |
289 | |
290 | // divided by duration |
291 | Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) { |
292 | double duration = sample.sampleRate()/freq*periods; |
293 | ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration); |
294 | } |
295 | |
296 | // Not divided by duration - this seems like the best frequency detector at this point. |
297 | // As in a proper FFT/DCT, we return a complex value to represent phase. |
298 | // Call abs() to get the desired intensity value. |
299 | Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) { |
300 | SumOfVibrations sum = new(sample, channel, start, freq, periods); |
301 | double re = sum.rawSum(); |
302 | sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle |
303 | double im = sum.rawSum(); |
304 | ret Complex(re, im); |
305 | } |
306 | } |
Began life as a copy of #1032199
download show line numbers debug dex old transpilations
Travelled to 4 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj, pyentgdyhuwx
No comments. add comment
Snippet ID: | #1032403 |
Snippet name: | AudioRecognizer [converts audio to integral "image"] |
Eternal ID of this version: | #1032403/139 |
Text MD5: | c32474ce0d23ac9e491d7e0880be4bf7 |
Transpilation MD5: | 0c80b5fd7c725c15c44e7eb4a11f9e1e |
Author: | stefan |
Category: | javax / audio recognition |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2021-09-05 08:29:07 |
Source code size: | 11923 bytes / 306 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 533 / 1040 |
Version history: | 138 change(s) |
Referenced in: | [show references] |