Libraryless. Click here for Pure Java version (5770L/34K).
1 | // A base class for ultra-fast audio recognition. |
2 | // [Part of the Ultrafa.st framework] |
3 | |
4 | // Idea: We do NOT spend time on a full FFT/DCT in the early stages |
5 | // of the recognition. |
6 | |
7 | // Instead we stay in the time domain, turn the sample data into pixels |
8 | // and then convert that verybig*1px image into an "integral" image. |
9 | |
10 | // Then we use the integral image access functions to probe for |
11 | // various frequencies and wavelets we come up with during the |
12 | // recognition of whatever we are currently listening to (that's |
13 | // what the higher-level algorithms based on this class do). |
14 | |
15 | // Note we want a full 16 bit range for each pixel's value to make |
16 | // this truly hi-fi, so we actually reserve a whole 8 bytes for each |
17 | // cell in the (1D) table (could make that 6 but that's annoying to |
18 | // handle). |
19 | |
20 | // Stefan Reich, Gaz.AI, Sep 3 2021 |
21 | // |
22 | // [Insert very liberal license here] |
23 | |
24 | sclass AudioRecognizer { |
25 | IAudioSample mainSample; |
26 | |
27 | double defaultInputSampleRate() { ret 44100; } |
28 | |
29 | // It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample. |
30 | interface IAudioSample { |
31 | int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer |
32 | |
33 | double length(); // in samples according to sampleRate |
34 | double sampleRate(); // in hertz |
35 | |
36 | // Query the integral. |
37 | // Result is in the range -32768*(end-start) to 32767*(end-start)... |
38 | // unless you applied too much gain (there is no clipping). |
39 | // channel is between 0 and channels()-1 from here on out |
40 | double sampleSum(int channel, double start, double end); |
41 | |
42 | // Here the range is -1 to 1 just to spice things up |
43 | default double getPixel(int channel, double start, double end) { |
44 | ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768); |
45 | } |
46 | |
47 | // RENDERING FUNCTIONS (visualize audio as BufferedImage) |
48 | |
49 | // render audio as black-and-white (grayscale) stripes |
50 | // h = height per channel |
51 | default BufferedImage stripes(int h default 50) { |
52 | int w = iceil(length()); |
53 | int channels = channels(); |
54 | ret imageFromFunction(w, h*channels, (x, y) -> { |
55 | int channel = y/h; |
56 | double value = sampleSum(channel, x, x+1); |
57 | |
58 | // lose lower 8 bits and shift to 0 to 255 |
59 | int digital = ifloor(value/256)+128; |
60 | ret rgbIntFullAlpha(digital, digital, digital); |
61 | }); |
62 | } |
63 | |
64 | // render audio as graph |
65 | // h = height per channel |
66 | default BufferedImage graph(int h default 100) { |
67 | int w = iceil(length()); |
68 | ret mergeBufferedImagesVertically( |
69 | countIteratorToList(channels(), c -> |
70 | simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767))); |
71 | } |
72 | |
73 | // render audio as stripes + graph (best way to look at it) |
74 | default BufferedImage render(int h default 100) { |
75 | ret mergeBufferedImagesVertically(stripes(h/2), graph(h)); |
76 | } |
77 | |
78 | // END OF RENDERING FUNCTIONS |
79 | |
80 | // find maximum amplitude, going pixel-by-pixel |
81 | // (remember: This clip may already have been temporally |
82 | // scaled with speedUp(), so a "pixel" may represent the average |
83 | // of multiple audio samples.) |
84 | default double maxAmplitude() { |
85 | int n = iceil(length()), channels = channels(); |
86 | double max = 0; |
87 | for i to n: |
88 | for c to channels: |
89 | max = max(max, abs(sampleSum(c, i, i+1))); |
90 | ret min(32767, max); |
91 | } |
92 | |
93 | // There are various non-destructive virtual transformations |
94 | // which you can do on the audio clip (gain, speed-up and time-shift). |
95 | // All transformations are affine in time and amplitude and thus |
96 | // preserve the "integral image" property. |
97 | |
98 | default IAudioSample gain(double factor) { |
99 | ret factor == 1 ? this : new Gain(factor, this); |
100 | } |
101 | |
102 | // gain to maximum volume possible without clipping |
103 | // (even though clipping isn't even a thing in integral audio wonderland, |
104 | // so we just define "clipping" as exceeding the 32767 value we are used to from real audio.) |
105 | default IAudioSample normalize() { |
106 | ret gain(doubleRatio(32767, maxAmplitude())); |
107 | } |
108 | |
109 | // resample with a factor |
110 | public default IAudioSample speedUp(double factor) { |
111 | ret factor == 1 ? this : new SpeedUp(factor, this); |
112 | } |
113 | |
114 | // resample to a target frequency |
115 | public default IAudioSample sampleAt(double freq) { |
116 | ret speedUp(sampleRate()/freq); |
117 | } |
118 | |
119 | public default IAudioSample timeShift aka shift(double shift) { |
120 | ret shift == 0 ? this : new TimeShift(shift, this); |
121 | } |
122 | |
123 | // For debug-printing. Valued from 0 to 1 this time because why not. First channel only |
124 | default L<Double> firstPixels(int n default 20) { |
125 | double[] pixels = new[n]; |
126 | for i to n: |
127 | pixels[i] = sampleSum(0, i, i+1)/32768; |
128 | ret wrapDoubleArrayAsList(pixels); |
129 | } |
130 | } // end of IAudioSample |
131 | |
132 | // The core integral 1D image. |
133 | sclass AudioSample implements IAudioSample { |
134 | int channels; |
135 | double sampleRate; |
136 | int length; |
137 | |
138 | // Here they are: the partial sums of the 16 bit audio samples. |
139 | // Channels are stored interleaved |
140 | long[] data; |
141 | |
142 | public double sampleRate() { ret sampleRate; } |
143 | public int channels() { ret channels; } |
144 | public double length() { ret length; } |
145 | |
146 | // result is in the range -32768*(end-start) to 32767*(end-start) |
147 | public double sampleSum(int channel, double start, double end) { |
148 | // We could do linear interpolation here if we weren't so basic. |
149 | int a = ifloor(start), b = ifloor(end); |
150 | ret getEntry(channel, b-1)-getEntry(channel, a-1); |
151 | } |
152 | |
153 | // Get an entry of the sum table - allow for out-of-bounds |
154 | // requests (those just default to silence). |
155 | long getEntry(int channel, int i) { |
156 | if (i < 0) ret 0; |
157 | i = min(i, length-1); |
158 | ret data[i*channels+channel]; |
159 | } |
160 | |
161 | // perform the integration of the raw audio data |
162 | *(L<short[]> samples, int *channels, double *sampleRate) { |
163 | length = lengthLevel2_shortArrays(samples); |
164 | data = new long[length*channels]; |
165 | long[] sums = new[channels]; |
166 | int iSample = 0, iChunk = 0, iInArray = 0; |
167 | short[] chunk = null; |
168 | for i to length: |
169 | for c to channels: { |
170 | if (chunk == null || iInArray >= chunk.length) { |
171 | chunk = samples.get(iChunk++); |
172 | iInArray = 0; |
173 | } |
174 | data[iSample++] = (sums[c] += chunk[iInArray++]); |
175 | } |
176 | } |
177 | } |
178 | |
179 | // implementation of gain modifier |
180 | srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample { |
181 | public double sampleRate() { ret original.sampleRate(); } |
182 | public int channels() { ret original.channels(); } |
183 | public double length() { ret original.length(); } |
184 | |
185 | public double sampleSum(int channel, double start, double end) { |
186 | ret original.sampleSum(channel, start, end)*factor; |
187 | } |
188 | |
189 | // coalesce consecutive gains |
190 | public IAudioSample gain(double factor) { |
191 | ret original.gain(this.factor*factor); |
192 | } |
193 | } |
194 | |
195 | // Implementation of the time-shift modifier. |
196 | // moves the input <shift> samples to the left (cuts off beginning). |
197 | // Shift can be fractional - we're in integral image (audio) wonderland after all |
198 | // where a traditional pixel has no meaning. |
199 | srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample { |
200 | public double sampleRate() { ret original.sampleRate(); } |
201 | public int channels() { ret original.channels(); } |
202 | public double length() { ret original.length()-shift; } |
203 | |
204 | public double sampleSum(int channel, double start, double end) { |
205 | ret original.sampleSum(channel, start+shift, end+shift); |
206 | } |
207 | |
208 | // coalesce consecutive time-shifts |
209 | public IAudioSample timeShift(double shift) { |
210 | ret original.timeShift(this.shift+shift); |
211 | } |
212 | } |
213 | |
214 | // Implementation of the speed-up modifier which transforms every frequency f to f*factor. |
215 | // This is for convenience, you could also just call sampleSum() directly with larger intervals. |
216 | sclass SpeedUp implements IAudioSample { |
217 | double factor, invFactor; |
218 | IAudioSample original; |
219 | |
220 | *(double *factor, IAudioSample *original) { |
221 | if (factor < 1) fail("Can't slow down. " + factor); |
222 | invFactor = 1/factor; |
223 | } |
224 | |
225 | public double sampleRate() { ret original.sampleRate()*invFactor; } |
226 | public int channels() { ret original.channels(); } |
227 | public double length() { ret original.length()*invFactor; } |
228 | |
229 | public double sampleSum(int channel, double start, double end) { |
230 | ret original.sampleSum(channel, start*factor, end*factor)*invFactor; |
231 | } |
232 | |
233 | // coalesce consecutive speed-ups |
234 | public IAudioSample speedUp(double factor) { |
235 | ret original.speedUp(this.factor*factor); |
236 | } |
237 | } |
238 | |
239 | // Constructors from various types of PCM data (including rendered-on-the-spot) |
240 | |
241 | *() {} |
242 | *(short[] samples, int channels) { |
243 | this(ll(samples), channels); |
244 | } |
245 | |
246 | *(L<short[]> samples, int channels) { |
247 | mainSample = new AudioSample(samples, channels, defaultInputSampleRate()); |
248 | } |
249 | |
250 | *(double seconds, VF1<double[]> soundSource, int channels) { |
251 | this(soundSourceToShortArrays(seconds, soundSource, channels), channels); |
252 | } |
253 | |
254 | // in-place modifiers for mainSample (convenience functions) |
255 | |
256 | void applyGain(double factor) { mainSample = mainSample.gain(factor); } |
257 | void normalize { mainSample = mainSample.normalize(); } |
258 | void speedUp(double factor) { mainSample = mainSample.speedUp(factor); } |
259 | |
260 | // Here come the actual analysis functions. |
261 | |
262 | // This looks at a number of periods of a given frequency starting at a certain time in the audio |
263 | // and returns an intensity value. |
264 | // No phase adjustment here, so you have to call this twice to get meaningful (complex) results. |
265 | srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) { |
266 | double period, end; |
267 | |
268 | double rawSum() { |
269 | period = sample.sampleRate()/freq; |
270 | double sum = 0, t = start; |
271 | for p to periods: { |
272 | // Subtract an expected trough from an expected neighboring peak and add to overall sum. |
273 | // Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature! |
274 | // By the use of which we automatically get around nasty complications like DC offsets in the input data. |
275 | |
276 | sum += sample.sampleSum(channel, t, t+period/2) |
277 | - sample.sampleSum(channel, t+period/2, t+period); |
278 | |
279 | t += period; |
280 | } |
281 | end = t; |
282 | ret sum; |
283 | } |
284 | |
285 | // alternate calculation adjusted for duration |
286 | double sumDividedByDuration() { |
287 | ret rawSum()/(end-start); |
288 | } |
289 | } |
290 | |
291 | // divided by duration |
292 | Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) { |
293 | double duration = sample.sampleRate()/freq*periods; |
294 | ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration); |
295 | } |
296 | |
297 | // Not divided by duration - this seems like the best frequency detector at this point. |
298 | // As in a proper FFT/DCT, we return a complex value to represent phase. |
299 | // Call abs() to get the desired intensity value. |
300 | Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) { |
301 | SumOfVibrations sum = new(sample, channel, start, freq, periods); |
302 | double re = sum.rawSum(); |
303 | sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle |
304 | double im = sum.rawSum(); |
305 | ret Complex(re, im); |
306 | } |
307 | } |
Began life as a copy of #1032403
download show line numbers debug dex old transpilations
Travelled to 4 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj, pyentgdyhuwx
No comments. add comment
Snippet ID: | #1032438 |
Snippet name: | AudioRecognizer [converts audio to integral "image", original version with 8 byte per entry] |
Eternal ID of this version: | #1032438/1 |
Text MD5: | cabe295b99480bb1fedbc48347cd8802 |
Transpilation MD5: | 2c9c0805c0bcb6b2b4b520642d95cc2a |
Author: | stefan |
Category: | javax / audio recognition |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2021-09-05 06:21:13 |
Source code size: | 11932 bytes / 307 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 167 / 220 |
Referenced in: | [show references] |