Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

307
LINES

< > BotCompany Repo | #1032438 // AudioRecognizer [converts audio to integral "image", original version with 8 byte per entry]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (5770L/34K).

1  
// A base class for ultra-fast audio recognition.
2  
// [Part of the Ultrafa.st framework]
3  
4  
// Idea: We do NOT spend time on a full FFT/DCT in the early stages
5  
// of the recognition.
6  
7  
// Instead we stay in the time domain, turn the sample data into pixels
8  
// and then convert that verybig*1px image into an "integral" image.
9  
10  
// Then we use the integral image access functions to probe for
11  
// various frequencies and wavelets we come up with during the
12  
// recognition of whatever we are currently listening to (that's
13  
// what the higher-level algorithms based on this class do).
14  
15  
// Note we want a full 16 bit range for each pixel's value to make
16  
// this truly hi-fi, so we actually reserve a whole 8 bytes for each 
17  
// cell in the (1D) table (could make that 6 but that's annoying to
18  
// handle).
19  
20  
// Stefan Reich, Gaz.AI, Sep 3 2021
21  
//
22  
// [Insert very liberal license here]
23  
24  
sclass AudioRecognizer {
25  
  IAudioSample mainSample;
26  
  
27  
  double defaultInputSampleRate() { ret 44100; }
28  
  
29  
  // It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample.
30  
  interface IAudioSample {
31  
    int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer
32  
    
33  
    double length();     // in samples according to sampleRate
34  
    double sampleRate(); // in hertz
35  
    
36  
    // Query the integral.
37  
    // Result is in the range -32768*(end-start) to 32767*(end-start)...
38  
    // unless you applied too much gain (there is no clipping).
39  
    // channel is between 0 and channels()-1 from here on out
40  
    double sampleSum(int channel, double start, double end);
41  
    
42  
    // Here the range is -1 to 1 just to spice things up
43  
    default double getPixel(int channel, double start, double end) {
44  
      ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768);
45  
    }
46  
    
47  
    // RENDERING FUNCTIONS (visualize audio as BufferedImage)
48  
    
49  
    // render audio as black-and-white (grayscale) stripes
50  
    // h = height per channel
51  
    default BufferedImage stripes(int h default 50) {
52  
      int w = iceil(length());
53  
      int channels = channels();
54  
      ret imageFromFunction(w, h*channels, (x, y) -> {
55  
        int channel = y/h;
56  
        double value = sampleSum(channel, x, x+1);
57  
        
58  
        // lose lower 8 bits and shift to 0 to 255
59  
        int digital = ifloor(value/256)+128;
60  
        ret rgbIntFullAlpha(digital, digital, digital);
61  
      });
62  
    }
63  
   
64  
    // render audio as graph
65  
    // h = height per channel
66  
    default BufferedImage graph(int h default 100) {
67  
      int w = iceil(length());
68  
      ret mergeBufferedImagesVertically(
69  
        countIteratorToList(channels(), c ->
70  
          simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767)));
71  
    }
72  
    
73  
    // render audio as stripes + graph (best way to look at it)
74  
    default BufferedImage render(int h default 100) {
75  
      ret mergeBufferedImagesVertically(stripes(h/2), graph(h));
76  
    }
77  
    
78  
    // END OF RENDERING FUNCTIONS
79  
   
80  
    // find maximum amplitude, going pixel-by-pixel
81  
    // (remember: This clip may already have been temporally
82  
    // scaled with speedUp(), so a "pixel" may represent the average
83  
    // of multiple audio samples.)
84  
    default double maxAmplitude() {
85  
      int n = iceil(length()), channels = channels();
86  
      double max = 0;
87  
      for i to n:
88  
        for c to channels: 
89  
          max = max(max, abs(sampleSum(c, i, i+1)));
90  
      ret min(32767, max);
91  
    }
92  
    
93  
    // There are various non-destructive virtual transformations
94  
    // which you can do on the audio clip (gain, speed-up and time-shift).
95  
    // All transformations are affine in time and amplitude and thus
96  
    // preserve the "integral image" property.
97  
    
98  
    default IAudioSample gain(double factor) {
99  
      ret factor == 1 ? this : new Gain(factor, this);
100  
    }
101  
    
102  
    // gain to maximum volume possible without clipping
103  
    // (even though clipping isn't even a thing in integral audio wonderland,
104  
    // so we just define "clipping" as exceeding the 32767 value we are used to from real audio.)
105  
    default IAudioSample normalize() {
106  
      ret gain(doubleRatio(32767, maxAmplitude()));
107  
    }
108  
    
109  
    // resample with a factor
110  
    public default IAudioSample speedUp(double factor) {
111  
      ret factor == 1 ? this : new SpeedUp(factor, this);
112  
    }
113  
    
114  
    // resample to a target frequency
115  
    public default IAudioSample sampleAt(double freq) {
116  
      ret speedUp(sampleRate()/freq);
117  
    }
118  
    
119  
    public default IAudioSample timeShift aka shift(double shift) {
120  
      ret shift == 0 ? this : new TimeShift(shift, this);
121  
    }
122  
    
123  
    // For debug-printing. Valued from 0 to 1 this time because why not. First channel only
124  
    default L<Double> firstPixels(int n default 20) {
125  
      double[] pixels = new[n];
126  
      for i to n:
127  
        pixels[i] = sampleSum(0, i, i+1)/32768;
128  
      ret wrapDoubleArrayAsList(pixels);
129  
    }
130  
  } // end of IAudioSample
131  
  
132  
  // The core integral 1D image.
133  
  sclass AudioSample implements IAudioSample {
134  
    int channels;
135  
    double sampleRate;
136  
    int length;
137  
    
138  
    // Here they are: the partial sums of the 16 bit audio samples.
139  
    // Channels are stored interleaved
140  
    long[] data;
141  
    
142  
    public double sampleRate() { ret sampleRate; }
143  
    public int channels() { ret channels; }
144  
    public double length() { ret length; }
145  
    
146  
    // result is in the range -32768*(end-start) to 32767*(end-start)
147  
    public double sampleSum(int channel, double start, double end) {
148  
      // We could do linear interpolation here if we weren't so basic.
149  
      int a = ifloor(start), b = ifloor(end);
150  
      ret getEntry(channel, b-1)-getEntry(channel, a-1);
151  
    }
152  
    
153  
    // Get an entry of the sum table - allow for out-of-bounds
154  
    // requests (those just default to silence).
155  
    long getEntry(int channel, int i) {
156  
      if (i < 0) ret 0;
157  
      i = min(i, length-1);
158  
      ret data[i*channels+channel];
159  
    }
160  
    
161  
    // perform the integration of the raw audio data
162  
    *(L<short[]> samples, int *channels, double *sampleRate) {
163  
      length = lengthLevel2_shortArrays(samples);
164  
      data = new long[length*channels];
165  
      long[] sums = new[channels];
166  
      int iSample = 0, iChunk = 0, iInArray = 0;
167  
      short[] chunk = null;
168  
      for i to length:
169  
        for c to channels: {
170  
          if (chunk == null || iInArray >= chunk.length) {
171  
            chunk = samples.get(iChunk++);
172  
            iInArray = 0;
173  
          }
174  
          data[iSample++] = (sums[c] += chunk[iInArray++]);
175  
        }
176  
    }
177  
  }
178  
179  
  // implementation of gain modifier
180  
  srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample {
181  
    public double sampleRate() { ret original.sampleRate(); }
182  
    public int channels() { ret original.channels(); }
183  
    public double length() { ret original.length(); }
184  
    
185  
    public double sampleSum(int channel, double start, double end) {
186  
      ret original.sampleSum(channel, start, end)*factor;
187  
    }
188  
    
189  
    // coalesce consecutive gains
190  
    public IAudioSample gain(double factor) {
191  
      ret original.gain(this.factor*factor);
192  
    }
193  
  }
194  
  
195  
  // Implementation of the time-shift modifier.
196  
  // moves the input <shift> samples to the left (cuts off beginning).
197  
  // Shift can be fractional - we're in integral image (audio) wonderland after all
198  
  // where a traditional pixel has no meaning.
199  
  srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample {
200  
    public double sampleRate() { ret original.sampleRate(); }
201  
    public int channels() { ret original.channels(); }
202  
    public double length() { ret original.length()-shift; }
203  
    
204  
    public double sampleSum(int channel, double start, double end) {
205  
      ret original.sampleSum(channel, start+shift, end+shift);
206  
    }
207  
    
208  
    // coalesce consecutive time-shifts
209  
    public IAudioSample timeShift(double shift) {
210  
      ret original.timeShift(this.shift+shift);
211  
    }
212  
  }
213  
  
214  
  // Implementation of the speed-up modifier which transforms every frequency f to f*factor.
215  
  // This is for convenience, you could also just call sampleSum() directly with larger intervals.
216  
  sclass SpeedUp implements IAudioSample {
217  
    double factor, invFactor;
218  
    IAudioSample original;
219  
220  
    *(double *factor, IAudioSample *original) {
221  
      if (factor < 1) fail("Can't slow down. " + factor);
222  
      invFactor = 1/factor;
223  
    }
224  
    
225  
    public double sampleRate() { ret original.sampleRate()*invFactor; }
226  
    public int channels() { ret original.channels(); }
227  
    public double length() { ret original.length()*invFactor; }
228  
    
229  
    public double sampleSum(int channel, double start, double end) {
230  
      ret original.sampleSum(channel, start*factor, end*factor)*invFactor;
231  
    }
232  
    
233  
    // coalesce consecutive speed-ups
234  
    public IAudioSample speedUp(double factor) {
235  
      ret original.speedUp(this.factor*factor);
236  
    }
237  
  }
238  
  
239  
  // Constructors from various types of PCM data (including rendered-on-the-spot)
240  
  
241  
  *() {}
242  
  *(short[] samples, int channels) {
243  
    this(ll(samples), channels);
244  
  }
245  
  
246  
  *(L<short[]> samples, int channels) {
247  
    mainSample = new AudioSample(samples, channels, defaultInputSampleRate());
248  
  }
249  
  
250  
  *(double seconds, VF1<double[]> soundSource, int channels) {
251  
    this(soundSourceToShortArrays(seconds, soundSource, channels), channels);
252  
  }
253  
  
254  
  // in-place modifiers for mainSample (convenience functions)
255  
  
256  
  void applyGain(double factor) { mainSample = mainSample.gain(factor); }
257  
  void normalize                { mainSample = mainSample.normalize(); }
258  
  void speedUp(double factor)   { mainSample = mainSample.speedUp(factor); }
259  
  
260  
  // Here come the actual analysis functions.
261  
  
262  
  // This looks at a number of periods of a given frequency starting at a certain time in the audio
263  
  // and returns an intensity value.
264  
  // No phase adjustment here, so you have to call this twice to get meaningful (complex) results.
265  
  srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
266  
    double period, end;
267  
    
268  
    double rawSum() {
269  
      period = sample.sampleRate()/freq;
270  
      double sum = 0, t = start;
271  
      for p to periods: {
272  
        // Subtract an expected trough from an expected neighboring peak and add to overall sum.
273  
        // Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature!
274  
        // By the use of which we automatically get around nasty complications like DC offsets in the input data.
275  
        
276  
        sum += sample.sampleSum(channel, t, t+period/2)
277  
             - sample.sampleSum(channel, t+period/2, t+period);
278  
          
279  
        t += period;
280  
      }
281  
      end = t;
282  
      ret sum;
283  
    }
284  
    
285  
    // alternate calculation adjusted for duration
286  
    double sumDividedByDuration() {
287  
      ret rawSum()/(end-start);
288  
    }
289  
  }
290  
  
291  
  // divided by duration
292  
  Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
293  
    double duration = sample.sampleRate()/freq*periods;
294  
    ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration);
295  
  }
296  
  
297  
  // Not divided by duration - this seems like the best frequency detector at this point.
298  
  // As in a proper FFT/DCT, we return a complex value to represent phase.
299  
  // Call abs() to get the desired intensity value.
300  
  Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) {
301  
    SumOfVibrations sum = new(sample, channel, start, freq, periods);
302  
    double re = sum.rawSum();
303  
    sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle
304  
    double im = sum.rawSum();
305  
    ret Complex(re, im);
306  
  }
307  
}

Author comment

Began life as a copy of #1032403

download  show line numbers  debug dex  old transpilations   

Travelled to 4 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj, pyentgdyhuwx

No comments. add comment

Snippet ID: #1032438
Snippet name: AudioRecognizer [converts audio to integral "image", original version with 8 byte per entry]
Eternal ID of this version: #1032438/1
Text MD5: cabe295b99480bb1fedbc48347cd8802
Transpilation MD5: 2c9c0805c0bcb6b2b4b520642d95cc2a
Author: stefan
Category: javax / audio recognition
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2021-09-05 06:21:13
Source code size: 11932 bytes / 307 lines
Pitched / IR pitched: No / No
Views / Downloads: 170 / 224
Referenced in: [show references]