Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

306
LINES

< > BotCompany Repo | #1032403 // AudioRecognizer [converts audio to integral "image"]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (5810L/34K).

1  
// A base class for ultra-fast audio recognition.
2  
// [Part of the Ultrafa.st framework]
3  
4  
// Idea: We do NOT spend time on a full FFT/DCT in the early stages
5  
// of the recognition.
6  
7  
// Instead we stay in the time domain, turn the sample data into pixels
8  
// and then convert that verybig*1px image into an "integral" image.
9  
10  
// Then we use the integral image access functions to probe for
11  
// various frequencies and wavelets we come up with during the
12  
// recognition of whatever we are currently listening to (that's
13  
// what the higher-level algorithms based on this class do).
14  
15  
// Note we want a full 16 bit range for each pixel's value to make
16  
// this truly hi-fi, so we actually reserve a whole 6 bytes for each 
17  
// cell in the (1D) table.
18  
19  
// Stefan Reich, Gaz.AI, Sep 3 2021
20  
//
21  
// [Insert very liberal license here]
22  
23  
sclass AudioRecognizer {
24  
  IAudioSample mainSample;
25  
  
26  
  double defaultInputSampleRate() { ret 44100; }
27  
  
28  
  // It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample.
29  
  interface IAudioSample {
30  
    int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer
31  
    
32  
    double length();     // in samples according to sampleRate
33  
    double sampleRate(); // in hertz
34  
    
35  
    // Query the integral.
36  
    // Result is in the range -32768*(end-start) to 32767*(end-start)...
37  
    // unless you applied too much gain (there is no clipping).
38  
    // channel is between 0 and channels()-1 from here on out
39  
    double sampleSum(int channel, double start, double end);
40  
    
41  
    // Here the range is -1 to 1 just to spice things up
42  
    default double getPixel(int channel, double start, double end) {
43  
      ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768);
44  
    }
45  
    
46  
    // RENDERING FUNCTIONS (visualize audio as BufferedImage)
47  
    
48  
    // render audio as black-and-white (grayscale) stripes
49  
    // h = height per channel
50  
    default BufferedImage stripes(int h default 50) {
51  
      int w = iceil(length());
52  
      int channels = channels();
53  
      ret imageFromFunction(w, h*channels, (x, y) -> {
54  
        int channel = y/h;
55  
        double value = sampleSum(channel, x, x+1);
56  
        
57  
        // lose lower 8 bits and shift to 0 to 255
58  
        int digital = ifloor(value/256)+128;
59  
        ret rgbIntFullAlpha(digital, digital, digital);
60  
      });
61  
    }
62  
   
63  
    // render audio as graph
64  
    // h = height per channel
65  
    default BufferedImage graph(int h default 100) {
66  
      int w = iceil(length());
67  
      ret mergeBufferedImagesVertically(
68  
        countIteratorToList(channels(), c ->
69  
          simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767)));
70  
    }
71  
    
72  
    // render audio as stripes + graph (best way to look at it)
73  
    default BufferedImage render(int h default 100) {
74  
      ret mergeBufferedImagesVertically(stripes(h/2), graph(h));
75  
    }
76  
    
77  
    // END OF RENDERING FUNCTIONS
78  
   
79  
    // find maximum amplitude, going pixel-by-pixel
80  
    // (remember: This clip may already have been temporally
81  
    // scaled with speedUp(), so a "pixel" may represent the average
82  
    // of multiple audio samples.)
83  
    default double maxAmplitude() {
84  
      int n = iceil(length()), channels = channels();
85  
      double max = 0;
86  
      for i to n:
87  
        for c to channels: 
88  
          max = max(max, abs(sampleSum(c, i, i+1)));
89  
      ret min(32767, max);
90  
    }
91  
    
92  
    // There are various non-destructive virtual transformations
93  
    // which you can do on the audio clip (gain, speed-up and time-shift).
94  
    // All transformations are affine in time and amplitude and thus
95  
    // preserve the "integral image" property.
96  
    
97  
    default IAudioSample gain(double factor) {
98  
      ret factor == 1 ? this : new Gain(factor, this);
99  
    }
100  
    
101  
    // gain to maximum volume possible without clipping
102  
    // (even though clipping isn't even a thing in integral audio wonderland,
103  
    // so we just define "clipping" as exceeding the 32767 value we are used to from real audio.)
104  
    default IAudioSample normalize() {
105  
      ret gain(doubleRatio(32767, maxAmplitude()));
106  
    }
107  
    
108  
    // resample with a factor
109  
    public default IAudioSample speedUp(double factor) {
110  
      ret factor == 1 ? this : new SpeedUp(factor, this);
111  
    }
112  
    
113  
    // resample to a target frequency
114  
    public default IAudioSample sampleAt(double freq) {
115  
      ret speedUp(sampleRate()/freq);
116  
    }
117  
    
118  
    public default IAudioSample timeShift aka shift(double shift) {
119  
      ret shift == 0 ? this : new TimeShift(shift, this);
120  
    }
121  
    
122  
    // For debug-printing. Valued from 0 to 1 this time because why not. First channel only
123  
    default L<Double> firstPixels(int n default 20) {
124  
      double[] pixels = new[n];
125  
      for i to n:
126  
        pixels[i] = sampleSum(0, i, i+1)/32768;
127  
      ret wrapDoubleArrayAsList(pixels);
128  
    }
129  
  } // end of IAudioSample
130  
  
131  
  // The core integral 1D image.
132  
  sclass AudioSample implements IAudioSample {
133  
    int channels;
134  
    double sampleRate;
135  
    int length;
136  
    
137  
    // Here they are: the partial sums of the 16 bit audio samples
138  
    // in an array of 6-byte integers. Channels are stored interleaved.
139  
    HalfLongs data;
140  
    
141  
    public double sampleRate() { ret sampleRate; }
142  
    public int channels() { ret channels; }
143  
    public double length() { ret length; }
144  
    
145  
    // result is in the range -32768*(end-start) to 32767*(end-start)
146  
    public double sampleSum(int channel, double start, double end) {
147  
      // We could do linear interpolation here if we weren't so basic.
148  
      int a = ifloor(start), b = ifloor(end);
149  
      ret getEntry(channel, b-1)-getEntry(channel, a-1);
150  
    }
151  
    
152  
    // Get an entry of the sum table - allow for out-of-bounds
153  
    // requests (those just default to silence).
154  
    long getEntry(int channel, int i) {
155  
      if (i < 0) ret 0;
156  
      i = min(i, length-1);
157  
      ret data.get(i*channels+channel);
158  
    }
159  
    
160  
    // perform the integration of the raw audio data
161  
    *(L<short[]> samples, int *channels, double *sampleRate) {
162  
      length = lengthLevel2_shortArrays(samples);
163  
      data = new HalfLongs(length*channels);
164  
      long[] sums = new[channels];
165  
      int iSample = 0, iChunk = 0, iInArray = 0;
166  
      short[] chunk = null;
167  
      for i to length:
168  
        for c to channels: {
169  
          if (chunk == null || iInArray >= chunk.length) {
170  
            chunk = samples.get(iChunk++);
171  
            iInArray = 0;
172  
          }
173  
          data.set(iSample++, sums[c] += chunk[iInArray++]);
174  
        }
175  
    }
176  
  }
177  
178  
  // implementation of gain modifier
179  
  srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample {
180  
    public double sampleRate() { ret original.sampleRate(); }
181  
    public int channels() { ret original.channels(); }
182  
    public double length() { ret original.length(); }
183  
    
184  
    public double sampleSum(int channel, double start, double end) {
185  
      ret original.sampleSum(channel, start, end)*factor;
186  
    }
187  
    
188  
    // coalesce consecutive gains
189  
    public IAudioSample gain(double factor) {
190  
      ret original.gain(this.factor*factor);
191  
    }
192  
  }
193  
  
194  
  // Implementation of the time-shift modifier.
195  
  // moves the input <shift> samples to the left (cuts off beginning).
196  
  // Shift can be fractional - we're in integral image (audio) wonderland after all
197  
  // where a traditional pixel has no meaning.
198  
  srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample {
199  
    public double sampleRate() { ret original.sampleRate(); }
200  
    public int channels() { ret original.channels(); }
201  
    public double length() { ret original.length()-shift; }
202  
    
203  
    public double sampleSum(int channel, double start, double end) {
204  
      ret original.sampleSum(channel, start+shift, end+shift);
205  
    }
206  
    
207  
    // coalesce consecutive time-shifts
208  
    public IAudioSample timeShift(double shift) {
209  
      ret original.timeShift(this.shift+shift);
210  
    }
211  
  }
212  
  
213  
  // Implementation of the speed-up modifier which transforms every frequency f to f*factor.
214  
  // This is for convenience, you could also just call sampleSum() directly with larger intervals.
215  
  sclass SpeedUp implements IAudioSample {
216  
    double factor, invFactor;
217  
    IAudioSample original;
218  
219  
    *(double *factor, IAudioSample *original) {
220  
      if (factor < 1) fail("Can't slow down. " + factor);
221  
      invFactor = 1/factor;
222  
    }
223  
    
224  
    public double sampleRate() { ret original.sampleRate()*invFactor; }
225  
    public int channels() { ret original.channels(); }
226  
    public double length() { ret original.length()*invFactor; }
227  
    
228  
    public double sampleSum(int channel, double start, double end) {
229  
      ret original.sampleSum(channel, start*factor, end*factor)*invFactor;
230  
    }
231  
    
232  
    // coalesce consecutive speed-ups
233  
    public IAudioSample speedUp(double factor) {
234  
      ret original.speedUp(this.factor*factor);
235  
    }
236  
  }
237  
  
238  
  // Constructors from various types of PCM data (including rendered-on-the-spot)
239  
  
240  
  *() {}
241  
  *(short[] samples, int channels) {
242  
    this(ll(samples), channels);
243  
  }
244  
  
245  
  *(L<short[]> samples, int channels) {
246  
    mainSample = new AudioSample(samples, channels, defaultInputSampleRate());
247  
  }
248  
  
249  
  *(double seconds, VF1<double[]> soundSource, int channels) {
250  
    this(soundSourceToShortArrays(seconds, soundSource, channels), channels);
251  
  }
252  
  
253  
  // in-place modifiers for mainSample (convenience functions)
254  
  
255  
  void applyGain(double factor) { mainSample = mainSample.gain(factor); }
256  
  void normalize                { mainSample = mainSample.normalize(); }
257  
  void speedUp(double factor)   { mainSample = mainSample.speedUp(factor); }
258  
  
259  
  // Here come the actual analysis functions.
260  
  
261  
  // This looks at a number of periods of a given frequency starting at a certain time in the audio
262  
  // and returns an intensity value.
263  
  // No phase adjustment here, so you have to call this twice to get meaningful (complex) results.
264  
  srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
265  
    double period, end;
266  
    
267  
    double rawSum() {
268  
      period = sample.sampleRate()/freq;
269  
      double sum = 0, t = start;
270  
      for p to periods: {
271  
        // Subtract an expected trough from an expected neighboring peak and add to overall sum.
272  
        // Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature!
273  
        // By the use of which we automatically get around nasty complications like DC offsets in the input data.
274  
        
275  
        sum += sample.sampleSum(channel, t, t+period/2)
276  
             - sample.sampleSum(channel, t+period/2, t+period);
277  
          
278  
        t += period;
279  
      }
280  
      end = t;
281  
      ret sum;
282  
    }
283  
    
284  
    // alternate calculation adjusted for duration
285  
    double sumDividedByDuration() {
286  
      ret rawSum()/(end-start);
287  
    }
288  
  }
289  
  
290  
  // divided by duration
291  
  Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
292  
    double duration = sample.sampleRate()/freq*periods;
293  
    ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration);
294  
  }
295  
  
296  
  // Not divided by duration - this seems like the best frequency detector at this point.
297  
  // As in a proper FFT/DCT, we return a complex value to represent phase.
298  
  // Call abs() to get the desired intensity value.
299  
  Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) {
300  
    SumOfVibrations sum = new(sample, channel, start, freq, periods);
301  
    double re = sum.rawSum();
302  
    sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle
303  
    double im = sum.rawSum();
304  
    ret Complex(re, im);
305  
  }
306  
}

Author comment

Began life as a copy of #1032199

download  show line numbers  debug dex  old transpilations   

Travelled to 4 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj, pyentgdyhuwx

No comments. add comment

Snippet ID: #1032403
Snippet name: AudioRecognizer [converts audio to integral "image"]
Eternal ID of this version: #1032403/139
Text MD5: c32474ce0d23ac9e491d7e0880be4bf7
Transpilation MD5: 0c80b5fd7c725c15c44e7eb4a11f9e1e
Author: stefan
Category: javax / audio recognition
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2021-09-05 08:29:07
Source code size: 11923 bytes / 306 lines
Pitched / IR pitched: No / No
Views / Downloads: 533 / 1040
Version history: 138 change(s)
Referenced in: [show references]