AudioRecognizer [converts audio to integral "image", original version with 8 byte per entry] [1032438]

// A base class for ultra-fast audio recognition.
// [Part of the Ultrafa.st framework]

// Idea: We do NOT spend time on a full FFT/DCT in the early stages
// of the recognition.

// Instead we stay in the time domain, turn the sample data into pixels
// and then convert that verybig*1px image into an "integral" image.

// Then we use the integral image access functions to probe for
// various frequencies and wavelets we come up with during the
// recognition of whatever we are currently listening to (that's
// what the higher-level algorithms based on this class do).

// Note we want a full 16 bit range for each pixel's value to make
// this truly hi-fi, so we actually reserve a whole 8 bytes for each 
// cell in the (1D) table (could make that 6 but that's annoying to
// handle).

// Stefan Reich, Gaz.AI, Sep 3 2021
//
// [Insert very liberal license here]

sclass AudioRecognizer {
  IAudioSample mainSample;
  
  double defaultInputSampleRate() { ret 44100; }
  
  // It works like this: There is a general interface for accessing an "integrated" audio clip - IAudioSample.
  interface IAudioSample {
    int channels(); // 1 for mono, 2 for left+right, 3 for center+left+right... or whatever channel model you prefer
    
    double length();     // in samples according to sampleRate
    double sampleRate(); // in hertz
    
    // Query the integral.
    // Result is in the range -32768*(end-start) to 32767*(end-start)...
    // unless you applied too much gain (there is no clipping).
    // channel is between 0 and channels()-1 from here on out
    double sampleSum(int channel, double start, double end);
    
    // Here the range is -1 to 1 just to spice things up
    default double getPixel(int channel, double start, double end) {
      ret doubleRatio(sampleSum(channel, start, end), (end-start)*32768);
    }
    
    // RENDERING FUNCTIONS (visualize audio as BufferedImage)
    
    // render audio as black-and-white (grayscale) stripes
    // h = height per channel
    default BufferedImage stripes(int h default 50) {
      int w = iceil(length());
      int channels = channels();
      ret imageFromFunction(w, h*channels, (x, y) -> {
        int channel = y/h;
        double value = sampleSum(channel, x, x+1);
        
        // lose lower 8 bits and shift to 0 to 255
        int digital = ifloor(value/256)+128;
        ret rgbIntFullAlpha(digital, digital, digital);
      });
    }
   
    // render audio as graph
    // h = height per channel
    default BufferedImage graph(int h default 100) {
      int w = iceil(length());
      ret mergeBufferedImagesVertically(
        countIteratorToList(channels(), c ->
          simpleGraph(w, h, x -> sampleSum(c, x, x+1), -32768, 32767)));
    }
    
    // render audio as stripes + graph (best way to look at it)
    default BufferedImage render(int h default 100) {
      ret mergeBufferedImagesVertically(stripes(h/2), graph(h));
    }
    
    // END OF RENDERING FUNCTIONS
   
    // find maximum amplitude, going pixel-by-pixel
    // (remember: This clip may already have been temporally
    // scaled with speedUp(), so a "pixel" may represent the average
    // of multiple audio samples.)
    default double maxAmplitude() {
      int n = iceil(length()), channels = channels();
      double max = 0;
      for i to n:
        for c to channels: 
          max = max(max, abs(sampleSum(c, i, i+1)));
      ret min(32767, max);
    }
    
    // There are various non-destructive virtual transformations
    // which you can do on the audio clip (gain, speed-up and time-shift).
    // All transformations are affine in time and amplitude and thus
    // preserve the "integral image" property.
    
    default IAudioSample gain(double factor) {
      ret factor == 1 ? this : new Gain(factor, this);
    }
    
    // gain to maximum volume possible without clipping
    // (even though clipping isn't even a thing in integral audio wonderland,
    // so we just define "clipping" as exceeding the 32767 value we are used to from real audio.)
    default IAudioSample normalize() {
      ret gain(doubleRatio(32767, maxAmplitude()));
    }
    
    // resample with a factor
    public default IAudioSample speedUp(double factor) {
      ret factor == 1 ? this : new SpeedUp(factor, this);
    }
    
    // resample to a target frequency
    public default IAudioSample sampleAt(double freq) {
      ret speedUp(sampleRate()/freq);
    }
    
    public default IAudioSample timeShift aka shift(double shift) {
      ret shift == 0 ? this : new TimeShift(shift, this);
    }
    
    // For debug-printing. Valued from 0 to 1 this time because why not. First channel only
    default L<Double> firstPixels(int n default 20) {
      double[] pixels = new[n];
      for i to n:
        pixels[i] = sampleSum(0, i, i+1)/32768;
      ret wrapDoubleArrayAsList(pixels);
    }
  } // end of IAudioSample
  
  // The core integral 1D image.
  sclass AudioSample implements IAudioSample {
    int channels;
    double sampleRate;
    int length;
    
    // Here they are: the partial sums of the 16 bit audio samples.
    // Channels are stored interleaved
    long[] data;
    
    public double sampleRate() { ret sampleRate; }
    public int channels() { ret channels; }
    public double length() { ret length; }
    
    // result is in the range -32768*(end-start) to 32767*(end-start)
    public double sampleSum(int channel, double start, double end) {
      // We could do linear interpolation here if we weren't so basic.
      int a = ifloor(start), b = ifloor(end);
      ret getEntry(channel, b-1)-getEntry(channel, a-1);
    }
    
    // Get an entry of the sum table - allow for out-of-bounds
    // requests (those just default to silence).
    long getEntry(int channel, int i) {
      if (i < 0) ret 0;
      i = min(i, length-1);
      ret data[i*channels+channel];
    }
    
    // perform the integration of the raw audio data
    *(L<short[]> samples, int *channels, double *sampleRate) {
      length = lengthLevel2_shortArrays(samples);
      data = new long[length*channels];
      long[] sums = new[channels];
      int iSample = 0, iChunk = 0, iInArray = 0;
      short[] chunk = null;
      for i to length:
        for c to channels: {
          if (chunk == null || iInArray >= chunk.length) {
            chunk = samples.get(iChunk++);
            iInArray = 0;
          }
          data[iSample++] = (sums[c] += chunk[iInArray++]);
        }
    }
  }

  // implementation of gain modifier
  srecord noeq Gain(double factor, IAudioSample original) implements IAudioSample {
    public double sampleRate() { ret original.sampleRate(); }
    public int channels() { ret original.channels(); }
    public double length() { ret original.length(); }
    
    public double sampleSum(int channel, double start, double end) {
      ret original.sampleSum(channel, start, end)*factor;
    }
    
    // coalesce consecutive gains
    public IAudioSample gain(double factor) {
      ret original.gain(this.factor*factor);
    }
  }
  
  // Implementation of the time-shift modifier.
  // moves the input <shift> samples to the left (cuts off beginning).
  // Shift can be fractional - we're in integral image (audio) wonderland after all
  // where a traditional pixel has no meaning.
  srecord noeq TimeShift(double shift, IAudioSample original) implements IAudioSample {
    public double sampleRate() { ret original.sampleRate(); }
    public int channels() { ret original.channels(); }
    public double length() { ret original.length()-shift; }
    
    public double sampleSum(int channel, double start, double end) {
      ret original.sampleSum(channel, start+shift, end+shift);
    }
    
    // coalesce consecutive time-shifts
    public IAudioSample timeShift(double shift) {
      ret original.timeShift(this.shift+shift);
    }
  }
  
  // Implementation of the speed-up modifier which transforms every frequency f to f*factor.
  // This is for convenience, you could also just call sampleSum() directly with larger intervals.
  sclass SpeedUp implements IAudioSample {
    double factor, invFactor;
    IAudioSample original;

    *(double *factor, IAudioSample *original) {
      if (factor < 1) fail("Can't slow down. " + factor);
      invFactor = 1/factor;
    }
    
    public double sampleRate() { ret original.sampleRate()*invFactor; }
    public int channels() { ret original.channels(); }
    public double length() { ret original.length()*invFactor; }
    
    public double sampleSum(int channel, double start, double end) {
      ret original.sampleSum(channel, start*factor, end*factor)*invFactor;
    }
    
    // coalesce consecutive speed-ups
    public IAudioSample speedUp(double factor) {
      ret original.speedUp(this.factor*factor);
    }
  }
  
  // Constructors from various types of PCM data (including rendered-on-the-spot)
  
  *() {}
  *(short[] samples, int channels) {
    this(ll(samples), channels);
  }
  
  *(L<short[]> samples, int channels) {
    mainSample = new AudioSample(samples, channels, defaultInputSampleRate());
  }
  
  *(double seconds, VF1<double[]> soundSource, int channels) {
    this(soundSourceToShortArrays(seconds, soundSource, channels), channels);
  }
  
  // in-place modifiers for mainSample (convenience functions)
  
  void applyGain(double factor) { mainSample = mainSample.gain(factor); }
  void normalize                { mainSample = mainSample.normalize(); }
  void speedUp(double factor)   { mainSample = mainSample.speedUp(factor); }
  
  // Here come the actual analysis functions.
  
  // This looks at a number of periods of a given frequency starting at a certain time in the audio
  // and returns an intensity value.
  // No phase adjustment here, so you have to call this twice to get meaningful (complex) results.
  srecord noeq SumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
    double period, end;
    
    double rawSum() {
      period = sample.sampleRate()/freq;
      double sum = 0, t = start;
      for p to periods: {
        // Subtract an expected trough from an expected neighboring peak and add to overall sum.
        // Nota bene: Trough and peak have the same area (=length), so this is basically a Haar-like feature!
        // By the use of which we automatically get around nasty complications like DC offsets in the input data.
        
        sum += sample.sampleSum(channel, t, t+period/2)
             - sample.sampleSum(channel, t+period/2, t+period);
          
        t += period;
      }
      end = t;
      ret sum;
    }
    
    // alternate calculation adjusted for duration
    double sumDividedByDuration() {
      ret rawSum()/(end-start);
    }
  }
  
  // divided by duration
  Complex complexSumOfVibrations(IAudioSample sample, int channel, double start, double freq, int periods) {
    double duration = sample.sampleRate()/freq*periods;
    ret div(complexSumOfVibrations_raw(sample, channel, start, freq, periods), duration);
  }
  
  // Not divided by duration - this seems like the best frequency detector at this point.
  // As in a proper FFT/DCT, we return a complex value to represent phase.
  // Call abs() to get the desired intensity value.
  Complex complexSumOfVibrations_raw(IAudioSample sample, int channel, double start, double freq, int periods) {
    SumOfVibrations sum = new(sample, channel, start, freq, periods);
    double re = sum.rawSum();
    sum.start += sum.period/4; // 90° phase shift to catch the other half of the circle
    double im = sum.rawSum();
    ret Complex(re, im);
  }
}
Travelled to 4 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj, pyentgdyhuwx
Snippet ID:	#1032438
Snippet name:	AudioRecognizer [converts audio to integral "image", original version with 8 byte per entry]
Eternal ID of this version:	#1032438/1
Text MD5:	cabe295b99480bb1fedbc48347cd8802
Transpilation MD5:	2c9c0805c0bcb6b2b4b520642d95cc2a
Author:	stefan
Category:	javax / audio recognition
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2021-09-05 06:21:13
Source code size:	11932 bytes / 307 lines
Pitched / IR pitched:	No / No
Views / Downloads:	213 / 280
Referenced in:	[show references]
< > BotCompany Repo | #1032438 // AudioRecognizer [converts audio to integral "image", original version with 8 byte per entry]

JavaX fragment (include) [tags: use-pretranspiled]

Author comment