Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

179
LINES

< > BotCompany Repo | #1028182 - LineCompReader - read LINECOMP format

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (6206L/39K).

sclass LineCompReader {
  new LS literals;
  IVF1<Long> onPair; // if not null, don't save pairs, but stream them to onPair instead
  int[] literalOffsets; // where they start in file
  new LongBuffer pairs;
  new LinkedHashMap<S, L<Int>> versions;
  bool byteMode;
  
  // internal, optional
  CountingInputStream countingInputStream;
  long fileSize;
  
  *() {}
  
  // takes text or gzipped input file
  *(File f) { load(f); }
  
  *(InputStream in) { load(rawByteReader(in, 128*1024)); }
  *(BufferedReader reader) { load(reader); }
  
  void load(File f) {
    fileSize = fileSize(f);
    countingInputStream = new CountingInputStream(bufferedFileInputStream(f));
    temp BufferedReader reader = isGZipFile(f)
      ? rawByteReader(gzipInputStream(countingInputStream))
      : rawByteReader(countingInputStream);
      //rawByteReader_possiblyGZipped(f);
    load(reader);
  }
  
  void load(BufferedReader reader) ctex {
    new StringBuilder lineBuf;
    S s = readLineIgnoreCR(reader, lineBuf);
    int ofs = l(s)+1;
    new Matches m;
    if (startsWith(s, "BYTECOMP ", m)) set byteMode;
    else if (!startsWith(s, "LINECOMP ", m))
      fail("Not a LINECOMP file");
    int nLiterals = parseInt(m.rest());
    new IntBuffer offsets;
    for i to nLiterals: {
      S line = readLineIgnoreCR(reader, lineBuf);
      assertNotNull(line);
      literals.add(byteMode ? str(charFromHex(line)) : line);
      offsets.add(ofs);
      ofs += l(line)+1;
    }
    offsets.add(ofs);
    literalOffsets = offsets.toArray();
    int n = 0;
    while licensed {
      s = readLineIgnoreCR(reader, lineBuf);
      if (s == null || contains(s, "=")) break;
      try {
        int iSpace = s.indexOf(' ');
        long pair = twoIntsToLong(
          Int.parseInt(s, 0, iSpace, 10),
          Int.parseInt(s, iSpace+1, l(s), 10));
        if (onPair != null) onPair.get(pair);
        else pairs.add(pair);
        if (((++n) % oneMillion()) == 0) {
          S percentage = "";
          if (fileSize != 0 && countingInputStream != null)
            percentage = " (" + intPercentRatio(countingInputStream.getFilePointer(), fileSize) + "%)";
          print(nPairs(n) + " read" + percentage);
        }
      } on fail {
        print("On line " + (nLiterals + l(pairs)));
      }
    }
    pairs.trimToSize();
    while (contains(s, "=")) {
      int i = indexOf(s, '=');
      versions.put(takeFirst(s, i), compactIntList(parseInts(splitAtSpace(substring(s, i+1)))));
      s = readLineIgnoreCR(reader, lineBuf);
    }
  }
  
  Set<S> versions() { ret keys(versions); }
  
  S getText(S version) { ret textForVersion(version); }
  S textForVersion(S version) {
    L<Int> encoded = versions.get(version);
    if (encoded == null) null;
    new LS buf;
    for (int idx : encoded)
      decode(idx, buf);
    ret myFromLines(buf);
  }
  
  // name of first (or only) file
  S firstFile() { ret first(versions()); }
  
  // text for first (or only) file
  S text() { ret getText(firstFile()); }
  
  L<Int> encoding() { ret versions.get(firstFile()); }
  
  S myFromLines(LS l) {
    ret byteMode
      ? join(l)
      : fromLines_rtrim(l);
  }
  
  void decode(int idx, LS buf) {
    if (idx < l(literals))
      buf.add(literals.get(idx));
    else {
      long p = pairs.get(idx-l(literals));
      decode(firstIntFromLong(p), buf);
      decode(secondIntFromLong(p), buf);
    }
  }
  
  // That was it! The rest of this file is just for calculating some stats.
  
  new Map<Int> lineCountsForPairs;
  new Map<Int, Long> byteCountsForPairs;

  int lineCountForPointer(int idx) {
    ret idx < l(literals) ? 1 : lineCountForPair(idx);
  }
  
  long byteCountForPointer(int idx) {
    ret idx < l(literals) ? l(literals.get(idx))+1 : byteCountForPair(idx);
  }
  
  int lineCountForPair(int idx) {
    Int c = lineCountsForPairs.get(idx);
    if (c == null) {
      long p = pairs.get(idx-l(literals));
      c = lineCountForPointer(firstIntFromLong(p)) + lineCountForPointer(secondIntFromLong(p));
      lineCountsForPairs.put(idx, c);
    }
    ret c;
  }
  
  long byteCountForPair(int idx) {
    Long c = byteCountsForPairs.get(idx);
    if (c == null) {
      long p = pairs.get(idx-l(literals));
      c = byteCountForPointer(firstIntFromLong(p)) + byteCountForPointer(secondIntFromLong(p));
      byteCountsForPairs.put(idx, c);
    }
    ret c;
  }
  
  int lineCountForVersion(S version) {
    L<Int> encoded = versions.get(version);
    if (encoded == null) ret 0;
    int n = 0;
    for (int i : encoded) n += lineCountForPointer(i);
    ret n;
  }
  
  long byteCountForVersion(S version) {
    L<Int> encoded = versions.get(version);
    if (encoded == null) ret 0;
    long n = 0;
    for (int i : encoded) n += byteCountForPointer(i);
    ret max(0, n-1);
  }
  
  long totalByteCount() {
    ret longSum(lambdaMap byteCountForVersion(versions()));
  }
  
  // now we can also save again
  
  void save(PrintWriter out) {
    out.println((byteMode ? "BYTECOMP " : "LINECOMP ") + l(literals));
    for (S s : literals)
      out.println(byteMode ? charToHex(first(s)) : s);
    for (long p : pairs)
      out.println(firstIntFromLong(p) + " " + secondIntFromLong(p));
    for (S id, L<Int> l : versions)
      out.println(id + "=" + joinWithSpace(l));
  }  
}

download  show line numbers  debug dex   

Travelled to 6 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028182
Snippet name: LineCompReader - read LINECOMP format
Eternal ID of this version: #1028182/59
Text MD5: 34176f987e66773e35d6946d5c139f00
Transpilation MD5: f50c04c11824feba0e4ed0b675fa5957
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-08-06 15:05:30
Source code size: 5454 bytes / 179 lines
Pitched / IR pitched: No / No
Views / Downloads: 236 / 583
Version history: 58 change(s)
Referenced in: [show references]

Formerly at http://tinybrain.de/1028182 & http://1028182.tinybrain.de