Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

101
LINES

< > BotCompany Repo | #1029042 // DeepWordPairIndex [dev.]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (3730L/23K).

transient sclass DeepWordPairIndex<A> {
  S regexp = "\\w+";
  bool useHashMaps = true; // makes it case-sensitive and doesn't allow partial word searches
  new Map<A, Entry<A>> entries;
  new MultiSetMap<PairS, Entry<A>> entriesByPair;
  
  sclass Entry<A> extends Var<A> {
    new Map<PairS, int[]> pairPositions; // int array is sorted
    
    *(A id) { super(id); }
  }
  
  void useHashMaps() {
    set useHashMaps;
    entriesByPair = new MultiSetMap;
  }
  
  L<IntRange> wordRanges(S text) {
    ret regexpFindRanges(regexp, text);
  }
   
  void add(A a, S text) {
    Entry<A> e = new Entry<A>(a);
    if (useHashMaps) {
      e.pairPositions = new HashMap;
      text = upper(text);
    }
    if (entries.put(a, e) != null) fail("Double insertion");
    new MultiMap<PairS, Int> pairPositions;
    for (Pair<IntRange> p : overlappingPairs(wordRanges(text))) {
      PairS pair = pair(substring(text, p.a), substring(text, p.b));
      pairPositions.put(pair, p.a.start);
      entriesByPair.put(pair, e);
    }
    for (PairS pair : keys(pairPositions))
      e.pairPositions.put(pair, toIntArray(pairPositions.get(pair)));
  }
  
  Set<Entry<A>> get(PairS pair) { ret entriesByPair.get(pair); }
  
  int numPairs() { ret entriesByPair.keysSize(); }
  
  Iterable<Pair<A, Cl<Int>>>lookupString_withPositions(S query, O... _) {
    optPar bool debug;
    if (useHashMaps) query = upper(query);
    S _query = query;
    L<IntRange> ranges = wordRanges(query);
    if (empty(ranges)) null;
    int nRanges = l(ranges);
    int iFirstComplete = first(ranges).start == 0 ? 1 : 0;
    int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges;
    --iLastComplete; // because pairs
    
    LS words = map(ranges, r -> substring(_query, r));
    LPairS pairs = overlappingPairs(words);
    L<Set<Entry<A>>> entriesAtIndex = map(pairs, pair -> entriesByPair.get(pair));
      
    if (iLastComplete >= iFirstComplete+1) {
      int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest));
      if (nBest == 0) ret emptyList();
      for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) {
        int n = l(entriesAtIndex.get(iWord));
        if (n == 0) ret emptyList();
        if (n < nBest) {
          shortest = iWord;
          nBest = n;
        }
      }
      
      /*if (debug)*/ print("pairs: " + zipTwoLists(pairs, lmap l(entriesAtIndex)));
      
      Set<Entry<A>> entries = entriesAtIndex.get(shortest);
      int startShortest = ranges.get(shortest).start;
      PairS shortestPair = pairs.get(shortest); // not the shortest pair, but the pair with the shortest result list
      new IntBuffer intBuffer;
      
      new LPair<A, Cl<Int>> out;
      entrySearch: for (Entry<A> entry : entries) {
        int[] positions = entry.pairPositions.get(shortestPair);

        for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) {
          continue if iWord == shortest;
          IntRange r2 = ranges.get(iWord);
          PairS pair2 = pairs.get(iWord);
          int[] positions2 = entry.pairPositions.get(pair2);
          if (positions2 == null) continue entrySearch;
          int ofs = startShortest-r2.start;
          //if (debug) print("Intersecting " + asList(positions) + "/" + asList(positions2) + " with ofs " + ofs);
          positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer);
          //if (debug) print("Got " + asList(positions));
          if (empty(positions)) continue entrySearch;
        }
        
        out.add(pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest)));
      }
      ret out;
    }
    
    null;
  }
}

Author comment

Began life as a copy of #1029024

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029042
Snippet name: DeepWordPairIndex [dev.]
Eternal ID of this version: #1029042/13
Text MD5: 031dd215db5e9dc11c14d88ea68dca35
Transpilation MD5: 2cd245965f98520999330cd9f397c38a
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-17 16:29:02
Source code size: 3776 bytes / 101 lines
Pitched / IR pitched: No / No
Views / Downloads: 257 / 555
Version history: 12 change(s)
Referenced in: [show references]