Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

123
LINES

< > BotCompany Repo | #1029024 // DeepWordIndex

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (3999L/25K).

transient sclass DeepWordIndex<A> {
  S regexp = "\\w+";
  bool useHashMaps; // makes it case-sensitive and doesn't allow partial word searches
  bool sortEntries; // if A implements Comparable
  new Map<A, Entry<A>> entries;
  //new L<Entry<A>> entriesList;
  MultiSetMap<S, Entry<A>> entriesByWord;
  Map<S, L<Entry<A>>> entriesByWord_lists;
  
  sclass Entry<A> extends Var<A> implements Comparable<Entry<A>> {
    Map<S, int[]> wordPositions = ciMap(); // int array is sorted
    
    *(A id) { super(id); }
    
    public int compareTo(Entry<A> e) {
      ret ((Comparable) get()).compareTo(e!);
    }
    
    public bool equals(O o) {
      ret o instanceof Entry && get().equals(((Entry) o)!);
    }
  }
  
  void init() {
    if (entriesByWord != null) ret;
    entriesByWord = useHashMaps
      ? sortEntries ? multiSetMap_innerTreeSet() : new MultiSetMap
      : sortEntries ? ciMultiSetMap_innerTreeSet() : ciMultiSetMap();
  }
  
  L<IntRange> wordRanges(S text) {
    ret regexpFindRanges(regexp, text);
  }
   
  void add(A a, S text) {
    init();
    Entry<A> e = new Entry<A>(a);
    if (useHashMaps) {
      e.wordPositions = new HashMap;
      text = upper(text);
    }
    if (entries.put(a, e) != null) fail("Double insertion");
    MultiMap<S, Int> wordPositions = ciMultiMap();
    for (IntRange r : wordRanges(text)) {
      S word = substring(text, r);
      wordPositions.put(word, r.start);
      entriesByWord.put(word, e);
    }
    for (S word : keys(wordPositions))
      e.wordPositions.put(word, toIntArray(wordPositions.get(word)));
  }
  
  Set<Entry<A>> get(S word) { ret entriesByWord.get(word); }
  
  int numWords() { ret entriesByWord.keysSize(); }

  void doneAdding() {
    if (entriesByWord_lists != null) ret;
    entriesByWord_lists = mapValues asList(entriesByWord.data);
    // TODO: release entriesByWord
  }
  
  Iterable<Pair<A, Cl<Int>>>lookupString_withPositions(S query, O... _) {
    optPar bool debug;
    doneAdding();
    if (useHashMaps) query = upper(query);
    S _query = query;
    L<IntRange> ranges = wordRanges(query);
    if (empty(ranges)) null;
    int nRanges = l(ranges);
    int iFirstComplete = first(ranges).start == 0 ? 1 : 0;
    int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges;
    
    LS words = map(ranges, r -> substring(_query, r));
    LL<Entry<A>> entriesAtIndex = map(words, word -> entriesByWord_lists.get(word));
      
    if (iLastComplete >= iFirstComplete+1) {
      int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest));
      if (nBest == 0) { /*print("No results for " + words.get(shortest));*/ ret emptyList(); }
      for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) {
        int n = l(entriesAtIndex.get(iWord));
        if (n == 0) { /*print("No results for " + words.get(iWord));*/ ret emptyList(); }
        if (n < nBest) {
          shortest = iWord;
          nBest = n;
        }
      }
      int _shortest = shortest;
      
      Iterable<Entry<A>> entries = sortEntries
        ? intersectMultipleSortedCollectionsI(subList(entriesAtIndex, iFirstComplete, iLastComplete))
        : entriesAtIndex.get(shortest);
        
      int startShortest = ranges.get(shortest).start;
      S shortestWord = words.get(shortest); // not the shortest word, but the word with the shortest result list
      /*if (debug)*/ print("shortest: " + shortestWord + ", words: " + zipTwoLists(words, lmap l(entriesAtIndex)));
      
      new IntBuffer intBuffer;
      
      ret mapI_nonNulls_if1(entries, entry -> {
        int[] positions = entry.wordPositions.get(shortestWord);

        for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) {
          continue if iWord == _shortest;
          IntRange r2 = ranges.get(iWord);
          S word2 = words.get(iWord);
          int[] positions2 = entry.wordPositions.get(word2);
          if (positions2 == null) null;
          int ofs = startShortest-r2.start;
          int len = l(positions);
          positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer);
          print("Intersected " + len + "/" + l(positions2) + " => " + l(positions));
          //if (debug) print("Got " + asList(positions));
          if (empty(positions)) null;
        }
        
        ret pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest));
      });
    }
    
    null;
  }
}

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029024
Snippet name: DeepWordIndex
Eternal ID of this version: #1029024/57
Text MD5: 92af6cd15ba2d76d30e8974f29f55d4b
Transpilation MD5: ea371a277ae8de4cd8f6e19b5081c335
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-17 19:18:50
Source code size: 4432 bytes / 123 lines
Pitched / IR pitched: No / No
Views / Downloads: 299 / 738
Version history: 56 change(s)
Referenced in: #1029042 - DeepWordPairIndex [dev.]
#1034167 - Standard Classes + Interfaces (LIVE, continuation of #1003674)