Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

89
LINES

< > BotCompany Repo | #1028159 // greedySplitIntoWordsCI with choices Spike, shortened [OK]

JavaX source code (Dynamic Module) [tags: use-pretranspiled] - run with: Stefan's OS

Uses 911K of libraries. Click here for Pure Java version (5718L/29K).

cmodule AModule {
  switchable S input = "STATICALAFLATTENLISTOFPAIRSLPAIRALLAOUTEMPTYLISTLLFORPAIRAPUNNULLLOUTADDPAOUTADDPBRETOUT";
  switchable S words = "ret ll ls static out list pair pairs flatten null of if else while for";
  
  S firstResult;
  transient BigInt totalCombinations;

  // basically immutable (we make a new copy for every step)
  // to allow evaluating states in parallel and in any order
  class GreedySplitIntoWordsCI_Multi implements IF0<Either<Iterable<GreedySplitIntoWordsCI_Multi>, LS>> {
    S s; // the input to be split
    TreeSet<S> wordsSet;
    Map<Int> longestMatchMap;
    int i = 0, last = 0;
    ReverseChain<S> out;
    
    *() {}
    *(S *s, Cl<S> words) {
      wordsSet = asCISet(words);
      longestMatchMap = new AutoMap<Int>(i -> dontPrint("longest match at " + i + ": ", lengthOfLongestPrefixInCISet(substring(s, i), wordsSet)));
    }
    
    Cl<S> wordsAtPosition(int i) {
      int longestMatch = longestMatchMap.get(i);
      ret mapNonNulls(countBackwardsTo1(longestMatch), matchLength -> {
        S word = substring(s, i, i+matchLength);
        ret contains(wordsSet, word) ? word : null;
      });
    }
      
    // either we return some choices or a final result
    Either<Iterable<GreedySplitIntoWordsCI_Multi>, LS> get() {
      if (i >= l(s)) ret done(); // done with input
      ret eitherA(listPlus(map(wordsAtPosition(i), wordMatched -> {
        GreedySplitIntoWordsCI_Multi clone = shallowClone(this, new GreedySplitIntoWordsCI_Multi);
        clone.flush();
        clone.i = clone.last = i+l(wordMatched);
        clone.out = revChainPlus(clone.out, substring(s, i, clone.i));
        ret clone;
      }), getVar(() -> {
        GreedySplitIntoWordsCI_Multi clone = shallowClone(this, new GreedySplitIntoWordsCI_Multi);
        clone.i++;
        ret clone;
      })));
    }

    Either<Iterable<GreedySplitIntoWordsCI_Multi>, LS> done() {
      flush();
      ret eitherB(asList(out));
    }
    
    S unflushed() { ret substring(s, last, i); }

    void flush {
      if (i <= last) ret;
      // modifying this object in spite of convention
      out = revChainPlus(out, unflushed());
      last = i;
    }
    
    toString { ret asList(out) + prependIfNempty("|", unflushed()) + ", " + i + "/" + l(s); }
    
    S sentence() { ret joinNemptiesWithSpace(listPlus(asList(out), unflushed())); }
  }
  
  GreedySplitIntoWordsCI_Multi root() {
    ret new GreedySplitIntoWordsCI_Multi(input, splitAtSpace(words));
  }
  
  visual northAndCenterWithMargins(
    jvstackWithSpacing(
      withLabel("Total combinations:", dm_label totalCombinations()),
      withLabel("Preferred result:", dm_label firstResult())),
    jDynamicEitherTree(root(),
      valueToText := (IF1<Either<GreedySplitIntoWordsCI_Multi, LS>, S>) x
        -> isEitherA(x) ? eitherAOpt(x).sentence() + "..." : joinWithSpace(eitherBOpt(x))));

  start-thread {
    dm_reloadOnFieldChange('words, 'input);
    time {
      print("First result: " + setField(firstResult := joinWithSpace(getFirstResultOfEitherTree(root()!))));
    }
    
    GreedySplitIntoWordsCI_Multi root = root();
    // We should subtract combinationsAtPosition(i-l(word)) for each word because I think it is counted twice
    setField(totalCombinations := combinationsForPositionalParser(
      l(input), i -> listPlus(lambdaMap l(root.wordsAtPosition(i)), 1), debug := true);
  }
}

Author comment

Began life as a copy of #1028153

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028159
Snippet name: greedySplitIntoWordsCI with choices Spike, shortened [OK]
Eternal ID of this version: #1028159/12
Text MD5: 0b05f6aa10b1bab2de898d163f91fe32
Transpilation MD5: 973fe94cf68116abf1ed4e6a46e133b0
Author: stefan
Category: javax / stefan's os / nlp
Type: JavaX source code (Dynamic Module)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-05-26 14:12:21
Source code size: 3487 bytes / 89 lines
Pitched / IR pitched: No / No
Views / Downloads: 162 / 1073
Version history: 11 change(s)
Referenced in: [show references]