sclass ProbabilisticParser1 { transient TreeSetWithDuplicates doneStates = new(byProbability()); transient TreeSetWithDuplicates states = new(byProbability()); transient TreeSetWithDuplicates steppableStates = new(byProbability()); transient TreeSetWithDuplicates droppedStates = new(byProbability()); transient int stateCount; double cutoffPercentage = 50; Comparator byProbability() { ret (a, b) -> cmp(b.probability, a.probability); } abstract class Action { abstract void run(State state); State prepareClone(State state) { new State s; copyFields(state, s, 'tok, 'iNextToken, 'probability, 'matches); s.prev = state; s.remainingRule = optCast BasicLogicRule(state.remainingRule.rhs); ret s; } } abstract class Consumer extends Action { // override this or the next method double calcProbabilityForMatchedText(S s) { throw overrideMe(); } // tok is CNC starting & ending with code token double calcProbabilityForMatchedTokens(LS tok) { ret calcProbabilityForMatchedText(join(tok)); } void run(State state) { int maxTokensToConsume = state.remainingTokens(); for (int n = 0; n <= maxTokensToConsume; n++) { State s = prepareClone(state); s.iNextToken += n*2; LS tok = subList(state.tok, state.iNextToken, s.iNextToken-1); s.probability = multiplyPercentages(s.probability, calcProbabilityForMatchedTokens(tok)); s.matches = revChainPlus(s.matches, pair(state, tok)); addState(s); } } } noeq record ConsumeToken(S token) extends Consumer { double calcProbabilityForMatchedText(S s) { ret empty(s) ? 50 : levenSimilarityIntIC(s, token); } } noeq record Any extends Consumer { double calcProbabilityForMatchedText(S s) { ret 90; } } noeq record Filler extends Consumer { double calcProbabilityForMatchedTokens(LS tok) { ret 100-countCodeTokensInReversedCNC(tok)*10; } } noeq record EndOfInput extends Action { void run(State state) { State s = prepareClone(state); if (!state.endOfInput()) s.probability /= 2; s.matches = revChainPlus(s.matches, pair(state, subList(s.tok, s.iNextToken))); addState(s); } } class State { int number = ++stateCount; State prev; double probability = 100; LS tok; // CNC int iNextToken = 1; BasicLogicRule remainingRule; ReverseChain> matches; // values: reversed CNC toString { ret toStringWithFields(this, "number", "probability", "iNextToken") + stringIf(done(), " (done)" + " matches: " + matchesFromAction()); } LPair matchesFromAction() { ret mapPairsA(s -> s.action(), matches); } bool done() { ret remainingRule == null; } bool endOfInput() { ret iNextToken >= l(tok); } int remainingTokens() { ret (l(tok)-iNextToken)/2+1; } S nextToken() { ret get(tok, iNextToken); } Action action() { ret remainingRule == null ? null : (Action) remainingRule.lhs; } void step { if (!done()) action().run(this); } } void addState(State s) { if (s.probability < cutoffPercentage) ret with droppedStates.add(s); addToCollections(s, states, steppableStates); if (s.done()) doneStates.add(s); } bool stepFirstUnstepped() { State s = popFirst(steppableStates), ret false if null; ret true with s.step(); } BasicLogicRule patternToRule(S pattern) { ret curryLHS(BasicLogicRule( makeAnd(listPlus( mapWithIndex(javaTok(pattern), (i, t) -> even(i) ? new Filler : eq(t, "*") ? new Any : new ConsumeToken(t)), new EndOfInput)), formatFrag("parsed"))); } void reset { clearAll(doneStates, states, steppableStates, droppedStates); stateCount = 0; } // pattern e.g.: "Das * hat *."; void parse(S pattern, S input) { reset(); BasicLogicRule rule = patternToRule(pattern); print(rule); new State state; state.tok = javaTok(input); state.remainingRule = rule; addState(state); while ping (stepFirstUnstepped()) {} } L bestStates(int n) { ret takeFirst(n, doneStates); } Matches stateToMatches(State state) { if (state == null) null; new LS out; for (Pair p : state.matchesFromAction()) if (p.a instanceof Any) out.add(join(p.b)); ret matches(out); } Matches bestMatches() { ret stateToMatches(first(doneStates)); } }