Libraryless. Click here for Pure Java version (4078L/26K).
1 | sclass ProbabilisticParser1 { |
2 | transient TreeSetWithDuplicates<State> doneStates = new(byProbability()); |
3 | transient TreeSetWithDuplicates<State> states = new(byProbability()); |
4 | transient TreeSetWithDuplicates<State> steppableStates = new(byProbability()); |
5 | transient TreeSetWithDuplicates<State> droppedStates = new(byProbability()); |
6 | transient int stateCount; |
7 | |
8 | double cutoffPercentage = 50; |
9 | |
10 | Comparator<State> byProbability() { ret (a, b) -> cmp(b.probability, a.probability); } |
11 | |
12 | abstract class Action { |
13 | abstract void run(State state); |
14 | |
15 | State prepareClone(State state) { |
16 | new State s; |
17 | copyFields(state, s, 'tok, 'iNextToken, 'probability, 'matches); |
18 | s.prev = state; |
19 | s.remainingRule = optCast BasicLogicRule(state.remainingRule.rhs); |
20 | ret s; |
21 | } |
22 | } |
23 | |
24 | abstract class Consumer extends Action { |
25 | // override this or the next method |
26 | double calcProbabilityForMatchedText(S s) { throw overrideMe(); } |
27 | // tok is CNC starting & ending with code token |
28 | double calcProbabilityForMatchedTokens(LS tok) { |
29 | ret calcProbabilityForMatchedText(join(tok)); |
30 | } |
31 | |
32 | void run(State state) { |
33 | int maxTokensToConsume = state.remainingTokens(); |
34 | |
35 | for (int n = 0; n <= maxTokensToConsume; n++) { |
36 | State s = prepareClone(state); |
37 | s.iNextToken += n*2; |
38 | LS tok = subList(state.tok, state.iNextToken, s.iNextToken-1); |
39 | s.probability = multiplyPercentages(s.probability, calcProbabilityForMatchedTokens(tok)); |
40 | s.matches = revChainPlus(s.matches, pair(state, tok)); |
41 | addState(s); |
42 | } |
43 | } |
44 | } |
45 | |
46 | noeq record ConsumeToken(S token) extends Consumer { |
47 | double calcProbabilityForMatchedText(S s) { |
48 | ret empty(s) ? 50 : levenSimilarityIntIC(s, token); |
49 | } |
50 | } |
51 | |
52 | noeq record Any extends Consumer { |
53 | double calcProbabilityForMatchedText(S s) { |
54 | ret 90; |
55 | } |
56 | } |
57 | |
58 | noeq record Filler extends Consumer { |
59 | double calcProbabilityForMatchedTokens(LS tok) { |
60 | ret 100-countCodeTokensInReversedCNC(tok)*10; |
61 | } |
62 | } |
63 | |
64 | noeq record EndOfInput extends Action { |
65 | void run(State state) { |
66 | State s = prepareClone(state); |
67 | if (!state.endOfInput()) s.probability /= 2; |
68 | s.matches = revChainPlus(s.matches, pair(state, subList(s.tok, s.iNextToken))); |
69 | addState(s); |
70 | } |
71 | } |
72 | |
73 | class State { |
74 | int number = ++stateCount; |
75 | State prev; |
76 | double probability = 100; |
77 | LS tok; // CNC |
78 | int iNextToken = 1; |
79 | BasicLogicRule remainingRule; |
80 | ReverseChain<Pair<State, LS>> matches; // values: reversed CNC |
81 | |
82 | toString { |
83 | ret toStringWithFields(this, "number", "probability", "iNextToken") + stringIf(done(), " (done)" + " matches: " + matchesFromAction()); |
84 | } |
85 | |
86 | LPair<Action, LS> matchesFromAction() { ret mapPairsA(s -> s.action(), matches); } |
87 | |
88 | bool done() { ret remainingRule == null; } |
89 | |
90 | bool endOfInput() { ret iNextToken >= l(tok); } |
91 | int remainingTokens() { ret (l(tok)-iNextToken)/2+1; } |
92 | S nextToken() { ret get(tok, iNextToken); } |
93 | |
94 | Action action() { ret remainingRule == null ? null : (Action) remainingRule.lhs; } |
95 | |
96 | void step { if (!done()) action().run(this); } |
97 | } |
98 | |
99 | void addState(State s) { |
100 | if (s.probability < cutoffPercentage) ret with droppedStates.add(s); |
101 | addToCollections(s, states, steppableStates); |
102 | if (s.done()) doneStates.add(s); |
103 | } |
104 | |
105 | bool stepFirstUnstepped() { |
106 | State s = popFirst(steppableStates), ret false if null; |
107 | ret true with s.step(); |
108 | } |
109 | |
110 | BasicLogicRule patternToRule(S pattern) { |
111 | ret curryLHS(BasicLogicRule( |
112 | makeAnd(listPlus( |
113 | mapWithIndex(javaTok(pattern), (i, t) -> even(i) |
114 | ? new Filler |
115 | : eq(t, "*") ? new Any : new ConsumeToken(t)), |
116 | new EndOfInput)), |
117 | formatFrag("parsed"))); |
118 | } |
119 | |
120 | void reset { |
121 | clearAll(doneStates, states, steppableStates, droppedStates); |
122 | stateCount = 0; |
123 | } |
124 | |
125 | // pattern e.g.: "Das * hat *."; |
126 | void parse(S pattern, S input) { |
127 | reset(); |
128 | BasicLogicRule rule = patternToRule(pattern); |
129 | print(rule); |
130 | |
131 | new State state; |
132 | state.tok = javaTok(input); |
133 | state.remainingRule = rule; |
134 | addState(state); |
135 | while ping (stepFirstUnstepped()) {} |
136 | } |
137 | |
138 | L<State> bestStates(int n) { |
139 | ret takeFirst(n, doneStates); |
140 | } |
141 | |
142 | Matches stateToMatches(State state) { |
143 | if (state == null) null; |
144 | new LS out; |
145 | for (Pair<Action, LS> p : state.matchesFromAction()) |
146 | if (p.a instanceof Any) |
147 | out.add(join(p.b)); |
148 | ret matches(out); |
149 | } |
150 | |
151 | Matches bestMatches() { ret stateToMatches(first(doneStates)); } |
152 | } |
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1027937 |
Snippet name: | ProbabilisticParser1 (before shortening) |
Eternal ID of this version: | #1027937/10 |
Text MD5: | a13434d08ef0fa5a9650a1bf6d46537d |
Transpilation MD5: | be801c469d5bed4d2a544f28694105a4 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-04-20 12:02:45 |
Source code size: | 4731 bytes / 152 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 210 / 320 |
Version history: | 9 change(s) |
Referenced in: | [show references] |