Libraryless. Click here for Pure Java version (7911L/54K).
1 | // parses & translates |
2 | // main probabilistic functions: patternToRule + tokenToAction |
3 | |
4 | sclass RecursiveProbabilisticParser1 { |
5 | // example rule: |
6 | // "Ein/eine/der/die/das <ding> führt (nach/in/hin zu/ins) <ort>" => satz => $1 leads to $2 |
7 | |
8 | BasicLogicRule patternToRule(ProbabilisticParser1 parser, LS tok) { |
9 | ret BasicLogicRule(makeAnd(listPlus(mapWithIndex(tok, (i, t) -> tokenToAction(parser, i, t)), |
10 | parser.new EndOfInput)), formatFrag("parsed " + join(tok))); |
11 | } |
12 | |
13 | swappable ProbabilisticParser1.Action tokenToAction(ProbabilisticParser1 parser, int i, S t) { |
14 | ret |
15 | even(i) || contains(fillerClasses, t) |
16 | ? parser.new Filler |
17 | : isAngleBracketed(t) || starsAreWildcards && eq(t, "*") |
18 | ? set(parser.new Any(t), minTokensToConsume := minTokensForAny) |
19 | : containsSlash(t) |
20 | ? parser.new ConsumeOneOfTokens(asCISet(splitAtSlash_keepAll(t))) |
21 | : eq(t, "?") |
22 | ? set(parser.new ConsumeToken("?"), emptyProbability := 95.0) |
23 | : parser.new ConsumeToken(t); |
24 | } |
25 | |
26 | srecord Production(LS tok, S outClass, S rewritten) {} |
27 | |
28 | new L<Production> productions; |
29 | Set<S> fillerClasses = litciset("<filler>", "<füller>"); |
30 | S sentenceClass = "sentence"; |
31 | SS simpleTranslations = ciMap(); |
32 | LS sentences; |
33 | LS translations, translations2; |
34 | bool requireQuotedProductions = true; |
35 | bool useRoundBrackets; // instead of nested quotes |
36 | bool printSubtranslations; |
37 | bool productionsStartWithClass; // swap first & second entry in production definitions |
38 | bool starsAreWildcards; |
39 | int minTokensForAny = 1; // set to 0 to allow Any to consume 0 tokens |
40 | bool verbose; |
41 | |
42 | L<Production> otherProductions; |
43 | |
44 | int recursionLevels = 10; |
45 | |
46 | // rules can contain Java-style comments |
47 | void loadRules(S rules) { |
48 | for (LS l : lambdaMap splitAtDoubleArrow(tlft_j(rules))) { |
49 | continue unless isBetween(l(l), 2, 3); |
50 | S lhs = first(l), rhs = second(l), rewritten = third(l); |
51 | if (productionsStartWithClass) swap S lhs, rhs; |
52 | if (requireQuotedProductions && !isQuoted(lhs)) { |
53 | if (verbose) print("Production LHS not quoted", l); |
54 | continue; |
55 | } |
56 | LS tok = javaTokWithAllPlusAngleBrackets(replaceWithNothing("/...", tok_unquote(lhs))); |
57 | tok = tok_combineSpacelessSlashCombos(tok); |
58 | tok = mapCodeTokens tok_deRoundBracket(tok); |
59 | //printStruct(codeTokens(tok)); |
60 | //print(patternToRule(new ProbabilisticParser1, tok)); |
61 | productions.add(new Production(tok, deAngleBracket(rhs), rewritten)); |
62 | } |
63 | } |
64 | |
65 | // returns list of translated sentences |
66 | LS parse(S text, S sentenceClass) { |
67 | this.sentenceClass = sentenceClass; |
68 | ret parse(text); |
69 | } |
70 | |
71 | // returns list of translated sentences. Can be called multiple times |
72 | LS parse(S text) { |
73 | setField(sentences := splitIntoSentences(text)); |
74 | |
75 | new LS translations; |
76 | for (S sentence : sentences) { |
77 | L<Production> prods = objectsWhereIC(productions, outClass := sentenceClass); |
78 | print("Trying " + nProductions(prods) + " on sentence", sentence); |
79 | ProbabilisticParser1.State s = parserForInputAndProductions(javaTok(sentence), |
80 | prods).bestDoneState(); |
81 | print("State", s); |
82 | S translated = getTranslation(s); |
83 | addIfNempty(translations, translated); |
84 | //printIfNempty(" => ", translated); |
85 | } |
86 | |
87 | setField(+translations); |
88 | |
89 | setField(otherProductions := objectsWhereNotIC(productions, outClass := sentenceClass)); |
90 | |
91 | LS translations2 = map(translations, line -> translateQuotedParts(line, recursionLevels)); |
92 | //translations2 = lambdaMap recursiveUnquoteStartingAtLevel2(translations2); |
93 | translations2 = map(line -> join(translatePhrases(javaTokNoQuotes(line), simpleTranslations)), translations2); |
94 | if (useRoundBrackets) translations2 = lambdaMap recursiveQuotesToRoundBrackets(translations2); |
95 | setField(+translations2); |
96 | //pnl(translations2); |
97 | ret translations2; |
98 | } |
99 | |
100 | LS topLevelPatterns() { |
101 | ret map(translations, s -> jreplace(s, "<quoted>", "*")); |
102 | } |
103 | |
104 | // also parses |
105 | ProbabilisticParser1 parserForInputAndProductions(LS tok, Cl<Production> productions) { |
106 | new ProbabilisticParser1 parser; |
107 | parser.verbose = parser.pm.verbose = verbose; |
108 | //parser.pm.cutoffPercentage = 10; |
109 | for (Production p : productions) |
110 | parser.addState(tok, patternToRule(parser, p.tok)).userObject = p; |
111 | parser.pm.think(); |
112 | ret parser; |
113 | } |
114 | |
115 | S getTranslation(ProbabilisticParser1.State state) { |
116 | if (state == null) null; |
117 | Production prod = cast state.userObject; |
118 | Matches m = state.parser().stateToMatches(state); |
119 | //print(+m); |
120 | ret expandDollarRefsToMatches_alwaysQuote(prod.rewritten, m); |
121 | } |
122 | |
123 | S translateQuotedParts(S line, int recursionLimit) { |
124 | LS tok = javaTok(line); |
125 | for (int i : indicesOfQuoted(tok)) { |
126 | S x = unquote(tok.get(i)); |
127 | ProbabilisticParser1.State s = parserForInputAndProductions(javaTok(x), |
128 | otherProductions).bestDoneState(); |
129 | //printVars(+x, +s); |
130 | if (s != null && s.probability >= 80) { |
131 | S y = getTranslation(s); |
132 | if (printSubtranslations) printVars_str(+x, +y, +s); |
133 | //printVars(+x, +y, probability := s.probability); |
134 | if (recursionLimit > 0) { |
135 | S yy = y; |
136 | y = translateQuotedParts(y, recursionLimit-1); |
137 | //if (neq(y, yy)) printVars(+yy, +y); |
138 | } |
139 | tok.set(i, quote(y)); |
140 | } |
141 | } |
142 | ret join(tok); |
143 | } |
144 | |
145 | event change; !include #1027843 // setField |
146 | |
147 | swappable LS splitIntoSentences(S s) { |
148 | ret sentences_dropExclam(s); |
149 | } |
150 | } |
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1027974 |
Snippet name: | RecursiveProbabilisticParser1 |
Eternal ID of this version: | #1027974/55 |
Text MD5: | e22c09a59a5ab687fa82484bf7f01c3f |
Transpilation MD5: | 84ebf4f7243cfed0b5e445ce09375d25 |
Author: | stefan |
Category: | |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-05-26 14:26:48 |
Source code size: | 5664 bytes / 150 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 391 / 1002 |
Version history: | 54 change(s) |
Referenced in: | [show references] |