Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

150
LINES

< > BotCompany Repo | #1027974 // RecursiveProbabilisticParser1

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (7911L/54K).

1  
// parses & translates
2  
// main probabilistic functions: patternToRule + tokenToAction
3  
4  
sclass RecursiveProbabilisticParser1 {
5  
  // example rule:
6  
  // "Ein/eine/der/die/das <ding> führt (nach/in/hin zu/ins) <ort>" => satz => $1 leads to $2
7  
8  
  BasicLogicRule patternToRule(ProbabilisticParser1 parser, LS tok) {
9  
    ret BasicLogicRule(makeAnd(listPlus(mapWithIndex(tok, (i, t) -> tokenToAction(parser, i, t)),
10  
      parser.new EndOfInput)), formatFrag("parsed " + join(tok)));
11  
  }
12  
  
13  
  swappable ProbabilisticParser1.Action tokenToAction(ProbabilisticParser1 parser, int i, S t) {
14  
    ret
15  
      even(i) || contains(fillerClasses, t)
16  
        ? parser.new Filler
17  
      : isAngleBracketed(t) || starsAreWildcards && eq(t, "*")
18  
          ? set(parser.new Any(t), minTokensToConsume := minTokensForAny)
19  
      : containsSlash(t)
20  
        ? parser.new ConsumeOneOfTokens(asCISet(splitAtSlash_keepAll(t)))
21  
      : eq(t, "?")
22  
        ? set(parser.new ConsumeToken("?"), emptyProbability := 95.0)
23  
      : parser.new ConsumeToken(t);
24  
  }
25  
26  
  srecord Production(LS tok, S outClass, S rewritten) {}
27  
28  
  new L<Production> productions;
29  
  Set<S> fillerClasses = litciset("<filler>", "<füller>");
30  
  S sentenceClass = "sentence";
31  
  SS simpleTranslations = ciMap();
32  
  LS sentences;
33  
  LS translations, translations2;
34  
  bool requireQuotedProductions = true;
35  
  bool useRoundBrackets; // instead of nested quotes
36  
  bool printSubtranslations;
37  
  bool productionsStartWithClass; // swap first & second entry in production definitions
38  
  bool starsAreWildcards;
39  
  int minTokensForAny = 1; // set to 0 to allow Any to consume 0 tokens
40  
  bool verbose;
41  
42  
  L<Production> otherProductions;
43  
44  
  int recursionLevels = 10;
45  
46  
  // rules can contain Java-style comments
47  
  void loadRules(S rules) {
48  
    for (LS l : lambdaMap splitAtDoubleArrow(tlft_j(rules))) {
49  
      continue unless isBetween(l(l), 2, 3);
50  
      S lhs = first(l), rhs = second(l), rewritten = third(l);
51  
      if (productionsStartWithClass) swap S lhs, rhs;
52  
      if (requireQuotedProductions && !isQuoted(lhs)) {
53  
        if (verbose) print("Production LHS not quoted", l);
54  
        continue;
55  
      }
56  
      LS tok = javaTokWithAllPlusAngleBrackets(replaceWithNothing("/...", tok_unquote(lhs)));
57  
      tok = tok_combineSpacelessSlashCombos(tok);
58  
      tok = mapCodeTokens tok_deRoundBracket(tok);
59  
      //printStruct(codeTokens(tok));
60  
      //print(patternToRule(new ProbabilisticParser1, tok));
61  
      productions.add(new Production(tok, deAngleBracket(rhs), rewritten));
62  
    }
63  
  }
64  
  
65  
  // returns list of translated sentences
66  
  LS parse(S text, S sentenceClass) {
67  
    this.sentenceClass = sentenceClass;
68  
    ret parse(text);
69  
  }
70  
71  
  // returns list of translated sentences. Can be called multiple times
72  
  LS parse(S text) {
73  
    setField(sentences := splitIntoSentences(text));
74  
    
75  
    new LS translations;
76  
    for (S sentence : sentences) {
77  
      L<Production> prods = objectsWhereIC(productions, outClass := sentenceClass);
78  
      print("Trying " + nProductions(prods) + " on sentence", sentence);
79  
      ProbabilisticParser1.State s = parserForInputAndProductions(javaTok(sentence),
80  
        prods).bestDoneState();
81  
      print("State", s);
82  
      S translated = getTranslation(s);
83  
      addIfNempty(translations, translated);
84  
      //printIfNempty("  => ", translated);
85  
    }
86  
87  
    setField(+translations);
88  
89  
    setField(otherProductions := objectsWhereNotIC(productions, outClass := sentenceClass));
90  
91  
    LS translations2 = map(translations, line -> translateQuotedParts(line, recursionLevels));
92  
    //translations2 = lambdaMap recursiveUnquoteStartingAtLevel2(translations2);
93  
    translations2 = map(line -> join(translatePhrases(javaTokNoQuotes(line), simpleTranslations)), translations2);
94  
    if (useRoundBrackets) translations2 = lambdaMap recursiveQuotesToRoundBrackets(translations2);
95  
    setField(+translations2);
96  
    //pnl(translations2);
97  
    ret translations2;
98  
  }
99  
100  
  LS topLevelPatterns() {
101  
    ret map(translations, s -> jreplace(s, "<quoted>", "*"));
102  
  }
103  
104  
  // also parses
105  
  ProbabilisticParser1 parserForInputAndProductions(LS tok, Cl<Production> productions) {
106  
    new ProbabilisticParser1 parser;
107  
    parser.verbose = parser.pm.verbose = verbose;
108  
    //parser.pm.cutoffPercentage = 10;
109  
    for (Production p : productions)
110  
      parser.addState(tok, patternToRule(parser, p.tok)).userObject = p;
111  
    parser.pm.think();
112  
    ret parser;
113  
  }
114  
115  
  S getTranslation(ProbabilisticParser1.State state) {
116  
    if (state == null) null;
117  
    Production prod = cast state.userObject;
118  
    Matches m = state.parser().stateToMatches(state);
119  
    //print(+m);
120  
    ret expandDollarRefsToMatches_alwaysQuote(prod.rewritten, m);
121  
  }
122  
123  
  S translateQuotedParts(S line, int recursionLimit) {
124  
    LS tok = javaTok(line);
125  
    for (int i : indicesOfQuoted(tok)) {
126  
      S x = unquote(tok.get(i));
127  
      ProbabilisticParser1.State s = parserForInputAndProductions(javaTok(x),
128  
        otherProductions).bestDoneState();
129  
      //printVars(+x, +s);
130  
      if (s != null && s.probability >= 80) {
131  
        S y = getTranslation(s);
132  
        if (printSubtranslations) printVars_str(+x, +y, +s);
133  
        //printVars(+x, +y, probability := s.probability);
134  
        if (recursionLimit > 0) {
135  
          S yy = y;
136  
          y = translateQuotedParts(y, recursionLimit-1);
137  
          //if (neq(y, yy)) printVars(+yy, +y);
138  
        }
139  
        tok.set(i, quote(y));
140  
      }
141  
    }
142  
    ret join(tok);
143  
  }
144  
145  
  event change; !include #1027843 // setField
146  
  
147  
  swappable LS splitIntoSentences(S s) {
148  
    ret sentences_dropExclam(s);
149  
  }
150  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1027974
Snippet name: RecursiveProbabilisticParser1
Eternal ID of this version: #1027974/55
Text MD5: e22c09a59a5ab687fa82484bf7f01c3f
Transpilation MD5: 84ebf4f7243cfed0b5e445ce09375d25
Author: stefan
Category:
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-05-26 14:26:48
Source code size: 5664 bytes / 150 lines
Pitched / IR pitched: No / No
Views / Downloads: 264 / 839
Version history: 54 change(s)
Referenced in: [show references]