// parses & translates // main probabilistic functions: patternToRule + tokenToAction sclass RecursiveProbabilisticParser1 { // example rule: // "Ein/eine/der/die/das führt (nach/in/hin zu/ins) " => satz => $1 leads to $2 BasicLogicRule patternToRule(ProbabilisticParser1 parser, LS tok) { ret BasicLogicRule(makeAnd(listPlus(mapWithIndex(tok, (i, t) -> tokenToAction(parser, i, t)), parser.new EndOfInput)), formatFrag("parsed " + join(tok))); } swappable ProbabilisticParser1.Action tokenToAction(ProbabilisticParser1 parser, int i, S t) { ret even(i) || contains(fillerClasses, t) ? parser.new Filler : isAngleBracketed(t) || starsAreWildcards && eq(t, "*") ? set(parser.new Any(t), minTokensToConsume := minTokensForAny) : containsSlash(t) ? parser.new ConsumeOneOfTokens(asCISet(splitAtSlash_keepAll(t))) : eq(t, "?") ? set(parser.new ConsumeToken("?"), emptyProbability := 95.0) : parser.new ConsumeToken(t); } srecord Production(LS tok, S outClass, S rewritten) {} new L productions; Set fillerClasses = litciset("", ""); S sentenceClass = "sentence"; SS simpleTranslations = ciMap(); LS sentences; LS translations, translations2; bool requireQuotedProductions = true; bool useRoundBrackets; // instead of nested quotes bool printSubtranslations; bool productionsStartWithClass; // swap first & second entry in production definitions bool starsAreWildcards; int minTokensForAny = 1; // set to 0 to allow Any to consume 0 tokens bool verbose; L otherProductions; int recursionLevels = 10; // rules can contain Java-style comments void loadRules(S rules) { for (LS l : lambdaMap splitAtDoubleArrow(tlft_j(rules))) { continue unless isBetween(l(l), 2, 3); S lhs = first(l), rhs = second(l), rewritten = third(l); if (productionsStartWithClass) swap S lhs, rhs; if (requireQuotedProductions && !isQuoted(lhs)) { if (verbose) print("Production LHS not quoted", l); continue; } LS tok = javaTokWithAllPlusAngleBrackets(replaceWithNothing("/...", tok_unquote(lhs))); tok = tok_combineSpacelessSlashCombos(tok); tok = mapCodeTokens tok_deRoundBracket(tok); //printStruct(codeTokens(tok)); //print(patternToRule(new ProbabilisticParser1, tok)); productions.add(new Production(tok, deAngleBracket(rhs), rewritten)); } } // returns list of translated sentences LS parse(S text, S sentenceClass) { this.sentenceClass = sentenceClass; ret parse(text); } // returns list of translated sentences. Can be called multiple times LS parse(S text) { setField(sentences := splitIntoSentences(text)); new LS translations; for (S sentence : sentences) { L prods = objectsWhereIC(productions, outClass := sentenceClass); print("Trying " + nProductions(prods) + " on sentence", sentence); ProbabilisticParser1.State s = parserForInputAndProductions(javaTok(sentence), prods).bestDoneState(); print("State", s); S translated = getTranslation(s); addIfNempty(translations, translated); //printIfNempty(" => ", translated); } setField(+translations); setField(otherProductions := objectsWhereNotIC(productions, outClass := sentenceClass)); LS translations2 = map(translations, line -> translateQuotedParts(line, recursionLevels)); //translations2 = lambdaMap recursiveUnquoteStartingAtLevel2(translations2); translations2 = map(line -> join(translatePhrases(javaTokNoQuotes(line), simpleTranslations)), translations2); if (useRoundBrackets) translations2 = lambdaMap recursiveQuotesToRoundBrackets(translations2); setField(+translations2); //pnl(translations2); ret translations2; } LS topLevelPatterns() { ret map(translations, s -> jreplace(s, "", "*")); } // also parses ProbabilisticParser1 parserForInputAndProductions(LS tok, Cl productions) { new ProbabilisticParser1 parser; parser.verbose = parser.pm.verbose = verbose; //parser.pm.cutoffPercentage = 10; for (Production p : productions) parser.addState(tok, patternToRule(parser, p.tok)).userObject = p; parser.pm.think(); ret parser; } S getTranslation(ProbabilisticParser1.State state) { if (state == null) null; Production prod = cast state.userObject; Matches m = state.parser().stateToMatches(state); //print(+m); ret expandDollarRefsToMatches_alwaysQuote(prod.rewritten, m); } S translateQuotedParts(S line, int recursionLimit) { LS tok = javaTok(line); for (int i : indicesOfQuoted(tok)) { S x = unquote(tok.get(i)); ProbabilisticParser1.State s = parserForInputAndProductions(javaTok(x), otherProductions).bestDoneState(); //printVars(+x, +s); if (s != null && s.probability >= 80) { S y = getTranslation(s); if (printSubtranslations) printVars_str(+x, +y, +s); //printVars(+x, +y, probability := s.probability); if (recursionLimit > 0) { S yy = y; y = translateQuotedParts(y, recursionLimit-1); //if (neq(y, yy)) printVars(+yy, +y); } tok.set(i, quote(y)); } } ret join(tok); } event change; !include #1027843 // setField swappable LS splitIntoSentences(S s) { ret sentences_dropExclam(s); } }