Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

150
LINES

< > BotCompany Repo | #1027974 // RecursiveProbabilisticParser1

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (7911L/54K).

// parses & translates
// main probabilistic functions: patternToRule + tokenToAction

sclass RecursiveProbabilisticParser1 {
  // example rule:
  // "Ein/eine/der/die/das <ding> führt (nach/in/hin zu/ins) <ort>" => satz => $1 leads to $2

  BasicLogicRule patternToRule(ProbabilisticParser1 parser, LS tok) {
    ret BasicLogicRule(makeAnd(listPlus(mapWithIndex(tok, (i, t) -> tokenToAction(parser, i, t)),
      parser.new EndOfInput)), formatFrag("parsed " + join(tok)));
  }
  
  swappable ProbabilisticParser1.Action tokenToAction(ProbabilisticParser1 parser, int i, S t) {
    ret
      even(i) || contains(fillerClasses, t)
        ? parser.new Filler
      : isAngleBracketed(t) || starsAreWildcards && eq(t, "*")
          ? set(parser.new Any(t), minTokensToConsume := minTokensForAny)
      : containsSlash(t)
        ? parser.new ConsumeOneOfTokens(asCISet(splitAtSlash_keepAll(t)))
      : eq(t, "?")
        ? set(parser.new ConsumeToken("?"), emptyProbability := 95.0)
      : parser.new ConsumeToken(t);
  }

  srecord Production(LS tok, S outClass, S rewritten) {}

  new L<Production> productions;
  Set<S> fillerClasses = litciset("<filler>", "<füller>");
  S sentenceClass = "sentence";
  SS simpleTranslations = ciMap();
  LS sentences;
  LS translations, translations2;
  bool requireQuotedProductions = true;
  bool useRoundBrackets; // instead of nested quotes
  bool printSubtranslations;
  bool productionsStartWithClass; // swap first & second entry in production definitions
  bool starsAreWildcards;
  int minTokensForAny = 1; // set to 0 to allow Any to consume 0 tokens
  bool verbose;

  L<Production> otherProductions;

  int recursionLevels = 10;

  // rules can contain Java-style comments
  void loadRules(S rules) {
    for (LS l : lambdaMap splitAtDoubleArrow(tlft_j(rules))) {
      continue unless isBetween(l(l), 2, 3);
      S lhs = first(l), rhs = second(l), rewritten = third(l);
      if (productionsStartWithClass) swap S lhs, rhs;
      if (requireQuotedProductions && !isQuoted(lhs)) {
        if (verbose) print("Production LHS not quoted", l);
        continue;
      }
      LS tok = javaTokWithAllPlusAngleBrackets(replaceWithNothing("/...", tok_unquote(lhs)));
      tok = tok_combineSpacelessSlashCombos(tok);
      tok = mapCodeTokens tok_deRoundBracket(tok);
      //printStruct(codeTokens(tok));
      //print(patternToRule(new ProbabilisticParser1, tok));
      productions.add(new Production(tok, deAngleBracket(rhs), rewritten));
    }
  }
  
  // returns list of translated sentences
  LS parse(S text, S sentenceClass) {
    this.sentenceClass = sentenceClass;
    ret parse(text);
  }

  // returns list of translated sentences. Can be called multiple times
  LS parse(S text) {
    setField(sentences := splitIntoSentences(text));
    
    new LS translations;
    for (S sentence : sentences) {
      L<Production> prods = objectsWhereIC(productions, outClass := sentenceClass);
      print("Trying " + nProductions(prods) + " on sentence", sentence);
      ProbabilisticParser1.State s = parserForInputAndProductions(javaTok(sentence),
        prods).bestDoneState();
      print("State", s);
      S translated = getTranslation(s);
      addIfNempty(translations, translated);
      //printIfNempty("  => ", translated);
    }

    setField(+translations);

    setField(otherProductions := objectsWhereNotIC(productions, outClass := sentenceClass));

    LS translations2 = map(translations, line -> translateQuotedParts(line, recursionLevels));
    //translations2 = lambdaMap recursiveUnquoteStartingAtLevel2(translations2);
    translations2 = map(line -> join(translatePhrases(javaTokNoQuotes(line), simpleTranslations)), translations2);
    if (useRoundBrackets) translations2 = lambdaMap recursiveQuotesToRoundBrackets(translations2);
    setField(+translations2);
    //pnl(translations2);
    ret translations2;
  }

  LS topLevelPatterns() {
    ret map(translations, s -> jreplace(s, "<quoted>", "*"));
  }

  // also parses
  ProbabilisticParser1 parserForInputAndProductions(LS tok, Cl<Production> productions) {
    new ProbabilisticParser1 parser;
    parser.verbose = parser.pm.verbose = verbose;
    //parser.pm.cutoffPercentage = 10;
    for (Production p : productions)
      parser.addState(tok, patternToRule(parser, p.tok)).userObject = p;
    parser.pm.think();
    ret parser;
  }

  S getTranslation(ProbabilisticParser1.State state) {
    if (state == null) null;
    Production prod = cast state.userObject;
    Matches m = state.parser().stateToMatches(state);
    //print(+m);
    ret expandDollarRefsToMatches_alwaysQuote(prod.rewritten, m);
  }

  S translateQuotedParts(S line, int recursionLimit) {
    LS tok = javaTok(line);
    for (int i : indicesOfQuoted(tok)) {
      S x = unquote(tok.get(i));
      ProbabilisticParser1.State s = parserForInputAndProductions(javaTok(x),
        otherProductions).bestDoneState();
      //printVars(+x, +s);
      if (s != null && s.probability >= 80) {
        S y = getTranslation(s);
        if (printSubtranslations) printVars_str(+x, +y, +s);
        //printVars(+x, +y, probability := s.probability);
        if (recursionLimit > 0) {
          S yy = y;
          y = translateQuotedParts(y, recursionLimit-1);
          //if (neq(y, yy)) printVars(+yy, +y);
        }
        tok.set(i, quote(y));
      }
    }
    ret join(tok);
  }

  event change; !include #1027843 // setField
  
  swappable LS splitIntoSentences(S s) {
    ret sentences_dropExclam(s);
  }
}

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1027974
Snippet name: RecursiveProbabilisticParser1
Eternal ID of this version: #1027974/55
Text MD5: e22c09a59a5ab687fa82484bf7f01c3f
Transpilation MD5: 84ebf4f7243cfed0b5e445ce09375d25
Author: stefan
Category:
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-05-26 14:26:48
Source code size: 5664 bytes / 150 lines
Pitched / IR pitched: No / No
Views / Downloads: 258 / 833
Version history: 54 change(s)
Referenced in: [show references]