Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

99
LINES

< > BotCompany Repo | #1028365 // WordDocumentTextReplacer

JavaX fragment (include) [tags: use-pretranspiled]

Uses 16250K of libraries. Click here for Pure Java version (3159L/20K).

import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;

sclass WordDocumentTextReplacer {
  replace Run with XWPFRun.
  replace Paragraph with XWPFParagraph.
  
  File inFile, outFile;
  S regularExpression;

  swappable S getReplacement(S text, LS groups) { ret "[REPLACED]"; }

  run {
    assertNempty(+regularExpression);
    assertNotNull(+inFile);
    assertNotNull(+outFile);

    XWPFDocument doc = loadDocx(print("Loading", inFile));
    print("Document loaded");
    
    new XWPFDocument docOut;
    CTBody body = doc.getDocument().getBody();
    CTSectPr sectPr = body.getSectPr();
    CTBody bodyOut = docOut.getDocument().getBody();
    bodyOut.setSectPr(sectPr);
    
    L<Paragraph> paragraphs = doc.getParagraphs();

    for (Paragraph para : paragraphs) {
      Paragraph paraOut = docOut.createParagraph();
      L<Run> runs = para.getRuns();
      //print(n2(runs, "run"));
      new LPair<Run, S> runs2;
      for (Run r : runs)
        addPair(runs2, r, unnull(r.getText(0)));
      S fullText = join(pairsB(runs2));
      //print(quote(fullText));
      //printIfNempty(regexpExtractAll(regexp, fullText));
      
      for (int i = 0; i < l(runs2); i++) {
        for (int j = i+1; j <= l(runs2); j++) {
          S text = join(pairsB(subList(runs2, i, j)));
          print(+text);
          IntRange range = regexpFindRange(regularExpression, text);
          if (range == null) continue; // no match
          //print("Match: " + substring(text, range));
          // we have a match, find out run indices
          
          // skip runs left of match
          while (i < l(runs2) && range.start >= l(runs2.get(i).b)) {
            range = shiftIntRange(range, -l(runs2.get(i).b));
            i++;
          }
          
          text = join(pairsB(subList(runs2, i, j)));
          print("Found match: " + substring(text, range));
          
          // replace all matched runs with one or two runs at i
          removeSubList(runs2, i+1, j);
          Run run = runs2.get(i).a;
          S found = substring(text, range);
          LS groups = regexpFirstGroups(regularExpression, found);
          S replacement = getReplacement(found, groups);
          print("Replacing with: " + replacement);
          S text1 = takeFirst(text, range.start) + replacement;
          if (nempty(text1)) {
            runs2.add(i, pair(run, text1));
            ++i;
          }
          S rest = substring(text, range.end);
          if (empty(rest))
            runs2.remove(i--);
          else
            runs2.get(i).b = rest;
        }
      }
      
      //for (int k = l(runs)-1; k >= 0; k--) para.removeRun(k);
      //print("Adding " + nRuns(runs2));
      for (Pair<Run, S> p : runs2) {
        //paraOut.addRun(run);
        Run run = p.a;
        Run runOut = paraOut.createRun();
        runOut.setText(p.b);
        
        // copy run attributes
        runOut.setColor(run.getColor());
        runOut.setFontFamily(run.getFontFamily());
        runOut.setFontSize(run.getFontSize());
        runOut.setBold(run.isBold());
        runOut.setItalic(run.isItalic());
        runOut.setUnderline(run.getUnderline());
        paraOut.addRun(runOut);
      }
    }

    saveDocx(docOut, outFile);
    printFileInfo(outFile);
  }
}

Author comment

Began life as a copy of #1028318

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028365
Snippet name: WordDocumentTextReplacer
Eternal ID of this version: #1028365/5
Text MD5: 4d0f037aa386ef25f9aca226b779ee28
Transpilation MD5: bed581516cdac5033b4aa6994150cacf
Author: stefan
Category: javax / io
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-06-13 17:21:53
Source code size: 3377 bytes / 99 lines
Pitched / IR pitched: No / No
Views / Downloads: 210 / 512
Version history: 4 change(s)
Referenced in: #1028390 - WordDocumentTextReplacer2 [allows multiple replacement patterns & post processing]
#1034167 - Standard Classes + Interfaces (LIVE, continuation of #1003674)