!636 !629 // standard functions !1000300 // class Lexicon class SCWS { static List<String> tokenize(String src) tex { Lex lex = new Lex(); src = src.replace("\r\n", "\n"); LineNumberReader source = new LineNumberReader(new StringReader(src)); int lineNr = source.getLineNumber()+1; List<T> list = new ArrayList<T>(); for (Object a; (a = lex.grab(source)) != lex.$;) { String word = lex.word(); String q = main.quote(word); //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q); lineNr = source.getLineNumber()+1; T t = new T(a, word); boolean isSpace = t.isSpace(); if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace()) list.get(list.size()-1).word += word; // merge spaces else list.add(t); } List<String> cnc = new ArrayList<String>(); for (int i = 0; i < list.size(); ) { T t = list.get(i); boolean shouldBeSpace = (cnc.size() % 2) == 0; boolean isSpace = t.isSpace(); if (shouldBeSpace == isSpace) { cnc.add(t.word); ++i; } else if (shouldBeSpace) cnc.add(""); else { //System.out.println(cncToLines(cnc)); throw new RuntimeException("TILT at " + cnc.size() + ": " + main.quote(t.word)); } } if ((cnc.size() % 2) == 0) cnc.add(""); return cnc; } static class T { Object a; String word; T(Object a, String word) { this.a = a; this.word = word; } boolean isSpace() { return a.equals("WHITE_SPACE") || a.equals("COMMENT"); } } static class Lex extends Lexicon { Lex() { /* * TERMINAL - all letters uppercase */ int INFINITY = -1; /** * 19.3 Terminals from section 3.6: White Space: [[:space:]] */ put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY)); /** * 19.3 Terminals from section 3.7: Comment */ put("COMMENT", new Union( // // Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/ // new Concatenation( new Singleton("/*"), new Concatenation( new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation( new Repetition( new Concatenation( new Singleton("*"), new Repetition(new Concatenation( new NonMatch("*/"), new Repetition(new NonMatch("*"), 0, INFINITY) ), 0, 1) ), 0, INFINITY ), new Singleton("*/") ))), new Union( /** * End Of Line Comment: //[^\n]*\n */ new Concatenation( new Singleton("//"), new Concatenation( new Repetition(new NonMatch("\n"), 0, INFINITY), new Singleton("\n") )), // // Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/ // new Concatenation( new Singleton("/**"), new Concatenation( new Repetition( new Concatenation( new Repetition(new Concatenation( new NonMatch("*/"), new Repetition(new NonMatch("*"), 0, INFINITY) ), 0, 1), new Singleton("*") ), 0, INFINITY ), new Singleton("/") )) ))); put("IDENTIFIER", new Concatenation( new Union( PosixClass.alpha(), new Match("_$") ), new Repetition( new Union( PosixClass.alnum(), new Match("_$") ), 0, INFINITY ) )); /** * 19.3 Terminals from section 3.10.5: String Literal */ put("STRING_LITERAL", new Concatenation( new Singleton("\""), new Concatenation( new Repetition( new Union( /** * Single Character: [^\r\n"\\] */ new NonMatch("\r\n\"\\"), /** * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2}) */ new Concatenation( new Singleton("\\"), new Union( new Match("btnfr\"'\\"), new Concatenation( new Repetition(new Range('0', '3'), 0, 1), new Repetition(new Range('0', '7'), 1, 2) ) ) ) ), 0, INFINITY ), new Singleton("\"") ))); // Single-character catch-all production so we can parse anything. put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time } } // class Lex } // class SCWS main { psvm { String src = takeInput(args, null); List<String> cnc = SCWS.tokenize(src); saveTextFile("output/output.txt", cncToLines(cnc)); } static String takeInput(String[] args, String def) tex { if (args.length != 0) return loadSnippet(args[0]); return loadTextFile("input/input.txt", def); } }
Began life as a copy of #1000323
download show line numbers debug dex old transpilations
Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, qbtsjoyahagl, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1000324 |
Snippet name: | Spaces, comments, words, strings (Tokenizer, embeddable, developing 2) |
Eternal ID of this version: | #1000324/1 |
Text MD5: | d221241170f48eaa45dd14b213ea5964 |
Author: | stefan |
Category: | javax |
Type: | JavaX (input.txt to output.txt) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-07-06 18:47:25 |
Source code size: | 4646 bytes / 183 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 745 / 607 |
Referenced in: | [show references] |