!636 //1000300 // Lexicon //1000515 // Lexicon, fixing !quicknew class JavaTok { static String join(List<String> cnc) { new StringBuilder buf; for (String s : cnc) buf.append(s); return buf.toString(); } static List<String> split(String src) { Java20 lex = new Java20(); src = src.replace("\r\n", "\n"); LineNumberReader source = new LineNumberReader(new StringReader(src)); int lineNr = source.getLineNumber()+1; List<T> list = new ArrayList<T>(); try { for (Object a; (a = lex.grab(source)) != lex.$;) { String word = lex.word(); String q = quote(word); //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q); lineNr = source.getLineNumber()+1; T t = new T(a, word); boolean isSpace = t.isSpace(); if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace()) list.get(list.size()-1).word += word; // merge spaces else list.add(t); } } catch (Lexicon.Exception e) { throw new RuntimeException(e); } List<String> cnc = new ArrayList<String>(); for (int i = 0; i < list.size(); ) { T t = list.get(i); boolean shouldBeSpace = (cnc.size() % 2) == 0; boolean isSpace = t.isSpace(); if (shouldBeSpace == isSpace) { cnc.add(t.word); ++i; } else if (shouldBeSpace) cnc.add(""); else { System.out.println(cncToLines(cnc)); throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word)); } } if ((cnc.size() % 2) == 0) cnc.add(""); return cnc; } static class T { Object a; String word; T(Object a, String word) { this.a = a; this.word = word; } boolean isSpace() { return a.equals("WHITE_SPACE") || a.equals("COMMENT"); } } static String cncToLines(List<String> cnc) { StringBuilder out = new StringBuilder(); for (String token : cnc) out.append(quote(token) + "\n"); return out.toString(); } public static String quote(String s) { if (s == null) return "null"; return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\""; } static class Java20 extends Lexicon { Java20() { /** * Grammar for Java 2.0. * * Nonterminal - first letter uppercase * TERMINAL - all letters uppercase * keyword - all letters lowercase */ int INFINITY = -1; /** * 19.3 Terminals from section 3.6: White Space: [[:space:]] */ put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY)); /** * 19.3 Terminals from section 3.7: Comment */ put("COMMENT", new Union( // // Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/ // new Concatenation( new Singleton("/*"), new Concatenation( new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation( new Repetition( new Concatenation( new Singleton("*"), new Repetition(new Concatenation( new NonMatch("*/"), new Repetition(new NonMatch("*"), 0, INFINITY) ), 0, 1) ), 0, INFINITY ), new Singleton("*/") ))), new Union( /** * End Of Line Comment: //[^\n]*\n */ new Concatenation( new Singleton("//"), new Concatenation( new Repetition(new NonMatch("\n"), 0, INFINITY), new Singleton("\n") )), // // Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/ // new Concatenation( new Singleton("/**"), new Concatenation( new Repetition( new Concatenation( new Repetition(new Concatenation( new NonMatch("*/"), new Repetition(new NonMatch("*"), 0, INFINITY) ), 0, 1), new Singleton("*") ), 0, INFINITY ), new Singleton("/") )) ))); put("IDENTIFIER", new Concatenation( new Union( PosixClass.alpha(), new Match("_$") ), new Repetition( new Union( PosixClass.alnum(), new Match("_$") ), 0, INFINITY ) )); /** * 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar) */ put("KEYWORD", new Union( new Singleton("const"), new Singleton("goto") )); /** * 19.3 Terminals from section 3.10.1: Integer Literal */ put("INTEGER_LITERAL", new Concatenation( new Union( /** * Decimal Integer Literal: 0|[1-9][[:digit:]]* */ new Singleton("0"), new Union( new Concatenation( new Range('1', '9'), new Repetition(PosixClass.digit(), 0, INFINITY) ), new Union( /** * Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+ */ new Concatenation( new Singleton("0"), new Concatenation( new Match("xX"), new Repetition(PosixClass.xdigit(), 1, INFINITY) )), /** * Octal Integer Literal: 0[0-7]+ */ new Concatenation( new Singleton("0"), new Repetition(new Range('0', '7'), 1, INFINITY) ) ))), new Repetition(new Match("lL"), 0, 1) )); /** * 19.3 Terminals from section 3.10.2: Floating-Point Literal */ put("FLOATING_POINT_LITERAL", new Union( /** * [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]? */ new Concatenation( new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( new Singleton("."), new Concatenation( new Repetition(PosixClass.digit(), 0, INFINITY), new Concatenation( new Repetition(new Concatenation( new Match("eE"), new Concatenation( new Repetition(new Match("-+"), 0, 1), new Repetition(PosixClass.digit(), 1, INFINITY) )), 0, 1), new Repetition(new Match("fFdD"), 0, 1) )))), new Union( /** * \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]? */ new Concatenation( new Singleton("."), new Concatenation( new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( new Repetition(new Concatenation( new Match("eE"), new Concatenation( new Repetition(new Match("-+"), 0, 1), new Repetition(PosixClass.digit(), 1, INFINITY) )), 0, 1), new Repetition(new Match("fFdD"), 0, 1) ))), new Union( /** * [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]? */ new Concatenation( new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( new Match("eE"), new Concatenation( new Repetition(new Match("-+"), 0, 1), new Concatenation( new Repetition(PosixClass.digit(), 1, INFINITY), new Repetition(new Match("fFdD"), 0, 1) )))), /** * [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD] */ new Concatenation( new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( new Repetition(new Concatenation( new Match("eE"), new Concatenation( new Repetition(new Match("-+"), 0, 1), new Repetition(PosixClass.digit(), 1, INFINITY) )), 0, 1), new Match("fFdD") )) )))); /** * 19.3 Terminals from section 3.10.3: Boolean Literal */ put("BOOLEAN_LITERAL", new Union( new Singleton("true"), new Singleton("false") )); /** * 19.3 Terminals from section 3.10.4: Character Literal */ put("CHARACTER_LITERAL", new Concatenation( new Singleton("'"), new Concatenation( new Union( /** * Single Character: [^\r\n'\\] */ new NonMatch("\r\n'\\"), /** * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2}) */ new Concatenation( new Singleton("\\"), new Union( new Match("btnfr\"'\\"), new Concatenation( new Repetition(new Range('0', '3'), 0, 1), new Repetition(new Range('0', '7'), 1, 2) ) ) ) ), new Singleton("'") ))); put("MULTILINE_LITERAL", new Concatenation( new Singleton("[["), new Concatenation( new Repetition( new Union( new NonMatch("]"), new Concatenation( new Singleton("]"), new NonMatch("]")) ), 0, INFINITY ), new Singleton("]]") ))); put("MULTILINE_LITERAL2", new Concatenation( new Singleton("[=["), new Concatenation( new Repetition( new Union( new NonMatch("]"), new Concatenation(new Singleton("]"), new Union( new NonMatch("="), new Concatenation(new Singleton("="), new NonMatch("]")))) ), 0, INFINITY ), new Singleton("]=]") ))); /** * 19.3 Terminals from section 3.10.5: String Literal */ put("STRING_LITERAL", new Concatenation( new Singleton("\""), new Concatenation( new Repetition( new Union( /** * Single Character: [^\r\n"\\] */ new NonMatch("\r\n\"\\"), /** * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2}) */ new Concatenation( new Singleton("\\"), new Union( new Match("btnfr\"'\\"), new Union( new Concatenation( new Repetition(new Range('0', '3'), 0, 1), new Repetition(new Range('0', '7'), 1, 2) ), new Concatenation( new Singleton("u"), new Repetition(new Match("0123456789abcdefABCDEF"), 4, 4) ) ) ) ) ), 0, INFINITY ), new Singleton("\"") ))); /** * 19.3 Terminals section 3.10.7: Null Literal */ put("NULL_LITERAL", new Singleton("null")); // OK, it seems we have to add some more stuff... //put("OTHER1", new Match(";{}=,<>[]().+-:|&!")); //put("OTHER1", new NonMatch("")); // catch anything, one character at a time put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time } } // class Java20 } !include #1000300 // Lexicon
Began life as a copy of #1000353
Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1000516 |
Snippet name: | Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings - pre-static |
Eternal ID of this version: | #1000516/1 |
Text MD5: | 11dd188dfda60a264725ec4c5a479a9e |
Author: | stefan |
Category: | javax |
Type: | Document |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-08-09 22:58:18 |
Source code size: | 9896 bytes / 377 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 603 / 129 |
Referenced in: | [show references] |