Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

376
LINES

< > BotCompany Repo | #1000353 // Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings

Document

!636
//1000300 // Lexicon
//1000515 // Lexicon, fixing
!quicknew

class JavaTok {
  static String join(List<String> cnc) {
    new StringBuilder buf;
    for (String s : cnc) buf.append(s);
    return buf.toString();
  }
  
  static List<String> split(String src) {
    Java20 lex = new Java20();
    src = src.replace("\r\n", "\n");
    LineNumberReader source = new LineNumberReader(new StringReader(src));
    int lineNr = source.getLineNumber()+1;
    List<T> list = new ArrayList<T>();
    try {
      for (Object a; (a = lex.grab(source)) != lex.$;) {
        String word = lex.word();
        String q = quote(word);
        //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
        lineNr = source.getLineNumber()+1;
        
        T t = new T(a, word);
        boolean isSpace = t.isSpace();
        if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
          list.get(list.size()-1).word += word; // merge spaces
        else
          list.add(t);
      }
    } catch (Lexicon.Exception e) {
      throw new RuntimeException(e);
    }
    
    List<String> cnc = new ArrayList<String>();
    for (int i = 0; i < list.size(); ) {
      T t = list.get(i);
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
      boolean isSpace = t.isSpace();
      if (shouldBeSpace == isSpace) {
        cnc.add(t.word);
        ++i;
      } else if (shouldBeSpace)
        cnc.add("");
      else {
        System.out.println(cncToLines(cnc));
        throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
      }
    }
    if ((cnc.size() % 2) == 0)
      cnc.add("");

    return cnc;
  }
  
  static class T {
    Object a; String word;
    
    T(Object a, String word) { this.a = a; this.word = word; }
    
    boolean isSpace() {
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
    }
  }
  
  static String cncToLines(List<String> cnc) {
    StringBuilder out = new StringBuilder();
    for (String token : cnc)
      out.append(quote(token) + "\n");
    return out.toString();
  }
  
  public static String quote(String s) {
    if (s == null) return "null";
    return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
  }
  
  static class Java20 extends Lexicon {

	Java20() {
		/**
		* Grammar for Java 2.0.
		*
		* Nonterminal - first letter uppercase
		* TERMINAL - all letters uppercase
		* keyword - all letters lowercase
		*/
		int INFINITY = -1;

		/**
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
		*/
		put("WHITE_SPACE", new Repetition(space(), 1, INFINITY));

		/**
		* 19.3 Terminals from section 3.7: Comment
		*/
		put("COMMENT", new Union(

			//
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
			//
			new Concatenation(
				new Singleton("/*"), new Concatenation(
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
				new Repetition(
					new Concatenation(
						new Singleton("*"),
						new Repetition(new Concatenation(
							new NonMatch("*/"),
							new Repetition(new NonMatch("*"), 0, INFINITY)
						), 0, 1)
					), 0, INFINITY
				),
				new Singleton("*/")
			))), new Union(

			/**
			* End Of Line Comment: //[^\n]*\n
			*/
			new Concatenation(
				new Singleton("//"), new Concatenation(
				new Repetition(new NonMatch("\n"), 0, INFINITY),
				new Singleton("\n")
			)),

			//
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
			//
			new Concatenation(
				new Singleton("/**"), new Concatenation(
				new Repetition(
					new Concatenation(
						new Repetition(new Concatenation(
							new NonMatch("*/"),
							new Repetition(new NonMatch("*"), 0, INFINITY)
						), 0, 1),
						new Singleton("*")
					), 0, INFINITY
				),
				new Singleton("/")
			))
		)));

		put("IDENTIFIER", new Concatenation(
			new Union(
				alpha(),
				new Match("_$")
			),
			new Repetition(
				new Union(
					alnum(),
					new Match("_$")
				), 0, INFINITY
			)
		));

		/**
		* 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar)
		*/
		put("KEYWORD", new Union(
			new Singleton("const"),
			new Singleton("goto")
		));

		/**
		* 19.3 Terminals from section 3.10.1: Integer Literal
		*/
		put("INTEGER_LITERAL", new Concatenation(
			new Union(
				/**
				* Decimal Integer Literal: 0|[1-9][[:digit:]]*
				*/
				new Singleton("0"), new Union(

				new Concatenation(
					new Range('1', '9'),
					new Repetition(digit(), 0, INFINITY)
				), new Union(

				/**
				* Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+
				*/
				new Concatenation(
					new Singleton("0"), new Concatenation(
					new Match("xX"),
					new Repetition(xdigit(), 1, INFINITY)
				)),

				/**
				* Octal Integer Literal: 0[0-7]+
				*/
				new Concatenation(
					new Singleton("0"),
					new Repetition(new Range('0', '7'), 1, INFINITY)
				)
			))),
			new Repetition(new Match("lL"), 0, 1)
		));

		/**
		* 19.3 Terminals from section 3.10.2: Floating-Point Literal
		*/
		put("FLOATING_POINT_LITERAL", new Union(

			/**
			* [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]?
			*/
			new Concatenation(
				new Repetition(digit(), 1, INFINITY), new Concatenation(
				new Singleton("."), new Concatenation(
				new Repetition(digit(), 0, INFINITY), new Concatenation(
				new Repetition(new Concatenation(
					new Match("eE"), new Concatenation(
					new Repetition(new Match("-+"), 0, 1),
					new Repetition(digit(), 1, INFINITY)
				)), 0, 1),
				new Repetition(new Match("fFdD"), 0, 1)
			)))), new Union(

			/**
			* \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]?
			*/
			new Concatenation(
				new Singleton("."), new Concatenation(
				new Repetition(digit(), 1, INFINITY), new Concatenation(
				new Repetition(new Concatenation(
					new Match("eE"), new Concatenation(
					new Repetition(new Match("-+"), 0, 1),
					new Repetition(digit(), 1, INFINITY)
				)), 0, 1),
				new Repetition(new Match("fFdD"), 0, 1)
			))), new Union(

			/**
			* [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]?
			*/
			new Concatenation(
				new Repetition(digit(), 1, INFINITY), new Concatenation(
				new Match("eE"), new Concatenation(
				new Repetition(new Match("-+"), 0, 1), new Concatenation(
				new Repetition(digit(), 1, INFINITY),
				new Repetition(new Match("fFdD"), 0, 1)
			)))),

			/**
			* [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]
			*/
			new Concatenation(
				new Repetition(digit(), 1, INFINITY), new Concatenation(
				new Repetition(new Concatenation(
					new Match("eE"), new Concatenation(
					new Repetition(new Match("-+"), 0, 1),
					new Repetition(digit(), 1, INFINITY)
				)), 0, 1),
				new Match("fFdD")
			))
		))));

		/**
		* 19.3 Terminals from section 3.10.3: Boolean Literal
		*/
		put("BOOLEAN_LITERAL", new Union(
			new Singleton("true"),
			new Singleton("false")
		));

		/**
		* 19.3 Terminals from section 3.10.4: Character Literal
		*/
		put("CHARACTER_LITERAL", new Concatenation(
			new Singleton("'"), new Concatenation(
			new Union(

				/**
				* Single Character: [^\r\n'\\]
				*/
				new NonMatch("\r\n'\\"),

				/**
				* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
				*/
				new Concatenation(
					new Singleton("\\"),
					new Union(
						new Match("btnfr\"'\\"),
						new Concatenation(
							new Repetition(new Range('0', '3'), 0, 1),
							new Repetition(new Range('0', '7'), 1, 2)
						)
					)
				)
			),
			new Singleton("'")
		)));

		put("MULTILINE_LITERAL", new Concatenation(
			new Singleton("[["), new Concatenation(
			new Repetition(
				new Union(
					new NonMatch("]"),
					new Concatenation(
					  new Singleton("]"), new NonMatch("]"))
			  ), 0, INFINITY
			),
			new Singleton("]]")
		)));

		put("MULTILINE_LITERAL2", new Concatenation(
			new Singleton("[=["), new Concatenation(
			new Repetition(
				new Union(
					new NonMatch("]"),
					new Concatenation(new Singleton("]"), new Union(
				    new NonMatch("="),
				    new Concatenation(new Singleton("="), new NonMatch("]"))))
			  ), 0, INFINITY
			),
			new Singleton("]=]")
		)));

		/**
		* 19.3 Terminals from section 3.10.5: String Literal
		*/
		put("STRING_LITERAL", new Concatenation(
			new Singleton("\""), new Concatenation(
			new Repetition(
				new Union(

					/**
					* Single Character: [^\r\n"\\]
					*/
					new NonMatch("\r\n\"\\"),

					/**
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
					*/
					new Concatenation(
						new Singleton("\\"),
						new Union(
							new Match("btnfr\"'\\"),
							new Union(
  							new Concatenation(
  								new Repetition(new Range('0', '3'), 0, 1),
  								new Repetition(new Range('0', '7'), 1, 2)
  							),
  							new Concatenation(
  							  new Singleton("u"),
  							  new Repetition(new Match("0123456789abcdefABCDEF"), 4, 4)
  							)
  						)
						)
					)
				), 0, INFINITY
			),
			new Singleton("\"")
		)));

		/**
		* 19.3 Terminals section 3.10.7: Null Literal
		*/
		put("NULL_LITERAL", new Singleton("null"));
		
		// OK, it seems we have to add some more stuff...
		
		//put("OTHER1", new Match(";{}=,<>[]().+-:|&!"));
		//put("OTHER1", new NonMatch("")); // catch anything, one character at a time
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time

	}
} // class Java20
}

!include #1000514 // Lexicon

Author comment

Began life as a copy of #1000348

download  show line numbers   

Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

Comments [hide]

ID Author/Program Comment Date
149 #1000604 (pitcher) 2015-08-18 00:47:55

add comment

Snippet ID: #1000353
Snippet name: Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings
Eternal ID of this version: #1000353/1
Text MD5: 2fc40222a3253afa884ef1e7c6c4ec9d
Author: stefan
Category: javax
Type: Document
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-08-09 23:02:17
Source code size: 9740 bytes / 376 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 1123 / 2266
Referenced in: [show references]