Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

344
LINES

< > BotCompany Repo | #1000348 // Official Java tokenizer (embeddable)

Document

!636
!629 // standard functions
!1000300 // class Lexicon
!quicknew

class JavaTok {
  static String join(List<String> cnc) {
    new StringBuilder buf;
    for (String s : cnc) buf.append(s);
    return buf.toString();
  }
  
  static List<String> split(String src) {
    Java20 lex = new Java20();
    src = src.replace("\r\n", "\n");
    LineNumberReader source = new LineNumberReader(new StringReader(src));
    int lineNr = source.getLineNumber()+1;
    List<T> list = new ArrayList<T>();
    try {
      for (Object a; (a = lex.grab(source)) != lex.$;) {
        String word = lex.word();
        String q = quote(word);
        //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
        lineNr = source.getLineNumber()+1;
        
        T t = new T(a, word);
        boolean isSpace = t.isSpace();
        if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
          list.get(list.size()-1).word += word; // merge spaces
        else
          list.add(t);
      }
    } catch (Lexicon.Exception e) {
      throw new RuntimeException(e);
    }
    
    List<String> cnc = new ArrayList<String>();
    for (int i = 0; i < list.size(); ) {
      T t = list.get(i);
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
      boolean isSpace = t.isSpace();
      if (shouldBeSpace == isSpace) {
        cnc.add(t.word);
        ++i;
      } else if (shouldBeSpace)
        cnc.add("");
      else {
        System.out.println(cncToLines(cnc));
        throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
      }
    }
    if ((cnc.size() % 2) == 0)
      cnc.add("");

    return cnc;
  }
  
  static class T {
    Object a; String word;
    
    T(Object a, String word) { this.a = a; this.word = word; }
    
    boolean isSpace() {
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
    }
  }
  
  static String cncToLines(List<String> cnc) {
    StringBuilder out = new StringBuilder();
    for (String token : cnc)
      out.append(quote(token) + "\n");
    return out.toString();
  }
  
  public static String quote(String s) {
    if (s == null) return "null";
    return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
  }
  
  static class Java20 extends Lexicon {

	Java20() {

		/**
		* Grammar for Java 2.0.
		*
		* Nonterminal - first letter uppercase
		* TERMINAL - all letters uppercase
		* keyword - all letters lowercase
		*/
		int INFINITY = -1;

		/**
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
		*/
		put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY));

		/**
		* 19.3 Terminals from section 3.7: Comment
		*/
		put("COMMENT", new Union(

			//
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
			//
			new Concatenation(
				new Singleton("/*"), new Concatenation(
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
				new Repetition(
					new Concatenation(
						new Singleton("*"),
						new Repetition(new Concatenation(
							new NonMatch("*/"),
							new Repetition(new NonMatch("*"), 0, INFINITY)
						), 0, 1)
					), 0, INFINITY
				),
				new Singleton("*/")
			))), new Union(

			/**
			* End Of Line Comment: //[^\n]*\n
			*/
			new Concatenation(
				new Singleton("//"), new Concatenation(
				new Repetition(new NonMatch("\n"), 0, INFINITY),
				new Singleton("\n")
			)),

			//
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
			//
			new Concatenation(
				new Singleton("/**"), new Concatenation(
				new Repetition(
					new Concatenation(
						new Repetition(new Concatenation(
							new NonMatch("*/"),
							new Repetition(new NonMatch("*"), 0, INFINITY)
						), 0, 1),
						new Singleton("*")
					), 0, INFINITY
				),
				new Singleton("/")
			))
		)));

		put("IDENTIFIER", new Concatenation(
			new Union(
				PosixClass.alpha(),
				new Match("_$")
			),
			new Repetition(
				new Union(
					PosixClass.alnum(),
					new Match("_$")
				), 0, INFINITY
			)
		));

		/**
		* 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar)
		*/
		put("KEYWORD", new Union(
			new Singleton("const"),
			new Singleton("goto")
		));

		/**
		* 19.3 Terminals from section 3.10.1: Integer Literal
		*/
		put("INTEGER_LITERAL", new Concatenation(
			new Union(
				/**
				* Decimal Integer Literal: 0|[1-9][[:digit:]]*
				*/
				new Singleton("0"), new Union(

				new Concatenation(
					new Range('1', '9'),
					new Repetition(PosixClass.digit(), 0, INFINITY)
				), new Union(

				/**
				* Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+
				*/
				new Concatenation(
					new Singleton("0"), new Concatenation(
					new Match("xX"),
					new Repetition(PosixClass.xdigit(), 1, INFINITY)
				)),

				/**
				* Octal Integer Literal: 0[0-7]+
				*/
				new Concatenation(
					new Singleton("0"),
					new Repetition(new Range('0', '7'), 1, INFINITY)
				)
			))),
			new Repetition(new Match("lL"), 0, 1)
		));

		/**
		* 19.3 Terminals from section 3.10.2: Floating-Point Literal
		*/
		put("FLOATING_POINT_LITERAL", new Union(

			/**
			* [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]?
			*/
			new Concatenation(
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
				new Singleton("."), new Concatenation(
				new Repetition(PosixClass.digit(), 0, INFINITY), new Concatenation(
				new Repetition(new Concatenation(
					new Match("eE"), new Concatenation(
					new Repetition(new Match("-+"), 0, 1),
					new Repetition(PosixClass.digit(), 1, INFINITY)
				)), 0, 1),
				new Repetition(new Match("fFdD"), 0, 1)
			)))), new Union(

			/**
			* \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]?
			*/
			new Concatenation(
				new Singleton("."), new Concatenation(
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
				new Repetition(new Concatenation(
					new Match("eE"), new Concatenation(
					new Repetition(new Match("-+"), 0, 1),
					new Repetition(PosixClass.digit(), 1, INFINITY)
				)), 0, 1),
				new Repetition(new Match("fFdD"), 0, 1)
			))), new Union(

			/**
			* [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]?
			*/
			new Concatenation(
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
				new Match("eE"), new Concatenation(
				new Repetition(new Match("-+"), 0, 1), new Concatenation(
				new Repetition(PosixClass.digit(), 1, INFINITY),
				new Repetition(new Match("fFdD"), 0, 1)
			)))),

			/**
			* [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]
			*/
			new Concatenation(
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
				new Repetition(new Concatenation(
					new Match("eE"), new Concatenation(
					new Repetition(new Match("-+"), 0, 1),
					new Repetition(PosixClass.digit(), 1, INFINITY)
				)), 0, 1),
				new Match("fFdD")
			))
		))));

		/**
		* 19.3 Terminals from section 3.10.3: Boolean Literal
		*/
		put("BOOLEAN_LITERAL", new Union(
			new Singleton("true"),
			new Singleton("false")
		));

		/**
		* 19.3 Terminals from section 3.10.4: Character Literal
		*/
		put("CHARACTER_LITERAL", new Concatenation(
			new Singleton("'"), new Concatenation(
			new Union(

				/**
				* Single Character: [^\r\n'\\]
				*/
				new NonMatch("\r\n'\\"),

				/**
				* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
				*/
				new Concatenation(
					new Singleton("\\"),
					new Union(
						new Match("btnfr\"'\\"),
						new Concatenation(
							new Repetition(new Range('0', '3'), 0, 1),
							new Repetition(new Range('0', '7'), 1, 2)
						)
					)
				)
			),
			new Singleton("'")
		)));

		/**
		* 19.3 Terminals from section 3.10.5: String Literal
		*/
		put("STRING_LITERAL", new Concatenation(
			new Singleton("\""), new Concatenation(
			new Repetition(
				new Union(

					/**
					* Single Character: [^\r\n"\\]
					*/
					new NonMatch("\r\n\"\\"),

					/**
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
					*/
					new Concatenation(
						new Singleton("\\"),
						new Union(
							new Match("btnfr\"'\\"),
							new Concatenation(
								new Repetition(new Range('0', '3'), 0, 1),
								new Repetition(new Range('0', '7'), 1, 2)
							)
						)
					)
				), 0, INFINITY
			),
			new Singleton("\"")
		)));

		/**
		* 19.3 Terminals section 3.10.7: Null Literal
		*/
		put("NULL_LITERAL", new Singleton("null"));
		
		// OK, it seems we have to add some more stuff...
		
		//put("OTHER1", new Match(";{}=,<>[]().+-:|&!"));
		//put("OTHER1", new NonMatch("")); // catch anything, one character at a time
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time

	}
} // class Java20
}

Author comment

Began life as a copy of #651

download  show line numbers   

Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1000348
Snippet name: Official Java tokenizer (embeddable)
Eternal ID of this version: #1000348/1
Text MD5: f45fb032e61d337b04c614258a35ff40
Author: stefan
Category: javax
Type: Document
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-07-27 14:11:13
Source code size: 9031 bytes / 344 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 603 / 456
Referenced in: [show references]