Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

377
LINES

< > BotCompany Repo | #1000516 // Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings - pre-static

Document

!636
//1000300 // Lexicon
//1000515 // Lexicon, fixing
!quicknew

class JavaTok {
  static String join(List<String> cnc) {
    new StringBuilder buf;
    for (String s : cnc) buf.append(s);
    return buf.toString();
  }
  
  static List<String> split(String src) {
    Java20 lex = new Java20();
    src = src.replace("\r\n", "\n");
    LineNumberReader source = new LineNumberReader(new StringReader(src));
    int lineNr = source.getLineNumber()+1;
    List<T> list = new ArrayList<T>();
    try {
      for (Object a; (a = lex.grab(source)) != lex.$;) {
        String word = lex.word();
        String q = quote(word);
        //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
        lineNr = source.getLineNumber()+1;
        
        T t = new T(a, word);
        boolean isSpace = t.isSpace();
        if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
          list.get(list.size()-1).word += word; // merge spaces
        else
          list.add(t);
      }
    } catch (Lexicon.Exception e) {
      throw new RuntimeException(e);
    }
    
    List<String> cnc = new ArrayList<String>();
    for (int i = 0; i < list.size(); ) {
      T t = list.get(i);
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
      boolean isSpace = t.isSpace();
      if (shouldBeSpace == isSpace) {
        cnc.add(t.word);
        ++i;
      } else if (shouldBeSpace)
        cnc.add("");
      else {
        System.out.println(cncToLines(cnc));
        throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
      }
    }
    if ((cnc.size() % 2) == 0)
      cnc.add("");

    return cnc;
  }
  
  static class T {
    Object a; String word;
    
    T(Object a, String word) { this.a = a; this.word = word; }
    
    boolean isSpace() {
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
    }
  }
  
  static String cncToLines(List<String> cnc) {
    StringBuilder out = new StringBuilder();
    for (String token : cnc)
      out.append(quote(token) + "\n");
    return out.toString();
  }
  
  public static String quote(String s) {
    if (s == null) return "null";
    return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
  }
  
  static class Java20 extends Lexicon {

	Java20() {

		/**
		* Grammar for Java 2.0.
		*
		* Nonterminal - first letter uppercase
		* TERMINAL - all letters uppercase
		* keyword - all letters lowercase
		*/
		int INFINITY = -1;

		/**
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
		*/
		put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY));

		/**
		* 19.3 Terminals from section 3.7: Comment
		*/
		put("COMMENT", new Union(

			//
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
			//
			new Concatenation(
				new Singleton("/*"), new Concatenation(
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
				new Repetition(
					new Concatenation(
						new Singleton("*"),
						new Repetition(new Concatenation(
							new NonMatch("*/"),
							new Repetition(new NonMatch("*"), 0, INFINITY)
						), 0, 1)
					), 0, INFINITY
				),
				new Singleton("*/")
			))), new Union(

			/**
			* End Of Line Comment: //[^\n]*\n
			*/
			new Concatenation(
				new Singleton("//"), new Concatenation(
				new Repetition(new NonMatch("\n"), 0, INFINITY),
				new Singleton("\n")
			)),

			//
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
			//
			new Concatenation(
				new Singleton("/**"), new Concatenation(
				new Repetition(
					new Concatenation(
						new Repetition(new Concatenation(
							new NonMatch("*/"),
							new Repetition(new NonMatch("*"), 0, INFINITY)
						), 0, 1),
						new Singleton("*")
					), 0, INFINITY
				),
				new Singleton("/")
			))
		)));

		put("IDENTIFIER", new Concatenation(
			new Union(
				PosixClass.alpha(),
				new Match("_$")
			),
			new Repetition(
				new Union(
					PosixClass.alnum(),
					new Match("_$")
				), 0, INFINITY
			)
		));

		/**
		* 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar)
		*/
		put("KEYWORD", new Union(
			new Singleton("const"),
			new Singleton("goto")
		));

		/**
		* 19.3 Terminals from section 3.10.1: Integer Literal
		*/
		put("INTEGER_LITERAL", new Concatenation(
			new Union(
				/**
				* Decimal Integer Literal: 0|[1-9][[:digit:]]*
				*/
				new Singleton("0"), new Union(

				new Concatenation(
					new Range('1', '9'),
					new Repetition(PosixClass.digit(), 0, INFINITY)
				), new Union(

				/**
				* Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+
				*/
				new Concatenation(
					new Singleton("0"), new Concatenation(
					new Match("xX"),
					new Repetition(PosixClass.xdigit(), 1, INFINITY)
				)),

				/**
				* Octal Integer Literal: 0[0-7]+
				*/
				new Concatenation(
					new Singleton("0"),
					new Repetition(new Range('0', '7'), 1, INFINITY)
				)
			))),
			new Repetition(new Match("lL"), 0, 1)
		));

		/**
		* 19.3 Terminals from section 3.10.2: Floating-Point Literal
		*/
		put("FLOATING_POINT_LITERAL", new Union(

			/**
			* [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]?
			*/
			new Concatenation(
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
				new Singleton("."), new Concatenation(
				new Repetition(PosixClass.digit(), 0, INFINITY), new Concatenation(
				new Repetition(new Concatenation(
					new Match("eE"), new Concatenation(
					new Repetition(new Match("-+"), 0, 1),
					new Repetition(PosixClass.digit(), 1, INFINITY)
				)), 0, 1),
				new Repetition(new Match("fFdD"), 0, 1)
			)))), new Union(

			/**
			* \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]?
			*/
			new Concatenation(
				new Singleton("."), new Concatenation(
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
				new Repetition(new Concatenation(
					new Match("eE"), new Concatenation(
					new Repetition(new Match("-+"), 0, 1),
					new Repetition(PosixClass.digit(), 1, INFINITY)
				)), 0, 1),
				new Repetition(new Match("fFdD"), 0, 1)
			))), new Union(

			/**
			* [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]?
			*/
			new Concatenation(
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
				new Match("eE"), new Concatenation(
				new Repetition(new Match("-+"), 0, 1), new Concatenation(
				new Repetition(PosixClass.digit(), 1, INFINITY),
				new Repetition(new Match("fFdD"), 0, 1)
			)))),

			/**
			* [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]
			*/
			new Concatenation(
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
				new Repetition(new Concatenation(
					new Match("eE"), new Concatenation(
					new Repetition(new Match("-+"), 0, 1),
					new Repetition(PosixClass.digit(), 1, INFINITY)
				)), 0, 1),
				new Match("fFdD")
			))
		))));

		/**
		* 19.3 Terminals from section 3.10.3: Boolean Literal
		*/
		put("BOOLEAN_LITERAL", new Union(
			new Singleton("true"),
			new Singleton("false")
		));

		/**
		* 19.3 Terminals from section 3.10.4: Character Literal
		*/
		put("CHARACTER_LITERAL", new Concatenation(
			new Singleton("'"), new Concatenation(
			new Union(

				/**
				* Single Character: [^\r\n'\\]
				*/
				new NonMatch("\r\n'\\"),

				/**
				* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
				*/
				new Concatenation(
					new Singleton("\\"),
					new Union(
						new Match("btnfr\"'\\"),
						new Concatenation(
							new Repetition(new Range('0', '3'), 0, 1),
							new Repetition(new Range('0', '7'), 1, 2)
						)
					)
				)
			),
			new Singleton("'")
		)));

		put("MULTILINE_LITERAL", new Concatenation(
			new Singleton("[["), new Concatenation(
			new Repetition(
				new Union(
					new NonMatch("]"),
					new Concatenation(
					  new Singleton("]"), new NonMatch("]"))
			  ), 0, INFINITY
			),
			new Singleton("]]")
		)));

		put("MULTILINE_LITERAL2", new Concatenation(
			new Singleton("[=["), new Concatenation(
			new Repetition(
				new Union(
					new NonMatch("]"),
					new Concatenation(new Singleton("]"), new Union(
				    new NonMatch("="),
				    new Concatenation(new Singleton("="), new NonMatch("]"))))
			  ), 0, INFINITY
			),
			new Singleton("]=]")
		)));

		/**
		* 19.3 Terminals from section 3.10.5: String Literal
		*/
		put("STRING_LITERAL", new Concatenation(
			new Singleton("\""), new Concatenation(
			new Repetition(
				new Union(

					/**
					* Single Character: [^\r\n"\\]
					*/
					new NonMatch("\r\n\"\\"),

					/**
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
					*/
					new Concatenation(
						new Singleton("\\"),
						new Union(
							new Match("btnfr\"'\\"),
							new Union(
  							new Concatenation(
  								new Repetition(new Range('0', '3'), 0, 1),
  								new Repetition(new Range('0', '7'), 1, 2)
  							),
  							new Concatenation(
  							  new Singleton("u"),
  							  new Repetition(new Match("0123456789abcdefABCDEF"), 4, 4)
  							)
  						)
						)
					)
				), 0, INFINITY
			),
			new Singleton("\"")
		)));

		/**
		* 19.3 Terminals section 3.10.7: Null Literal
		*/
		put("NULL_LITERAL", new Singleton("null"));
		
		// OK, it seems we have to add some more stuff...
		
		//put("OTHER1", new Match(";{}=,<>[]().+-:|&!"));
		//put("OTHER1", new NonMatch("")); // catch anything, one character at a time
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time

	}
} // class Java20
}

!include #1000300 // Lexicon

Author comment

Began life as a copy of #1000353

download  show line numbers   

Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1000516
Snippet name: Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings - pre-static
Eternal ID of this version: #1000516/1
Text MD5: 11dd188dfda60a264725ec4c5a479a9e
Author: stefan
Category: javax
Type: Document
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-08-09 22:58:18
Source code size: 9896 bytes / 377 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 605 / 129
Referenced in: #3000190 - Answer for stefanreich(>> t 20 questions)
#3000382 - Answer for ferdie (>> t = 1, f = 0)
#3000383 - Answer for funkoverflow (>> t=1, f=0 okay)