Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

344
LINES

< > BotCompany Repo | #1000348 // Official Java tokenizer (embeddable)

Document

1  
!636
2  
!629 // standard functions
3  
!1000300 // class Lexicon
4  
!quicknew
5  
6  
class JavaTok {
7  
  static String join(List<String> cnc) {
8  
    new StringBuilder buf;
9  
    for (String s : cnc) buf.append(s);
10  
    return buf.toString();
11  
  }
12  
  
13  
  static List<String> split(String src) {
14  
    Java20 lex = new Java20();
15  
    src = src.replace("\r\n", "\n");
16  
    LineNumberReader source = new LineNumberReader(new StringReader(src));
17  
    int lineNr = source.getLineNumber()+1;
18  
    List<T> list = new ArrayList<T>();
19  
    try {
20  
      for (Object a; (a = lex.grab(source)) != lex.$;) {
21  
        String word = lex.word();
22  
        String q = quote(word);
23  
        //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
24  
        lineNr = source.getLineNumber()+1;
25  
        
26  
        T t = new T(a, word);
27  
        boolean isSpace = t.isSpace();
28  
        if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
29  
          list.get(list.size()-1).word += word; // merge spaces
30  
        else
31  
          list.add(t);
32  
      }
33  
    } catch (Lexicon.Exception e) {
34  
      throw new RuntimeException(e);
35  
    }
36  
    
37  
    List<String> cnc = new ArrayList<String>();
38  
    for (int i = 0; i < list.size(); ) {
39  
      T t = list.get(i);
40  
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
41  
      boolean isSpace = t.isSpace();
42  
      if (shouldBeSpace == isSpace) {
43  
        cnc.add(t.word);
44  
        ++i;
45  
      } else if (shouldBeSpace)
46  
        cnc.add("");
47  
      else {
48  
        System.out.println(cncToLines(cnc));
49  
        throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
50  
      }
51  
    }
52  
    if ((cnc.size() % 2) == 0)
53  
      cnc.add("");
54  
55  
    return cnc;
56  
  }
57  
  
58  
  static class T {
59  
    Object a; String word;
60  
    
61  
    T(Object a, String word) { this.a = a; this.word = word; }
62  
    
63  
    boolean isSpace() {
64  
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
65  
    }
66  
  }
67  
  
68  
  static String cncToLines(List<String> cnc) {
69  
    StringBuilder out = new StringBuilder();
70  
    for (String token : cnc)
71  
      out.append(quote(token) + "\n");
72  
    return out.toString();
73  
  }
74  
  
75  
  public static String quote(String s) {
76  
    if (s == null) return "null";
77  
    return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
78  
  }
79  
  
80  
  static class Java20 extends Lexicon {
81  
82  
	Java20() {
83  
84  
		/**
85  
		* Grammar for Java 2.0.
86  
		*
87  
		* Nonterminal - first letter uppercase
88  
		* TERMINAL - all letters uppercase
89  
		* keyword - all letters lowercase
90  
		*/
91  
		int INFINITY = -1;
92  
93  
		/**
94  
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
95  
		*/
96  
		put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY));
97  
98  
		/**
99  
		* 19.3 Terminals from section 3.7: Comment
100  
		*/
101  
		put("COMMENT", new Union(
102  
103  
			//
104  
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
105  
			//
106  
			new Concatenation(
107  
				new Singleton("/*"), new Concatenation(
108  
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
109  
				new Repetition(
110  
					new Concatenation(
111  
						new Singleton("*"),
112  
						new Repetition(new Concatenation(
113  
							new NonMatch("*/"),
114  
							new Repetition(new NonMatch("*"), 0, INFINITY)
115  
						), 0, 1)
116  
					), 0, INFINITY
117  
				),
118  
				new Singleton("*/")
119  
			))), new Union(
120  
121  
			/**
122  
			* End Of Line Comment: //[^\n]*\n
123  
			*/
124  
			new Concatenation(
125  
				new Singleton("//"), new Concatenation(
126  
				new Repetition(new NonMatch("\n"), 0, INFINITY),
127  
				new Singleton("\n")
128  
			)),
129  
130  
			//
131  
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
132  
			//
133  
			new Concatenation(
134  
				new Singleton("/**"), new Concatenation(
135  
				new Repetition(
136  
					new Concatenation(
137  
						new Repetition(new Concatenation(
138  
							new NonMatch("*/"),
139  
							new Repetition(new NonMatch("*"), 0, INFINITY)
140  
						), 0, 1),
141  
						new Singleton("*")
142  
					), 0, INFINITY
143  
				),
144  
				new Singleton("/")
145  
			))
146  
		)));
147  
148  
		put("IDENTIFIER", new Concatenation(
149  
			new Union(
150  
				PosixClass.alpha(),
151  
				new Match("_$")
152  
			),
153  
			new Repetition(
154  
				new Union(
155  
					PosixClass.alnum(),
156  
					new Match("_$")
157  
				), 0, INFINITY
158  
			)
159  
		));
160  
161  
		/**
162  
		* 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar)
163  
		*/
164  
		put("KEYWORD", new Union(
165  
			new Singleton("const"),
166  
			new Singleton("goto")
167  
		));
168  
169  
		/**
170  
		* 19.3 Terminals from section 3.10.1: Integer Literal
171  
		*/
172  
		put("INTEGER_LITERAL", new Concatenation(
173  
			new Union(
174  
				/**
175  
				* Decimal Integer Literal: 0|[1-9][[:digit:]]*
176  
				*/
177  
				new Singleton("0"), new Union(
178  
179  
				new Concatenation(
180  
					new Range('1', '9'),
181  
					new Repetition(PosixClass.digit(), 0, INFINITY)
182  
				), new Union(
183  
184  
				/**
185  
				* Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+
186  
				*/
187  
				new Concatenation(
188  
					new Singleton("0"), new Concatenation(
189  
					new Match("xX"),
190  
					new Repetition(PosixClass.xdigit(), 1, INFINITY)
191  
				)),
192  
193  
				/**
194  
				* Octal Integer Literal: 0[0-7]+
195  
				*/
196  
				new Concatenation(
197  
					new Singleton("0"),
198  
					new Repetition(new Range('0', '7'), 1, INFINITY)
199  
				)
200  
			))),
201  
			new Repetition(new Match("lL"), 0, 1)
202  
		));
203  
204  
		/**
205  
		* 19.3 Terminals from section 3.10.2: Floating-Point Literal
206  
		*/
207  
		put("FLOATING_POINT_LITERAL", new Union(
208  
209  
			/**
210  
			* [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]?
211  
			*/
212  
			new Concatenation(
213  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
214  
				new Singleton("."), new Concatenation(
215  
				new Repetition(PosixClass.digit(), 0, INFINITY), new Concatenation(
216  
				new Repetition(new Concatenation(
217  
					new Match("eE"), new Concatenation(
218  
					new Repetition(new Match("-+"), 0, 1),
219  
					new Repetition(PosixClass.digit(), 1, INFINITY)
220  
				)), 0, 1),
221  
				new Repetition(new Match("fFdD"), 0, 1)
222  
			)))), new Union(
223  
224  
			/**
225  
			* \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]?
226  
			*/
227  
			new Concatenation(
228  
				new Singleton("."), new Concatenation(
229  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
230  
				new Repetition(new Concatenation(
231  
					new Match("eE"), new Concatenation(
232  
					new Repetition(new Match("-+"), 0, 1),
233  
					new Repetition(PosixClass.digit(), 1, INFINITY)
234  
				)), 0, 1),
235  
				new Repetition(new Match("fFdD"), 0, 1)
236  
			))), new Union(
237  
238  
			/**
239  
			* [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]?
240  
			*/
241  
			new Concatenation(
242  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
243  
				new Match("eE"), new Concatenation(
244  
				new Repetition(new Match("-+"), 0, 1), new Concatenation(
245  
				new Repetition(PosixClass.digit(), 1, INFINITY),
246  
				new Repetition(new Match("fFdD"), 0, 1)
247  
			)))),
248  
249  
			/**
250  
			* [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]
251  
			*/
252  
			new Concatenation(
253  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
254  
				new Repetition(new Concatenation(
255  
					new Match("eE"), new Concatenation(
256  
					new Repetition(new Match("-+"), 0, 1),
257  
					new Repetition(PosixClass.digit(), 1, INFINITY)
258  
				)), 0, 1),
259  
				new Match("fFdD")
260  
			))
261  
		))));
262  
263  
		/**
264  
		* 19.3 Terminals from section 3.10.3: Boolean Literal
265  
		*/
266  
		put("BOOLEAN_LITERAL", new Union(
267  
			new Singleton("true"),
268  
			new Singleton("false")
269  
		));
270  
271  
		/**
272  
		* 19.3 Terminals from section 3.10.4: Character Literal
273  
		*/
274  
		put("CHARACTER_LITERAL", new Concatenation(
275  
			new Singleton("'"), new Concatenation(
276  
			new Union(
277  
278  
				/**
279  
				* Single Character: [^\r\n'\\]
280  
				*/
281  
				new NonMatch("\r\n'\\"),
282  
283  
				/**
284  
				* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
285  
				*/
286  
				new Concatenation(
287  
					new Singleton("\\"),
288  
					new Union(
289  
						new Match("btnfr\"'\\"),
290  
						new Concatenation(
291  
							new Repetition(new Range('0', '3'), 0, 1),
292  
							new Repetition(new Range('0', '7'), 1, 2)
293  
						)
294  
					)
295  
				)
296  
			),
297  
			new Singleton("'")
298  
		)));
299  
300  
		/**
301  
		* 19.3 Terminals from section 3.10.5: String Literal
302  
		*/
303  
		put("STRING_LITERAL", new Concatenation(
304  
			new Singleton("\""), new Concatenation(
305  
			new Repetition(
306  
				new Union(
307  
308  
					/**
309  
					* Single Character: [^\r\n"\\]
310  
					*/
311  
					new NonMatch("\r\n\"\\"),
312  
313  
					/**
314  
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
315  
					*/
316  
					new Concatenation(
317  
						new Singleton("\\"),
318  
						new Union(
319  
							new Match("btnfr\"'\\"),
320  
							new Concatenation(
321  
								new Repetition(new Range('0', '3'), 0, 1),
322  
								new Repetition(new Range('0', '7'), 1, 2)
323  
							)
324  
						)
325  
					)
326  
				), 0, INFINITY
327  
			),
328  
			new Singleton("\"")
329  
		)));
330  
331  
		/**
332  
		* 19.3 Terminals section 3.10.7: Null Literal
333  
		*/
334  
		put("NULL_LITERAL", new Singleton("null"));
335  
		
336  
		// OK, it seems we have to add some more stuff...
337  
		
338  
		//put("OTHER1", new Match(";{}=,<>[]().+-:|&!"));
339  
		//put("OTHER1", new NonMatch("")); // catch anything, one character at a time
340  
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time
341  
342  
	}
343  
} // class Java20
344  
}

Author comment

Began life as a copy of #651

download  show line numbers   

Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1000348
Snippet name: Official Java tokenizer (embeddable)
Eternal ID of this version: #1000348/1
Text MD5: f45fb032e61d337b04c614258a35ff40
Author: stefan
Category: javax
Type: Document
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-07-27 14:11:13
Source code size: 9031 bytes / 344 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 610 / 457
Referenced in: [show references]