Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

377
LINES

< > BotCompany Repo | #1000516 // Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings - pre-static

Document

1  
!636
2  
//1000300 // Lexicon
3  
//1000515 // Lexicon, fixing
4  
!quicknew
5  
6  
class JavaTok {
7  
  static String join(List<String> cnc) {
8  
    new StringBuilder buf;
9  
    for (String s : cnc) buf.append(s);
10  
    return buf.toString();
11  
  }
12  
  
13  
  static List<String> split(String src) {
14  
    Java20 lex = new Java20();
15  
    src = src.replace("\r\n", "\n");
16  
    LineNumberReader source = new LineNumberReader(new StringReader(src));
17  
    int lineNr = source.getLineNumber()+1;
18  
    List<T> list = new ArrayList<T>();
19  
    try {
20  
      for (Object a; (a = lex.grab(source)) != lex.$;) {
21  
        String word = lex.word();
22  
        String q = quote(word);
23  
        //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
24  
        lineNr = source.getLineNumber()+1;
25  
        
26  
        T t = new T(a, word);
27  
        boolean isSpace = t.isSpace();
28  
        if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
29  
          list.get(list.size()-1).word += word; // merge spaces
30  
        else
31  
          list.add(t);
32  
      }
33  
    } catch (Lexicon.Exception e) {
34  
      throw new RuntimeException(e);
35  
    }
36  
    
37  
    List<String> cnc = new ArrayList<String>();
38  
    for (int i = 0; i < list.size(); ) {
39  
      T t = list.get(i);
40  
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
41  
      boolean isSpace = t.isSpace();
42  
      if (shouldBeSpace == isSpace) {
43  
        cnc.add(t.word);
44  
        ++i;
45  
      } else if (shouldBeSpace)
46  
        cnc.add("");
47  
      else {
48  
        System.out.println(cncToLines(cnc));
49  
        throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
50  
      }
51  
    }
52  
    if ((cnc.size() % 2) == 0)
53  
      cnc.add("");
54  
55  
    return cnc;
56  
  }
57  
  
58  
  static class T {
59  
    Object a; String word;
60  
    
61  
    T(Object a, String word) { this.a = a; this.word = word; }
62  
    
63  
    boolean isSpace() {
64  
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
65  
    }
66  
  }
67  
  
68  
  static String cncToLines(List<String> cnc) {
69  
    StringBuilder out = new StringBuilder();
70  
    for (String token : cnc)
71  
      out.append(quote(token) + "\n");
72  
    return out.toString();
73  
  }
74  
  
75  
  public static String quote(String s) {
76  
    if (s == null) return "null";
77  
    return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
78  
  }
79  
  
80  
  static class Java20 extends Lexicon {
81  
82  
	Java20() {
83  
84  
		/**
85  
		* Grammar for Java 2.0.
86  
		*
87  
		* Nonterminal - first letter uppercase
88  
		* TERMINAL - all letters uppercase
89  
		* keyword - all letters lowercase
90  
		*/
91  
		int INFINITY = -1;
92  
93  
		/**
94  
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
95  
		*/
96  
		put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY));
97  
98  
		/**
99  
		* 19.3 Terminals from section 3.7: Comment
100  
		*/
101  
		put("COMMENT", new Union(
102  
103  
			//
104  
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
105  
			//
106  
			new Concatenation(
107  
				new Singleton("/*"), new Concatenation(
108  
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
109  
				new Repetition(
110  
					new Concatenation(
111  
						new Singleton("*"),
112  
						new Repetition(new Concatenation(
113  
							new NonMatch("*/"),
114  
							new Repetition(new NonMatch("*"), 0, INFINITY)
115  
						), 0, 1)
116  
					), 0, INFINITY
117  
				),
118  
				new Singleton("*/")
119  
			))), new Union(
120  
121  
			/**
122  
			* End Of Line Comment: //[^\n]*\n
123  
			*/
124  
			new Concatenation(
125  
				new Singleton("//"), new Concatenation(
126  
				new Repetition(new NonMatch("\n"), 0, INFINITY),
127  
				new Singleton("\n")
128  
			)),
129  
130  
			//
131  
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
132  
			//
133  
			new Concatenation(
134  
				new Singleton("/**"), new Concatenation(
135  
				new Repetition(
136  
					new Concatenation(
137  
						new Repetition(new Concatenation(
138  
							new NonMatch("*/"),
139  
							new Repetition(new NonMatch("*"), 0, INFINITY)
140  
						), 0, 1),
141  
						new Singleton("*")
142  
					), 0, INFINITY
143  
				),
144  
				new Singleton("/")
145  
			))
146  
		)));
147  
148  
		put("IDENTIFIER", new Concatenation(
149  
			new Union(
150  
				PosixClass.alpha(),
151  
				new Match("_$")
152  
			),
153  
			new Repetition(
154  
				new Union(
155  
					PosixClass.alnum(),
156  
					new Match("_$")
157  
				), 0, INFINITY
158  
			)
159  
		));
160  
161  
		/**
162  
		* 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar)
163  
		*/
164  
		put("KEYWORD", new Union(
165  
			new Singleton("const"),
166  
			new Singleton("goto")
167  
		));
168  
169  
		/**
170  
		* 19.3 Terminals from section 3.10.1: Integer Literal
171  
		*/
172  
		put("INTEGER_LITERAL", new Concatenation(
173  
			new Union(
174  
				/**
175  
				* Decimal Integer Literal: 0|[1-9][[:digit:]]*
176  
				*/
177  
				new Singleton("0"), new Union(
178  
179  
				new Concatenation(
180  
					new Range('1', '9'),
181  
					new Repetition(PosixClass.digit(), 0, INFINITY)
182  
				), new Union(
183  
184  
				/**
185  
				* Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+
186  
				*/
187  
				new Concatenation(
188  
					new Singleton("0"), new Concatenation(
189  
					new Match("xX"),
190  
					new Repetition(PosixClass.xdigit(), 1, INFINITY)
191  
				)),
192  
193  
				/**
194  
				* Octal Integer Literal: 0[0-7]+
195  
				*/
196  
				new Concatenation(
197  
					new Singleton("0"),
198  
					new Repetition(new Range('0', '7'), 1, INFINITY)
199  
				)
200  
			))),
201  
			new Repetition(new Match("lL"), 0, 1)
202  
		));
203  
204  
		/**
205  
		* 19.3 Terminals from section 3.10.2: Floating-Point Literal
206  
		*/
207  
		put("FLOATING_POINT_LITERAL", new Union(
208  
209  
			/**
210  
			* [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]?
211  
			*/
212  
			new Concatenation(
213  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
214  
				new Singleton("."), new Concatenation(
215  
				new Repetition(PosixClass.digit(), 0, INFINITY), new Concatenation(
216  
				new Repetition(new Concatenation(
217  
					new Match("eE"), new Concatenation(
218  
					new Repetition(new Match("-+"), 0, 1),
219  
					new Repetition(PosixClass.digit(), 1, INFINITY)
220  
				)), 0, 1),
221  
				new Repetition(new Match("fFdD"), 0, 1)
222  
			)))), new Union(
223  
224  
			/**
225  
			* \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]?
226  
			*/
227  
			new Concatenation(
228  
				new Singleton("."), new Concatenation(
229  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
230  
				new Repetition(new Concatenation(
231  
					new Match("eE"), new Concatenation(
232  
					new Repetition(new Match("-+"), 0, 1),
233  
					new Repetition(PosixClass.digit(), 1, INFINITY)
234  
				)), 0, 1),
235  
				new Repetition(new Match("fFdD"), 0, 1)
236  
			))), new Union(
237  
238  
			/**
239  
			* [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]?
240  
			*/
241  
			new Concatenation(
242  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
243  
				new Match("eE"), new Concatenation(
244  
				new Repetition(new Match("-+"), 0, 1), new Concatenation(
245  
				new Repetition(PosixClass.digit(), 1, INFINITY),
246  
				new Repetition(new Match("fFdD"), 0, 1)
247  
			)))),
248  
249  
			/**
250  
			* [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]
251  
			*/
252  
			new Concatenation(
253  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
254  
				new Repetition(new Concatenation(
255  
					new Match("eE"), new Concatenation(
256  
					new Repetition(new Match("-+"), 0, 1),
257  
					new Repetition(PosixClass.digit(), 1, INFINITY)
258  
				)), 0, 1),
259  
				new Match("fFdD")
260  
			))
261  
		))));
262  
263  
		/**
264  
		* 19.3 Terminals from section 3.10.3: Boolean Literal
265  
		*/
266  
		put("BOOLEAN_LITERAL", new Union(
267  
			new Singleton("true"),
268  
			new Singleton("false")
269  
		));
270  
271  
		/**
272  
		* 19.3 Terminals from section 3.10.4: Character Literal
273  
		*/
274  
		put("CHARACTER_LITERAL", new Concatenation(
275  
			new Singleton("'"), new Concatenation(
276  
			new Union(
277  
278  
				/**
279  
				* Single Character: [^\r\n'\\]
280  
				*/
281  
				new NonMatch("\r\n'\\"),
282  
283  
				/**
284  
				* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
285  
				*/
286  
				new Concatenation(
287  
					new Singleton("\\"),
288  
					new Union(
289  
						new Match("btnfr\"'\\"),
290  
						new Concatenation(
291  
							new Repetition(new Range('0', '3'), 0, 1),
292  
							new Repetition(new Range('0', '7'), 1, 2)
293  
						)
294  
					)
295  
				)
296  
			),
297  
			new Singleton("'")
298  
		)));
299  
300  
		put("MULTILINE_LITERAL", new Concatenation(
301  
			new Singleton("[["), new Concatenation(
302  
			new Repetition(
303  
				new Union(
304  
					new NonMatch("]"),
305  
					new Concatenation(
306  
					  new Singleton("]"), new NonMatch("]"))
307  
			  ), 0, INFINITY
308  
			),
309  
			new Singleton("]]")
310  
		)));
311  
312  
		put("MULTILINE_LITERAL2", new Concatenation(
313  
			new Singleton("[=["), new Concatenation(
314  
			new Repetition(
315  
				new Union(
316  
					new NonMatch("]"),
317  
					new Concatenation(new Singleton("]"), new Union(
318  
				    new NonMatch("="),
319  
				    new Concatenation(new Singleton("="), new NonMatch("]"))))
320  
			  ), 0, INFINITY
321  
			),
322  
			new Singleton("]=]")
323  
		)));
324  
325  
		/**
326  
		* 19.3 Terminals from section 3.10.5: String Literal
327  
		*/
328  
		put("STRING_LITERAL", new Concatenation(
329  
			new Singleton("\""), new Concatenation(
330  
			new Repetition(
331  
				new Union(
332  
333  
					/**
334  
					* Single Character: [^\r\n"\\]
335  
					*/
336  
					new NonMatch("\r\n\"\\"),
337  
338  
					/**
339  
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
340  
					*/
341  
					new Concatenation(
342  
						new Singleton("\\"),
343  
						new Union(
344  
							new Match("btnfr\"'\\"),
345  
							new Union(
346  
  							new Concatenation(
347  
  								new Repetition(new Range('0', '3'), 0, 1),
348  
  								new Repetition(new Range('0', '7'), 1, 2)
349  
  							),
350  
  							new Concatenation(
351  
  							  new Singleton("u"),
352  
  							  new Repetition(new Match("0123456789abcdefABCDEF"), 4, 4)
353  
  							)
354  
  						)
355  
						)
356  
					)
357  
				), 0, INFINITY
358  
			),
359  
			new Singleton("\"")
360  
		)));
361  
362  
		/**
363  
		* 19.3 Terminals section 3.10.7: Null Literal
364  
		*/
365  
		put("NULL_LITERAL", new Singleton("null"));
366  
		
367  
		// OK, it seems we have to add some more stuff...
368  
		
369  
		//put("OTHER1", new Match(";{}=,<>[]().+-:|&!"));
370  
		//put("OTHER1", new NonMatch("")); // catch anything, one character at a time
371  
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time
372  
373  
	}
374  
} // class Java20
375  
}
376  
377  
!include #1000300 // Lexicon

Author comment

Began life as a copy of #1000353

download  show line numbers   

Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1000516
Snippet name: Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings - pre-static
Eternal ID of this version: #1000516/1
Text MD5: 11dd188dfda60a264725ec4c5a479a9e
Author: stefan
Category: javax
Type: Document
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-08-09 22:58:18
Source code size: 9896 bytes / 377 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 542 / 107
Referenced in: [show references]