1 | !636 |
2 | //1000300 // Lexicon |
3 | //1000515 // Lexicon, fixing |
4 | !quicknew |
5 | |
6 | class JavaTok {
|
7 | static String join(List<String> cnc) {
|
8 | new StringBuilder buf; |
9 | for (String s : cnc) buf.append(s); |
10 | return buf.toString(); |
11 | } |
12 | |
13 | static List<String> split(String src) {
|
14 | Java20 lex = new Java20(); |
15 | src = src.replace("\r\n", "\n");
|
16 | LineNumberReader source = new LineNumberReader(new StringReader(src)); |
17 | int lineNr = source.getLineNumber()+1; |
18 | List<T> list = new ArrayList<T>(); |
19 | try {
|
20 | for (Object a; (a = lex.grab(source)) != lex.$;) {
|
21 | String word = lex.word(); |
22 | String q = quote(word); |
23 | //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
|
24 | lineNr = source.getLineNumber()+1; |
25 | |
26 | T t = new T(a, word); |
27 | boolean isSpace = t.isSpace(); |
28 | if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace()) |
29 | list.get(list.size()-1).word += word; // merge spaces |
30 | else |
31 | list.add(t); |
32 | } |
33 | } catch (Lexicon.Exception e) {
|
34 | throw new RuntimeException(e); |
35 | } |
36 | |
37 | List<String> cnc = new ArrayList<String>(); |
38 | for (int i = 0; i < list.size(); ) {
|
39 | T t = list.get(i); |
40 | boolean shouldBeSpace = (cnc.size() % 2) == 0; |
41 | boolean isSpace = t.isSpace(); |
42 | if (shouldBeSpace == isSpace) {
|
43 | cnc.add(t.word); |
44 | ++i; |
45 | } else if (shouldBeSpace) |
46 | cnc.add("");
|
47 | else {
|
48 | System.out.println(cncToLines(cnc)); |
49 | throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
|
50 | } |
51 | } |
52 | if ((cnc.size() % 2) == 0) |
53 | cnc.add("");
|
54 | |
55 | return cnc; |
56 | } |
57 | |
58 | static class T {
|
59 | Object a; String word; |
60 | |
61 | T(Object a, String word) { this.a = a; this.word = word; }
|
62 | |
63 | boolean isSpace() {
|
64 | return a.equals("WHITE_SPACE") || a.equals("COMMENT");
|
65 | } |
66 | } |
67 | |
68 | static String cncToLines(List<String> cnc) {
|
69 | StringBuilder out = new StringBuilder(); |
70 | for (String token : cnc) |
71 | out.append(quote(token) + "\n"); |
72 | return out.toString(); |
73 | } |
74 | |
75 | public static String quote(String s) {
|
76 | if (s == null) return "null"; |
77 | return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
|
78 | } |
79 | |
80 | static class Java20 extends Lexicon {
|
81 | |
82 | Java20() {
|
83 | |
84 | /** |
85 | * Grammar for Java 2.0. |
86 | * |
87 | * Nonterminal - first letter uppercase |
88 | * TERMINAL - all letters uppercase |
89 | * keyword - all letters lowercase |
90 | */ |
91 | int INFINITY = -1; |
92 | |
93 | /** |
94 | * 19.3 Terminals from section 3.6: White Space: [[:space:]] |
95 | */ |
96 | put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY));
|
97 | |
98 | /** |
99 | * 19.3 Terminals from section 3.7: Comment |
100 | */ |
101 | put("COMMENT", new Union(
|
102 | |
103 | // |
104 | // Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/ |
105 | // |
106 | new Concatenation( |
107 | new Singleton("/*"), new Concatenation(
|
108 | new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
|
109 | new Repetition( |
110 | new Concatenation( |
111 | new Singleton("*"),
|
112 | new Repetition(new Concatenation( |
113 | new NonMatch("*/"),
|
114 | new Repetition(new NonMatch("*"), 0, INFINITY)
|
115 | ), 0, 1) |
116 | ), 0, INFINITY |
117 | ), |
118 | new Singleton("*/")
|
119 | ))), new Union( |
120 | |
121 | /** |
122 | * End Of Line Comment: //[^\n]*\n |
123 | */ |
124 | new Concatenation( |
125 | new Singleton("//"), new Concatenation(
|
126 | new Repetition(new NonMatch("\n"), 0, INFINITY),
|
127 | new Singleton("\n")
|
128 | )), |
129 | |
130 | // |
131 | // Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/ |
132 | // |
133 | new Concatenation( |
134 | new Singleton("/**"), new Concatenation(
|
135 | new Repetition( |
136 | new Concatenation( |
137 | new Repetition(new Concatenation( |
138 | new NonMatch("*/"),
|
139 | new Repetition(new NonMatch("*"), 0, INFINITY)
|
140 | ), 0, 1), |
141 | new Singleton("*")
|
142 | ), 0, INFINITY |
143 | ), |
144 | new Singleton("/")
|
145 | )) |
146 | ))); |
147 | |
148 | put("IDENTIFIER", new Concatenation(
|
149 | new Union( |
150 | PosixClass.alpha(), |
151 | new Match("_$")
|
152 | ), |
153 | new Repetition( |
154 | new Union( |
155 | PosixClass.alnum(), |
156 | new Match("_$")
|
157 | ), 0, INFINITY |
158 | ) |
159 | )); |
160 | |
161 | /** |
162 | * 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar) |
163 | */ |
164 | put("KEYWORD", new Union(
|
165 | new Singleton("const"),
|
166 | new Singleton("goto")
|
167 | )); |
168 | |
169 | /** |
170 | * 19.3 Terminals from section 3.10.1: Integer Literal |
171 | */ |
172 | put("INTEGER_LITERAL", new Concatenation(
|
173 | new Union( |
174 | /** |
175 | * Decimal Integer Literal: 0|[1-9][[:digit:]]* |
176 | */ |
177 | new Singleton("0"), new Union(
|
178 | |
179 | new Concatenation( |
180 | new Range('1', '9'),
|
181 | new Repetition(PosixClass.digit(), 0, INFINITY) |
182 | ), new Union( |
183 | |
184 | /** |
185 | * Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+ |
186 | */ |
187 | new Concatenation( |
188 | new Singleton("0"), new Concatenation(
|
189 | new Match("xX"),
|
190 | new Repetition(PosixClass.xdigit(), 1, INFINITY) |
191 | )), |
192 | |
193 | /** |
194 | * Octal Integer Literal: 0[0-7]+ |
195 | */ |
196 | new Concatenation( |
197 | new Singleton("0"),
|
198 | new Repetition(new Range('0', '7'), 1, INFINITY)
|
199 | ) |
200 | ))), |
201 | new Repetition(new Match("lL"), 0, 1)
|
202 | )); |
203 | |
204 | /** |
205 | * 19.3 Terminals from section 3.10.2: Floating-Point Literal |
206 | */ |
207 | put("FLOATING_POINT_LITERAL", new Union(
|
208 | |
209 | /** |
210 | * [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]? |
211 | */ |
212 | new Concatenation( |
213 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
214 | new Singleton("."), new Concatenation(
|
215 | new Repetition(PosixClass.digit(), 0, INFINITY), new Concatenation( |
216 | new Repetition(new Concatenation( |
217 | new Match("eE"), new Concatenation(
|
218 | new Repetition(new Match("-+"), 0, 1),
|
219 | new Repetition(PosixClass.digit(), 1, INFINITY) |
220 | )), 0, 1), |
221 | new Repetition(new Match("fFdD"), 0, 1)
|
222 | )))), new Union( |
223 | |
224 | /** |
225 | * \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]? |
226 | */ |
227 | new Concatenation( |
228 | new Singleton("."), new Concatenation(
|
229 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
230 | new Repetition(new Concatenation( |
231 | new Match("eE"), new Concatenation(
|
232 | new Repetition(new Match("-+"), 0, 1),
|
233 | new Repetition(PosixClass.digit(), 1, INFINITY) |
234 | )), 0, 1), |
235 | new Repetition(new Match("fFdD"), 0, 1)
|
236 | ))), new Union( |
237 | |
238 | /** |
239 | * [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]? |
240 | */ |
241 | new Concatenation( |
242 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
243 | new Match("eE"), new Concatenation(
|
244 | new Repetition(new Match("-+"), 0, 1), new Concatenation(
|
245 | new Repetition(PosixClass.digit(), 1, INFINITY), |
246 | new Repetition(new Match("fFdD"), 0, 1)
|
247 | )))), |
248 | |
249 | /** |
250 | * [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD] |
251 | */ |
252 | new Concatenation( |
253 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
254 | new Repetition(new Concatenation( |
255 | new Match("eE"), new Concatenation(
|
256 | new Repetition(new Match("-+"), 0, 1),
|
257 | new Repetition(PosixClass.digit(), 1, INFINITY) |
258 | )), 0, 1), |
259 | new Match("fFdD")
|
260 | )) |
261 | )))); |
262 | |
263 | /** |
264 | * 19.3 Terminals from section 3.10.3: Boolean Literal |
265 | */ |
266 | put("BOOLEAN_LITERAL", new Union(
|
267 | new Singleton("true"),
|
268 | new Singleton("false")
|
269 | )); |
270 | |
271 | /** |
272 | * 19.3 Terminals from section 3.10.4: Character Literal |
273 | */ |
274 | put("CHARACTER_LITERAL", new Concatenation(
|
275 | new Singleton("'"), new Concatenation(
|
276 | new Union( |
277 | |
278 | /** |
279 | * Single Character: [^\r\n'\\] |
280 | */ |
281 | new NonMatch("\r\n'\\"),
|
282 | |
283 | /** |
284 | * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
|
285 | */ |
286 | new Concatenation( |
287 | new Singleton("\\"),
|
288 | new Union( |
289 | new Match("btnfr\"'\\"),
|
290 | new Concatenation( |
291 | new Repetition(new Range('0', '3'), 0, 1),
|
292 | new Repetition(new Range('0', '7'), 1, 2)
|
293 | ) |
294 | ) |
295 | ) |
296 | ), |
297 | new Singleton("'")
|
298 | ))); |
299 | |
300 | put("MULTILINE_LITERAL", new Concatenation(
|
301 | new Singleton("[["), new Concatenation(
|
302 | new Repetition( |
303 | new Union( |
304 | new NonMatch("]"),
|
305 | new Concatenation( |
306 | new Singleton("]"), new NonMatch("]"))
|
307 | ), 0, INFINITY |
308 | ), |
309 | new Singleton("]]")
|
310 | ))); |
311 | |
312 | put("MULTILINE_LITERAL2", new Concatenation(
|
313 | new Singleton("[=["), new Concatenation(
|
314 | new Repetition( |
315 | new Union( |
316 | new NonMatch("]"),
|
317 | new Concatenation(new Singleton("]"), new Union(
|
318 | new NonMatch("="),
|
319 | new Concatenation(new Singleton("="), new NonMatch("]"))))
|
320 | ), 0, INFINITY |
321 | ), |
322 | new Singleton("]=]")
|
323 | ))); |
324 | |
325 | /** |
326 | * 19.3 Terminals from section 3.10.5: String Literal |
327 | */ |
328 | put("STRING_LITERAL", new Concatenation(
|
329 | new Singleton("\""), new Concatenation(
|
330 | new Repetition( |
331 | new Union( |
332 | |
333 | /** |
334 | * Single Character: [^\r\n"\\] |
335 | */ |
336 | new NonMatch("\r\n\"\\"),
|
337 | |
338 | /** |
339 | * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
|
340 | */ |
341 | new Concatenation( |
342 | new Singleton("\\"),
|
343 | new Union( |
344 | new Match("btnfr\"'\\"),
|
345 | new Union( |
346 | new Concatenation( |
347 | new Repetition(new Range('0', '3'), 0, 1),
|
348 | new Repetition(new Range('0', '7'), 1, 2)
|
349 | ), |
350 | new Concatenation( |
351 | new Singleton("u"),
|
352 | new Repetition(new Match("0123456789abcdefABCDEF"), 4, 4)
|
353 | ) |
354 | ) |
355 | ) |
356 | ) |
357 | ), 0, INFINITY |
358 | ), |
359 | new Singleton("\"")
|
360 | ))); |
361 | |
362 | /** |
363 | * 19.3 Terminals section 3.10.7: Null Literal |
364 | */ |
365 | put("NULL_LITERAL", new Singleton("null"));
|
366 | |
367 | // OK, it seems we have to add some more stuff... |
368 | |
369 | //put("OTHER1", new Match(";{}=,<>[]().+-:|&!"));
|
370 | //put("OTHER1", new NonMatch("")); // catch anything, one character at a time
|
371 | put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time
|
372 | |
373 | } |
374 | } // class Java20 |
375 | } |
376 | |
377 | !include #1000300 // Lexicon |
Began life as a copy of #1000353
Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
| Snippet ID: | #1000516 |
| Snippet name: | Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings - pre-static |
| Eternal ID of this version: | #1000516/1 |
| Text MD5: | 11dd188dfda60a264725ec4c5a479a9e |
| Author: | stefan |
| Category: | javax |
| Type: | Document |
| Public (visible to everyone): | Yes |
| Archived (hidden from active list): | No |
| Created/modified: | 2015-08-09 22:58:18 |
| Source code size: | 9896 bytes / 377 lines |
| Pitched / IR pitched: | No / Yes |
| Views / Downloads: | 808 / 180 |
| Referenced in: | [show references] |