1 | !636 |
2 | //1000300 // Lexicon |
3 | //1000515 // Lexicon, fixing |
4 | !quicknew |
5 | |
6 | class JavaTok { |
7 | static String join(List<String> cnc) { |
8 | new StringBuilder buf; |
9 | for (String s : cnc) buf.append(s); |
10 | return buf.toString(); |
11 | } |
12 | |
13 | static List<String> split(String src) { |
14 | Java20 lex = new Java20(); |
15 | src = src.replace("\r\n", "\n"); |
16 | LineNumberReader source = new LineNumberReader(new StringReader(src)); |
17 | int lineNr = source.getLineNumber()+1; |
18 | List<T> list = new ArrayList<T>(); |
19 | try { |
20 | for (Object a; (a = lex.grab(source)) != lex.$;) { |
21 | String word = lex.word(); |
22 | String q = quote(word); |
23 | //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q); |
24 | lineNr = source.getLineNumber()+1; |
25 | |
26 | T t = new T(a, word); |
27 | boolean isSpace = t.isSpace(); |
28 | if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace()) |
29 | list.get(list.size()-1).word += word; // merge spaces |
30 | else |
31 | list.add(t); |
32 | } |
33 | } catch (Lexicon.Exception e) { |
34 | throw new RuntimeException(e); |
35 | } |
36 | |
37 | List<String> cnc = new ArrayList<String>(); |
38 | for (int i = 0; i < list.size(); ) { |
39 | T t = list.get(i); |
40 | boolean shouldBeSpace = (cnc.size() % 2) == 0; |
41 | boolean isSpace = t.isSpace(); |
42 | if (shouldBeSpace == isSpace) { |
43 | cnc.add(t.word); |
44 | ++i; |
45 | } else if (shouldBeSpace) |
46 | cnc.add(""); |
47 | else { |
48 | System.out.println(cncToLines(cnc)); |
49 | throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word)); |
50 | } |
51 | } |
52 | if ((cnc.size() % 2) == 0) |
53 | cnc.add(""); |
54 | |
55 | return cnc; |
56 | } |
57 | |
58 | static class T { |
59 | Object a; String word; |
60 | |
61 | T(Object a, String word) { this.a = a; this.word = word; } |
62 | |
63 | boolean isSpace() { |
64 | return a.equals("WHITE_SPACE") || a.equals("COMMENT"); |
65 | } |
66 | } |
67 | |
68 | static String cncToLines(List<String> cnc) { |
69 | StringBuilder out = new StringBuilder(); |
70 | for (String token : cnc) |
71 | out.append(quote(token) + "\n"); |
72 | return out.toString(); |
73 | } |
74 | |
75 | public static String quote(String s) { |
76 | if (s == null) return "null"; |
77 | return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\""; |
78 | } |
79 | |
80 | static class Java20 extends Lexicon { |
81 | |
82 | Java20() { |
83 | |
84 | /** |
85 | * Grammar for Java 2.0. |
86 | * |
87 | * Nonterminal - first letter uppercase |
88 | * TERMINAL - all letters uppercase |
89 | * keyword - all letters lowercase |
90 | */ |
91 | int INFINITY = -1; |
92 | |
93 | /** |
94 | * 19.3 Terminals from section 3.6: White Space: [[:space:]] |
95 | */ |
96 | put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY)); |
97 | |
98 | /** |
99 | * 19.3 Terminals from section 3.7: Comment |
100 | */ |
101 | put("COMMENT", new Union( |
102 | |
103 | // |
104 | // Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/ |
105 | // |
106 | new Concatenation( |
107 | new Singleton("/*"), new Concatenation( |
108 | new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation( |
109 | new Repetition( |
110 | new Concatenation( |
111 | new Singleton("*"), |
112 | new Repetition(new Concatenation( |
113 | new NonMatch("*/"), |
114 | new Repetition(new NonMatch("*"), 0, INFINITY) |
115 | ), 0, 1) |
116 | ), 0, INFINITY |
117 | ), |
118 | new Singleton("*/") |
119 | ))), new Union( |
120 | |
121 | /** |
122 | * End Of Line Comment: //[^\n]*\n |
123 | */ |
124 | new Concatenation( |
125 | new Singleton("//"), new Concatenation( |
126 | new Repetition(new NonMatch("\n"), 0, INFINITY), |
127 | new Singleton("\n") |
128 | )), |
129 | |
130 | // |
131 | // Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/ |
132 | // |
133 | new Concatenation( |
134 | new Singleton("/**"), new Concatenation( |
135 | new Repetition( |
136 | new Concatenation( |
137 | new Repetition(new Concatenation( |
138 | new NonMatch("*/"), |
139 | new Repetition(new NonMatch("*"), 0, INFINITY) |
140 | ), 0, 1), |
141 | new Singleton("*") |
142 | ), 0, INFINITY |
143 | ), |
144 | new Singleton("/") |
145 | )) |
146 | ))); |
147 | |
148 | put("IDENTIFIER", new Concatenation( |
149 | new Union( |
150 | PosixClass.alpha(), |
151 | new Match("_$") |
152 | ), |
153 | new Repetition( |
154 | new Union( |
155 | PosixClass.alnum(), |
156 | new Match("_$") |
157 | ), 0, INFINITY |
158 | ) |
159 | )); |
160 | |
161 | /** |
162 | * 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar) |
163 | */ |
164 | put("KEYWORD", new Union( |
165 | new Singleton("const"), |
166 | new Singleton("goto") |
167 | )); |
168 | |
169 | /** |
170 | * 19.3 Terminals from section 3.10.1: Integer Literal |
171 | */ |
172 | put("INTEGER_LITERAL", new Concatenation( |
173 | new Union( |
174 | /** |
175 | * Decimal Integer Literal: 0|[1-9][[:digit:]]* |
176 | */ |
177 | new Singleton("0"), new Union( |
178 | |
179 | new Concatenation( |
180 | new Range('1', '9'), |
181 | new Repetition(PosixClass.digit(), 0, INFINITY) |
182 | ), new Union( |
183 | |
184 | /** |
185 | * Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+ |
186 | */ |
187 | new Concatenation( |
188 | new Singleton("0"), new Concatenation( |
189 | new Match("xX"), |
190 | new Repetition(PosixClass.xdigit(), 1, INFINITY) |
191 | )), |
192 | |
193 | /** |
194 | * Octal Integer Literal: 0[0-7]+ |
195 | */ |
196 | new Concatenation( |
197 | new Singleton("0"), |
198 | new Repetition(new Range('0', '7'), 1, INFINITY) |
199 | ) |
200 | ))), |
201 | new Repetition(new Match("lL"), 0, 1) |
202 | )); |
203 | |
204 | /** |
205 | * 19.3 Terminals from section 3.10.2: Floating-Point Literal |
206 | */ |
207 | put("FLOATING_POINT_LITERAL", new Union( |
208 | |
209 | /** |
210 | * [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]? |
211 | */ |
212 | new Concatenation( |
213 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
214 | new Singleton("."), new Concatenation( |
215 | new Repetition(PosixClass.digit(), 0, INFINITY), new Concatenation( |
216 | new Repetition(new Concatenation( |
217 | new Match("eE"), new Concatenation( |
218 | new Repetition(new Match("-+"), 0, 1), |
219 | new Repetition(PosixClass.digit(), 1, INFINITY) |
220 | )), 0, 1), |
221 | new Repetition(new Match("fFdD"), 0, 1) |
222 | )))), new Union( |
223 | |
224 | /** |
225 | * \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]? |
226 | */ |
227 | new Concatenation( |
228 | new Singleton("."), new Concatenation( |
229 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
230 | new Repetition(new Concatenation( |
231 | new Match("eE"), new Concatenation( |
232 | new Repetition(new Match("-+"), 0, 1), |
233 | new Repetition(PosixClass.digit(), 1, INFINITY) |
234 | )), 0, 1), |
235 | new Repetition(new Match("fFdD"), 0, 1) |
236 | ))), new Union( |
237 | |
238 | /** |
239 | * [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]? |
240 | */ |
241 | new Concatenation( |
242 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
243 | new Match("eE"), new Concatenation( |
244 | new Repetition(new Match("-+"), 0, 1), new Concatenation( |
245 | new Repetition(PosixClass.digit(), 1, INFINITY), |
246 | new Repetition(new Match("fFdD"), 0, 1) |
247 | )))), |
248 | |
249 | /** |
250 | * [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD] |
251 | */ |
252 | new Concatenation( |
253 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
254 | new Repetition(new Concatenation( |
255 | new Match("eE"), new Concatenation( |
256 | new Repetition(new Match("-+"), 0, 1), |
257 | new Repetition(PosixClass.digit(), 1, INFINITY) |
258 | )), 0, 1), |
259 | new Match("fFdD") |
260 | )) |
261 | )))); |
262 | |
263 | /** |
264 | * 19.3 Terminals from section 3.10.3: Boolean Literal |
265 | */ |
266 | put("BOOLEAN_LITERAL", new Union( |
267 | new Singleton("true"), |
268 | new Singleton("false") |
269 | )); |
270 | |
271 | /** |
272 | * 19.3 Terminals from section 3.10.4: Character Literal |
273 | */ |
274 | put("CHARACTER_LITERAL", new Concatenation( |
275 | new Singleton("'"), new Concatenation( |
276 | new Union( |
277 | |
278 | /** |
279 | * Single Character: [^\r\n'\\] |
280 | */ |
281 | new NonMatch("\r\n'\\"), |
282 | |
283 | /** |
284 | * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2}) |
285 | */ |
286 | new Concatenation( |
287 | new Singleton("\\"), |
288 | new Union( |
289 | new Match("btnfr\"'\\"), |
290 | new Concatenation( |
291 | new Repetition(new Range('0', '3'), 0, 1), |
292 | new Repetition(new Range('0', '7'), 1, 2) |
293 | ) |
294 | ) |
295 | ) |
296 | ), |
297 | new Singleton("'") |
298 | ))); |
299 | |
300 | put("MULTILINE_LITERAL", new Concatenation( |
301 | new Singleton("[["), new Concatenation( |
302 | new Repetition( |
303 | new Union( |
304 | new NonMatch("]"), |
305 | new Concatenation( |
306 | new Singleton("]"), new NonMatch("]")) |
307 | ), 0, INFINITY |
308 | ), |
309 | new Singleton("]]") |
310 | ))); |
311 | |
312 | put("MULTILINE_LITERAL2", new Concatenation( |
313 | new Singleton("[=["), new Concatenation( |
314 | new Repetition( |
315 | new Union( |
316 | new NonMatch("]"), |
317 | new Concatenation(new Singleton("]"), new Union( |
318 | new NonMatch("="), |
319 | new Concatenation(new Singleton("="), new NonMatch("]")))) |
320 | ), 0, INFINITY |
321 | ), |
322 | new Singleton("]=]") |
323 | ))); |
324 | |
325 | /** |
326 | * 19.3 Terminals from section 3.10.5: String Literal |
327 | */ |
328 | put("STRING_LITERAL", new Concatenation( |
329 | new Singleton("\""), new Concatenation( |
330 | new Repetition( |
331 | new Union( |
332 | |
333 | /** |
334 | * Single Character: [^\r\n"\\] |
335 | */ |
336 | new NonMatch("\r\n\"\\"), |
337 | |
338 | /** |
339 | * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2}) |
340 | */ |
341 | new Concatenation( |
342 | new Singleton("\\"), |
343 | new Union( |
344 | new Match("btnfr\"'\\"), |
345 | new Union( |
346 | new Concatenation( |
347 | new Repetition(new Range('0', '3'), 0, 1), |
348 | new Repetition(new Range('0', '7'), 1, 2) |
349 | ), |
350 | new Concatenation( |
351 | new Singleton("u"), |
352 | new Repetition(new Match("0123456789abcdefABCDEF"), 4, 4) |
353 | ) |
354 | ) |
355 | ) |
356 | ) |
357 | ), 0, INFINITY |
358 | ), |
359 | new Singleton("\"") |
360 | ))); |
361 | |
362 | /** |
363 | * 19.3 Terminals section 3.10.7: Null Literal |
364 | */ |
365 | put("NULL_LITERAL", new Singleton("null")); |
366 | |
367 | // OK, it seems we have to add some more stuff... |
368 | |
369 | //put("OTHER1", new Match(";{}=,<>[]().+-:|&!")); |
370 | //put("OTHER1", new NonMatch("")); // catch anything, one character at a time |
371 | put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time |
372 | |
373 | } |
374 | } // class Java20 |
375 | } |
376 | |
377 | !include #1000300 // Lexicon |
Began life as a copy of #1000353
Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1000516 |
Snippet name: | Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings - pre-static |
Eternal ID of this version: | #1000516/1 |
Text MD5: | 11dd188dfda60a264725ec4c5a479a9e |
Author: | stefan |
Category: | javax |
Type: | Document |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-08-09 22:58:18 |
Source code size: | 9896 bytes / 377 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 604 / 129 |
Referenced in: | [show references] |