1 | !636
|
2 | //1000300 // Lexicon
|
3 | //1000515 // Lexicon, fixing
|
4 | !quicknew
|
5 |
|
6 | class JavaTok {
|
7 | static String join(List<String> cnc) {
|
8 | new StringBuilder buf;
|
9 | for (String s : cnc) buf.append(s);
|
10 | return buf.toString();
|
11 | }
|
12 |
|
13 | static List<String> split(String src) {
|
14 | Java20 lex = new Java20();
|
15 | src = src.replace("\r\n", "\n");
|
16 | LineNumberReader source = new LineNumberReader(new StringReader(src));
|
17 | int lineNr = source.getLineNumber()+1;
|
18 | List<T> list = new ArrayList<T>();
|
19 | try {
|
20 | for (Object a; (a = lex.grab(source)) != lex.$;) {
|
21 | String word = lex.word();
|
22 | String q = quote(word);
|
23 | //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
|
24 | lineNr = source.getLineNumber()+1;
|
25 |
|
26 | T t = new T(a, word);
|
27 | boolean isSpace = t.isSpace();
|
28 | if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
|
29 | list.get(list.size()-1).word += word; // merge spaces
|
30 | else
|
31 | list.add(t);
|
32 | }
|
33 | } catch (Lexicon.Exception e) {
|
34 | throw new RuntimeException(e);
|
35 | }
|
36 |
|
37 | List<String> cnc = new ArrayList<String>();
|
38 | for (int i = 0; i < list.size(); ) {
|
39 | T t = list.get(i);
|
40 | boolean shouldBeSpace = (cnc.size() % 2) == 0;
|
41 | boolean isSpace = t.isSpace();
|
42 | if (shouldBeSpace == isSpace) {
|
43 | cnc.add(t.word);
|
44 | ++i;
|
45 | } else if (shouldBeSpace)
|
46 | cnc.add("");
|
47 | else {
|
48 | System.out.println(cncToLines(cnc));
|
49 | throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
|
50 | }
|
51 | }
|
52 | if ((cnc.size() % 2) == 0)
|
53 | cnc.add("");
|
54 |
|
55 | return cnc;
|
56 | }
|
57 |
|
58 | static class T {
|
59 | Object a; String word;
|
60 |
|
61 | T(Object a, String word) { this.a = a; this.word = word; }
|
62 |
|
63 | boolean isSpace() {
|
64 | return a.equals("WHITE_SPACE") || a.equals("COMMENT");
|
65 | }
|
66 | }
|
67 |
|
68 | static String cncToLines(List<String> cnc) {
|
69 | StringBuilder out = new StringBuilder();
|
70 | for (String token : cnc)
|
71 | out.append(quote(token) + "\n");
|
72 | return out.toString();
|
73 | }
|
74 |
|
75 | public static String quote(String s) {
|
76 | if (s == null) return "null";
|
77 | return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
|
78 | }
|
79 |
|
80 | static class Java20 extends Lexicon {
|
81 |
|
82 | Java20() {
|
83 | /**
|
84 | * Grammar for Java 2.0.
|
85 | *
|
86 | * Nonterminal - first letter uppercase
|
87 | * TERMINAL - all letters uppercase
|
88 | * keyword - all letters lowercase
|
89 | */
|
90 | int INFINITY = -1;
|
91 |
|
92 | /**
|
93 | * 19.3 Terminals from section 3.6: White Space: [[:space:]]
|
94 | */
|
95 | put("WHITE_SPACE", new Repetition(space(), 1, INFINITY));
|
96 |
|
97 | /**
|
98 | * 19.3 Terminals from section 3.7: Comment
|
99 | */
|
100 | put("COMMENT", new Union(
|
101 |
|
102 | //
|
103 | // Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
|
104 | //
|
105 | new Concatenation(
|
106 | new Singleton("/*"), new Concatenation(
|
107 | new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
|
108 | new Repetition(
|
109 | new Concatenation(
|
110 | new Singleton("*"),
|
111 | new Repetition(new Concatenation(
|
112 | new NonMatch("*/"),
|
113 | new Repetition(new NonMatch("*"), 0, INFINITY)
|
114 | ), 0, 1)
|
115 | ), 0, INFINITY
|
116 | ),
|
117 | new Singleton("*/")
|
118 | ))), new Union(
|
119 |
|
120 | /**
|
121 | * End Of Line Comment: //[^\n]*\n
|
122 | */
|
123 | new Concatenation(
|
124 | new Singleton("//"), new Concatenation(
|
125 | new Repetition(new NonMatch("\n"), 0, INFINITY),
|
126 | new Singleton("\n")
|
127 | )),
|
128 |
|
129 | //
|
130 | // Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
|
131 | //
|
132 | new Concatenation(
|
133 | new Singleton("/**"), new Concatenation(
|
134 | new Repetition(
|
135 | new Concatenation(
|
136 | new Repetition(new Concatenation(
|
137 | new NonMatch("*/"),
|
138 | new Repetition(new NonMatch("*"), 0, INFINITY)
|
139 | ), 0, 1),
|
140 | new Singleton("*")
|
141 | ), 0, INFINITY
|
142 | ),
|
143 | new Singleton("/")
|
144 | ))
|
145 | )));
|
146 |
|
147 | put("IDENTIFIER", new Concatenation(
|
148 | new Union(
|
149 | alpha(),
|
150 | new Match("_$")
|
151 | ),
|
152 | new Repetition(
|
153 | new Union(
|
154 | alnum(),
|
155 | new Match("_$")
|
156 | ), 0, INFINITY
|
157 | )
|
158 | ));
|
159 |
|
160 | /**
|
161 | * 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar)
|
162 | */
|
163 | put("KEYWORD", new Union(
|
164 | new Singleton("const"),
|
165 | new Singleton("goto")
|
166 | ));
|
167 |
|
168 | /**
|
169 | * 19.3 Terminals from section 3.10.1: Integer Literal
|
170 | */
|
171 | put("INTEGER_LITERAL", new Concatenation(
|
172 | new Union(
|
173 | /**
|
174 | * Decimal Integer Literal: 0|[1-9][[:digit:]]*
|
175 | */
|
176 | new Singleton("0"), new Union(
|
177 |
|
178 | new Concatenation(
|
179 | new Range('1', '9'),
|
180 | new Repetition(digit(), 0, INFINITY)
|
181 | ), new Union(
|
182 |
|
183 | /**
|
184 | * Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+
|
185 | */
|
186 | new Concatenation(
|
187 | new Singleton("0"), new Concatenation(
|
188 | new Match("xX"),
|
189 | new Repetition(xdigit(), 1, INFINITY)
|
190 | )),
|
191 |
|
192 | /**
|
193 | * Octal Integer Literal: 0[0-7]+
|
194 | */
|
195 | new Concatenation(
|
196 | new Singleton("0"),
|
197 | new Repetition(new Range('0', '7'), 1, INFINITY)
|
198 | )
|
199 | ))),
|
200 | new Repetition(new Match("lL"), 0, 1)
|
201 | ));
|
202 |
|
203 | /**
|
204 | * 19.3 Terminals from section 3.10.2: Floating-Point Literal
|
205 | */
|
206 | put("FLOATING_POINT_LITERAL", new Union(
|
207 |
|
208 | /**
|
209 | * [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]?
|
210 | */
|
211 | new Concatenation(
|
212 | new Repetition(digit(), 1, INFINITY), new Concatenation(
|
213 | new Singleton("."), new Concatenation(
|
214 | new Repetition(digit(), 0, INFINITY), new Concatenation(
|
215 | new Repetition(new Concatenation(
|
216 | new Match("eE"), new Concatenation(
|
217 | new Repetition(new Match("-+"), 0, 1),
|
218 | new Repetition(digit(), 1, INFINITY)
|
219 | )), 0, 1),
|
220 | new Repetition(new Match("fFdD"), 0, 1)
|
221 | )))), new Union(
|
222 |
|
223 | /**
|
224 | * \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]?
|
225 | */
|
226 | new Concatenation(
|
227 | new Singleton("."), new Concatenation(
|
228 | new Repetition(digit(), 1, INFINITY), new Concatenation(
|
229 | new Repetition(new Concatenation(
|
230 | new Match("eE"), new Concatenation(
|
231 | new Repetition(new Match("-+"), 0, 1),
|
232 | new Repetition(digit(), 1, INFINITY)
|
233 | )), 0, 1),
|
234 | new Repetition(new Match("fFdD"), 0, 1)
|
235 | ))), new Union(
|
236 |
|
237 | /**
|
238 | * [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]?
|
239 | */
|
240 | new Concatenation(
|
241 | new Repetition(digit(), 1, INFINITY), new Concatenation(
|
242 | new Match("eE"), new Concatenation(
|
243 | new Repetition(new Match("-+"), 0, 1), new Concatenation(
|
244 | new Repetition(digit(), 1, INFINITY),
|
245 | new Repetition(new Match("fFdD"), 0, 1)
|
246 | )))),
|
247 |
|
248 | /**
|
249 | * [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]
|
250 | */
|
251 | new Concatenation(
|
252 | new Repetition(digit(), 1, INFINITY), new Concatenation(
|
253 | new Repetition(new Concatenation(
|
254 | new Match("eE"), new Concatenation(
|
255 | new Repetition(new Match("-+"), 0, 1),
|
256 | new Repetition(digit(), 1, INFINITY)
|
257 | )), 0, 1),
|
258 | new Match("fFdD")
|
259 | ))
|
260 | ))));
|
261 |
|
262 | /**
|
263 | * 19.3 Terminals from section 3.10.3: Boolean Literal
|
264 | */
|
265 | put("BOOLEAN_LITERAL", new Union(
|
266 | new Singleton("true"),
|
267 | new Singleton("false")
|
268 | ));
|
269 |
|
270 | /**
|
271 | * 19.3 Terminals from section 3.10.4: Character Literal
|
272 | */
|
273 | put("CHARACTER_LITERAL", new Concatenation(
|
274 | new Singleton("'"), new Concatenation(
|
275 | new Union(
|
276 |
|
277 | /**
|
278 | * Single Character: [^\r\n'\\]
|
279 | */
|
280 | new NonMatch("\r\n'\\"),
|
281 |
|
282 | /**
|
283 | * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
|
284 | */
|
285 | new Concatenation(
|
286 | new Singleton("\\"),
|
287 | new Union(
|
288 | new Match("btnfr\"'\\"),
|
289 | new Concatenation(
|
290 | new Repetition(new Range('0', '3'), 0, 1),
|
291 | new Repetition(new Range('0', '7'), 1, 2)
|
292 | )
|
293 | )
|
294 | )
|
295 | ),
|
296 | new Singleton("'")
|
297 | )));
|
298 |
|
299 | put("MULTILINE_LITERAL", new Concatenation(
|
300 | new Singleton("[["), new Concatenation(
|
301 | new Repetition(
|
302 | new Union(
|
303 | new NonMatch("]"),
|
304 | new Concatenation(
|
305 | new Singleton("]"), new NonMatch("]"))
|
306 | ), 0, INFINITY
|
307 | ),
|
308 | new Singleton("]]")
|
309 | )));
|
310 |
|
311 | put("MULTILINE_LITERAL2", new Concatenation(
|
312 | new Singleton("[=["), new Concatenation(
|
313 | new Repetition(
|
314 | new Union(
|
315 | new NonMatch("]"),
|
316 | new Concatenation(new Singleton("]"), new Union(
|
317 | new NonMatch("="),
|
318 | new Concatenation(new Singleton("="), new NonMatch("]"))))
|
319 | ), 0, INFINITY
|
320 | ),
|
321 | new Singleton("]=]")
|
322 | )));
|
323 |
|
324 | /**
|
325 | * 19.3 Terminals from section 3.10.5: String Literal
|
326 | */
|
327 | put("STRING_LITERAL", new Concatenation(
|
328 | new Singleton("\""), new Concatenation(
|
329 | new Repetition(
|
330 | new Union(
|
331 |
|
332 | /**
|
333 | * Single Character: [^\r\n"\\]
|
334 | */
|
335 | new NonMatch("\r\n\"\\"),
|
336 |
|
337 | /**
|
338 | * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
|
339 | */
|
340 | new Concatenation(
|
341 | new Singleton("\\"),
|
342 | new Union(
|
343 | new Match("btnfr\"'\\"),
|
344 | new Union(
|
345 | new Concatenation(
|
346 | new Repetition(new Range('0', '3'), 0, 1),
|
347 | new Repetition(new Range('0', '7'), 1, 2)
|
348 | ),
|
349 | new Concatenation(
|
350 | new Singleton("u"),
|
351 | new Repetition(new Match("0123456789abcdefABCDEF"), 4, 4)
|
352 | )
|
353 | )
|
354 | )
|
355 | )
|
356 | ), 0, INFINITY
|
357 | ),
|
358 | new Singleton("\"")
|
359 | )));
|
360 |
|
361 | /**
|
362 | * 19.3 Terminals section 3.10.7: Null Literal
|
363 | */
|
364 | put("NULL_LITERAL", new Singleton("null"));
|
365 |
|
366 | // OK, it seems we have to add some more stuff...
|
367 |
|
368 | //put("OTHER1", new Match(";{}=,<>[]().+-:|&!"));
|
369 | //put("OTHER1", new NonMatch("")); // catch anything, one character at a time
|
370 | put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time
|
371 |
|
372 | }
|
373 | } // class Java20
|
374 | }
|
375 |
|
376 | !include #1000514 // Lexicon |