1 | !636 |
2 | !629 // standard functions |
3 | !1000300 // class Lexicon |
4 | |
5 | main { |
6 | psvm { |
7 | String src = takeInput(args, null); |
8 | Java20 lex = new Java20(); |
9 | src = src.replace("\r\n", "\n"); |
10 | LineNumberReader source = new LineNumberReader(new StringReader(src)); |
11 | int lineNr = source.getLineNumber()+1; |
12 | List<T> list = new ArrayList<T>(); |
13 | for (Object a; (a = lex.grab(source)) != lex.$;) { |
14 | String word = lex.word(); |
15 | String q = quote(word); |
16 | //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q); |
17 | lineNr = source.getLineNumber()+1; |
18 | |
19 | T t = new T(a, word); |
20 | boolean isSpace = t.isSpace(); |
21 | if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace()) |
22 | list.get(list.size()-1).word += word; // merge spaces |
23 | else |
24 | list.add(t); |
25 | } |
26 | |
27 | List<String> cnc = new ArrayList<String>(); |
28 | for (int i = 0; i < list.size(); ) { |
29 | T t = list.get(i); |
30 | boolean shouldBeSpace = (cnc.size() % 2) == 0; |
31 | boolean isSpace = t.isSpace(); |
32 | if (shouldBeSpace == isSpace) { |
33 | cnc.add(t.word); |
34 | ++i; |
35 | } else if (shouldBeSpace) |
36 | cnc.add(""); |
37 | else { |
38 | System.out.println(cncToLines(cnc)); |
39 | throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word)); |
40 | } |
41 | } |
42 | if ((cnc.size() % 2) == 0) |
43 | cnc.add(""); |
44 | |
45 | saveTextFile("output/output.txt", cncToLines(cnc)); |
46 | } |
47 | |
48 | static class T { |
49 | Object a; String word; |
50 | |
51 | T(Object a, String word) { this.a = a; this.word = word; } |
52 | |
53 | boolean isSpace() { |
54 | return a.equals("WHITE_SPACE") || a.equals("COMMENT"); |
55 | } |
56 | } |
57 | |
58 | static String cncToLines(List<String> cnc) { |
59 | StringBuilder out = new StringBuilder(); |
60 | for (String token : cnc) |
61 | out.append(quote(token) + "\n"); |
62 | return out.toString(); |
63 | } |
64 | |
65 | static String takeInput(String[] args, String def) tex { |
66 | if (args.length != 0) return loadSnippet(args[0]); |
67 | return loadTextFile("input/input.txt", def); |
68 | } |
69 | |
70 | public static String quote(String s) { |
71 | if (s == null) return "null"; |
72 | return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\""; |
73 | } |
74 | |
75 | static class Java20 extends Lexicon { |
76 | |
77 | Java20() { |
78 | |
79 | /** |
80 | * Grammar for Java 2.0. |
81 | * |
82 | * Nonterminal - first letter uppercase |
83 | * TERMINAL - all letters uppercase |
84 | * keyword - all letters lowercase |
85 | */ |
86 | int INFINITY = -1; |
87 | |
88 | /** |
89 | * 19.3 Terminals from section 3.6: White Space: [[:space:]] |
90 | */ |
91 | put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY)); |
92 | |
93 | /** |
94 | * 19.3 Terminals from section 3.7: Comment |
95 | */ |
96 | put("COMMENT", new Union( |
97 | |
98 | // |
99 | // Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/ |
100 | // |
101 | new Concatenation( |
102 | new Singleton("/*"), new Concatenation( |
103 | new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation( |
104 | new Repetition( |
105 | new Concatenation( |
106 | new Singleton("*"), |
107 | new Repetition(new Concatenation( |
108 | new NonMatch("*/"), |
109 | new Repetition(new NonMatch("*"), 0, INFINITY) |
110 | ), 0, 1) |
111 | ), 0, INFINITY |
112 | ), |
113 | new Singleton("*/") |
114 | ))), new Union( |
115 | |
116 | /** |
117 | * End Of Line Comment: //[^\n]*\n |
118 | */ |
119 | new Concatenation( |
120 | new Singleton("//"), new Concatenation( |
121 | new Repetition(new NonMatch("\n"), 0, INFINITY), |
122 | new Singleton("\n") |
123 | )), |
124 | |
125 | // |
126 | // Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/ |
127 | // |
128 | new Concatenation( |
129 | new Singleton("/**"), new Concatenation( |
130 | new Repetition( |
131 | new Concatenation( |
132 | new Repetition(new Concatenation( |
133 | new NonMatch("*/"), |
134 | new Repetition(new NonMatch("*"), 0, INFINITY) |
135 | ), 0, 1), |
136 | new Singleton("*") |
137 | ), 0, INFINITY |
138 | ), |
139 | new Singleton("/") |
140 | )) |
141 | ))); |
142 | |
143 | put("IDENTIFIER", new Concatenation( |
144 | new Union( |
145 | PosixClass.alpha(), |
146 | new Match("_$") |
147 | ), |
148 | new Repetition( |
149 | new Union( |
150 | PosixClass.alnum(), |
151 | new Match("_$") |
152 | ), 0, INFINITY |
153 | ) |
154 | )); |
155 | |
156 | /** |
157 | * 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar) |
158 | */ |
159 | put("KEYWORD", new Union( |
160 | new Singleton("const"), |
161 | new Singleton("goto") |
162 | )); |
163 | |
164 | /** |
165 | * 19.3 Terminals from section 3.10.1: Integer Literal |
166 | */ |
167 | put("INTEGER_LITERAL", new Concatenation( |
168 | new Union( |
169 | /** |
170 | * Decimal Integer Literal: 0|[1-9][[:digit:]]* |
171 | */ |
172 | new Singleton("0"), new Union( |
173 | |
174 | new Concatenation( |
175 | new Range('1', '9'), |
176 | new Repetition(PosixClass.digit(), 0, INFINITY) |
177 | ), new Union( |
178 | |
179 | /** |
180 | * Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+ |
181 | */ |
182 | new Concatenation( |
183 | new Singleton("0"), new Concatenation( |
184 | new Match("xX"), |
185 | new Repetition(PosixClass.xdigit(), 1, INFINITY) |
186 | )), |
187 | |
188 | /** |
189 | * Octal Integer Literal: 0[0-7]+ |
190 | */ |
191 | new Concatenation( |
192 | new Singleton("0"), |
193 | new Repetition(new Range('0', '7'), 1, INFINITY) |
194 | ) |
195 | ))), |
196 | new Repetition(new Match("lL"), 0, 1) |
197 | )); |
198 | |
199 | /** |
200 | * 19.3 Terminals from section 3.10.2: Floating-Point Literal |
201 | */ |
202 | put("FLOATING_POINT_LITERAL", new Union( |
203 | |
204 | /** |
205 | * [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]? |
206 | */ |
207 | new Concatenation( |
208 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
209 | new Singleton("."), new Concatenation( |
210 | new Repetition(PosixClass.digit(), 0, INFINITY), new Concatenation( |
211 | new Repetition(new Concatenation( |
212 | new Match("eE"), new Concatenation( |
213 | new Repetition(new Match("-+"), 0, 1), |
214 | new Repetition(PosixClass.digit(), 1, INFINITY) |
215 | )), 0, 1), |
216 | new Repetition(new Match("fFdD"), 0, 1) |
217 | )))), new Union( |
218 | |
219 | /** |
220 | * \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]? |
221 | */ |
222 | new Concatenation( |
223 | new Singleton("."), new Concatenation( |
224 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
225 | new Repetition(new Concatenation( |
226 | new Match("eE"), new Concatenation( |
227 | new Repetition(new Match("-+"), 0, 1), |
228 | new Repetition(PosixClass.digit(), 1, INFINITY) |
229 | )), 0, 1), |
230 | new Repetition(new Match("fFdD"), 0, 1) |
231 | ))), new Union( |
232 | |
233 | /** |
234 | * [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]? |
235 | */ |
236 | new Concatenation( |
237 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
238 | new Match("eE"), new Concatenation( |
239 | new Repetition(new Match("-+"), 0, 1), new Concatenation( |
240 | new Repetition(PosixClass.digit(), 1, INFINITY), |
241 | new Repetition(new Match("fFdD"), 0, 1) |
242 | )))), |
243 | |
244 | /** |
245 | * [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD] |
246 | */ |
247 | new Concatenation( |
248 | new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation( |
249 | new Repetition(new Concatenation( |
250 | new Match("eE"), new Concatenation( |
251 | new Repetition(new Match("-+"), 0, 1), |
252 | new Repetition(PosixClass.digit(), 1, INFINITY) |
253 | )), 0, 1), |
254 | new Match("fFdD") |
255 | )) |
256 | )))); |
257 | |
258 | /** |
259 | * 19.3 Terminals from section 3.10.3: Boolean Literal |
260 | */ |
261 | put("BOOLEAN_LITERAL", new Union( |
262 | new Singleton("true"), |
263 | new Singleton("false") |
264 | )); |
265 | |
266 | /** |
267 | * 19.3 Terminals from section 3.10.4: Character Literal |
268 | */ |
269 | put("CHARACTER_LITERAL", new Concatenation( |
270 | new Singleton("'"), new Concatenation( |
271 | new Union( |
272 | |
273 | /** |
274 | * Single Character: [^\r\n'\\] |
275 | */ |
276 | new NonMatch("\r\n'\\"), |
277 | |
278 | /** |
279 | * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2}) |
280 | */ |
281 | new Concatenation( |
282 | new Singleton("\\"), |
283 | new Union( |
284 | new Match("btnfr\"'\\"), |
285 | new Concatenation( |
286 | new Repetition(new Range('0', '3'), 0, 1), |
287 | new Repetition(new Range('0', '7'), 1, 2) |
288 | ) |
289 | ) |
290 | ) |
291 | ), |
292 | new Singleton("'") |
293 | ))); |
294 | |
295 | /** |
296 | * 19.3 Terminals from section 3.10.5: String Literal |
297 | */ |
298 | put("STRING_LITERAL", new Concatenation( |
299 | new Singleton("\""), new Concatenation( |
300 | new Repetition( |
301 | new Union( |
302 | |
303 | /** |
304 | * Single Character: [^\r\n"\\] |
305 | */ |
306 | new NonMatch("\r\n\"\\"), |
307 | |
308 | /** |
309 | * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2}) |
310 | */ |
311 | new Concatenation( |
312 | new Singleton("\\"), |
313 | new Union( |
314 | new Match("btnfr\"'\\"), |
315 | new Concatenation( |
316 | new Repetition(new Range('0', '3'), 0, 1), |
317 | new Repetition(new Range('0', '7'), 1, 2) |
318 | ) |
319 | ) |
320 | ) |
321 | ), 0, INFINITY |
322 | ), |
323 | new Singleton("\"") |
324 | ))); |
325 | |
326 | /** |
327 | * 19.3 Terminals section 3.10.7: Null Literal |
328 | */ |
329 | put("NULL_LITERAL", new Singleton("null")); |
330 | |
331 | // OK, it seems we have to add some more stuff... |
332 | |
333 | //put("OTHER1", new Match(";{}=,<>[]().+-:|&!")); |
334 | //put("OTHER1", new NonMatch("")); // catch anything, one character at a time |
335 | put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time |
336 | |
337 | } |
338 | } // class Java20 |
339 | } |
Began life as a copy of #648
download show line numbers debug dex old transpilations
Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, qbtsjoyahagl, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #651 |
Snippet name: | Official Java tokenizer |
Eternal ID of this version: | #651/1 |
Text MD5: | 6fb38b25def5b2c4b9574b4126255ea9 |
Author: | stefan |
Category: | javax |
Type: | JavaX (input.txt to output.txt) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-06-27 18:01:15 |
Source code size: | 8967 bytes / 339 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 736 / 585 |
Referenced in: | [show references] |