Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

339
LINES

< > BotCompany Repo | #651 // Official Java tokenizer

JavaX (input.txt to output.txt)

1  
!636
2  
!629 // standard functions
3  
!1000300 // class Lexicon
4  
5  
main {
6  
  psvm {
7  
    String src = takeInput(args, null);
8  
    Java20 lex = new Java20();
9  
    src = src.replace("\r\n", "\n");
10  
    LineNumberReader source = new LineNumberReader(new StringReader(src));
11  
    int lineNr = source.getLineNumber()+1;
12  
    List<T> list = new ArrayList<T>();
13  
    for (Object a; (a = lex.grab(source)) != lex.$;) {
14  
      String word = lex.word();
15  
      String q = quote(word);
16  
      //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
17  
      lineNr = source.getLineNumber()+1;
18  
      
19  
      T t = new T(a, word);
20  
      boolean isSpace = t.isSpace();
21  
      if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
22  
        list.get(list.size()-1).word += word; // merge spaces
23  
      else
24  
        list.add(t);
25  
    }
26  
    
27  
    List<String> cnc = new ArrayList<String>();
28  
    for (int i = 0; i < list.size(); ) {
29  
      T t = list.get(i);
30  
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
31  
      boolean isSpace = t.isSpace();
32  
      if (shouldBeSpace == isSpace) {
33  
        cnc.add(t.word);
34  
        ++i;
35  
      } else if (shouldBeSpace)
36  
        cnc.add("");
37  
      else {
38  
        System.out.println(cncToLines(cnc));
39  
        throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
40  
      }
41  
    }
42  
    if ((cnc.size() % 2) == 0)
43  
      cnc.add("");
44  
45  
    saveTextFile("output/output.txt", cncToLines(cnc));
46  
  }
47  
  
48  
  static class T {
49  
    Object a; String word;
50  
    
51  
    T(Object a, String word) { this.a = a; this.word = word; }
52  
    
53  
    boolean isSpace() {
54  
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
55  
    }
56  
  }
57  
  
58  
  static String cncToLines(List<String> cnc) {
59  
    StringBuilder out = new StringBuilder();
60  
    for (String token : cnc)
61  
      out.append(quote(token) + "\n");
62  
    return out.toString();
63  
  }
64  
  
65  
  static String takeInput(String[] args, String def) tex {
66  
    if (args.length != 0) return loadSnippet(args[0]);
67  
    return loadTextFile("input/input.txt", def);
68  
  }
69  
  
70  
  public static String quote(String s) {
71  
    if (s == null) return "null";
72  
    return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
73  
  }
74  
  
75  
  static class Java20 extends Lexicon {
76  
77  
	Java20() {
78  
79  
		/**
80  
		* Grammar for Java 2.0.
81  
		*
82  
		* Nonterminal - first letter uppercase
83  
		* TERMINAL - all letters uppercase
84  
		* keyword - all letters lowercase
85  
		*/
86  
		int INFINITY = -1;
87  
88  
		/**
89  
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
90  
		*/
91  
		put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY));
92  
93  
		/**
94  
		* 19.3 Terminals from section 3.7: Comment
95  
		*/
96  
		put("COMMENT", new Union(
97  
98  
			//
99  
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
100  
			//
101  
			new Concatenation(
102  
				new Singleton("/*"), new Concatenation(
103  
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
104  
				new Repetition(
105  
					new Concatenation(
106  
						new Singleton("*"),
107  
						new Repetition(new Concatenation(
108  
							new NonMatch("*/"),
109  
							new Repetition(new NonMatch("*"), 0, INFINITY)
110  
						), 0, 1)
111  
					), 0, INFINITY
112  
				),
113  
				new Singleton("*/")
114  
			))), new Union(
115  
116  
			/**
117  
			* End Of Line Comment: //[^\n]*\n
118  
			*/
119  
			new Concatenation(
120  
				new Singleton("//"), new Concatenation(
121  
				new Repetition(new NonMatch("\n"), 0, INFINITY),
122  
				new Singleton("\n")
123  
			)),
124  
125  
			//
126  
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
127  
			//
128  
			new Concatenation(
129  
				new Singleton("/**"), new Concatenation(
130  
				new Repetition(
131  
					new Concatenation(
132  
						new Repetition(new Concatenation(
133  
							new NonMatch("*/"),
134  
							new Repetition(new NonMatch("*"), 0, INFINITY)
135  
						), 0, 1),
136  
						new Singleton("*")
137  
					), 0, INFINITY
138  
				),
139  
				new Singleton("/")
140  
			))
141  
		)));
142  
143  
		put("IDENTIFIER", new Concatenation(
144  
			new Union(
145  
				PosixClass.alpha(),
146  
				new Match("_$")
147  
			),
148  
			new Repetition(
149  
				new Union(
150  
					PosixClass.alnum(),
151  
					new Match("_$")
152  
				), 0, INFINITY
153  
			)
154  
		));
155  
156  
		/**
157  
		* 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar)
158  
		*/
159  
		put("KEYWORD", new Union(
160  
			new Singleton("const"),
161  
			new Singleton("goto")
162  
		));
163  
164  
		/**
165  
		* 19.3 Terminals from section 3.10.1: Integer Literal
166  
		*/
167  
		put("INTEGER_LITERAL", new Concatenation(
168  
			new Union(
169  
				/**
170  
				* Decimal Integer Literal: 0|[1-9][[:digit:]]*
171  
				*/
172  
				new Singleton("0"), new Union(
173  
174  
				new Concatenation(
175  
					new Range('1', '9'),
176  
					new Repetition(PosixClass.digit(), 0, INFINITY)
177  
				), new Union(
178  
179  
				/**
180  
				* Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+
181  
				*/
182  
				new Concatenation(
183  
					new Singleton("0"), new Concatenation(
184  
					new Match("xX"),
185  
					new Repetition(PosixClass.xdigit(), 1, INFINITY)
186  
				)),
187  
188  
				/**
189  
				* Octal Integer Literal: 0[0-7]+
190  
				*/
191  
				new Concatenation(
192  
					new Singleton("0"),
193  
					new Repetition(new Range('0', '7'), 1, INFINITY)
194  
				)
195  
			))),
196  
			new Repetition(new Match("lL"), 0, 1)
197  
		));
198  
199  
		/**
200  
		* 19.3 Terminals from section 3.10.2: Floating-Point Literal
201  
		*/
202  
		put("FLOATING_POINT_LITERAL", new Union(
203  
204  
			/**
205  
			* [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]?
206  
			*/
207  
			new Concatenation(
208  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
209  
				new Singleton("."), new Concatenation(
210  
				new Repetition(PosixClass.digit(), 0, INFINITY), new Concatenation(
211  
				new Repetition(new Concatenation(
212  
					new Match("eE"), new Concatenation(
213  
					new Repetition(new Match("-+"), 0, 1),
214  
					new Repetition(PosixClass.digit(), 1, INFINITY)
215  
				)), 0, 1),
216  
				new Repetition(new Match("fFdD"), 0, 1)
217  
			)))), new Union(
218  
219  
			/**
220  
			* \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]?
221  
			*/
222  
			new Concatenation(
223  
				new Singleton("."), new Concatenation(
224  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
225  
				new Repetition(new Concatenation(
226  
					new Match("eE"), new Concatenation(
227  
					new Repetition(new Match("-+"), 0, 1),
228  
					new Repetition(PosixClass.digit(), 1, INFINITY)
229  
				)), 0, 1),
230  
				new Repetition(new Match("fFdD"), 0, 1)
231  
			))), new Union(
232  
233  
			/**
234  
			* [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]?
235  
			*/
236  
			new Concatenation(
237  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
238  
				new Match("eE"), new Concatenation(
239  
				new Repetition(new Match("-+"), 0, 1), new Concatenation(
240  
				new Repetition(PosixClass.digit(), 1, INFINITY),
241  
				new Repetition(new Match("fFdD"), 0, 1)
242  
			)))),
243  
244  
			/**
245  
			* [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]
246  
			*/
247  
			new Concatenation(
248  
				new Repetition(PosixClass.digit(), 1, INFINITY), new Concatenation(
249  
				new Repetition(new Concatenation(
250  
					new Match("eE"), new Concatenation(
251  
					new Repetition(new Match("-+"), 0, 1),
252  
					new Repetition(PosixClass.digit(), 1, INFINITY)
253  
				)), 0, 1),
254  
				new Match("fFdD")
255  
			))
256  
		))));
257  
258  
		/**
259  
		* 19.3 Terminals from section 3.10.3: Boolean Literal
260  
		*/
261  
		put("BOOLEAN_LITERAL", new Union(
262  
			new Singleton("true"),
263  
			new Singleton("false")
264  
		));
265  
266  
		/**
267  
		* 19.3 Terminals from section 3.10.4: Character Literal
268  
		*/
269  
		put("CHARACTER_LITERAL", new Concatenation(
270  
			new Singleton("'"), new Concatenation(
271  
			new Union(
272  
273  
				/**
274  
				* Single Character: [^\r\n'\\]
275  
				*/
276  
				new NonMatch("\r\n'\\"),
277  
278  
				/**
279  
				* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
280  
				*/
281  
				new Concatenation(
282  
					new Singleton("\\"),
283  
					new Union(
284  
						new Match("btnfr\"'\\"),
285  
						new Concatenation(
286  
							new Repetition(new Range('0', '3'), 0, 1),
287  
							new Repetition(new Range('0', '7'), 1, 2)
288  
						)
289  
					)
290  
				)
291  
			),
292  
			new Singleton("'")
293  
		)));
294  
295  
		/**
296  
		* 19.3 Terminals from section 3.10.5: String Literal
297  
		*/
298  
		put("STRING_LITERAL", new Concatenation(
299  
			new Singleton("\""), new Concatenation(
300  
			new Repetition(
301  
				new Union(
302  
303  
					/**
304  
					* Single Character: [^\r\n"\\]
305  
					*/
306  
					new NonMatch("\r\n\"\\"),
307  
308  
					/**
309  
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
310  
					*/
311  
					new Concatenation(
312  
						new Singleton("\\"),
313  
						new Union(
314  
							new Match("btnfr\"'\\"),
315  
							new Concatenation(
316  
								new Repetition(new Range('0', '3'), 0, 1),
317  
								new Repetition(new Range('0', '7'), 1, 2)
318  
							)
319  
						)
320  
					)
321  
				), 0, INFINITY
322  
			),
323  
			new Singleton("\"")
324  
		)));
325  
326  
		/**
327  
		* 19.3 Terminals section 3.10.7: Null Literal
328  
		*/
329  
		put("NULL_LITERAL", new Singleton("null"));
330  
		
331  
		// OK, it seems we have to add some more stuff...
332  
		
333  
		//put("OTHER1", new Match(";{}=,<>[]().+-:|&!"));
334  
		//put("OTHER1", new NonMatch("")); // catch anything, one character at a time
335  
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time
336  
337  
	}
338  
} // class Java20
339  
}

Author comment

Began life as a copy of #648

download  show line numbers  debug dex  old transpilations   

Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, qbtsjoyahagl, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #651
Snippet name: Official Java tokenizer
Eternal ID of this version: #651/1
Text MD5: 6fb38b25def5b2c4b9574b4126255ea9
Author: stefan
Category: javax
Type: JavaX (input.txt to output.txt)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-06-27 18:01:15
Source code size: 8967 bytes / 339 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 736 / 585
Referenced in: [show references]