Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

376
LINES

< > BotCompany Repo | #1000353 // Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings

Document

1  
!636
2  
//1000300 // Lexicon
3  
//1000515 // Lexicon, fixing
4  
!quicknew
5  
6  
class JavaTok {
7  
  static String join(List<String> cnc) {
8  
    new StringBuilder buf;
9  
    for (String s : cnc) buf.append(s);
10  
    return buf.toString();
11  
  }
12  
  
13  
  static List<String> split(String src) {
14  
    Java20 lex = new Java20();
15  
    src = src.replace("\r\n", "\n");
16  
    LineNumberReader source = new LineNumberReader(new StringReader(src));
17  
    int lineNr = source.getLineNumber()+1;
18  
    List<T> list = new ArrayList<T>();
19  
    try {
20  
      for (Object a; (a = lex.grab(source)) != lex.$;) {
21  
        String word = lex.word();
22  
        String q = quote(word);
23  
        //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
24  
        lineNr = source.getLineNumber()+1;
25  
        
26  
        T t = new T(a, word);
27  
        boolean isSpace = t.isSpace();
28  
        if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
29  
          list.get(list.size()-1).word += word; // merge spaces
30  
        else
31  
          list.add(t);
32  
      }
33  
    } catch (Lexicon.Exception e) {
34  
      throw new RuntimeException(e);
35  
    }
36  
    
37  
    List<String> cnc = new ArrayList<String>();
38  
    for (int i = 0; i < list.size(); ) {
39  
      T t = list.get(i);
40  
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
41  
      boolean isSpace = t.isSpace();
42  
      if (shouldBeSpace == isSpace) {
43  
        cnc.add(t.word);
44  
        ++i;
45  
      } else if (shouldBeSpace)
46  
        cnc.add("");
47  
      else {
48  
        System.out.println(cncToLines(cnc));
49  
        throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
50  
      }
51  
    }
52  
    if ((cnc.size() % 2) == 0)
53  
      cnc.add("");
54  
55  
    return cnc;
56  
  }
57  
  
58  
  static class T {
59  
    Object a; String word;
60  
    
61  
    T(Object a, String word) { this.a = a; this.word = word; }
62  
    
63  
    boolean isSpace() {
64  
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
65  
    }
66  
  }
67  
  
68  
  static String cncToLines(List<String> cnc) {
69  
    StringBuilder out = new StringBuilder();
70  
    for (String token : cnc)
71  
      out.append(quote(token) + "\n");
72  
    return out.toString();
73  
  }
74  
  
75  
  public static String quote(String s) {
76  
    if (s == null) return "null";
77  
    return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
78  
  }
79  
  
80  
  static class Java20 extends Lexicon {
81  
82  
	Java20() {
83  
		/**
84  
		* Grammar for Java 2.0.
85  
		*
86  
		* Nonterminal - first letter uppercase
87  
		* TERMINAL - all letters uppercase
88  
		* keyword - all letters lowercase
89  
		*/
90  
		int INFINITY = -1;
91  
92  
		/**
93  
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
94  
		*/
95  
		put("WHITE_SPACE", new Repetition(space(), 1, INFINITY));
96  
97  
		/**
98  
		* 19.3 Terminals from section 3.7: Comment
99  
		*/
100  
		put("COMMENT", new Union(
101  
102  
			//
103  
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
104  
			//
105  
			new Concatenation(
106  
				new Singleton("/*"), new Concatenation(
107  
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
108  
				new Repetition(
109  
					new Concatenation(
110  
						new Singleton("*"),
111  
						new Repetition(new Concatenation(
112  
							new NonMatch("*/"),
113  
							new Repetition(new NonMatch("*"), 0, INFINITY)
114  
						), 0, 1)
115  
					), 0, INFINITY
116  
				),
117  
				new Singleton("*/")
118  
			))), new Union(
119  
120  
			/**
121  
			* End Of Line Comment: //[^\n]*\n
122  
			*/
123  
			new Concatenation(
124  
				new Singleton("//"), new Concatenation(
125  
				new Repetition(new NonMatch("\n"), 0, INFINITY),
126  
				new Singleton("\n")
127  
			)),
128  
129  
			//
130  
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
131  
			//
132  
			new Concatenation(
133  
				new Singleton("/**"), new Concatenation(
134  
				new Repetition(
135  
					new Concatenation(
136  
						new Repetition(new Concatenation(
137  
							new NonMatch("*/"),
138  
							new Repetition(new NonMatch("*"), 0, INFINITY)
139  
						), 0, 1),
140  
						new Singleton("*")
141  
					), 0, INFINITY
142  
				),
143  
				new Singleton("/")
144  
			))
145  
		)));
146  
147  
		put("IDENTIFIER", new Concatenation(
148  
			new Union(
149  
				alpha(),
150  
				new Match("_$")
151  
			),
152  
			new Repetition(
153  
				new Union(
154  
					alnum(),
155  
					new Match("_$")
156  
				), 0, INFINITY
157  
			)
158  
		));
159  
160  
		/**
161  
		* 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar)
162  
		*/
163  
		put("KEYWORD", new Union(
164  
			new Singleton("const"),
165  
			new Singleton("goto")
166  
		));
167  
168  
		/**
169  
		* 19.3 Terminals from section 3.10.1: Integer Literal
170  
		*/
171  
		put("INTEGER_LITERAL", new Concatenation(
172  
			new Union(
173  
				/**
174  
				* Decimal Integer Literal: 0|[1-9][[:digit:]]*
175  
				*/
176  
				new Singleton("0"), new Union(
177  
178  
				new Concatenation(
179  
					new Range('1', '9'),
180  
					new Repetition(digit(), 0, INFINITY)
181  
				), new Union(
182  
183  
				/**
184  
				* Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+
185  
				*/
186  
				new Concatenation(
187  
					new Singleton("0"), new Concatenation(
188  
					new Match("xX"),
189  
					new Repetition(xdigit(), 1, INFINITY)
190  
				)),
191  
192  
				/**
193  
				* Octal Integer Literal: 0[0-7]+
194  
				*/
195  
				new Concatenation(
196  
					new Singleton("0"),
197  
					new Repetition(new Range('0', '7'), 1, INFINITY)
198  
				)
199  
			))),
200  
			new Repetition(new Match("lL"), 0, 1)
201  
		));
202  
203  
		/**
204  
		* 19.3 Terminals from section 3.10.2: Floating-Point Literal
205  
		*/
206  
		put("FLOATING_POINT_LITERAL", new Union(
207  
208  
			/**
209  
			* [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]?
210  
			*/
211  
			new Concatenation(
212  
				new Repetition(digit(), 1, INFINITY), new Concatenation(
213  
				new Singleton("."), new Concatenation(
214  
				new Repetition(digit(), 0, INFINITY), new Concatenation(
215  
				new Repetition(new Concatenation(
216  
					new Match("eE"), new Concatenation(
217  
					new Repetition(new Match("-+"), 0, 1),
218  
					new Repetition(digit(), 1, INFINITY)
219  
				)), 0, 1),
220  
				new Repetition(new Match("fFdD"), 0, 1)
221  
			)))), new Union(
222  
223  
			/**
224  
			* \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]?
225  
			*/
226  
			new Concatenation(
227  
				new Singleton("."), new Concatenation(
228  
				new Repetition(digit(), 1, INFINITY), new Concatenation(
229  
				new Repetition(new Concatenation(
230  
					new Match("eE"), new Concatenation(
231  
					new Repetition(new Match("-+"), 0, 1),
232  
					new Repetition(digit(), 1, INFINITY)
233  
				)), 0, 1),
234  
				new Repetition(new Match("fFdD"), 0, 1)
235  
			))), new Union(
236  
237  
			/**
238  
			* [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]?
239  
			*/
240  
			new Concatenation(
241  
				new Repetition(digit(), 1, INFINITY), new Concatenation(
242  
				new Match("eE"), new Concatenation(
243  
				new Repetition(new Match("-+"), 0, 1), new Concatenation(
244  
				new Repetition(digit(), 1, INFINITY),
245  
				new Repetition(new Match("fFdD"), 0, 1)
246  
			)))),
247  
248  
			/**
249  
			* [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]
250  
			*/
251  
			new Concatenation(
252  
				new Repetition(digit(), 1, INFINITY), new Concatenation(
253  
				new Repetition(new Concatenation(
254  
					new Match("eE"), new Concatenation(
255  
					new Repetition(new Match("-+"), 0, 1),
256  
					new Repetition(digit(), 1, INFINITY)
257  
				)), 0, 1),
258  
				new Match("fFdD")
259  
			))
260  
		))));
261  
262  
		/**
263  
		* 19.3 Terminals from section 3.10.3: Boolean Literal
264  
		*/
265  
		put("BOOLEAN_LITERAL", new Union(
266  
			new Singleton("true"),
267  
			new Singleton("false")
268  
		));
269  
270  
		/**
271  
		* 19.3 Terminals from section 3.10.4: Character Literal
272  
		*/
273  
		put("CHARACTER_LITERAL", new Concatenation(
274  
			new Singleton("'"), new Concatenation(
275  
			new Union(
276  
277  
				/**
278  
				* Single Character: [^\r\n'\\]
279  
				*/
280  
				new NonMatch("\r\n'\\"),
281  
282  
				/**
283  
				* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
284  
				*/
285  
				new Concatenation(
286  
					new Singleton("\\"),
287  
					new Union(
288  
						new Match("btnfr\"'\\"),
289  
						new Concatenation(
290  
							new Repetition(new Range('0', '3'), 0, 1),
291  
							new Repetition(new Range('0', '7'), 1, 2)
292  
						)
293  
					)
294  
				)
295  
			),
296  
			new Singleton("'")
297  
		)));
298  
299  
		put("MULTILINE_LITERAL", new Concatenation(
300  
			new Singleton("[["), new Concatenation(
301  
			new Repetition(
302  
				new Union(
303  
					new NonMatch("]"),
304  
					new Concatenation(
305  
					  new Singleton("]"), new NonMatch("]"))
306  
			  ), 0, INFINITY
307  
			),
308  
			new Singleton("]]")
309  
		)));
310  
311  
		put("MULTILINE_LITERAL2", new Concatenation(
312  
			new Singleton("[=["), new Concatenation(
313  
			new Repetition(
314  
				new Union(
315  
					new NonMatch("]"),
316  
					new Concatenation(new Singleton("]"), new Union(
317  
				    new NonMatch("="),
318  
				    new Concatenation(new Singleton("="), new NonMatch("]"))))
319  
			  ), 0, INFINITY
320  
			),
321  
			new Singleton("]=]")
322  
		)));
323  
324  
		/**
325  
		* 19.3 Terminals from section 3.10.5: String Literal
326  
		*/
327  
		put("STRING_LITERAL", new Concatenation(
328  
			new Singleton("\""), new Concatenation(
329  
			new Repetition(
330  
				new Union(
331  
332  
					/**
333  
					* Single Character: [^\r\n"\\]
334  
					*/
335  
					new NonMatch("\r\n\"\\"),
336  
337  
					/**
338  
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
339  
					*/
340  
					new Concatenation(
341  
						new Singleton("\\"),
342  
						new Union(
343  
							new Match("btnfr\"'\\"),
344  
							new Union(
345  
  							new Concatenation(
346  
  								new Repetition(new Range('0', '3'), 0, 1),
347  
  								new Repetition(new Range('0', '7'), 1, 2)
348  
  							),
349  
  							new Concatenation(
350  
  							  new Singleton("u"),
351  
  							  new Repetition(new Match("0123456789abcdefABCDEF"), 4, 4)
352  
  							)
353  
  						)
354  
						)
355  
					)
356  
				), 0, INFINITY
357  
			),
358  
			new Singleton("\"")
359  
		)));
360  
361  
		/**
362  
		* 19.3 Terminals section 3.10.7: Null Literal
363  
		*/
364  
		put("NULL_LITERAL", new Singleton("null"));
365  
		
366  
		// OK, it seems we have to add some more stuff...
367  
		
368  
		//put("OTHER1", new Match(";{}=,<>[]().+-:|&!"));
369  
		//put("OTHER1", new NonMatch("")); // catch anything, one character at a time
370  
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time
371  
372  
	}
373  
} // class Java20
374  
}
375  
376  
!include #1000514 // Lexicon

Author comment

Began life as a copy of #1000348

download  show line numbers   

Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

Comments [hide]

ID Author/Program Comment Date
149 #1000604 (pitcher) 2015-08-18 00:47:55

add comment

Snippet ID: #1000353
Snippet name: Official Java tokenizer plus multi-line strings (embeddable) - partly enhanced with robust multiline strings
Eternal ID of this version: #1000353/1
Text MD5: 2fc40222a3253afa884ef1e7c6c4ec9d
Author: stefan
Category: javax
Type: Document
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-08-09 23:02:17
Source code size: 9740 bytes / 376 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 1028 / 2242
Referenced in: [show references]