| 1   | !636
 | 
| 2   | //1000300 // Lexicon
 | 
| 3   | //1000515 // Lexicon, fixing
 | 
| 4   | !quicknew
 | 
| 5   | 
 | 
| 6   | class JavaTok {
 | 
| 7   |   static String join(List<String> cnc) {
 | 
| 8   |     new StringBuilder buf;
 | 
| 9   |     for (String s : cnc) buf.append(s);
 | 
| 10   |     return buf.toString();
 | 
| 11   |   }
 | 
| 12   |   
 | 
| 13   |   static List<String> split(String src) {
 | 
| 14   |     Java20 lex = new Java20();
 | 
| 15   |     src = src.replace("\r\n", "\n");
 | 
| 16   |     LineNumberReader source = new LineNumberReader(new StringReader(src));
 | 
| 17   |     int lineNr = source.getLineNumber()+1;
 | 
| 18   |     List<T> list = new ArrayList<T>();
 | 
| 19   |     try {
 | 
| 20   |       for (Object a; (a = lex.grab(source)) != lex.$;) {
 | 
| 21   |         String word = lex.word();
 | 
| 22   |         String q = quote(word);
 | 
| 23   |         //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
 | 
| 24   |         lineNr = source.getLineNumber()+1;
 | 
| 25   |         
 | 
| 26   |         T t = new T(a, word);
 | 
| 27   |         boolean isSpace = t.isSpace();
 | 
| 28   |         if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
 | 
| 29   |           list.get(list.size()-1).word += word; // merge spaces
 | 
| 30   |         else
 | 
| 31   |           list.add(t);
 | 
| 32   |       }
 | 
| 33   |     } catch (Lexicon.Exception e) {
 | 
| 34   |       throw new RuntimeException(e);
 | 
| 35   |     }
 | 
| 36   |     
 | 
| 37   |     List<String> cnc = new ArrayList<String>();
 | 
| 38   |     for (int i = 0; i < list.size(); ) {
 | 
| 39   |       T t = list.get(i);
 | 
| 40   |       boolean shouldBeSpace = (cnc.size() % 2) == 0;
 | 
| 41   |       boolean isSpace = t.isSpace();
 | 
| 42   |       if (shouldBeSpace == isSpace) {
 | 
| 43   |         cnc.add(t.word);
 | 
| 44   |         ++i;
 | 
| 45   |       } else if (shouldBeSpace)
 | 
| 46   |         cnc.add("");
 | 
| 47   |       else {
 | 
| 48   |         System.out.println(cncToLines(cnc));
 | 
| 49   |         throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
 | 
| 50   |       }
 | 
| 51   |     }
 | 
| 52   |     if ((cnc.size() % 2) == 0)
 | 
| 53   |       cnc.add("");
 | 
| 54   | 
 | 
| 55   |     return cnc;
 | 
| 56   |   }
 | 
| 57   |   
 | 
| 58   |   static class T {
 | 
| 59   |     Object a; String word;
 | 
| 60   |     
 | 
| 61   |     T(Object a, String word) { this.a = a; this.word = word; }
 | 
| 62   |     
 | 
| 63   |     boolean isSpace() {
 | 
| 64   |       return a.equals("WHITE_SPACE") || a.equals("COMMENT");
 | 
| 65   |     }
 | 
| 66   |   }
 | 
| 67   |   
 | 
| 68   |   static String cncToLines(List<String> cnc) {
 | 
| 69   |     StringBuilder out = new StringBuilder();
 | 
| 70   |     for (String token : cnc)
 | 
| 71   |       out.append(quote(token) + "\n");
 | 
| 72   |     return out.toString();
 | 
| 73   |   }
 | 
| 74   |   
 | 
| 75   |   public static String quote(String s) {
 | 
| 76   |     if (s == null) return "null";
 | 
| 77   |     return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
 | 
| 78   |   }
 | 
| 79   |   
 | 
| 80   |   static class Java20 extends Lexicon {
 | 
| 81   | 
 | 
| 82   | 	Java20() {
 | 
| 83   | 		/**
 | 
| 84   | 		* Grammar for Java 2.0.
 | 
| 85   | 		*
 | 
| 86   | 		* Nonterminal - first letter uppercase
 | 
| 87   | 		* TERMINAL - all letters uppercase
 | 
| 88   | 		* keyword - all letters lowercase
 | 
| 89   | 		*/
 | 
| 90   | 		int INFINITY = -1;
 | 
| 91   | 
 | 
| 92   | 		/**
 | 
| 93   | 		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
 | 
| 94   | 		*/
 | 
| 95   | 		put("WHITE_SPACE", new Repetition(space(), 1, INFINITY));
 | 
| 96   | 
 | 
| 97   | 		/**
 | 
| 98   | 		* 19.3 Terminals from section 3.7: Comment
 | 
| 99   | 		*/
 | 
| 100   | 		put("COMMENT", new Union(
 | 
| 101   | 
 | 
| 102   | 			//
 | 
| 103   | 			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
 | 
| 104   | 			//
 | 
| 105   | 			new Concatenation(
 | 
| 106   | 				new Singleton("/*"), new Concatenation(
 | 
| 107   | 				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
 | 
| 108   | 				new Repetition(
 | 
| 109   | 					new Concatenation(
 | 
| 110   | 						new Singleton("*"),
 | 
| 111   | 						new Repetition(new Concatenation(
 | 
| 112   | 							new NonMatch("*/"),
 | 
| 113   | 							new Repetition(new NonMatch("*"), 0, INFINITY)
 | 
| 114   | 						), 0, 1)
 | 
| 115   | 					), 0, INFINITY
 | 
| 116   | 				),
 | 
| 117   | 				new Singleton("*/")
 | 
| 118   | 			))), new Union(
 | 
| 119   | 
 | 
| 120   | 			/**
 | 
| 121   | 			* End Of Line Comment: //[^\n]*\n
 | 
| 122   | 			*/
 | 
| 123   | 			new Concatenation(
 | 
| 124   | 				new Singleton("//"), new Concatenation(
 | 
| 125   | 				new Repetition(new NonMatch("\n"), 0, INFINITY),
 | 
| 126   | 				new Singleton("\n")
 | 
| 127   | 			)),
 | 
| 128   | 
 | 
| 129   | 			//
 | 
| 130   | 			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
 | 
| 131   | 			//
 | 
| 132   | 			new Concatenation(
 | 
| 133   | 				new Singleton("/**"), new Concatenation(
 | 
| 134   | 				new Repetition(
 | 
| 135   | 					new Concatenation(
 | 
| 136   | 						new Repetition(new Concatenation(
 | 
| 137   | 							new NonMatch("*/"),
 | 
| 138   | 							new Repetition(new NonMatch("*"), 0, INFINITY)
 | 
| 139   | 						), 0, 1),
 | 
| 140   | 						new Singleton("*")
 | 
| 141   | 					), 0, INFINITY
 | 
| 142   | 				),
 | 
| 143   | 				new Singleton("/")
 | 
| 144   | 			))
 | 
| 145   | 		)));
 | 
| 146   | 
 | 
| 147   | 		put("IDENTIFIER", new Concatenation(
 | 
| 148   | 			new Union(
 | 
| 149   | 				alpha(),
 | 
| 150   | 				new Match("_$")
 | 
| 151   | 			),
 | 
| 152   | 			new Repetition(
 | 
| 153   | 				new Union(
 | 
| 154   | 					alnum(),
 | 
| 155   | 					new Match("_$")
 | 
| 156   | 				), 0, INFINITY
 | 
| 157   | 			)
 | 
| 158   | 		));
 | 
| 159   | 
 | 
| 160   | 		/**
 | 
| 161   | 		* 19.3 Terminals from section 3.9: Keyword (recognized but not in the Java grammar)
 | 
| 162   | 		*/
 | 
| 163   | 		put("KEYWORD", new Union(
 | 
| 164   | 			new Singleton("const"),
 | 
| 165   | 			new Singleton("goto")
 | 
| 166   | 		));
 | 
| 167   | 
 | 
| 168   | 		/**
 | 
| 169   | 		* 19.3 Terminals from section 3.10.1: Integer Literal
 | 
| 170   | 		*/
 | 
| 171   | 		put("INTEGER_LITERAL", new Concatenation(
 | 
| 172   | 			new Union(
 | 
| 173   | 				/**
 | 
| 174   | 				* Decimal Integer Literal: 0|[1-9][[:digit:]]*
 | 
| 175   | 				*/
 | 
| 176   | 				new Singleton("0"), new Union(
 | 
| 177   | 
 | 
| 178   | 				new Concatenation(
 | 
| 179   | 					new Range('1', '9'),
 | 
| 180   | 					new Repetition(digit(), 0, INFINITY)
 | 
| 181   | 				), new Union(
 | 
| 182   | 
 | 
| 183   | 				/**
 | 
| 184   | 				* Hexadecimal Integer Literal: 0[xX][[:xdigit:]]+
 | 
| 185   | 				*/
 | 
| 186   | 				new Concatenation(
 | 
| 187   | 					new Singleton("0"), new Concatenation(
 | 
| 188   | 					new Match("xX"),
 | 
| 189   | 					new Repetition(xdigit(), 1, INFINITY)
 | 
| 190   | 				)),
 | 
| 191   | 
 | 
| 192   | 				/**
 | 
| 193   | 				* Octal Integer Literal: 0[0-7]+
 | 
| 194   | 				*/
 | 
| 195   | 				new Concatenation(
 | 
| 196   | 					new Singleton("0"),
 | 
| 197   | 					new Repetition(new Range('0', '7'), 1, INFINITY)
 | 
| 198   | 				)
 | 
| 199   | 			))),
 | 
| 200   | 			new Repetition(new Match("lL"), 0, 1)
 | 
| 201   | 		));
 | 
| 202   | 
 | 
| 203   | 		/**
 | 
| 204   | 		* 19.3 Terminals from section 3.10.2: Floating-Point Literal
 | 
| 205   | 		*/
 | 
| 206   | 		put("FLOATING_POINT_LITERAL", new Union(
 | 
| 207   | 
 | 
| 208   | 			/**
 | 
| 209   | 			* [[:digit:]]+\.[[:digit:]]*([eE][-+]?[[:digit:]]+)?[fFdD]?
 | 
| 210   | 			*/
 | 
| 211   | 			new Concatenation(
 | 
| 212   | 				new Repetition(digit(), 1, INFINITY), new Concatenation(
 | 
| 213   | 				new Singleton("."), new Concatenation(
 | 
| 214   | 				new Repetition(digit(), 0, INFINITY), new Concatenation(
 | 
| 215   | 				new Repetition(new Concatenation(
 | 
| 216   | 					new Match("eE"), new Concatenation(
 | 
| 217   | 					new Repetition(new Match("-+"), 0, 1),
 | 
| 218   | 					new Repetition(digit(), 1, INFINITY)
 | 
| 219   | 				)), 0, 1),
 | 
| 220   | 				new Repetition(new Match("fFdD"), 0, 1)
 | 
| 221   | 			)))), new Union(
 | 
| 222   | 
 | 
| 223   | 			/**
 | 
| 224   | 			* \.[[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]?
 | 
| 225   | 			*/
 | 
| 226   | 			new Concatenation(
 | 
| 227   | 				new Singleton("."), new Concatenation(
 | 
| 228   | 				new Repetition(digit(), 1, INFINITY), new Concatenation(
 | 
| 229   | 				new Repetition(new Concatenation(
 | 
| 230   | 					new Match("eE"), new Concatenation(
 | 
| 231   | 					new Repetition(new Match("-+"), 0, 1),
 | 
| 232   | 					new Repetition(digit(), 1, INFINITY)
 | 
| 233   | 				)), 0, 1),
 | 
| 234   | 				new Repetition(new Match("fFdD"), 0, 1)
 | 
| 235   | 			))), new Union(
 | 
| 236   | 
 | 
| 237   | 			/**
 | 
| 238   | 			* [[:digit:]]+[eE][-+]?[[:digit:]]+[fFdD]?
 | 
| 239   | 			*/
 | 
| 240   | 			new Concatenation(
 | 
| 241   | 				new Repetition(digit(), 1, INFINITY), new Concatenation(
 | 
| 242   | 				new Match("eE"), new Concatenation(
 | 
| 243   | 				new Repetition(new Match("-+"), 0, 1), new Concatenation(
 | 
| 244   | 				new Repetition(digit(), 1, INFINITY),
 | 
| 245   | 				new Repetition(new Match("fFdD"), 0, 1)
 | 
| 246   | 			)))),
 | 
| 247   | 
 | 
| 248   | 			/**
 | 
| 249   | 			* [[:digit:]]+([eE][-+]?[[:digit:]]+)?[fFdD]
 | 
| 250   | 			*/
 | 
| 251   | 			new Concatenation(
 | 
| 252   | 				new Repetition(digit(), 1, INFINITY), new Concatenation(
 | 
| 253   | 				new Repetition(new Concatenation(
 | 
| 254   | 					new Match("eE"), new Concatenation(
 | 
| 255   | 					new Repetition(new Match("-+"), 0, 1),
 | 
| 256   | 					new Repetition(digit(), 1, INFINITY)
 | 
| 257   | 				)), 0, 1),
 | 
| 258   | 				new Match("fFdD")
 | 
| 259   | 			))
 | 
| 260   | 		))));
 | 
| 261   | 
 | 
| 262   | 		/**
 | 
| 263   | 		* 19.3 Terminals from section 3.10.3: Boolean Literal
 | 
| 264   | 		*/
 | 
| 265   | 		put("BOOLEAN_LITERAL", new Union(
 | 
| 266   | 			new Singleton("true"),
 | 
| 267   | 			new Singleton("false")
 | 
| 268   | 		));
 | 
| 269   | 
 | 
| 270   | 		/**
 | 
| 271   | 		* 19.3 Terminals from section 3.10.4: Character Literal
 | 
| 272   | 		*/
 | 
| 273   | 		put("CHARACTER_LITERAL", new Concatenation(
 | 
| 274   | 			new Singleton("'"), new Concatenation(
 | 
| 275   | 			new Union(
 | 
| 276   | 
 | 
| 277   | 				/**
 | 
| 278   | 				* Single Character: [^\r\n'\\]
 | 
| 279   | 				*/
 | 
| 280   | 				new NonMatch("\r\n'\\"),
 | 
| 281   | 
 | 
| 282   | 				/**
 | 
| 283   | 				* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
 | 
| 284   | 				*/
 | 
| 285   | 				new Concatenation(
 | 
| 286   | 					new Singleton("\\"),
 | 
| 287   | 					new Union(
 | 
| 288   | 						new Match("btnfr\"'\\"),
 | 
| 289   | 						new Concatenation(
 | 
| 290   | 							new Repetition(new Range('0', '3'), 0, 1),
 | 
| 291   | 							new Repetition(new Range('0', '7'), 1, 2)
 | 
| 292   | 						)
 | 
| 293   | 					)
 | 
| 294   | 				)
 | 
| 295   | 			),
 | 
| 296   | 			new Singleton("'")
 | 
| 297   | 		)));
 | 
| 298   | 
 | 
| 299   | 		put("MULTILINE_LITERAL", new Concatenation(
 | 
| 300   | 			new Singleton("[["), new Concatenation(
 | 
| 301   | 			new Repetition(
 | 
| 302   | 				new Union(
 | 
| 303   | 					new NonMatch("]"),
 | 
| 304   | 					new Concatenation(
 | 
| 305   | 					  new Singleton("]"), new NonMatch("]"))
 | 
| 306   | 			  ), 0, INFINITY
 | 
| 307   | 			),
 | 
| 308   | 			new Singleton("]]")
 | 
| 309   | 		)));
 | 
| 310   | 
 | 
| 311   | 		put("MULTILINE_LITERAL2", new Concatenation(
 | 
| 312   | 			new Singleton("[=["), new Concatenation(
 | 
| 313   | 			new Repetition(
 | 
| 314   | 				new Union(
 | 
| 315   | 					new NonMatch("]"),
 | 
| 316   | 					new Concatenation(new Singleton("]"), new Union(
 | 
| 317   | 				    new NonMatch("="),
 | 
| 318   | 				    new Concatenation(new Singleton("="), new NonMatch("]"))))
 | 
| 319   | 			  ), 0, INFINITY
 | 
| 320   | 			),
 | 
| 321   | 			new Singleton("]=]")
 | 
| 322   | 		)));
 | 
| 323   | 
 | 
| 324   | 		/**
 | 
| 325   | 		* 19.3 Terminals from section 3.10.5: String Literal
 | 
| 326   | 		*/
 | 
| 327   | 		put("STRING_LITERAL", new Concatenation(
 | 
| 328   | 			new Singleton("\""), new Concatenation(
 | 
| 329   | 			new Repetition(
 | 
| 330   | 				new Union(
 | 
| 331   | 
 | 
| 332   | 					/**
 | 
| 333   | 					* Single Character: [^\r\n"\\]
 | 
| 334   | 					*/
 | 
| 335   | 					new NonMatch("\r\n\"\\"),
 | 
| 336   | 
 | 
| 337   | 					/**
 | 
| 338   | 					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
 | 
| 339   | 					*/
 | 
| 340   | 					new Concatenation(
 | 
| 341   | 						new Singleton("\\"),
 | 
| 342   | 						new Union(
 | 
| 343   | 							new Match("btnfr\"'\\"),
 | 
| 344   | 							new Union(
 | 
| 345   |   							new Concatenation(
 | 
| 346   |   								new Repetition(new Range('0', '3'), 0, 1),
 | 
| 347   |   								new Repetition(new Range('0', '7'), 1, 2)
 | 
| 348   |   							),
 | 
| 349   |   							new Concatenation(
 | 
| 350   |   							  new Singleton("u"),
 | 
| 351   |   							  new Repetition(new Match("0123456789abcdefABCDEF"), 4, 4)
 | 
| 352   |   							)
 | 
| 353   |   						)
 | 
| 354   | 						)
 | 
| 355   | 					)
 | 
| 356   | 				), 0, INFINITY
 | 
| 357   | 			),
 | 
| 358   | 			new Singleton("\"")
 | 
| 359   | 		)));
 | 
| 360   | 
 | 
| 361   | 		/**
 | 
| 362   | 		* 19.3 Terminals section 3.10.7: Null Literal
 | 
| 363   | 		*/
 | 
| 364   | 		put("NULL_LITERAL", new Singleton("null"));
 | 
| 365   | 		
 | 
| 366   | 		// OK, it seems we have to add some more stuff...
 | 
| 367   | 		
 | 
| 368   | 		//put("OTHER1", new Match(";{}=,<>[]().+-:|&!"));
 | 
| 369   | 		//put("OTHER1", new NonMatch("")); // catch anything, one character at a time
 | 
| 370   | 		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time
 | 
| 371   | 
 | 
| 372   | 	}
 | 
| 373   | } // class Java20
 | 
| 374   | }
 | 
| 375   | 
 | 
| 376   | !include #1000514 // Lexicon |