Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

193
LINES

< > BotCompany Repo | #1000323 // Spaces, comments, words, strings (Tokenizer, embeddable, developing)

JavaX (input.txt to output.txt)

1  
!636
2  
!629 // standard functions
3  
!1000300 // class Lexicon
4  
5  
main {
6  
  psvm {
7  
    String src = takeInput(args, null);
8  
    List<String> cnc = tokenize(src);
9  
    saveTextFile("output/output.txt", cncToLines(cnc));
10  
  }
11  
  
12  
  static List<String> tokenize(String src) tex {
13  
    Lex lex = new Lex();
14  
    src = src.replace("\r\n", "\n");
15  
    LineNumberReader source = new LineNumberReader(new StringReader(src));
16  
    int lineNr = source.getLineNumber()+1;
17  
    List<T> list = new ArrayList<T>();
18  
    for (Object a; (a = lex.grab(source)) != lex.$;) {
19  
      String word = lex.word();
20  
      String q = quote(word);
21  
      //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
22  
      lineNr = source.getLineNumber()+1;
23  
      
24  
      T t = new T(a, word);
25  
      boolean isSpace = t.isSpace();
26  
      if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
27  
        list.get(list.size()-1).word += word; // merge spaces
28  
      else
29  
        list.add(t);
30  
    }
31  
    
32  
    List<String> cnc = new ArrayList<String>();
33  
    for (int i = 0; i < list.size(); ) {
34  
      T t = list.get(i);
35  
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
36  
      boolean isSpace = t.isSpace();
37  
      if (shouldBeSpace == isSpace) {
38  
        cnc.add(t.word);
39  
        ++i;
40  
      } else if (shouldBeSpace)
41  
        cnc.add("");
42  
      else {
43  
        System.out.println(cncToLines(cnc));
44  
        throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
45  
      }
46  
    }
47  
    if ((cnc.size() % 2) == 0)
48  
      cnc.add("");
49  
    return cnc;
50  
  }
51  
  
52  
  static class T {
53  
    Object a; String word;
54  
    
55  
    T(Object a, String word) { this.a = a; this.word = word; }
56  
    
57  
    boolean isSpace() {
58  
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
59  
    }
60  
  }
61  
  
62  
  static String cncToLines(List<String> cnc) {
63  
    StringBuilder out = new StringBuilder();
64  
    for (String token : cnc)
65  
      out.append(quote(token) + "\n");
66  
    return out.toString();
67  
  }
68  
  
69  
  static String takeInput(String[] args, String def) tex {
70  
    if (args.length != 0) return loadSnippet(args[0]);
71  
    return loadTextFile("input/input.txt", def);
72  
  }
73  
  
74  
  public static String quote(String s) {
75  
    if (s == null) return "null";
76  
    return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
77  
  }
78  
  
79  
  static class Lex extends Lexicon {
80  
81  
	Lex() {
82  
83  
    /*
84  
		* TERMINAL - all letters uppercase
85  
		*/
86  
		int INFINITY = -1;
87  
88  
		/**
89  
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
90  
		*/
91  
		put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY));
92  
93  
		/**
94  
		* 19.3 Terminals from section 3.7: Comment
95  
		*/
96  
		put("COMMENT", new Union(
97  
98  
			//
99  
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
100  
			//
101  
			new Concatenation(
102  
				new Singleton("/*"), new Concatenation(
103  
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
104  
				new Repetition(
105  
					new Concatenation(
106  
						new Singleton("*"),
107  
						new Repetition(new Concatenation(
108  
							new NonMatch("*/"),
109  
							new Repetition(new NonMatch("*"), 0, INFINITY)
110  
						), 0, 1)
111  
					), 0, INFINITY
112  
				),
113  
				new Singleton("*/")
114  
			))), new Union(
115  
116  
			/**
117  
			* End Of Line Comment: //[^\n]*\n
118  
			*/
119  
			new Concatenation(
120  
				new Singleton("//"), new Concatenation(
121  
				new Repetition(new NonMatch("\n"), 0, INFINITY),
122  
				new Singleton("\n")
123  
			)),
124  
125  
			//
126  
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
127  
			//
128  
			new Concatenation(
129  
				new Singleton("/**"), new Concatenation(
130  
				new Repetition(
131  
					new Concatenation(
132  
						new Repetition(new Concatenation(
133  
							new NonMatch("*/"),
134  
							new Repetition(new NonMatch("*"), 0, INFINITY)
135  
						), 0, 1),
136  
						new Singleton("*")
137  
					), 0, INFINITY
138  
				),
139  
				new Singleton("/")
140  
			))
141  
		)));
142  
143  
		put("IDENTIFIER", new Concatenation(
144  
			new Union(
145  
				PosixClass.alpha(),
146  
				new Match("_$")
147  
			),
148  
			new Repetition(
149  
				new Union(
150  
					PosixClass.alnum(),
151  
					new Match("_$")
152  
				), 0, INFINITY
153  
			)
154  
		));
155  
156  
		/**
157  
		* 19.3 Terminals from section 3.10.5: String Literal
158  
		*/
159  
		put("STRING_LITERAL", new Concatenation(
160  
			new Singleton("\""), new Concatenation(
161  
			new Repetition(
162  
				new Union(
163  
164  
					/**
165  
					* Single Character: [^\r\n"\\]
166  
					*/
167  
					new NonMatch("\r\n\"\\"),
168  
169  
					/**
170  
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
171  
					*/
172  
					new Concatenation(
173  
						new Singleton("\\"),
174  
						new Union(
175  
							new Match("btnfr\"'\\"),
176  
							new Concatenation(
177  
								new Repetition(new Range('0', '3'), 0, 1),
178  
								new Repetition(new Range('0', '7'), 1, 2)
179  
							)
180  
						)
181  
					)
182  
				), 0, INFINITY
183  
			),
184  
			new Singleton("\"")
185  
		)));
186  
187  
		// Single-character catch-all production so we can parse anything.
188  
		
189  
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time
190  
191  
	}
192  
} // class Lex
193  
}

Author comment

Began life as a copy of #655

download  show line numbers  debug dex  old transpilations   

Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, qbtsjoyahagl, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1000323
Snippet name: Spaces, comments, words, strings (Tokenizer, embeddable, developing)
Eternal ID of this version: #1000323/1
Text MD5: 1130cfc8b659aee0598f502ea9276c55
Author: stefan
Category: javax
Type: JavaX (input.txt to output.txt)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-07-06 18:39:38
Source code size: 4994 bytes / 193 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 669 / 576
Referenced in: [show references]