Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

189
LINES

< > BotCompany Repo | #655 // Spaces, comments, words, strings (Tokenizer)

JavaX (input.txt to output.txt)

1  
!636
2  
!629 // standard functions
3  
!1000300 // class Lexicon
4  
5  
main {
6  
  psvm {
7  
    String src = takeInput(args, null);
8  
    Lex lex = new Lex();
9  
    src = src.replace("\r\n", "\n");
10  
    LineNumberReader source = new LineNumberReader(new StringReader(src));
11  
    int lineNr = source.getLineNumber()+1;
12  
    List<T> list = new ArrayList<T>();
13  
    for (Object a; (a = lex.grab(source)) != lex.$;) {
14  
      String word = lex.word();
15  
      String q = quote(word);
16  
      //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
17  
      lineNr = source.getLineNumber()+1;
18  
      
19  
      T t = new T(a, word);
20  
      boolean isSpace = t.isSpace();
21  
      if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
22  
        list.get(list.size()-1).word += word; // merge spaces
23  
      else
24  
        list.add(t);
25  
    }
26  
    
27  
    List<String> cnc = new ArrayList<String>();
28  
    for (int i = 0; i < list.size(); ) {
29  
      T t = list.get(i);
30  
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
31  
      boolean isSpace = t.isSpace();
32  
      if (shouldBeSpace == isSpace) {
33  
        cnc.add(t.word);
34  
        ++i;
35  
      } else if (shouldBeSpace)
36  
        cnc.add("");
37  
      else {
38  
        System.out.println(cncToLines(cnc));
39  
        throw new RuntimeException("TILT at " + cnc.size() + ": " + quote(t.word));
40  
      }
41  
    }
42  
    if ((cnc.size() % 2) == 0)
43  
      cnc.add("");
44  
45  
    saveTextFile("output/output.txt", cncToLines(cnc));
46  
  }
47  
  
48  
  static class T {
49  
    Object a; String word;
50  
    
51  
    T(Object a, String word) { this.a = a; this.word = word; }
52  
    
53  
    boolean isSpace() {
54  
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
55  
    }
56  
  }
57  
  
58  
  static String cncToLines(List<String> cnc) {
59  
    StringBuilder out = new StringBuilder();
60  
    for (String token : cnc)
61  
      out.append(quote(token) + "\n");
62  
    return out.toString();
63  
  }
64  
  
65  
  static String takeInput(String[] args, String def) tex {
66  
    if (args.length != 0) return loadSnippet(args[0]);
67  
    return loadTextFile("input/input.txt", def);
68  
  }
69  
  
70  
  public static String quote(String s) {
71  
    if (s == null) return "null";
72  
    return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\"";
73  
  }
74  
  
75  
  static class Lex extends Lexicon {
76  
77  
	Lex() {
78  
79  
    /*
80  
		* TERMINAL - all letters uppercase
81  
		*/
82  
		int INFINITY = -1;
83  
84  
		/**
85  
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
86  
		*/
87  
		put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY));
88  
89  
		/**
90  
		* 19.3 Terminals from section 3.7: Comment
91  
		*/
92  
		put("COMMENT", new Union(
93  
94  
			//
95  
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
96  
			//
97  
			new Concatenation(
98  
				new Singleton("/*"), new Concatenation(
99  
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
100  
				new Repetition(
101  
					new Concatenation(
102  
						new Singleton("*"),
103  
						new Repetition(new Concatenation(
104  
							new NonMatch("*/"),
105  
							new Repetition(new NonMatch("*"), 0, INFINITY)
106  
						), 0, 1)
107  
					), 0, INFINITY
108  
				),
109  
				new Singleton("*/")
110  
			))), new Union(
111  
112  
			/**
113  
			* End Of Line Comment: //[^\n]*\n
114  
			*/
115  
			new Concatenation(
116  
				new Singleton("//"), new Concatenation(
117  
				new Repetition(new NonMatch("\n"), 0, INFINITY),
118  
				new Singleton("\n")
119  
			)),
120  
121  
			//
122  
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
123  
			//
124  
			new Concatenation(
125  
				new Singleton("/**"), new Concatenation(
126  
				new Repetition(
127  
					new Concatenation(
128  
						new Repetition(new Concatenation(
129  
							new NonMatch("*/"),
130  
							new Repetition(new NonMatch("*"), 0, INFINITY)
131  
						), 0, 1),
132  
						new Singleton("*")
133  
					), 0, INFINITY
134  
				),
135  
				new Singleton("/")
136  
			))
137  
		)));
138  
139  
		put("IDENTIFIER", new Concatenation(
140  
			new Union(
141  
				PosixClass.alpha(),
142  
				new Match("_$")
143  
			),
144  
			new Repetition(
145  
				new Union(
146  
					PosixClass.alnum(),
147  
					new Match("_$")
148  
				), 0, INFINITY
149  
			)
150  
		));
151  
152  
		/**
153  
		* 19.3 Terminals from section 3.10.5: String Literal
154  
		*/
155  
		put("STRING_LITERAL", new Concatenation(
156  
			new Singleton("\""), new Concatenation(
157  
			new Repetition(
158  
				new Union(
159  
160  
					/**
161  
					* Single Character: [^\r\n"\\]
162  
					*/
163  
					new NonMatch("\r\n\"\\"),
164  
165  
					/**
166  
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
167  
					*/
168  
					new Concatenation(
169  
						new Singleton("\\"),
170  
						new Union(
171  
							new Match("btnfr\"'\\"),
172  
							new Concatenation(
173  
								new Repetition(new Range('0', '3'), 0, 1),
174  
								new Repetition(new Range('0', '7'), 1, 2)
175  
							)
176  
						)
177  
					)
178  
				), 0, INFINITY
179  
			),
180  
			new Singleton("\"")
181  
		)));
182  
183  
		// Single-character catch-all production so we can parse anything.
184  
		
185  
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time
186  
187  
	}
188  
} // class Lex
189  
}

Author comment

Began life as a copy of #651

download  show line numbers  debug dex  old transpilations   

Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, qbtsjoyahagl, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #655
Snippet name: Spaces, comments, words, strings (Tokenizer)
Eternal ID of this version: #655/1
Text MD5: 4c83a001a302a8beb62837e767f4fb28
Author: stefan
Category: javax
Type: JavaX (input.txt to output.txt)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-07-06 18:02:11
Source code size: 4881 bytes / 189 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 712 / 871
Referenced in: [show references]