1 | !636 |
2 | !629 // standard functions |
3 | !1000300 // class Lexicon |
4 | |
5 | class SCWS { |
6 | static List<String> tokenize(String src) tex { |
7 | Lex lex = new Lex(); |
8 | src = src.replace("\r\n", "\n"); |
9 | LineNumberReader source = new LineNumberReader(new StringReader(src)); |
10 | int lineNr = source.getLineNumber()+1; |
11 | List<T> list = new ArrayList<T>(); |
12 | for (Object a; (a = lex.grab(source)) != lex.$;) { |
13 | String word = lex.word(); |
14 | String q = main.quote(word); |
15 | //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q); |
16 | lineNr = source.getLineNumber()+1; |
17 | |
18 | T t = new T(a, word); |
19 | boolean isSpace = t.isSpace(); |
20 | if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace()) |
21 | list.get(list.size()-1).word += word; // merge spaces |
22 | else |
23 | list.add(t); |
24 | } |
25 | |
26 | List<String> cnc = new ArrayList<String>(); |
27 | for (int i = 0; i < list.size(); ) { |
28 | T t = list.get(i); |
29 | boolean shouldBeSpace = (cnc.size() % 2) == 0; |
30 | boolean isSpace = t.isSpace(); |
31 | if (shouldBeSpace == isSpace) { |
32 | cnc.add(t.word); |
33 | ++i; |
34 | } else if (shouldBeSpace) |
35 | cnc.add(""); |
36 | else { |
37 | //System.out.println(cncToLines(cnc)); |
38 | throw new RuntimeException("TILT at " + cnc.size() + ": " + main.quote(t.word)); |
39 | } |
40 | } |
41 | if ((cnc.size() % 2) == 0) |
42 | cnc.add(""); |
43 | return cnc; |
44 | } |
45 | |
46 | static class T { |
47 | Object a; String word; |
48 | |
49 | T(Object a, String word) { this.a = a; this.word = word; } |
50 | |
51 | boolean isSpace() { |
52 | return a.equals("WHITE_SPACE") || a.equals("COMMENT"); |
53 | } |
54 | } |
55 | |
56 | static class Lex extends Lexicon { |
57 | |
58 | Lex() { |
59 | |
60 | /* |
61 | * TERMINAL - all letters uppercase |
62 | */ |
63 | int INFINITY = -1; |
64 | |
65 | /** |
66 | * 19.3 Terminals from section 3.6: White Space: [[:space:]] |
67 | */ |
68 | put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY)); |
69 | |
70 | /** |
71 | * 19.3 Terminals from section 3.7: Comment |
72 | */ |
73 | put("COMMENT", new Union( |
74 | |
75 | // |
76 | // Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/ |
77 | // |
78 | new Concatenation( |
79 | new Singleton("/*"), new Concatenation( |
80 | new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation( |
81 | new Repetition( |
82 | new Concatenation( |
83 | new Singleton("*"), |
84 | new Repetition(new Concatenation( |
85 | new NonMatch("*/"), |
86 | new Repetition(new NonMatch("*"), 0, INFINITY) |
87 | ), 0, 1) |
88 | ), 0, INFINITY |
89 | ), |
90 | new Singleton("*/") |
91 | ))), new Union( |
92 | |
93 | /** |
94 | * End Of Line Comment: //[^\n]*\n |
95 | */ |
96 | new Concatenation( |
97 | new Singleton("//"), new Concatenation( |
98 | new Repetition(new NonMatch("\n"), 0, INFINITY), |
99 | new Singleton("\n") |
100 | )), |
101 | |
102 | // |
103 | // Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/ |
104 | // |
105 | new Concatenation( |
106 | new Singleton("/**"), new Concatenation( |
107 | new Repetition( |
108 | new Concatenation( |
109 | new Repetition(new Concatenation( |
110 | new NonMatch("*/"), |
111 | new Repetition(new NonMatch("*"), 0, INFINITY) |
112 | ), 0, 1), |
113 | new Singleton("*") |
114 | ), 0, INFINITY |
115 | ), |
116 | new Singleton("/") |
117 | )) |
118 | ))); |
119 | |
120 | put("IDENTIFIER", new Concatenation( |
121 | new Union( |
122 | PosixClass.alpha(), |
123 | new Match("_$") |
124 | ), |
125 | new Repetition( |
126 | new Union( |
127 | PosixClass.alnum(), |
128 | new Match("_$") |
129 | ), 0, INFINITY |
130 | ) |
131 | )); |
132 | |
133 | /** |
134 | * 19.3 Terminals from section 3.10.5: String Literal |
135 | */ |
136 | put("STRING_LITERAL", new Concatenation( |
137 | new Singleton("\""), new Concatenation( |
138 | new Repetition( |
139 | new Union( |
140 | |
141 | /** |
142 | * Single Character: [^\r\n"\\] |
143 | */ |
144 | new NonMatch("\r\n\"\\"), |
145 | |
146 | /** |
147 | * Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2}) |
148 | */ |
149 | new Concatenation( |
150 | new Singleton("\\"), |
151 | new Union( |
152 | new Match("btnfr\"'\\"), |
153 | new Concatenation( |
154 | new Repetition(new Range('0', '3'), 0, 1), |
155 | new Repetition(new Range('0', '7'), 1, 2) |
156 | ) |
157 | ) |
158 | ) |
159 | ), 0, INFINITY |
160 | ), |
161 | new Singleton("\"") |
162 | ))); |
163 | |
164 | // Single-character catch-all production so we can parse anything. |
165 | |
166 | put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time |
167 | |
168 | } |
169 | } // class Lex |
170 | } // class SCWS |
171 | |
172 | main { |
173 | psvm { |
174 | String src = takeInput(args, null); |
175 | List<String> cnc = SCWS.tokenize(src); |
176 | saveTextFile("output/output.txt", cncToLines(cnc)); |
177 | } |
178 | |
179 | static String takeInput(String[] args, String def) tex { |
180 | if (args.length != 0) return loadSnippet(args[0]); |
181 | return loadTextFile("input/input.txt", def); |
182 | } |
183 | } |
Began life as a copy of #1000323
download show line numbers debug dex old transpilations
Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, qbtsjoyahagl, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1000324 |
Snippet name: | Spaces, comments, words, strings (Tokenizer, embeddable, developing 2) |
Eternal ID of this version: | #1000324/1 |
Text MD5: | d221241170f48eaa45dd14b213ea5964 |
Author: | stefan |
Category: | javax |
Type: | JavaX (input.txt to output.txt) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-07-06 18:47:25 |
Source code size: | 4646 bytes / 183 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 661 / 521 |
Referenced in: | [show references] |