Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

183
LINES

< > BotCompany Repo | #1000324 // Spaces, comments, words, strings (Tokenizer, embeddable, developing 2)

JavaX (input.txt to output.txt)

1  
!636
2  
!629 // standard functions
3  
!1000300 // class Lexicon
4  
5  
class SCWS {  
6  
  static List<String> tokenize(String src) tex {
7  
    Lex lex = new Lex();
8  
    src = src.replace("\r\n", "\n");
9  
    LineNumberReader source = new LineNumberReader(new StringReader(src));
10  
    int lineNr = source.getLineNumber()+1;
11  
    List<T> list = new ArrayList<T>();
12  
    for (Object a; (a = lex.grab(source)) != lex.$;) {
13  
      String word = lex.word();
14  
      String q = main.quote(word);
15  
      //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
16  
      lineNr = source.getLineNumber()+1;
17  
      
18  
      T t = new T(a, word);
19  
      boolean isSpace = t.isSpace();
20  
      if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
21  
        list.get(list.size()-1).word += word; // merge spaces
22  
      else
23  
        list.add(t);
24  
    }
25  
    
26  
    List<String> cnc = new ArrayList<String>();
27  
    for (int i = 0; i < list.size(); ) {
28  
      T t = list.get(i);
29  
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
30  
      boolean isSpace = t.isSpace();
31  
      if (shouldBeSpace == isSpace) {
32  
        cnc.add(t.word);
33  
        ++i;
34  
      } else if (shouldBeSpace)
35  
        cnc.add("");
36  
      else {
37  
        //System.out.println(cncToLines(cnc));
38  
        throw new RuntimeException("TILT at " + cnc.size() + ": " + main.quote(t.word));
39  
      }
40  
    }
41  
    if ((cnc.size() % 2) == 0)
42  
      cnc.add("");
43  
    return cnc;
44  
  }
45  
  
46  
  static class T {
47  
    Object a; String word;
48  
    
49  
    T(Object a, String word) { this.a = a; this.word = word; }
50  
    
51  
    boolean isSpace() {
52  
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
53  
    }
54  
  }
55  
  
56  
  static class Lex extends Lexicon {
57  
58  
	Lex() {
59  
60  
    /*
61  
		* TERMINAL - all letters uppercase
62  
		*/
63  
		int INFINITY = -1;
64  
65  
		/**
66  
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
67  
		*/
68  
		put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY));
69  
70  
		/**
71  
		* 19.3 Terminals from section 3.7: Comment
72  
		*/
73  
		put("COMMENT", new Union(
74  
75  
			//
76  
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
77  
			//
78  
			new Concatenation(
79  
				new Singleton("/*"), new Concatenation(
80  
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
81  
				new Repetition(
82  
					new Concatenation(
83  
						new Singleton("*"),
84  
						new Repetition(new Concatenation(
85  
							new NonMatch("*/"),
86  
							new Repetition(new NonMatch("*"), 0, INFINITY)
87  
						), 0, 1)
88  
					), 0, INFINITY
89  
				),
90  
				new Singleton("*/")
91  
			))), new Union(
92  
93  
			/**
94  
			* End Of Line Comment: //[^\n]*\n
95  
			*/
96  
			new Concatenation(
97  
				new Singleton("//"), new Concatenation(
98  
				new Repetition(new NonMatch("\n"), 0, INFINITY),
99  
				new Singleton("\n")
100  
			)),
101  
102  
			//
103  
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
104  
			//
105  
			new Concatenation(
106  
				new Singleton("/**"), new Concatenation(
107  
				new Repetition(
108  
					new Concatenation(
109  
						new Repetition(new Concatenation(
110  
							new NonMatch("*/"),
111  
							new Repetition(new NonMatch("*"), 0, INFINITY)
112  
						), 0, 1),
113  
						new Singleton("*")
114  
					), 0, INFINITY
115  
				),
116  
				new Singleton("/")
117  
			))
118  
		)));
119  
120  
		put("IDENTIFIER", new Concatenation(
121  
			new Union(
122  
				PosixClass.alpha(),
123  
				new Match("_$")
124  
			),
125  
			new Repetition(
126  
				new Union(
127  
					PosixClass.alnum(),
128  
					new Match("_$")
129  
				), 0, INFINITY
130  
			)
131  
		));
132  
133  
		/**
134  
		* 19.3 Terminals from section 3.10.5: String Literal
135  
		*/
136  
		put("STRING_LITERAL", new Concatenation(
137  
			new Singleton("\""), new Concatenation(
138  
			new Repetition(
139  
				new Union(
140  
141  
					/**
142  
					* Single Character: [^\r\n"\\]
143  
					*/
144  
					new NonMatch("\r\n\"\\"),
145  
146  
					/**
147  
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
148  
					*/
149  
					new Concatenation(
150  
						new Singleton("\\"),
151  
						new Union(
152  
							new Match("btnfr\"'\\"),
153  
							new Concatenation(
154  
								new Repetition(new Range('0', '3'), 0, 1),
155  
								new Repetition(new Range('0', '7'), 1, 2)
156  
							)
157  
						)
158  
					)
159  
				), 0, INFINITY
160  
			),
161  
			new Singleton("\"")
162  
		)));
163  
164  
		// Single-character catch-all production so we can parse anything.
165  
		
166  
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time
167  
168  
	}
169  
} // class Lex
170  
} // class SCWS
171  
172  
main {
173  
  psvm {
174  
    String src = takeInput(args, null);
175  
    List<String> cnc = SCWS.tokenize(src);
176  
    saveTextFile("output/output.txt", cncToLines(cnc));
177  
  }
178  
  
179  
  static String takeInput(String[] args, String def) tex {
180  
    if (args.length != 0) return loadSnippet(args[0]);
181  
    return loadTextFile("input/input.txt", def);
182  
  }
183  
}

Author comment

Began life as a copy of #1000323

download  show line numbers  debug dex  old transpilations   

Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, qbtsjoyahagl, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1000324
Snippet name: Spaces, comments, words, strings (Tokenizer, embeddable, developing 2)
Eternal ID of this version: #1000324/1
Text MD5: d221241170f48eaa45dd14b213ea5964
Author: stefan
Category: javax
Type: JavaX (input.txt to output.txt)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-07-06 18:47:25
Source code size: 4646 bytes / 183 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 658 / 516
Referenced in: [show references]