Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

183
LINES

< > BotCompany Repo | #1000324 // Spaces, comments, words, strings (Tokenizer, embeddable, developing 2)

JavaX (input.txt to output.txt)

!636

!629 // standard functions
!1000300 // class Lexicon

class SCWS {  
  static List<String> tokenize(String src) tex {
    Lex lex = new Lex();
    src = src.replace("\r\n", "\n");
    LineNumberReader source = new LineNumberReader(new StringReader(src));
    int lineNr = source.getLineNumber()+1;
    List<T> list = new ArrayList<T>();
    for (Object a; (a = lex.grab(source)) != lex.$;) {
      String word = lex.word();
      String q = main.quote(word);
      //System.out.println("grabbed at line " + lineNr + ": " + a + " " + q);
      lineNr = source.getLineNumber()+1;
      
      T t = new T(a, word);
      boolean isSpace = t.isSpace();
      if (isSpace && list.size() > 0 && list.get(list.size()-1).isSpace())
        list.get(list.size()-1).word += word; // merge spaces
      else
        list.add(t);
    }
    
    List<String> cnc = new ArrayList<String>();
    for (int i = 0; i < list.size(); ) {
      T t = list.get(i);
      boolean shouldBeSpace = (cnc.size() % 2) == 0;
      boolean isSpace = t.isSpace();
      if (shouldBeSpace == isSpace) {
        cnc.add(t.word);
        ++i;
      } else if (shouldBeSpace)
        cnc.add("");
      else {
        //System.out.println(cncToLines(cnc));
        throw new RuntimeException("TILT at " + cnc.size() + ": " + main.quote(t.word));
      }
    }
    if ((cnc.size() % 2) == 0)
      cnc.add("");
    return cnc;
  }
  
  static class T {
    Object a; String word;
    
    T(Object a, String word) { this.a = a; this.word = word; }
    
    boolean isSpace() {
      return a.equals("WHITE_SPACE") || a.equals("COMMENT");
    }
  }
  
  static class Lex extends Lexicon {

	Lex() {

    /*
		* TERMINAL - all letters uppercase
		*/
		int INFINITY = -1;

		/**
		* 19.3 Terminals from section 3.6: White Space: [[:space:]]
		*/
		put("WHITE_SPACE", new Repetition(PosixClass.space(), 1, INFINITY));

		/**
		* 19.3 Terminals from section 3.7: Comment
		*/
		put("COMMENT", new Union(

			//
			// Traditional Comment: /\*[^*]+(\*([^*/][^*]*)?)*\*/
			//
			new Concatenation(
				new Singleton("/*"), new Concatenation(
				new Repetition(new NonMatch("*"), 1, INFINITY), new Concatenation(
				new Repetition(
					new Concatenation(
						new Singleton("*"),
						new Repetition(new Concatenation(
							new NonMatch("*/"),
							new Repetition(new NonMatch("*"), 0, INFINITY)
						), 0, 1)
					), 0, INFINITY
				),
				new Singleton("*/")
			))), new Union(

			/**
			* End Of Line Comment: //[^\n]*\n
			*/
			new Concatenation(
				new Singleton("//"), new Concatenation(
				new Repetition(new NonMatch("\n"), 0, INFINITY),
				new Singleton("\n")
			)),

			//
			// Documentation Comment: /\*\*(([^*/][^*]*)?\*)*/
			//
			new Concatenation(
				new Singleton("/**"), new Concatenation(
				new Repetition(
					new Concatenation(
						new Repetition(new Concatenation(
							new NonMatch("*/"),
							new Repetition(new NonMatch("*"), 0, INFINITY)
						), 0, 1),
						new Singleton("*")
					), 0, INFINITY
				),
				new Singleton("/")
			))
		)));

		put("IDENTIFIER", new Concatenation(
			new Union(
				PosixClass.alpha(),
				new Match("_$")
			),
			new Repetition(
				new Union(
					PosixClass.alnum(),
					new Match("_$")
				), 0, INFINITY
			)
		));

		/**
		* 19.3 Terminals from section 3.10.5: String Literal
		*/
		put("STRING_LITERAL", new Concatenation(
			new Singleton("\""), new Concatenation(
			new Repetition(
				new Union(

					/**
					* Single Character: [^\r\n"\\]
					*/
					new NonMatch("\r\n\"\\"),

					/**
					* Escape Sequence: \\([btnfr\"'\\]|[0-3]?[0-7]{1,2})
					*/
					new Concatenation(
						new Singleton("\\"),
						new Union(
							new Match("btnfr\"'\\"),
							new Concatenation(
								new Repetition(new Range('0', '3'), 0, 1),
								new Repetition(new Range('0', '7'), 1, 2)
							)
						)
					)
				), 0, INFINITY
			),
			new Singleton("\"")
		)));

		// Single-character catch-all production so we can parse anything.
		
		put("OTHER1", new NonMatch(" \t\r\n")); // catch any non-whitespace, one character at a time

	}
} // class Lex
} // class SCWS

main {
  psvm {
    String src = takeInput(args, null);
    List<String> cnc = SCWS.tokenize(src);
    saveTextFile("output/output.txt", cncToLines(cnc));
  }
  
  static String takeInput(String[] args, String def) tex {
    if (args.length != 0) return loadSnippet(args[0]);
    return loadTextFile("input/input.txt", def);
  }
}
  

Author comment

Began life as a copy of #1000323

download  show line numbers  debug dex  old transpilations   

Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, qbtsjoyahagl, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1000324
Snippet name: Spaces, comments, words, strings (Tokenizer, embeddable, developing 2)
Eternal ID of this version: #1000324/1
Text MD5: d221241170f48eaa45dd14b213ea5964
Author: stefan
Category: javax
Type: JavaX (input.txt to output.txt)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-07-06 18:47:25
Source code size: 4646 bytes / 183 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 654 / 511
Referenced in: [show references]