Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

82
LINES

< > BotCompany Repo | #1000688 // javaTok function - Java + JavaX tokenizer, but parses just about anything

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (2703L/17K).

// TODO: extended multi-line strings

static int javaTok_n, javaTok_elements;
static bool javaTok_opt;

static List<String> javaTok(String s) {
  ++javaTok_n;
  ArrayList<String> tok = new ArrayList();
  int l = s == null ? 0 : s.length();
  
  int i = 0;
  while (i < l) {
    int j = i;
    char c, d;
    
        // scan for whitespace
        while (j < l) {
          c = s.charAt(j);
          d = j+1 >= l ? '\0' : s.charAt(j+1);
          if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
            ++j;
          else if (c == '/' && d == '*') {
            do ++j; while (j < l && !regionMatches(s, j, "*/"));
            j = Math.min(j+2, l);
          } else if (c == '/' && d == '/') {
            do ++j; while (j < l && "\r\n".indexOf(s.charAt(j)) < 0);
          } else
            break;
        }
        
        tok.add(javaTok_substringN(s, i, j));
        i = j;
        if (i >= l) break;
        c = s.charAt(i);
        d = i+1 >= l ? '\0' : s.charAt(i+1);
    
        // scan for non-whitespace
        
        // Special JavaX syntax: 'identifier
        if (c == '\'' && Character.isJavaIdentifierStart(d) && i+2 < l && "'\\".indexOf(s.charAt(i+2)) < 0) {
          j += 2;
          while (j < l && Character.isJavaIdentifierPart(s.charAt(j)))
            ++j;
        } else if (c == '\'' || c == '"') {
          char opener = c;
          ++j;
          while (j < l) {
            int c2 = s.charAt(j);
            if (c2 == opener || c2 == '\n' && opener == '\'') { // allow multi-line strings, but not for '
              ++j;
              break;
            } else if (c2 == '\\' && j+1 < l)
              j += 2;
            else
              ++j;
          }
        } else if (Character.isJavaIdentifierStart(c))
          do ++j; while (j < l && (Character.isJavaIdentifierPart(s.charAt(j)) || s.charAt(j) == '\'')); // for stuff like "don't"
        else if (Character.isDigit(c)) {
          do ++j; while (j < l && Character.isDigit(s.charAt(j)));
          if (j < l && s.charAt(j) == 'L') ++j; // Long constants like 1L
        } else if (c == '[' && d == '[') {
          do ++j; while (j < l && !regionMatches(s, j, "]]"));
          j = Math.min(j+2, l);
        } else if (c == '[' && d == '=' && i+2 < l && s.charAt(i+2) == '[') {
          do ++j; while (j+2 < l && !regionMatches(s, j, "]=]"));
          j = Math.min(j+3, l);
        } else
          ++j;
      
    tok.add(javaTok_substringC(s, i, j));
    i = j;
  }
  
  if ((tok.size() % 2) == 0) tok.add("");
  javaTok_elements += tok.size();
  return tok;
}

static List<String> javaTok(List<String> tok) {
  return javaTokWithExisting(join(tok), tok);
}

Author comment

Began life as a copy of #1000647

download  show line numbers  debug dex  old transpilations   

Travelled to 18 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, irmadwmeruwu, ishqpsrjomds, lpdgvwnxivlt, mowyntqkapby, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1000688
Snippet name: javaTok function - Java + JavaX tokenizer, but parses just about anything
Eternal ID of this version: #1000688/18
Text MD5: 2c9ef109164cefc9352ebf79f6cb1b2f
Transpilation MD5: 00cf2d77835e7f88dbdf7b44fe6c33c1
Author: stefan
Category:
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-12-11 01:36:23
Source code size: 2755 bytes / 82 lines
Pitched / IR pitched: No / No
Views / Downloads: 1180 / 8559
Version history: 17 change(s)
Referenced in: #1000769 - javaTokPlusPeriod
#1002427 - Accellerating 629 (SPIKE)
#1005592 - javaTok_streaming function - javaTok using streamed output
#1006654 - Standard functions list 2 (LIVE, continuation of #761)
#1011293 - htmlFineTok
#1013521 - javaTokWithExisting
#1013522 - quotelessJavaTok function - Java tokenizer without quoted things. TODO: is it up to date with javaTok?
#1023688 - javaTok_noCommentsNoStrings
#1025691 - javaTok_noMLS - javaTok without multi-line-strings (for unstructure)
#1027377 - javaTokWithUnifiedNumbersAndIdentifiers
#1030392 - javaTok_vstack - javaTok for VStack [OK, only outer loop virtualized]
#1037256 - pineTok function - PineScript tokenizer
#3000382 - Answer for ferdie (>> t = 1, f = 0)
#3000383 - Answer for funkoverflow (>> t=1, f=0 okay)