From 82b96da2151b37443addb3ee472d66682bbf4a80 Mon Sep 17 00:00:00 2001 From: Alexey Andreev Date: Thu, 16 Nov 2017 13:44:03 +0300 Subject: [PATCH] Add StreamTokenizer --- .../classlib/java/io/TStreamTokenizer.java | 482 ++++++++++++++++++ .../classlib/java/io/StreamTokenizerTest.java | 336 ++++++++++++ 2 files changed, 818 insertions(+) create mode 100644 classlib/src/main/java/org/teavm/classlib/java/io/TStreamTokenizer.java create mode 100644 tests/src/test/java/org/teavm/classlib/java/io/StreamTokenizerTest.java diff --git a/classlib/src/main/java/org/teavm/classlib/java/io/TStreamTokenizer.java b/classlib/src/main/java/org/teavm/classlib/java/io/TStreamTokenizer.java new file mode 100644 index 000000000..1f6326357 --- /dev/null +++ b/classlib/src/main/java/org/teavm/classlib/java/io/TStreamTokenizer.java @@ -0,0 +1,482 @@ +/* + * Copyright 2017 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.io; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; + +public class TStreamTokenizer { + public double nval; + public String sval; + public static final int TT_EOF = -1; + public static final int TT_EOL = '\n'; + public static final int TT_NUMBER = -2; + public static final int TT_WORD = -3; + private static final int TT_UNKNOWN = -4; + public int ttype = TT_UNKNOWN; + + /** + * Internal character meanings, 0 implies TOKEN_ORDINARY + */ + private byte[] tokenTypes = new byte[256]; + + private static final byte TOKEN_COMMENT = 1; + + private static final byte TOKEN_QUOTE = 2; + + private static final byte TOKEN_WHITE = 4; + + private static final byte TOKEN_WORD = 8; + + private static final byte TOKEN_DIGIT = 16; + + private int lineNumber = 1; + + private boolean forceLowercase; + + private boolean isEOLSignificant; + + private boolean slashStarComments; + + private boolean slashSlashComments; + + private boolean pushBackToken; + + private boolean lastCr; + + /* One of these will have the stream */ + private InputStream inStream; + + private Reader inReader; + + private int peekChar = -2; + + private TStreamTokenizer() { + /* + * Initialize the default state per specification. All byte values 'A' + * through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are + * considered to be alphabetic. + */ + wordChars('A', 'Z'); + wordChars('a', 'z'); + wordChars(160, 255); + /* + * All byte values '\u0000' through '\u0020' are considered to be white + * space. + */ + whitespaceChars(0, 32); + /* + * '/' is a comment character. Single quote '\'' and double quote '"' + * are string quote characters. + */ + commentChar('/'); + quoteChar('"'); + quoteChar('\''); + /* + * Numbers are parsed. + */ + parseNumbers(); + /* + * Ends of lines are treated as white space, not as separate tokens. + * C-style and C++-style comments are not recognized. These are the + * defaults and are not needed in constructor. + */ + } + + @Deprecated + public TStreamTokenizer(InputStream is) { + this(); + if (is == null) { + throw new NullPointerException(); + } + inStream = is; + } + + public TStreamTokenizer(Reader r) { + this(); + if (r == null) { + throw new NullPointerException(); + } + inReader = r; + } + + public void commentChar(int ch) { + if (0 <= ch && ch < tokenTypes.length) { + tokenTypes[ch] = TOKEN_COMMENT; + } + } + + public void eolIsSignificant(boolean flag) { + isEOLSignificant = flag; + } + + public int lineno() { + return lineNumber; + } + + public void lowerCaseMode(boolean flag) { + forceLowercase = flag; + } + + public int nextToken() throws IOException { + if (pushBackToken) { + pushBackToken = false; + if (ttype != TT_UNKNOWN) { + return ttype; + } + } + sval = null; // Always reset sval to null + int currentChar = peekChar == -2 ? read() : peekChar; + + if (lastCr && currentChar == '\n') { + lastCr = false; + currentChar = read(); + } + if (currentChar == -1) { + ttype = TT_EOF; + return ttype; + } + + byte currentType = currentChar > 255 ? TOKEN_WORD : tokenTypes[currentChar]; + while ((currentType & TOKEN_WHITE) != 0) { + /* + * Skip over white space until we hit a new line or a real token + */ + if (currentChar == '\r') { + lineNumber++; + if (isEOLSignificant) { + lastCr = true; + peekChar = -2; + ttype = TT_EOL; + return ttype; + } + currentChar = read(); + if (currentChar == '\n') { + currentChar = read(); + } + } else if (currentChar == '\n') { + lineNumber++; + if (isEOLSignificant) { + peekChar = -2; + ttype = TT_EOL; + return ttype; + } + currentChar = read(); + } else { + // Advance over this white space character and try again. + currentChar = read(); + } + if (currentChar == -1) { + ttype = TT_EOF; + return ttype; + } + currentType = currentChar > 255 ? TOKEN_WORD : tokenTypes[currentChar]; + } + + /* + * Check for digits before checking for words since digits can be + * contained within words. + */ + if ((currentType & TOKEN_DIGIT) != 0) { + StringBuilder digits = new StringBuilder(20); + boolean haveDecimal = false; + boolean checkJustNegative = currentChar == '-'; + while (true) { + if (currentChar == '.') { + haveDecimal = true; + } + digits.append((char) currentChar); + currentChar = read(); + if ((currentChar < '0' || currentChar > '9') + && (haveDecimal || currentChar != '.')) { + break; + } + } + peekChar = currentChar; + if (checkJustNegative && digits.length() == 1) { + // Didn't get any other digits other than '-' + ttype = '-'; + return ttype; + } + try { + nval = Double.valueOf(digits.toString()); + } catch (NumberFormatException e) { + // Unsure what to do, will write test. + nval = 0; + } + ttype = TT_NUMBER; + return ttype; + } + // Check for words + if ((currentType & TOKEN_WORD) != 0) { + StringBuilder word = new StringBuilder(20); + while (true) { + word.append((char) currentChar); + currentChar = read(); + if (currentChar == -1 + || (currentChar < 256 && (tokenTypes[currentChar] & (TOKEN_WORD | TOKEN_DIGIT)) == 0)) { + break; + } + } + peekChar = currentChar; + sval = forceLowercase ? word.toString().toLowerCase() : word + .toString(); + ttype = TT_WORD; + return ttype; + } + // Check for quoted character + if (currentType == TOKEN_QUOTE) { + StringBuilder quoteString = new StringBuilder(); + int peekOne = read(); + while (peekOne >= 0 && peekOne != currentChar && peekOne != '\r' && peekOne != '\n') { + boolean readPeek = true; + if (peekOne == '\\') { + int c1 = read(); + // Check for quoted octal IE: \377 + if (c1 <= '7' && c1 >= '0') { + int digitValue = c1 - '0'; + c1 = read(); + if (c1 > '7' || c1 < '0') { + readPeek = false; + } else { + digitValue = digitValue * 8 + (c1 - '0'); + c1 = read(); + // limit the digit value to a byte + if (digitValue > 31 || c1 > '7' || c1 < '0') { + readPeek = false; + } else { + digitValue = digitValue * 8 + (c1 - '0'); + } + } + if (!readPeek) { + // We've consumed one to many + quoteString.append((char) digitValue); + peekOne = c1; + } else { + peekOne = digitValue; + } + } else { + switch (c1) { + case 'a': + peekOne = 0x7; + break; + case 'b': + peekOne = 0x8; + break; + case 'f': + peekOne = 0xc; + break; + case 'n': + peekOne = 0xA; + break; + case 'r': + peekOne = 0xD; + break; + case 't': + peekOne = 0x9; + break; + case 'v': + peekOne = 0xB; + break; + default: + peekOne = c1; + } + } + } + if (readPeek) { + quoteString.append((char) peekOne); + peekOne = read(); + } + } + if (peekOne == currentChar) { + peekOne = read(); + } + peekChar = peekOne; + ttype = currentChar; + sval = quoteString.toString(); + return ttype; + } + // Do comments, both "//" and "/*stuff*/" + if (currentChar == '/' && (slashSlashComments || slashStarComments)) { + currentChar = read(); + if (currentChar == '*' && slashStarComments) { + int peekOne = read(); + while (true) { + currentChar = peekOne; + peekOne = read(); + if (currentChar == -1) { + peekChar = -1; + ttype = TT_EOF; + return ttype; + } + if (currentChar == '\r') { + if (peekOne == '\n') { + peekOne = read(); + } + lineNumber++; + } else if (currentChar == '\n') { + lineNumber++; + } else if (currentChar == '*' && peekOne == '/') { + peekChar = read(); + return nextToken(); + } + } + } else if (currentChar == '/' && slashSlashComments) { + // Skip to EOF or new line then return the next token + do { + currentChar = read(); + } while (currentChar >= 0 && currentChar != '\r' && currentChar != '\n'); + peekChar = currentChar; + return nextToken(); + } else if (currentType != TOKEN_COMMENT) { + // Was just a slash by itself + peekChar = currentChar; + ttype = '/'; + return ttype; + } + } + // Check for comment character + if (currentType == TOKEN_COMMENT) { + // Skip to EOF or new line then return the next token + do { + currentChar = read(); + } while (currentChar >= 0 && currentChar != '\r' && currentChar != '\n'); + peekChar = currentChar; + return nextToken(); + } + + peekChar = read(); + ttype = currentChar; + return ttype; + } + + public void ordinaryChar(int ch) { + if (0 <= ch && ch < tokenTypes.length) { + tokenTypes[ch] = 0; + } + } + + public void ordinaryChars(int low, int hi) { + if (low < 0) { + low = 0; + } + if (hi > tokenTypes.length) { + hi = tokenTypes.length - 1; + } + for (int i = low; i <= hi; i++) { + tokenTypes[i] = 0; + } + } + + public void parseNumbers() { + for (int i = '0'; i <= '9'; i++) { + tokenTypes[i] |= TOKEN_DIGIT; + } + tokenTypes['.'] |= TOKEN_DIGIT; + tokenTypes['-'] |= TOKEN_DIGIT; + } + + public void pushBack() { + pushBackToken = true; + } + + public void quoteChar(int ch) { + if (0 <= ch && ch < tokenTypes.length) { + tokenTypes[ch] = TOKEN_QUOTE; + } + } + + private int read() throws IOException { + // Call the read for the appropriate stream + if (inStream == null) { + return inReader.read(); + } + return inStream.read(); + } + + public void resetSyntax() { + for (int i = 0; i < 256; i++) { + tokenTypes[i] = 0; + } + } + + public void slashSlashComments(boolean flag) { + slashSlashComments = flag; + } + + public void slashStarComments(boolean flag) { + slashStarComments = flag; + } + + @Override + public String toString() { + // Values determined through experimentation + StringBuilder result = new StringBuilder(); + result.append("Token["); + switch (ttype) { + case TT_EOF: + result.append("EOF"); + break; + case TT_EOL: + result.append("EOL"); + break; + case TT_NUMBER: + result.append("n="); + result.append(nval); + break; + case TT_WORD: + result.append(sval); + break; + default: + if (ttype == TT_UNKNOWN || tokenTypes[ttype] == TOKEN_QUOTE) { + result.append(sval); + } else { + result.append('\''); + result.append((char) ttype); + result.append('\''); + } + } + result.append("], line "); + result.append(lineNumber); + return result.toString(); + } + + public void whitespaceChars(int low, int hi) { + if (low < 0) { + low = 0; + } + if (hi > tokenTypes.length) { + hi = tokenTypes.length - 1; + } + for (int i = low; i <= hi; i++) { + tokenTypes[i] = TOKEN_WHITE; + } + } + + public void wordChars(int low, int hi) { + if (low < 0) { + low = 0; + } + if (hi > tokenTypes.length) { + hi = tokenTypes.length - 1; + } + for (int i = low; i <= hi; i++) { + tokenTypes[i] |= TOKEN_WORD; + } + } +} diff --git a/tests/src/test/java/org/teavm/classlib/java/io/StreamTokenizerTest.java b/tests/src/test/java/org/teavm/classlib/java/io/StreamTokenizerTest.java new file mode 100644 index 000000000..7719aa01b --- /dev/null +++ b/tests/src/test/java/org/teavm/classlib/java/io/StreamTokenizerTest.java @@ -0,0 +1,336 @@ +/* + * Copyright 2017 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.io; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.Reader; +import java.io.StreamTokenizer; +import java.io.StringReader; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.teavm.junit.TeaVMTestRunner; + +@RunWith(TeaVMTestRunner.class) +public class StreamTokenizerTest { + private StreamTokenizer st; + + @Test + @SuppressWarnings("deprecation") + public void constructorLjava_io_InputStream() throws IOException { + st = new StreamTokenizer(new StringReader("/comments\n d 8 'h'")); + + assertEquals("the next token returned should be the letter d", StreamTokenizer.TT_WORD, st.nextToken()); + assertEquals("the next token returned should be the letter d", "d", st.sval); + + assertEquals("the next token returned should be the digit 8", StreamTokenizer.TT_NUMBER, st.nextToken()); + assertEquals("the next token returned should be the digit 8", 8.0, st.nval, 0.0001); + + assertEquals("the next token returned should be the quote character", 39, st.nextToken()); + assertEquals("the next token returned should be the quote character", "h", st.sval); + } + + @Test + public void constructorLjava_io_Reader() throws IOException { + setTest("/testing\n d 8 'h' "); + assertEquals("the next token returned should be the letter d skipping the comments", + StreamTokenizer.TT_WORD, st.nextToken()); + assertEquals("the next token returned should be the letter d", "d", st.sval); + + assertEquals("the next token returned should be the digit 8", StreamTokenizer.TT_NUMBER, st.nextToken()); + assertEquals("the next token returned should be the digit 8", 8.0, st.nval, 0.001); + + assertEquals("the next token returned should be the quote character", 39, st.nextToken()); + assertEquals("the next token returned should be the quote character", "h", st.sval); + } + + @Test + public void commentCharI() throws IOException { + setTest("*comment \n / 8 'h' "); + st.ordinaryChar('/'); + st.commentChar('*'); + assertEquals("nextToken() did not return the character / skiping the comments starting with *", + 47, st.nextToken()); + assertTrue("the next token returned should be the digit 8", + st.nextToken() == StreamTokenizer.TT_NUMBER && st.nval == 8.0); + assertTrue("the next token returned should be the quote character", + st.nextToken() == 39 && st.sval.equals("h")); + } + + @Test + public void eolIsSignificantZ() throws IOException { + setTest("d 8\n"); + // by default end of line characters are not significant + assertTrue("nextToken did not return d", st.nextToken() == StreamTokenizer.TT_WORD && st.sval.equals("d")); + assertTrue("nextToken did not return 8", st.nextToken() == StreamTokenizer.TT_NUMBER && st.nval == 8.0); + assertTrue("nextToken should be the end of file", st.nextToken() == StreamTokenizer.TT_EOF); + setTest("d\n"); + st.eolIsSignificant(true); + // end of line characters are significant + assertTrue("nextToken did not return d", st.nextToken() == StreamTokenizer.TT_WORD && st.sval.equals("d")); + assertTrue("nextToken is the end of line", st.nextToken() == StreamTokenizer.TT_EOL); + } + + @Test + public void lineno() throws IOException { + setTest("d\n 8\n"); + assertEquals("the lineno should be 1", 1, st.lineno()); + st.nextToken(); + st.nextToken(); + assertEquals("the lineno should be 2", 2, st.lineno()); + st.nextToken(); + assertEquals("the next line no should be 3", 3, st.lineno()); + } + + @Test + public void lowerCaseModeZ() throws Exception { + // SM. + setTest("HELLOWORLD"); + st.lowerCaseMode(true); + + st.nextToken(); + assertEquals("sval not converted to lowercase.", "helloworld", st.sval); + } + + @Test + @SuppressWarnings("deprecation") + public void nextToken() throws IOException { + // SM. + setTest("\r\n/* fje fje 43.4 f \r\n f g */ 456.459 \r\nHello / \t\r\n \r\n \n \r \257 Hi \'Hello World\'"); + st.ordinaryChar('/'); + st.slashStarComments(true); + st.nextToken(); + assertTrue("Wrong Token type1: " + (char) st.ttype, st.ttype == StreamTokenizer.TT_NUMBER); + st.nextToken(); + assertTrue("Wrong Token type2: " + st.ttype, st.ttype == StreamTokenizer.TT_WORD); + st.nextToken(); + assertTrue("Wrong Token type3: " + st.ttype, st.ttype == '/'); + st.nextToken(); + assertTrue("Wrong Token type4: " + st.ttype, st.ttype == StreamTokenizer.TT_WORD); + st.nextToken(); + assertTrue("Wrong Token type5: " + st.ttype, st.ttype == StreamTokenizer.TT_WORD); + st.nextToken(); + assertTrue("Wrong Token type6: " + st.ttype, st.ttype == '\''); + assertTrue("Wrong Token type7: " + st.ttype, st.sval.equals("Hello World")); + st.nextToken(); + assertTrue("Wrong Token type8: " + st.ttype, st.ttype == -1); + + StreamTokenizer s = new StreamTokenizer(new StringReader("hello\n\n\n")); + s.eolIsSignificant(true); + assertTrue("Wrong token 1,1", s.nextToken() == StreamTokenizer.TT_WORD && s.sval.equals("hello")); + assertTrue("Wrong token 1,2", s.nextToken() == '\n'); + assertTrue("Wrong token 1,3", s.nextToken() == '\n'); + assertTrue("Wrong token 1,4", s.nextToken() == '\n'); + assertTrue("Wrong token 1,5", s.nextToken() == StreamTokenizer.TT_EOF); + StreamTokenizer tokenizer = new StreamTokenizer(new StringReader("\n \r\n#")); + tokenizer.ordinaryChar('\n'); // make \n ordinary + tokenizer.eolIsSignificant(true); + assertTrue("Wrong token 2,1", tokenizer.nextToken() == '\n'); + assertTrue("Wrong token 2,2", tokenizer.nextToken() == '\n'); + assertEquals("Wrong token 2,3", '#', tokenizer.nextToken()); + } + + @Test + public void ordinaryCharI() throws IOException { + // SM. + setTest("Ffjein 893"); + st.ordinaryChar('F'); + st.nextToken(); + assertTrue("OrdinaryChar failed." + (char) st.ttype, st.ttype == 'F'); + } + + @Test + public void ordinaryCharsII() throws IOException { + setTest("azbc iof z 893"); + st.ordinaryChars('a', 'z'); + assertEquals("OrdinaryChars failed.", 'a', st.nextToken()); + assertEquals("OrdinaryChars failed.", 'z', st.nextToken()); + } + + @Test + public void parseNumbers() throws IOException { + // SM + setTest("9.9 678"); + assertTrue("Base behavior failed.", st.nextToken() == StreamTokenizer.TT_NUMBER); + st.ordinaryChars('0', '9'); + assertEquals("setOrdinary failed.", '6', st.nextToken()); + st.parseNumbers(); + assertTrue("parseNumbers failed.", st.nextToken() == StreamTokenizer.TT_NUMBER); + } + + @Test + public void pushBack() throws IOException { + // SM. + setTest("Hello 897"); + st.nextToken(); + st.pushBack(); + assertTrue("PushBack failed.", st.nextToken() == StreamTokenizer.TT_WORD); + } + + @Test + public void quoteCharI() throws IOException { + // SM + setTest("