Add StreamTokenizer

This commit is contained in:
Alexey Andreev 2017-11-16 13:44:03 +03:00
parent 7f152c0137
commit 82b96da215
2 changed files with 818 additions and 0 deletions

View File

@ -0,0 +1,482 @@
/*
* Copyright 2017 Alexey Andreev.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.teavm.classlib.java.io;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
public class TStreamTokenizer {
public double nval;
public String sval;
public static final int TT_EOF = -1;
public static final int TT_EOL = '\n';
public static final int TT_NUMBER = -2;
public static final int TT_WORD = -3;
private static final int TT_UNKNOWN = -4;
public int ttype = TT_UNKNOWN;
/**
* Internal character meanings, 0 implies TOKEN_ORDINARY
*/
private byte[] tokenTypes = new byte[256];
private static final byte TOKEN_COMMENT = 1;
private static final byte TOKEN_QUOTE = 2;
private static final byte TOKEN_WHITE = 4;
private static final byte TOKEN_WORD = 8;
private static final byte TOKEN_DIGIT = 16;
private int lineNumber = 1;
private boolean forceLowercase;
private boolean isEOLSignificant;
private boolean slashStarComments;
private boolean slashSlashComments;
private boolean pushBackToken;
private boolean lastCr;
/* One of these will have the stream */
private InputStream inStream;
private Reader inReader;
private int peekChar = -2;
private TStreamTokenizer() {
/*
* Initialize the default state per specification. All byte values 'A'
* through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are
* considered to be alphabetic.
*/
wordChars('A', 'Z');
wordChars('a', 'z');
wordChars(160, 255);
/*
* All byte values '\u0000' through '\u0020' are considered to be white
* space.
*/
whitespaceChars(0, 32);
/*
* '/' is a comment character. Single quote '\'' and double quote '"'
* are string quote characters.
*/
commentChar('/');
quoteChar('"');
quoteChar('\'');
/*
* Numbers are parsed.
*/
parseNumbers();
/*
* Ends of lines are treated as white space, not as separate tokens.
* C-style and C++-style comments are not recognized. These are the
* defaults and are not needed in constructor.
*/
}
@Deprecated
public TStreamTokenizer(InputStream is) {
this();
if (is == null) {
throw new NullPointerException();
}
inStream = is;
}
public TStreamTokenizer(Reader r) {
this();
if (r == null) {
throw new NullPointerException();
}
inReader = r;
}
public void commentChar(int ch) {
if (0 <= ch && ch < tokenTypes.length) {
tokenTypes[ch] = TOKEN_COMMENT;
}
}
public void eolIsSignificant(boolean flag) {
isEOLSignificant = flag;
}
public int lineno() {
return lineNumber;
}
public void lowerCaseMode(boolean flag) {
forceLowercase = flag;
}
public int nextToken() throws IOException {
if (pushBackToken) {
pushBackToken = false;
if (ttype != TT_UNKNOWN) {
return ttype;
}
}
sval = null; // Always reset sval to null
int currentChar = peekChar == -2 ? read() : peekChar;
if (lastCr && currentChar == '\n') {
lastCr = false;
currentChar = read();
}
if (currentChar == -1) {
ttype = TT_EOF;
return ttype;
}
byte currentType = currentChar > 255 ? TOKEN_WORD : tokenTypes[currentChar];
while ((currentType & TOKEN_WHITE) != 0) {
/*
* Skip over white space until we hit a new line or a real token
*/
if (currentChar == '\r') {
lineNumber++;
if (isEOLSignificant) {
lastCr = true;
peekChar = -2;
ttype = TT_EOL;
return ttype;
}
currentChar = read();
if (currentChar == '\n') {
currentChar = read();
}
} else if (currentChar == '\n') {
lineNumber++;
if (isEOLSignificant) {
peekChar = -2;
ttype = TT_EOL;
return ttype;
}
currentChar = read();
} else {
// Advance over this white space character and try again.
currentChar = read();
}
if (currentChar == -1) {
ttype = TT_EOF;
return ttype;
}
currentType = currentChar > 255 ? TOKEN_WORD : tokenTypes[currentChar];
}
/*
* Check for digits before checking for words since digits can be
* contained within words.
*/
if ((currentType & TOKEN_DIGIT) != 0) {
StringBuilder digits = new StringBuilder(20);
boolean haveDecimal = false;
boolean checkJustNegative = currentChar == '-';
while (true) {
if (currentChar == '.') {
haveDecimal = true;
}
digits.append((char) currentChar);
currentChar = read();
if ((currentChar < '0' || currentChar > '9')
&& (haveDecimal || currentChar != '.')) {
break;
}
}
peekChar = currentChar;
if (checkJustNegative && digits.length() == 1) {
// Didn't get any other digits other than '-'
ttype = '-';
return ttype;
}
try {
nval = Double.valueOf(digits.toString());
} catch (NumberFormatException e) {
// Unsure what to do, will write test.
nval = 0;
}
ttype = TT_NUMBER;
return ttype;
}
// Check for words
if ((currentType & TOKEN_WORD) != 0) {
StringBuilder word = new StringBuilder(20);
while (true) {
word.append((char) currentChar);
currentChar = read();
if (currentChar == -1
|| (currentChar < 256 && (tokenTypes[currentChar] & (TOKEN_WORD | TOKEN_DIGIT)) == 0)) {
break;
}
}
peekChar = currentChar;
sval = forceLowercase ? word.toString().toLowerCase() : word
.toString();
ttype = TT_WORD;
return ttype;
}
// Check for quoted character
if (currentType == TOKEN_QUOTE) {
StringBuilder quoteString = new StringBuilder();
int peekOne = read();
while (peekOne >= 0 && peekOne != currentChar && peekOne != '\r' && peekOne != '\n') {
boolean readPeek = true;
if (peekOne == '\\') {
int c1 = read();
// Check for quoted octal IE: \377
if (c1 <= '7' && c1 >= '0') {
int digitValue = c1 - '0';
c1 = read();
if (c1 > '7' || c1 < '0') {
readPeek = false;
} else {
digitValue = digitValue * 8 + (c1 - '0');
c1 = read();
// limit the digit value to a byte
if (digitValue > 31 || c1 > '7' || c1 < '0') {
readPeek = false;
} else {
digitValue = digitValue * 8 + (c1 - '0');
}
}
if (!readPeek) {
// We've consumed one to many
quoteString.append((char) digitValue);
peekOne = c1;
} else {
peekOne = digitValue;
}
} else {
switch (c1) {
case 'a':
peekOne = 0x7;
break;
case 'b':
peekOne = 0x8;
break;
case 'f':
peekOne = 0xc;
break;
case 'n':
peekOne = 0xA;
break;
case 'r':
peekOne = 0xD;
break;
case 't':
peekOne = 0x9;
break;
case 'v':
peekOne = 0xB;
break;
default:
peekOne = c1;
}
}
}
if (readPeek) {
quoteString.append((char) peekOne);
peekOne = read();
}
}
if (peekOne == currentChar) {
peekOne = read();
}
peekChar = peekOne;
ttype = currentChar;
sval = quoteString.toString();
return ttype;
}
// Do comments, both "//" and "/*stuff*/"
if (currentChar == '/' && (slashSlashComments || slashStarComments)) {
currentChar = read();
if (currentChar == '*' && slashStarComments) {
int peekOne = read();
while (true) {
currentChar = peekOne;
peekOne = read();
if (currentChar == -1) {
peekChar = -1;
ttype = TT_EOF;
return ttype;
}
if (currentChar == '\r') {
if (peekOne == '\n') {
peekOne = read();
}
lineNumber++;
} else if (currentChar == '\n') {
lineNumber++;
} else if (currentChar == '*' && peekOne == '/') {
peekChar = read();
return nextToken();
}
}
} else if (currentChar == '/' && slashSlashComments) {
// Skip to EOF or new line then return the next token
do {
currentChar = read();
} while (currentChar >= 0 && currentChar != '\r' && currentChar != '\n');
peekChar = currentChar;
return nextToken();
} else if (currentType != TOKEN_COMMENT) {
// Was just a slash by itself
peekChar = currentChar;
ttype = '/';
return ttype;
}
}
// Check for comment character
if (currentType == TOKEN_COMMENT) {
// Skip to EOF or new line then return the next token
do {
currentChar = read();
} while (currentChar >= 0 && currentChar != '\r' && currentChar != '\n');
peekChar = currentChar;
return nextToken();
}
peekChar = read();
ttype = currentChar;
return ttype;
}
public void ordinaryChar(int ch) {
if (0 <= ch && ch < tokenTypes.length) {
tokenTypes[ch] = 0;
}
}
public void ordinaryChars(int low, int hi) {
if (low < 0) {
low = 0;
}
if (hi > tokenTypes.length) {
hi = tokenTypes.length - 1;
}
for (int i = low; i <= hi; i++) {
tokenTypes[i] = 0;
}
}
public void parseNumbers() {
for (int i = '0'; i <= '9'; i++) {
tokenTypes[i] |= TOKEN_DIGIT;
}
tokenTypes['.'] |= TOKEN_DIGIT;
tokenTypes['-'] |= TOKEN_DIGIT;
}
public void pushBack() {
pushBackToken = true;
}
public void quoteChar(int ch) {
if (0 <= ch && ch < tokenTypes.length) {
tokenTypes[ch] = TOKEN_QUOTE;
}
}
private int read() throws IOException {
// Call the read for the appropriate stream
if (inStream == null) {
return inReader.read();
}
return inStream.read();
}
public void resetSyntax() {
for (int i = 0; i < 256; i++) {
tokenTypes[i] = 0;
}
}
public void slashSlashComments(boolean flag) {
slashSlashComments = flag;
}
public void slashStarComments(boolean flag) {
slashStarComments = flag;
}
@Override
public String toString() {
// Values determined through experimentation
StringBuilder result = new StringBuilder();
result.append("Token[");
switch (ttype) {
case TT_EOF:
result.append("EOF");
break;
case TT_EOL:
result.append("EOL");
break;
case TT_NUMBER:
result.append("n=");
result.append(nval);
break;
case TT_WORD:
result.append(sval);
break;
default:
if (ttype == TT_UNKNOWN || tokenTypes[ttype] == TOKEN_QUOTE) {
result.append(sval);
} else {
result.append('\'');
result.append((char) ttype);
result.append('\'');
}
}
result.append("], line ");
result.append(lineNumber);
return result.toString();
}
public void whitespaceChars(int low, int hi) {
if (low < 0) {
low = 0;
}
if (hi > tokenTypes.length) {
hi = tokenTypes.length - 1;
}
for (int i = low; i <= hi; i++) {
tokenTypes[i] = TOKEN_WHITE;
}
}
public void wordChars(int low, int hi) {
if (low < 0) {
low = 0;
}
if (hi > tokenTypes.length) {
hi = tokenTypes.length - 1;
}
for (int i = low; i <= hi; i++) {
tokenTypes[i] |= TOKEN_WORD;
}
}
}

View File

@ -0,0 +1,336 @@
/*
* Copyright 2017 Alexey Andreev.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.teavm.classlib.java.io;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.teavm.junit.TeaVMTestRunner;
@RunWith(TeaVMTestRunner.class)
public class StreamTokenizerTest {
private StreamTokenizer st;
@Test
@SuppressWarnings("deprecation")
public void constructorLjava_io_InputStream() throws IOException {
st = new StreamTokenizer(new StringReader("/comments\n d 8 'h'"));
assertEquals("the next token returned should be the letter d", StreamTokenizer.TT_WORD, st.nextToken());
assertEquals("the next token returned should be the letter d", "d", st.sval);
assertEquals("the next token returned should be the digit 8", StreamTokenizer.TT_NUMBER, st.nextToken());
assertEquals("the next token returned should be the digit 8", 8.0, st.nval, 0.0001);
assertEquals("the next token returned should be the quote character", 39, st.nextToken());
assertEquals("the next token returned should be the quote character", "h", st.sval);
}
@Test
public void constructorLjava_io_Reader() throws IOException {
setTest("/testing\n d 8 'h' ");
assertEquals("the next token returned should be the letter d skipping the comments",
StreamTokenizer.TT_WORD, st.nextToken());
assertEquals("the next token returned should be the letter d", "d", st.sval);
assertEquals("the next token returned should be the digit 8", StreamTokenizer.TT_NUMBER, st.nextToken());
assertEquals("the next token returned should be the digit 8", 8.0, st.nval, 0.001);
assertEquals("the next token returned should be the quote character", 39, st.nextToken());
assertEquals("the next token returned should be the quote character", "h", st.sval);
}
@Test
public void commentCharI() throws IOException {
setTest("*comment \n / 8 'h' ");
st.ordinaryChar('/');
st.commentChar('*');
assertEquals("nextToken() did not return the character / skiping the comments starting with *",
47, st.nextToken());
assertTrue("the next token returned should be the digit 8",
st.nextToken() == StreamTokenizer.TT_NUMBER && st.nval == 8.0);
assertTrue("the next token returned should be the quote character",
st.nextToken() == 39 && st.sval.equals("h"));
}
@Test
public void eolIsSignificantZ() throws IOException {
setTest("d 8\n");
// by default end of line characters are not significant
assertTrue("nextToken did not return d", st.nextToken() == StreamTokenizer.TT_WORD && st.sval.equals("d"));
assertTrue("nextToken did not return 8", st.nextToken() == StreamTokenizer.TT_NUMBER && st.nval == 8.0);
assertTrue("nextToken should be the end of file", st.nextToken() == StreamTokenizer.TT_EOF);
setTest("d\n");
st.eolIsSignificant(true);
// end of line characters are significant
assertTrue("nextToken did not return d", st.nextToken() == StreamTokenizer.TT_WORD && st.sval.equals("d"));
assertTrue("nextToken is the end of line", st.nextToken() == StreamTokenizer.TT_EOL);
}
@Test
public void lineno() throws IOException {
setTest("d\n 8\n");
assertEquals("the lineno should be 1", 1, st.lineno());
st.nextToken();
st.nextToken();
assertEquals("the lineno should be 2", 2, st.lineno());
st.nextToken();
assertEquals("the next line no should be 3", 3, st.lineno());
}
@Test
public void lowerCaseModeZ() throws Exception {
// SM.
setTest("HELLOWORLD");
st.lowerCaseMode(true);
st.nextToken();
assertEquals("sval not converted to lowercase.", "helloworld", st.sval);
}
@Test
@SuppressWarnings("deprecation")
public void nextToken() throws IOException {
// SM.
setTest("\r\n/* fje fje 43.4 f \r\n f g */ 456.459 \r\nHello / \t\r\n \r\n \n \r \257 Hi \'Hello World\'");
st.ordinaryChar('/');
st.slashStarComments(true);
st.nextToken();
assertTrue("Wrong Token type1: " + (char) st.ttype, st.ttype == StreamTokenizer.TT_NUMBER);
st.nextToken();
assertTrue("Wrong Token type2: " + st.ttype, st.ttype == StreamTokenizer.TT_WORD);
st.nextToken();
assertTrue("Wrong Token type3: " + st.ttype, st.ttype == '/');
st.nextToken();
assertTrue("Wrong Token type4: " + st.ttype, st.ttype == StreamTokenizer.TT_WORD);
st.nextToken();
assertTrue("Wrong Token type5: " + st.ttype, st.ttype == StreamTokenizer.TT_WORD);
st.nextToken();
assertTrue("Wrong Token type6: " + st.ttype, st.ttype == '\'');
assertTrue("Wrong Token type7: " + st.ttype, st.sval.equals("Hello World"));
st.nextToken();
assertTrue("Wrong Token type8: " + st.ttype, st.ttype == -1);
StreamTokenizer s = new StreamTokenizer(new StringReader("hello\n\n\n"));
s.eolIsSignificant(true);
assertTrue("Wrong token 1,1", s.nextToken() == StreamTokenizer.TT_WORD && s.sval.equals("hello"));
assertTrue("Wrong token 1,2", s.nextToken() == '\n');
assertTrue("Wrong token 1,3", s.nextToken() == '\n');
assertTrue("Wrong token 1,4", s.nextToken() == '\n');
assertTrue("Wrong token 1,5", s.nextToken() == StreamTokenizer.TT_EOF);
StreamTokenizer tokenizer = new StreamTokenizer(new StringReader("\n \r\n#"));
tokenizer.ordinaryChar('\n'); // make \n ordinary
tokenizer.eolIsSignificant(true);
assertTrue("Wrong token 2,1", tokenizer.nextToken() == '\n');
assertTrue("Wrong token 2,2", tokenizer.nextToken() == '\n');
assertEquals("Wrong token 2,3", '#', tokenizer.nextToken());
}
@Test
public void ordinaryCharI() throws IOException {
// SM.
setTest("Ffjein 893");
st.ordinaryChar('F');
st.nextToken();
assertTrue("OrdinaryChar failed." + (char) st.ttype, st.ttype == 'F');
}
@Test
public void ordinaryCharsII() throws IOException {
setTest("azbc iof z 893");
st.ordinaryChars('a', 'z');
assertEquals("OrdinaryChars failed.", 'a', st.nextToken());
assertEquals("OrdinaryChars failed.", 'z', st.nextToken());
}
@Test
public void parseNumbers() throws IOException {
// SM
setTest("9.9 678");
assertTrue("Base behavior failed.", st.nextToken() == StreamTokenizer.TT_NUMBER);
st.ordinaryChars('0', '9');
assertEquals("setOrdinary failed.", '6', st.nextToken());
st.parseNumbers();
assertTrue("parseNumbers failed.", st.nextToken() == StreamTokenizer.TT_NUMBER);
}
@Test
public void pushBack() throws IOException {
// SM.
setTest("Hello 897");
st.nextToken();
st.pushBack();
assertTrue("PushBack failed.", st.nextToken() == StreamTokenizer.TT_WORD);
}
@Test
public void quoteCharI() throws IOException {
// SM
setTest("<Hello World< HelloWorldH");
st.quoteChar('<');
assertEquals("QuoteChar failed.", '<', st.nextToken());
assertEquals("QuoteChar failed.", "Hello World", st.sval);
st.quoteChar('H');
st.nextToken();
assertEquals("QuoteChar failed for word.", "elloWorld", st.sval);
}
@Test
public void resetSyntax() throws IOException {
// SM
setTest("H 9\' ello World");
st.resetSyntax();
assertTrue("resetSyntax failed1." + (char) st.ttype, st.nextToken() == 'H');
assertTrue("resetSyntax failed1." + (char) st.ttype, st.nextToken() == ' ');
assertTrue("resetSyntax failed2." + (char) st.ttype, st.nextToken() == '9');
assertTrue("resetSyntax failed3." + (char) st.ttype, st.nextToken() == '\'');
}
@Test
public void slashSlashCommentsZ() throws IOException {
// SM.
setTest("// foo \r\n /fiji \r\n -456");
st.ordinaryChar('/');
st.slashSlashComments(true);
assertEquals("Test failed.", '/', st.nextToken());
assertTrue("Test failed.", st.nextToken() == StreamTokenizer.TT_WORD);
}
@Test
public void slashSlashComments_withSSOpen() throws IOException {
Reader reader = new StringReader("t // t t t");
StreamTokenizer st = new StreamTokenizer(reader);
st.slashSlashComments(true);
assertEquals(StreamTokenizer.TT_WORD, st.nextToken());
assertEquals(StreamTokenizer.TT_EOF, st.nextToken());
}
@Test
public void slashSlashComments_withSSOpen_NoComment() throws IOException {
Reader reader = new StringReader("// t");
StreamTokenizer st = new StreamTokenizer(reader);
st.slashSlashComments(true);
st.ordinaryChar('/');
assertEquals(StreamTokenizer.TT_EOF, st.nextToken());
}
@Test
public void slashSlashComments_withSSClosed() throws IOException {
Reader reader = new StringReader("// t");
StreamTokenizer st = new StreamTokenizer(reader);
st.slashSlashComments(false);
st.ordinaryChar('/');
assertEquals('/', st.nextToken());
assertEquals('/', st.nextToken());
assertEquals(StreamTokenizer.TT_WORD, st.nextToken());
}
@Test
public void slashStarCommentsZ() throws IOException {
setTest("/* foo \r\n /fiji \r\n*/ -456");
st.ordinaryChar('/');
st.slashStarComments(true);
assertTrue("Test failed.", st.nextToken() == StreamTokenizer.TT_NUMBER);
}
@Test
public void slashStarComments_withSTOpen() throws IOException {
Reader reader = new StringReader("t /* t */ t");
StreamTokenizer st = new StreamTokenizer(reader);
st.slashStarComments(true);
assertEquals(StreamTokenizer.TT_WORD, st.nextToken());
assertEquals(StreamTokenizer.TT_WORD, st.nextToken());
assertEquals(StreamTokenizer.TT_EOF, st.nextToken());
}
@Test
public void slashStarComments_withSTClosed() throws IOException {
Reader reader = new StringReader("t /* t */ t");
StreamTokenizer st = new StreamTokenizer(reader);
st.slashStarComments(false);
assertEquals(StreamTokenizer.TT_WORD, st.nextToken());
assertEquals(StreamTokenizer.TT_EOF, st.nextToken());
}
@SuppressWarnings("deprecation")
@Test
public void test_toString() throws IOException {
setTest("ABC Hello World");
st.nextToken();
assertTrue("toString failed." + st.toString(), st.toString().equals("Token[ABC], line 1"));
// Regression test for HARMONY-4070
byte[] data = new byte[] { (byte) '-' };
StreamTokenizer tokenizer = new StreamTokenizer(new ByteArrayInputStream(data));
tokenizer.nextToken();
String result = tokenizer.toString();
assertEquals("Token['-'], line 1", result);
}
@Test
public void whitespaceCharsII() throws IOException {
setTest("azbc iof z 893");
st.whitespaceChars('a', 'z');
assertTrue("OrdinaryChar failed.", st.nextToken() == StreamTokenizer.TT_NUMBER);
}
@Test
public void wordCharsII() throws IOException {
setTest("A893 -9B87");
st.wordChars('0', '9');
assertTrue("WordChar failed1.",
st.nextToken() == StreamTokenizer.TT_WORD);
assertEquals("WordChar failed2.", "A893", st.sval);
assertTrue("WordChar failed3.",
st.nextToken() == StreamTokenizer.TT_NUMBER);
st.nextToken();
assertEquals("WordChar failed4.", "B87", st.sval);
setTest(" Hello World");
st.wordChars(' ', ' ');
st.nextToken();
assertEquals("WordChars failed for whitespace.", "Hello World", st.sval);
setTest(" Hello World\r\n \'Hello World\' Hello\' World");
st.wordChars(' ', ' ');
st.wordChars('\'', '\'');
st.nextToken();
assertTrue("WordChars failed for whitespace: " + st.sval, st.sval.equals("Hello World"));
st.nextToken();
assertTrue("WordChars failed for quote1: " + st.sval, st.sval.equals("\'Hello World\' Hello\' World"));
}
private void setTest(String s) {
st = new StreamTokenizer(new StringReader(s));
}
}