From aa2451c3e367fc220c9b5a68d617ac53e473eccb Mon Sep 17 00:00:00 2001 From: konsoletyper Date: Sun, 22 Mar 2015 18:15:48 +0300 Subject: [PATCH] Replacing old TeaVM-based charsets with NIO charsets --- .../classlib/impl/charset/UTF16Helper.java | 61 ----------- .../classlib/impl/charset/UTF8Charset.java | 14 +-- .../classlib/java/io/TInputStreamReader.java | 76 ++++++------- .../java/lang/TAbstractStringBuilder.java | 7 +- .../teavm/classlib/java/lang/TCharacter.java | 41 ++++--- .../org/teavm/classlib/java/lang/TString.java | 100 ++++++++---------- .../java/nio/charset/TCharsetDecoder.java | 16 ++- .../java/nio/charset/impl/TUTF8Decoder.java | 33 +++++- .../java/nio/charset/impl/TUTF8Encoder.java | 9 +- .../classlib/java/nio/charset/UTF8Test.java | 74 ++++++++++++- 10 files changed, 234 insertions(+), 197 deletions(-) delete mode 100644 teavm-classlib/src/main/java/org/teavm/classlib/impl/charset/UTF16Helper.java diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/impl/charset/UTF16Helper.java b/teavm-classlib/src/main/java/org/teavm/classlib/impl/charset/UTF16Helper.java deleted file mode 100644 index 12fc86a3a..000000000 --- a/teavm-classlib/src/main/java/org/teavm/classlib/impl/charset/UTF16Helper.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright 2013 Alexey Andreev. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.teavm.classlib.impl.charset; - -/** - * - * @author Alexey Andreev - */ -public class UTF16Helper { - public static final int SURROGATE_NEUTRAL_BIT_MASK = 0xF800; - public static final int SURROGATE_BITS = 0xD800; - public static final int SURROGATE_BIT_MASK = 0xFC00; - public static final int SURROGATE_BIT_INV_MASK = 0x03FF; - public static final int HIGH_SURROGATE_BITS = 0xD800; - public static final int LOW_SURROGATE_BITS = 0xDC00; - public static final int MEANINGFUL_SURROGATE_BITS = 10; - public static final int SUPPLEMENTARY_PLANE = 0x10000; - - public static char highSurrogate(int codePoint) { - codePoint -= SUPPLEMENTARY_PLANE; - return (char)(HIGH_SURROGATE_BITS | (codePoint >> MEANINGFUL_SURROGATE_BITS) & SURROGATE_BIT_INV_MASK); - } - - public static char lowSurrogate(int codePoint) { - return (char)(LOW_SURROGATE_BITS | codePoint & SURROGATE_BIT_INV_MASK); - } - - public static boolean isHighSurrogate(char c) { - return (c & SURROGATE_BIT_MASK) == HIGH_SURROGATE_BITS; - } - - public static boolean isLowSurrogate(char c) { - return (c & SURROGATE_BIT_MASK) == LOW_SURROGATE_BITS; - } - - public static boolean isSurrogatePair(char a, char b) { - return isHighSurrogate(a) && isLowSurrogate(b); - } - - public static int buildCodePoint(char a, char b) { - return (((a & SURROGATE_BIT_INV_MASK) << MEANINGFUL_SURROGATE_BITS) | (b & SURROGATE_BIT_INV_MASK)) + - SUPPLEMENTARY_PLANE; - } - - public static boolean isSurrogate(char c) { - return (c & SURROGATE_NEUTRAL_BIT_MASK) == SURROGATE_BITS; - } -} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/impl/charset/UTF8Charset.java b/teavm-classlib/src/main/java/org/teavm/classlib/impl/charset/UTF8Charset.java index 3c2aeb11a..1eddfa602 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/impl/charset/UTF8Charset.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/impl/charset/UTF8Charset.java @@ -29,17 +29,17 @@ public class UTF8Charset extends Charset { } else if (ch < 0x400) { dest.put((byte)(0xC0 | (ch >> 6))); dest.put((byte)(0x80 | (ch & 0x3F))); - } else if (!UTF16Helper.isSurrogate(ch)) { + } else if (!Character.isSurrogate(ch)) { dest.put((byte)(0xE0 | (ch >> 12))); dest.put((byte)(0x80 | ((ch >> 6) & 0x3F))); dest.put((byte)(0x80 | (ch & 0x3F))); - } else if (UTF16Helper.isHighSurrogate(ch)) { + } else if (Character.isHighSurrogate(ch)) { char low = source.get(); - if (!UTF16Helper.isLowSurrogate(low)) { + if (!Character.isLowSurrogate(low)) { source.back(1); dest.put((byte)'?'); } else { - int codePoint = UTF16Helper.buildCodePoint(ch, low); + int codePoint = Character.toCodePoint(ch, low); dest.put((byte)(0xF0 | (codePoint >> 18))); dest.put((byte)(0x80 | ((codePoint >> 12) & 0x3F))); dest.put((byte)(0x80 | ((codePoint >> 6) & 0x3F))); @@ -72,7 +72,7 @@ public class UTF8Charset extends Charset { byte b2 = source.get(); byte b3 = source.get(); char c = (char)(((b & 0x0F) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3F)); - dest.put(!UTF16Helper.isHighSurrogate(c) ? c : '?'); + dest.put(!Character.isHighSurrogate(c) ? c : '?'); } else if ((b & 0xF8) == 0xF0) { if (source.available() < 3) { source.skip(source.available()); @@ -83,8 +83,8 @@ public class UTF8Charset extends Charset { byte b3 = source.get(); byte b4 = source.get(); int code = ((b & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3F) << 6) | (b4 & 0x3F); - dest.put(UTF16Helper.highSurrogate(code)); - dest.put(UTF16Helper.lowSurrogate(code)); + dest.put(Character.highSurrogate(code)); + dest.put(Character.lowSurrogate(code)); } } } diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/io/TInputStreamReader.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/io/TInputStreamReader.java index 891fb473b..08b6054e1 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/java/io/TInputStreamReader.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/io/TInputStreamReader.java @@ -15,11 +15,13 @@ */ package org.teavm.classlib.java.io; -import org.teavm.classlib.impl.charset.ByteBuffer; -import org.teavm.classlib.impl.charset.CharBuffer; -import org.teavm.classlib.impl.charset.Charset; -import org.teavm.classlib.impl.charset.UTF8Charset; import org.teavm.classlib.java.lang.TString; +import org.teavm.classlib.java.nio.TByteBuffer; +import org.teavm.classlib.java.nio.TCharBuffer; +import org.teavm.classlib.java.nio.charset.TCharset; +import org.teavm.classlib.java.nio.charset.TCharsetDecoder; +import org.teavm.classlib.java.nio.charset.TCodingErrorAction; +import org.teavm.classlib.java.nio.charset.impl.TUTF8Charset; /** * @@ -27,30 +29,30 @@ import org.teavm.classlib.java.lang.TString; */ public class TInputStreamReader extends TReader { private TInputStream stream; - private Charset charset; + private TCharset charset; private TString charsetName; private byte[] inData = new byte[8192]; - private ByteBuffer inBuffer = new ByteBuffer(inData); + private TByteBuffer inBuffer = TByteBuffer.wrap(inData); private char[] outData = new char[1024]; - private CharBuffer outBuffer = new CharBuffer(outData); + private TCharBuffer outBuffer = TCharBuffer.wrap(outData); private boolean streamEof; private boolean eof; public TInputStreamReader(TInputStream in, TString charsetName) { - this(in, Charset.get(charsetName.toString())); + this(in, TCharset.forName(charsetName.toString())); this.charsetName = charsetName; } public TInputStreamReader(TInputStream in) { - this(in, new UTF8Charset()); + this(in, new TUTF8Charset()); charsetName = TString.wrap("UTF-8"); } - private TInputStreamReader(TInputStream in, Charset charset) { + public TInputStreamReader(TInputStream in, TCharset charset) { this.stream = in; this.charset = charset; - outBuffer.skip(outBuffer.available()); - inBuffer.skip(inBuffer.available()); + outBuffer.position(outBuffer.limit()); + inBuffer.position(inBuffer.limit()); } public TString getEncoding() { @@ -64,10 +66,10 @@ public class TInputStreamReader extends TReader { @Override public int read() throws TIOException { - if (eof && outBuffer.end()) { + if (eof && !outBuffer.hasRemaining()) { return -1; } - if (!outBuffer.end()) { + if (outBuffer.hasRemaining()) { return outBuffer.get(); } return fillBuffer() ? outBuffer.get() : -1; @@ -75,37 +77,40 @@ public class TInputStreamReader extends TReader { @Override public int read(char[] cbuf, int off, int len) throws TIOException { - if (eof && outBuffer.end()) { + if (eof && !outBuffer.hasRemaining()) { return -1; } - CharBuffer wrapBuffer = new CharBuffer(cbuf, off, off + len); - while (!wrapBuffer.end()) { - wrapBuffer.put(outBuffer); - if (outBuffer.end() && !fillBuffer()) { + int bytesRead = 0; + while (len > 0) { + int sz = Math.min(len, outBuffer.remaining()); + outBuffer.get(cbuf, off + bytesRead, sz); + len -= sz; + bytesRead += sz; + if (!outBuffer.hasRemaining() && !fillBuffer()) { break; } } - return wrapBuffer.position() - off; + return bytesRead; } private boolean fillBuffer() throws TIOException { if (eof) { return false; } - CharBuffer newBuffer = new CharBuffer(outData); - newBuffer.put(outBuffer); + outBuffer.compact(); + TCharsetDecoder decoder = charset.newDecoder() + .onMalformedInput(TCodingErrorAction.REPLACE) + .onUnmappableCharacter(TCodingErrorAction.IGNORE); while (true) { - if (inBuffer.end() && !fillReadBuffer()) { + if (!inBuffer.hasRemaining() && !fillReadBuffer()) { eof = true; break; } - int oldAvail = newBuffer.available(); - charset.decode(inBuffer, newBuffer); - if (oldAvail == newBuffer.available()) { + if (decoder.decode(inBuffer, outBuffer, eof).isOverflow()) { break; } } - outBuffer = new CharBuffer(outData, 0, newBuffer.position()); + outBuffer.flip(); return true; } @@ -113,30 +118,25 @@ public class TInputStreamReader extends TReader { if (streamEof) { return false; } - int off = 0; - while (!inBuffer.end()) { - inData[off] = inBuffer.get(); - } - inBuffer.rewind(0); - while (off < inData.length) { - int bytesRead = stream.read(inData, off, inData.length - off); + inBuffer.compact(); + while (inBuffer.hasRemaining()) { + int bytesRead = stream.read(inBuffer.array(), inBuffer.position(), inBuffer.remaining()); if (bytesRead == -1) { streamEof = true; - inBuffer = new ByteBuffer(inData, 0, inBuffer.position()); break; } else { - off += bytesRead; + inBuffer.position(inBuffer.position() + bytesRead); if (bytesRead == 0) { break; } } } - inBuffer = new ByteBuffer(inData, 0, off); + inBuffer.flip(); return true; } @Override public boolean ready() throws TIOException { - return !outBuffer.end() || inBuffer.end(); + return outBuffer.hasRemaining() || inBuffer.hasRemaining(); } } diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TAbstractStringBuilder.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TAbstractStringBuilder.java index ee3d6e04c..fdd03076b 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TAbstractStringBuilder.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TAbstractStringBuilder.java @@ -15,7 +15,6 @@ */ package org.teavm.classlib.java.lang; -import org.teavm.classlib.impl.charset.UTF16Helper; import org.teavm.classlib.java.io.TSerializable; import org.teavm.classlib.java.util.TArrays; @@ -553,12 +552,12 @@ class TAbstractStringBuilder extends TObject implements TSerializable, TCharSequ } protected TAbstractStringBuilder appendCodePoint(int codePoint) { - if (codePoint < UTF16Helper.SUPPLEMENTARY_PLANE) { + if (codePoint < TCharacter.MIN_SUPPLEMENTARY_CODE_POINT) { return append((char)codePoint); } ensureCapacity(length + 2); - buffer[length++] = UTF16Helper.highSurrogate(codePoint); - buffer[length++] = UTF16Helper.lowSurrogate(codePoint); + buffer[length++] = TCharacter.highSurrogate(codePoint); + buffer[length++] = TCharacter.lowSurrogate(codePoint); return this; } diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TCharacter.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TCharacter.java index 82309eed4..925f01c25 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TCharacter.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TCharacter.java @@ -15,7 +15,6 @@ */ package org.teavm.classlib.java.lang; -import org.teavm.classlib.impl.charset.UTF16Helper; import org.teavm.classlib.impl.unicode.UnicodeHelper; import org.teavm.platform.Platform; import org.teavm.platform.metadata.MetadataProvider; @@ -96,6 +95,13 @@ public class TCharacter extends TObject implements TComparable { private static UnicodeHelper.Range[] classMapping; private char value; private static TCharacter[] characterCache = new TCharacter[128]; + private static final int SURROGATE_NEUTRAL_BIT_MASK = 0xF800; + private static final int SURROGATE_BITS = 0xD800; + private static final int SURROGATE_BIT_MASK = 0xFC00; + private static final int SURROGATE_BIT_INV_MASK = 0x03FF; + private static final int HIGH_SURROGATE_BITS = 0xD800; + private static final int LOW_SURROGATE_BITS = 0xDC00; + private static final int MEANINGFUL_SURROGATE_BITS = 10; public TCharacter(char value) { this.value = value; @@ -152,11 +158,11 @@ public class TCharacter extends TObject implements TComparable { } public static boolean isHighSurrogate(char ch) { - return UTF16Helper.isHighSurrogate(ch); + return (ch & SURROGATE_BIT_MASK) == HIGH_SURROGATE_BITS; } public static boolean isLowSurrogate(char ch) { - return UTF16Helper.isLowSurrogate(ch); + return (ch & SURROGATE_BIT_MASK) == LOW_SURROGATE_BITS; } public static boolean isSurrogate(char ch) { @@ -172,7 +178,8 @@ public class TCharacter extends TObject implements TComparable { } public static int toCodePoint(char high, char low) { - return UTF16Helper.buildCodePoint(high, low); + return (((high & SURROGATE_BIT_INV_MASK) << MEANINGFUL_SURROGATE_BITS) | (low & SURROGATE_BIT_INV_MASK)) + + MIN_SUPPLEMENTARY_CODE_POINT; } public static int codePointAt(TCharSequence seq, int index) { @@ -216,11 +223,12 @@ public class TCharacter extends TObject implements TComparable { } public static char highSurrogate(int codePoint) { - return UTF16Helper.highSurrogate(codePoint); + codePoint -= MIN_SUPPLEMENTARY_CODE_POINT; + return (char)(HIGH_SURROGATE_BITS | (codePoint >> MEANINGFUL_SURROGATE_BITS) & SURROGATE_BIT_INV_MASK); } public static char lowSurrogate(int codePoint) { - return UTF16Helper.lowSurrogate(codePoint); + return (char)(LOW_SURROGATE_BITS | codePoint & SURROGATE_BIT_INV_MASK); } public static char toLowerCase(char ch) { @@ -309,9 +317,9 @@ public class TCharacter extends TObject implements TComparable { private static native StringResource obtainClasses(); public static int toChars(int codePoint, char[] dst, int dstIndex) { - if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) { - dst[dstIndex] = UTF16Helper.highSurrogate(codePoint); - dst[dstIndex + 1] = UTF16Helper.lowSurrogate(codePoint); + if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) { + dst[dstIndex] = highSurrogate(codePoint); + dst[dstIndex + 1] = lowSurrogate(codePoint); return 2; } else { dst[dstIndex] = (char)codePoint; @@ -320,8 +328,8 @@ public class TCharacter extends TObject implements TComparable { } public static char[] toChars(int codePoint) { - if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) { - return new char[] { UTF16Helper.highSurrogate(codePoint), UTF16Helper.lowSurrogate(codePoint) }; + if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) { + return new char[] { highSurrogate(codePoint), lowSurrogate(codePoint) }; } else { return new char[] { (char)codePoint }; } @@ -331,7 +339,7 @@ public class TCharacter extends TObject implements TComparable { int count = endIndex - beginIndex; --endIndex; for (int i = beginIndex; i < endIndex; ++i) { - if (UTF16Helper.isHighSurrogate(seq.charAt(i)) && UTF16Helper.isLowSurrogate(seq.charAt(i + 1))) { + if (isHighSurrogate(seq.charAt(i)) && isLowSurrogate(seq.charAt(i + 1))) { --count; ++i; } @@ -343,7 +351,7 @@ public class TCharacter extends TObject implements TComparable { int r = count; --count; for (int i = 0; i < count; ++i) { - if (UTF16Helper.isHighSurrogate(a[offset]) && UTF16Helper.isLowSurrogate(a[offset + i + 1])) { + if (isHighSurrogate(a[offset]) && isLowSurrogate(a[offset + i + 1])) { --r; ++i; } @@ -353,8 +361,8 @@ public class TCharacter extends TObject implements TComparable { public static int offsetByCodePoints(TCharSequence seq, int index, int codePointOffset) { for (int i = 0; i < codePointOffset; ++i) { - if (index < seq.length() - 1 && UTF16Helper.isHighSurrogate(seq.charAt(index)) && - UTF16Helper.isLowSurrogate(seq.charAt(index + 1))) { + if (index < seq.length() - 1 && isHighSurrogate(seq.charAt(index)) && + isLowSurrogate(seq.charAt(index + 1))) { index += 2; } else { index++; @@ -365,8 +373,7 @@ public class TCharacter extends TObject implements TComparable { public static int offsetByCodePoints(char[] a, int start, int count, int index, int codePointOffset) { for (int i = 0; i < codePointOffset; ++i) { - if (index < count - 1 && UTF16Helper.isHighSurrogate(a[index + start]) && - UTF16Helper.isLowSurrogate(a[index + start + 1])) { + if (index < count - 1 && isHighSurrogate(a[index + start]) && isLowSurrogate(a[index + start + 1])) { index += 2; } else { index++; diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TString.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TString.java index dd2daa1a1..cec922c01 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TString.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TString.java @@ -15,9 +15,12 @@ */ package org.teavm.classlib.java.lang; -import org.teavm.classlib.impl.charset.*; import org.teavm.classlib.java.io.TSerializable; import org.teavm.classlib.java.io.TUnsupportedEncodingException; +import org.teavm.classlib.java.nio.TByteBuffer; +import org.teavm.classlib.java.nio.TCharBuffer; +import org.teavm.classlib.java.nio.charset.TCharset; +import org.teavm.classlib.java.nio.charset.impl.TUTF8Charset; import org.teavm.classlib.java.util.TArrays; import org.teavm.classlib.java.util.TComparator; import org.teavm.classlib.java.util.THashMap; @@ -61,15 +64,15 @@ public class TString extends TObject implements TSerializable, TComparable= UTF16Helper.SUPPLEMENTARY_PLANE) { - characters[charCount++] = UTF16Helper.highSurrogate(codePoint); - characters[charCount++] = UTF16Helper.lowSurrogate(codePoint); + if (codePoint >= TCharacter.MIN_SUPPLEMENTARY_CODE_POINT) { + characters[charCount++] = TCharacter.highSurrogate(codePoint); + characters[charCount++] = TCharacter.lowSurrogate(codePoint); } else { characters[charCount++] = (char)codePoint; } @@ -97,19 +104,14 @@ public class TString extends TObject implements TSerializable, TComparable= 0; --i) { if (characters[i] == bmpChar) { @@ -317,8 +319,8 @@ public class TString extends TObject implements TSerializable, TComparable= 1; --i) { if (characters[i] == lo && characters[i - 1] == hi) { return i - 1; @@ -550,34 +552,22 @@ public class TString extends TObject implements TSerializable, TComparable result.length) { - result = TArrays.copyOf(result, result.length * 2); - } - for (int i = 0; i < dest.position(); ++i) { - result[resultLength++] = destArray[i]; - } - dest.rewind(0); + public byte[] getBytes(TCharset charset) { + TByteBuffer buffer = charset.encode(TCharBuffer.wrap(characters)); + if (buffer.hasArray() && buffer.position() == 0 && buffer.limit() == buffer.capacity()) { + return buffer.array(); + } else { + byte[] result = new byte[buffer.remaining()]; + buffer.get(result); + return result; } - return TArrays.copyOf(result, resultLength); } @Override @@ -601,11 +591,11 @@ public class TString extends TObject implements TSerializable, TComparable replacement.length()) { + in.position(in.position() + in.remaining()); + if (malformedAction == TCodingErrorAction.REPLACE) { + out.put(replacement); + } + } else { + return TCoderResult.OVERFLOW; + } + } } return result; } else if (result.isMalformed()) { diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/nio/charset/impl/TUTF8Decoder.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/nio/charset/impl/TUTF8Decoder.java index a9c8e4659..758f1682b 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/java/nio/charset/impl/TUTF8Decoder.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/nio/charset/impl/TUTF8Decoder.java @@ -15,7 +15,6 @@ */ package org.teavm.classlib.java.nio.charset.impl; -import org.teavm.classlib.impl.charset.UTF16Helper; import org.teavm.classlib.java.nio.charset.TCharset; import org.teavm.classlib.java.nio.charset.TCoderResult; @@ -44,7 +43,13 @@ public class TUTF8Decoder extends TBufferedDecoder { } break; } - outArray[outPos++] = (char)(((b & 0x1F) << 6) | (inArray[inPos++] & 0x3F)); + byte b2 = inArray[inPos++]; + if (!checkMidByte(b2)) { + inPos -= 2; + result = TCoderResult.malformedForLength(1); + break; + } + outArray[outPos++] = (char)(((b & 0x1F) << 6) | (b2 & 0x3F)); } else if ((b & 0xF0) == 0xE0) { if (inPos + 2 > inSize) { --inPos; @@ -55,6 +60,11 @@ public class TUTF8Decoder extends TBufferedDecoder { } byte b2 = inArray[inPos++]; byte b3 = inArray[inPos++]; + if (!checkMidByte(b2) || !checkMidByte(b3)) { + inPos -= 3; + result = TCoderResult.malformedForLength(1); + break; + } char c = (char)(((b & 0x0F) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3F)); if (Character.isSurrogate(c)) { inPos -= 3; @@ -72,7 +82,7 @@ public class TUTF8Decoder extends TBufferedDecoder { } if (outPos + 2 > outSize) { --inPos; - if (!controller.hasMoreOutput()) { + if (!controller.hasMoreOutput(2)) { result = TCoderResult.OVERFLOW; } break; @@ -80,9 +90,18 @@ public class TUTF8Decoder extends TBufferedDecoder { byte b2 = inArray[inPos++]; byte b3 = inArray[inPos++]; byte b4 = inArray[inPos++]; + if (!checkMidByte(b2) || !checkMidByte(b3) || !checkMidByte(b4)) { + inPos -= 3; + result = TCoderResult.malformedForLength(1); + break; + } int code = ((b & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3F) << 6) | (b4 & 0x3F); - outArray[outPos++] = UTF16Helper.highSurrogate(code); - outArray[outPos++] = UTF16Helper.lowSurrogate(code); + outArray[outPos++] = Character.highSurrogate(code); + outArray[outPos++] = Character.lowSurrogate(code); + } else { + --inPos; + result = TCoderResult.malformedForLength(1); + break; } } @@ -90,4 +109,8 @@ public class TUTF8Decoder extends TBufferedDecoder { controller.setOutPosition(outPos); return result; } + + private boolean checkMidByte(byte b) { + return (b & 0xC0) == 0x80; + } } diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/nio/charset/impl/TUTF8Encoder.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/nio/charset/impl/TUTF8Encoder.java index 0da2118dd..e5dc774c7 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/java/nio/charset/impl/TUTF8Encoder.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/nio/charset/impl/TUTF8Encoder.java @@ -15,7 +15,6 @@ */ package org.teavm.classlib.java.nio.charset.impl; -import org.teavm.classlib.impl.charset.UTF16Helper; import org.teavm.classlib.java.nio.charset.TCharset; import org.teavm.classlib.java.nio.charset.TCoderResult; @@ -57,7 +56,7 @@ public class TUTF8Encoder extends TBufferedEncoder { outArray[outPos++] = (byte)(0xE0 | (ch >> 12)); outArray[outPos++] = (byte)(0x80 | ((ch >> 6) & 0x3F)); outArray[outPos++] = (byte)(0x80 | (ch & 0x3F)); - } else if (UTF16Helper.isHighSurrogate(ch)) { + } else if (Character.isHighSurrogate(ch)) { if (inPos >= inSize) { if (!controller.hasMoreInput()) { result = TCoderResult.UNDERFLOW; @@ -65,9 +64,9 @@ public class TUTF8Encoder extends TBufferedEncoder { break; } char low = inArray[inPos++]; - if (!UTF16Helper.isLowSurrogate(low)) { + if (!Character.isLowSurrogate(low)) { inPos -= 2; - result = TCoderResult.malformedForLength(2); + result = TCoderResult.malformedForLength(1); break; } if (outPos + 4 > outSize) { @@ -77,7 +76,7 @@ public class TUTF8Encoder extends TBufferedEncoder { } break; } - int codePoint = UTF16Helper.buildCodePoint(ch, low); + int codePoint = Character.toCodePoint(ch, low); outArray[outPos++] = (byte)(0xF0 | (codePoint >> 18)); outArray[outPos++] = (byte)(0x80 | ((codePoint >> 12) & 0x3F)); outArray[outPos++] = (byte)(0x80 | ((codePoint >> 6) & 0x3F)); diff --git a/teavm-tests/src/test/java/org/teavm/classlib/java/nio/charset/UTF8Test.java b/teavm-tests/src/test/java/org/teavm/classlib/java/nio/charset/UTF8Test.java index 5017a335b..77b950c95 100644 --- a/teavm-tests/src/test/java/org/teavm/classlib/java/nio/charset/UTF8Test.java +++ b/teavm-tests/src/test/java/org/teavm/classlib/java/nio/charset/UTF8Test.java @@ -1,7 +1,7 @@ package org.teavm.classlib.java.nio.charset; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.Assert.*; +import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; @@ -46,6 +46,76 @@ public class UTF8Test { runDecode(100, 600); } + @Test + public void replaceMalformedSurrogatePair() { + Charset charset = Charset.forName("UTF-8"); + ByteBuffer buffer = charset.encode("a\uD800\uD800b"); + byte[] result = new byte[buffer.remaining()]; + buffer.get(result); + assertArrayEquals(new byte[] { 97, 63, 63, 98 }, result); + } + + @Test + public void encodeSurrogate() { + Charset charset = Charset.forName("UTF-8"); + ByteBuffer buffer = charset.encode("a\uD800\uDC00b"); + byte[] result = new byte[buffer.remaining()]; + buffer.get(result); + assertArrayEquals(new byte[] { 97, -16, -112, -128, -128, 98 }, result); + } + + @Test + public void replaceMalformedFirstByte() { + Charset charset = Charset.forName("UTF-8"); + CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xFF, 98 })); + char[] result = new char[buffer.remaining()]; + buffer.get(result); + assertEquals("a\uFFFDb", new String(result)); + } + + @Test + public void replaceMalformedMidByte() { + Charset charset = Charset.forName("UTF-8"); + CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xC0, 98, 98 })); + char[] result = new char[buffer.remaining()]; + buffer.get(result); + assertEquals("a\uFFFDbb", new String(result)); + } + + @Test + public void replaceDecodedSurrogate() { + Charset charset = Charset.forName("UTF-8"); + CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xED, (byte)0xA0, (byte)0x80, 98 })); + char[] result = new char[buffer.remaining()]; + buffer.get(result); + assertEquals("a\uFFFDb", new String(result)); + } + + @Test + public void replaceDecodedSurrogatePair() { + Charset charset = Charset.forName("UTF-8"); + CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xED, (byte)0xA0, (byte)0x80, + (byte)0xED, (byte)0xBF, (byte)0xBF, 98 })); + char[] result = new char[buffer.remaining()]; + buffer.get(result); + assertEquals("a\uFFFD\uFFFDb", new String(result)); + } + + @Test + public void decodeLongUTF8ByteArray() throws UnsupportedEncodingException { + byte[] bytes = new byte[16384]; + for (int i = 0; i < bytes.length;) { + bytes[i++] = -16; + bytes[i++] = -66; + bytes[i++] = -78; + bytes[i++] = -69; + } + Charset charset = Charset.forName("UTF-8"); + CharBuffer buffer = charset.decode(ByteBuffer.wrap(bytes)); + assertEquals('\uD8BB', buffer.get(8190)); + assertEquals('\uDCBB', buffer.get(8191)); + } + private void runEncode(int inSize, int outSize) { char[] input = text.toCharArray(); byte[] output = new byte[16384];