Replacing old TeaVM-based charsets with NIO charsets

This commit is contained in:
konsoletyper 2015-03-22 18:15:48 +03:00
parent cc5225a2a6
commit aa2451c3e3
10 changed files with 234 additions and 197 deletions

View File

@ -1,61 +0,0 @@
/*
* Copyright 2013 Alexey Andreev.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.teavm.classlib.impl.charset;
/**
*
* @author Alexey Andreev
*/
public class UTF16Helper {
public static final int SURROGATE_NEUTRAL_BIT_MASK = 0xF800;
public static final int SURROGATE_BITS = 0xD800;
public static final int SURROGATE_BIT_MASK = 0xFC00;
public static final int SURROGATE_BIT_INV_MASK = 0x03FF;
public static final int HIGH_SURROGATE_BITS = 0xD800;
public static final int LOW_SURROGATE_BITS = 0xDC00;
public static final int MEANINGFUL_SURROGATE_BITS = 10;
public static final int SUPPLEMENTARY_PLANE = 0x10000;
public static char highSurrogate(int codePoint) {
codePoint -= SUPPLEMENTARY_PLANE;
return (char)(HIGH_SURROGATE_BITS | (codePoint >> MEANINGFUL_SURROGATE_BITS) & SURROGATE_BIT_INV_MASK);
}
public static char lowSurrogate(int codePoint) {
return (char)(LOW_SURROGATE_BITS | codePoint & SURROGATE_BIT_INV_MASK);
}
public static boolean isHighSurrogate(char c) {
return (c & SURROGATE_BIT_MASK) == HIGH_SURROGATE_BITS;
}
public static boolean isLowSurrogate(char c) {
return (c & SURROGATE_BIT_MASK) == LOW_SURROGATE_BITS;
}
public static boolean isSurrogatePair(char a, char b) {
return isHighSurrogate(a) && isLowSurrogate(b);
}
public static int buildCodePoint(char a, char b) {
return (((a & SURROGATE_BIT_INV_MASK) << MEANINGFUL_SURROGATE_BITS) | (b & SURROGATE_BIT_INV_MASK)) +
SUPPLEMENTARY_PLANE;
}
public static boolean isSurrogate(char c) {
return (c & SURROGATE_NEUTRAL_BIT_MASK) == SURROGATE_BITS;
}
}

View File

@ -29,17 +29,17 @@ public class UTF8Charset extends Charset {
} else if (ch < 0x400) {
dest.put((byte)(0xC0 | (ch >> 6)));
dest.put((byte)(0x80 | (ch & 0x3F)));
} else if (!UTF16Helper.isSurrogate(ch)) {
} else if (!Character.isSurrogate(ch)) {
dest.put((byte)(0xE0 | (ch >> 12)));
dest.put((byte)(0x80 | ((ch >> 6) & 0x3F)));
dest.put((byte)(0x80 | (ch & 0x3F)));
} else if (UTF16Helper.isHighSurrogate(ch)) {
} else if (Character.isHighSurrogate(ch)) {
char low = source.get();
if (!UTF16Helper.isLowSurrogate(low)) {
if (!Character.isLowSurrogate(low)) {
source.back(1);
dest.put((byte)'?');
} else {
int codePoint = UTF16Helper.buildCodePoint(ch, low);
int codePoint = Character.toCodePoint(ch, low);
dest.put((byte)(0xF0 | (codePoint >> 18)));
dest.put((byte)(0x80 | ((codePoint >> 12) & 0x3F)));
dest.put((byte)(0x80 | ((codePoint >> 6) & 0x3F)));
@ -72,7 +72,7 @@ public class UTF8Charset extends Charset {
byte b2 = source.get();
byte b3 = source.get();
char c = (char)(((b & 0x0F) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3F));
dest.put(!UTF16Helper.isHighSurrogate(c) ? c : '?');
dest.put(!Character.isHighSurrogate(c) ? c : '?');
} else if ((b & 0xF8) == 0xF0) {
if (source.available() < 3) {
source.skip(source.available());
@ -83,8 +83,8 @@ public class UTF8Charset extends Charset {
byte b3 = source.get();
byte b4 = source.get();
int code = ((b & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3F) << 6) | (b4 & 0x3F);
dest.put(UTF16Helper.highSurrogate(code));
dest.put(UTF16Helper.lowSurrogate(code));
dest.put(Character.highSurrogate(code));
dest.put(Character.lowSurrogate(code));
}
}
}

View File

@ -15,11 +15,13 @@
*/
package org.teavm.classlib.java.io;
import org.teavm.classlib.impl.charset.ByteBuffer;
import org.teavm.classlib.impl.charset.CharBuffer;
import org.teavm.classlib.impl.charset.Charset;
import org.teavm.classlib.impl.charset.UTF8Charset;
import org.teavm.classlib.java.lang.TString;
import org.teavm.classlib.java.nio.TByteBuffer;
import org.teavm.classlib.java.nio.TCharBuffer;
import org.teavm.classlib.java.nio.charset.TCharset;
import org.teavm.classlib.java.nio.charset.TCharsetDecoder;
import org.teavm.classlib.java.nio.charset.TCodingErrorAction;
import org.teavm.classlib.java.nio.charset.impl.TUTF8Charset;
/**
*
@ -27,30 +29,30 @@ import org.teavm.classlib.java.lang.TString;
*/
public class TInputStreamReader extends TReader {
private TInputStream stream;
private Charset charset;
private TCharset charset;
private TString charsetName;
private byte[] inData = new byte[8192];
private ByteBuffer inBuffer = new ByteBuffer(inData);
private TByteBuffer inBuffer = TByteBuffer.wrap(inData);
private char[] outData = new char[1024];
private CharBuffer outBuffer = new CharBuffer(outData);
private TCharBuffer outBuffer = TCharBuffer.wrap(outData);
private boolean streamEof;
private boolean eof;
public TInputStreamReader(TInputStream in, TString charsetName) {
this(in, Charset.get(charsetName.toString()));
this(in, TCharset.forName(charsetName.toString()));
this.charsetName = charsetName;
}
public TInputStreamReader(TInputStream in) {
this(in, new UTF8Charset());
this(in, new TUTF8Charset());
charsetName = TString.wrap("UTF-8");
}
private TInputStreamReader(TInputStream in, Charset charset) {
public TInputStreamReader(TInputStream in, TCharset charset) {
this.stream = in;
this.charset = charset;
outBuffer.skip(outBuffer.available());
inBuffer.skip(inBuffer.available());
outBuffer.position(outBuffer.limit());
inBuffer.position(inBuffer.limit());
}
public TString getEncoding() {
@ -64,10 +66,10 @@ public class TInputStreamReader extends TReader {
@Override
public int read() throws TIOException {
if (eof && outBuffer.end()) {
if (eof && !outBuffer.hasRemaining()) {
return -1;
}
if (!outBuffer.end()) {
if (outBuffer.hasRemaining()) {
return outBuffer.get();
}
return fillBuffer() ? outBuffer.get() : -1;
@ -75,37 +77,40 @@ public class TInputStreamReader extends TReader {
@Override
public int read(char[] cbuf, int off, int len) throws TIOException {
if (eof && outBuffer.end()) {
if (eof && !outBuffer.hasRemaining()) {
return -1;
}
CharBuffer wrapBuffer = new CharBuffer(cbuf, off, off + len);
while (!wrapBuffer.end()) {
wrapBuffer.put(outBuffer);
if (outBuffer.end() && !fillBuffer()) {
int bytesRead = 0;
while (len > 0) {
int sz = Math.min(len, outBuffer.remaining());
outBuffer.get(cbuf, off + bytesRead, sz);
len -= sz;
bytesRead += sz;
if (!outBuffer.hasRemaining() && !fillBuffer()) {
break;
}
}
return wrapBuffer.position() - off;
return bytesRead;
}
private boolean fillBuffer() throws TIOException {
if (eof) {
return false;
}
CharBuffer newBuffer = new CharBuffer(outData);
newBuffer.put(outBuffer);
outBuffer.compact();
TCharsetDecoder decoder = charset.newDecoder()
.onMalformedInput(TCodingErrorAction.REPLACE)
.onUnmappableCharacter(TCodingErrorAction.IGNORE);
while (true) {
if (inBuffer.end() && !fillReadBuffer()) {
if (!inBuffer.hasRemaining() && !fillReadBuffer()) {
eof = true;
break;
}
int oldAvail = newBuffer.available();
charset.decode(inBuffer, newBuffer);
if (oldAvail == newBuffer.available()) {
if (decoder.decode(inBuffer, outBuffer, eof).isOverflow()) {
break;
}
}
outBuffer = new CharBuffer(outData, 0, newBuffer.position());
outBuffer.flip();
return true;
}
@ -113,30 +118,25 @@ public class TInputStreamReader extends TReader {
if (streamEof) {
return false;
}
int off = 0;
while (!inBuffer.end()) {
inData[off] = inBuffer.get();
}
inBuffer.rewind(0);
while (off < inData.length) {
int bytesRead = stream.read(inData, off, inData.length - off);
inBuffer.compact();
while (inBuffer.hasRemaining()) {
int bytesRead = stream.read(inBuffer.array(), inBuffer.position(), inBuffer.remaining());
if (bytesRead == -1) {
streamEof = true;
inBuffer = new ByteBuffer(inData, 0, inBuffer.position());
break;
} else {
off += bytesRead;
inBuffer.position(inBuffer.position() + bytesRead);
if (bytesRead == 0) {
break;
}
}
}
inBuffer = new ByteBuffer(inData, 0, off);
inBuffer.flip();
return true;
}
@Override
public boolean ready() throws TIOException {
return !outBuffer.end() || inBuffer.end();
return outBuffer.hasRemaining() || inBuffer.hasRemaining();
}
}

View File

@ -15,7 +15,6 @@
*/
package org.teavm.classlib.java.lang;
import org.teavm.classlib.impl.charset.UTF16Helper;
import org.teavm.classlib.java.io.TSerializable;
import org.teavm.classlib.java.util.TArrays;
@ -553,12 +552,12 @@ class TAbstractStringBuilder extends TObject implements TSerializable, TCharSequ
}
protected TAbstractStringBuilder appendCodePoint(int codePoint) {
if (codePoint < UTF16Helper.SUPPLEMENTARY_PLANE) {
if (codePoint < TCharacter.MIN_SUPPLEMENTARY_CODE_POINT) {
return append((char)codePoint);
}
ensureCapacity(length + 2);
buffer[length++] = UTF16Helper.highSurrogate(codePoint);
buffer[length++] = UTF16Helper.lowSurrogate(codePoint);
buffer[length++] = TCharacter.highSurrogate(codePoint);
buffer[length++] = TCharacter.lowSurrogate(codePoint);
return this;
}

View File

@ -15,7 +15,6 @@
*/
package org.teavm.classlib.java.lang;
import org.teavm.classlib.impl.charset.UTF16Helper;
import org.teavm.classlib.impl.unicode.UnicodeHelper;
import org.teavm.platform.Platform;
import org.teavm.platform.metadata.MetadataProvider;
@ -96,6 +95,13 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
private static UnicodeHelper.Range[] classMapping;
private char value;
private static TCharacter[] characterCache = new TCharacter[128];
private static final int SURROGATE_NEUTRAL_BIT_MASK = 0xF800;
private static final int SURROGATE_BITS = 0xD800;
private static final int SURROGATE_BIT_MASK = 0xFC00;
private static final int SURROGATE_BIT_INV_MASK = 0x03FF;
private static final int HIGH_SURROGATE_BITS = 0xD800;
private static final int LOW_SURROGATE_BITS = 0xDC00;
private static final int MEANINGFUL_SURROGATE_BITS = 10;
public TCharacter(char value) {
this.value = value;
@ -152,11 +158,11 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
}
public static boolean isHighSurrogate(char ch) {
return UTF16Helper.isHighSurrogate(ch);
return (ch & SURROGATE_BIT_MASK) == HIGH_SURROGATE_BITS;
}
public static boolean isLowSurrogate(char ch) {
return UTF16Helper.isLowSurrogate(ch);
return (ch & SURROGATE_BIT_MASK) == LOW_SURROGATE_BITS;
}
public static boolean isSurrogate(char ch) {
@ -172,7 +178,8 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
}
public static int toCodePoint(char high, char low) {
return UTF16Helper.buildCodePoint(high, low);
return (((high & SURROGATE_BIT_INV_MASK) << MEANINGFUL_SURROGATE_BITS) | (low & SURROGATE_BIT_INV_MASK)) +
MIN_SUPPLEMENTARY_CODE_POINT;
}
public static int codePointAt(TCharSequence seq, int index) {
@ -216,11 +223,12 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
}
public static char highSurrogate(int codePoint) {
return UTF16Helper.highSurrogate(codePoint);
codePoint -= MIN_SUPPLEMENTARY_CODE_POINT;
return (char)(HIGH_SURROGATE_BITS | (codePoint >> MEANINGFUL_SURROGATE_BITS) & SURROGATE_BIT_INV_MASK);
}
public static char lowSurrogate(int codePoint) {
return UTF16Helper.lowSurrogate(codePoint);
return (char)(LOW_SURROGATE_BITS | codePoint & SURROGATE_BIT_INV_MASK);
}
public static char toLowerCase(char ch) {
@ -309,9 +317,9 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
private static native StringResource obtainClasses();
public static int toChars(int codePoint, char[] dst, int dstIndex) {
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) {
dst[dstIndex] = UTF16Helper.highSurrogate(codePoint);
dst[dstIndex + 1] = UTF16Helper.lowSurrogate(codePoint);
if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) {
dst[dstIndex] = highSurrogate(codePoint);
dst[dstIndex + 1] = lowSurrogate(codePoint);
return 2;
} else {
dst[dstIndex] = (char)codePoint;
@ -320,8 +328,8 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
}
public static char[] toChars(int codePoint) {
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) {
return new char[] { UTF16Helper.highSurrogate(codePoint), UTF16Helper.lowSurrogate(codePoint) };
if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) {
return new char[] { highSurrogate(codePoint), lowSurrogate(codePoint) };
} else {
return new char[] { (char)codePoint };
}
@ -331,7 +339,7 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
int count = endIndex - beginIndex;
--endIndex;
for (int i = beginIndex; i < endIndex; ++i) {
if (UTF16Helper.isHighSurrogate(seq.charAt(i)) && UTF16Helper.isLowSurrogate(seq.charAt(i + 1))) {
if (isHighSurrogate(seq.charAt(i)) && isLowSurrogate(seq.charAt(i + 1))) {
--count;
++i;
}
@ -343,7 +351,7 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
int r = count;
--count;
for (int i = 0; i < count; ++i) {
if (UTF16Helper.isHighSurrogate(a[offset]) && UTF16Helper.isLowSurrogate(a[offset + i + 1])) {
if (isHighSurrogate(a[offset]) && isLowSurrogate(a[offset + i + 1])) {
--r;
++i;
}
@ -353,8 +361,8 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
public static int offsetByCodePoints(TCharSequence seq, int index, int codePointOffset) {
for (int i = 0; i < codePointOffset; ++i) {
if (index < seq.length() - 1 && UTF16Helper.isHighSurrogate(seq.charAt(index)) &&
UTF16Helper.isLowSurrogate(seq.charAt(index + 1))) {
if (index < seq.length() - 1 && isHighSurrogate(seq.charAt(index)) &&
isLowSurrogate(seq.charAt(index + 1))) {
index += 2;
} else {
index++;
@ -365,8 +373,7 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
public static int offsetByCodePoints(char[] a, int start, int count, int index, int codePointOffset) {
for (int i = 0; i < codePointOffset; ++i) {
if (index < count - 1 && UTF16Helper.isHighSurrogate(a[index + start]) &&
UTF16Helper.isLowSurrogate(a[index + start + 1])) {
if (index < count - 1 && isHighSurrogate(a[index + start]) && isLowSurrogate(a[index + start + 1])) {
index += 2;
} else {
index++;

View File

@ -15,9 +15,12 @@
*/
package org.teavm.classlib.java.lang;
import org.teavm.classlib.impl.charset.*;
import org.teavm.classlib.java.io.TSerializable;
import org.teavm.classlib.java.io.TUnsupportedEncodingException;
import org.teavm.classlib.java.nio.TByteBuffer;
import org.teavm.classlib.java.nio.TCharBuffer;
import org.teavm.classlib.java.nio.charset.TCharset;
import org.teavm.classlib.java.nio.charset.impl.TUTF8Charset;
import org.teavm.classlib.java.util.TArrays;
import org.teavm.classlib.java.util.TComparator;
import org.teavm.classlib.java.util.THashMap;
@ -61,15 +64,15 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
}
public TString(byte[] bytes, int offset, int length, TString charsetName) throws TUnsupportedEncodingException {
Charset charset = Charset.get(charsetName.toString());
if (charset == null) {
throw new TUnsupportedEncodingException(TString.wrap("Unknown encoding:" + charsetName));
}
this(bytes, offset, length, TCharset.forName(charsetName.toString()));
}
public TString(byte[] bytes, int offset, int length, TCharset charset) {
initWithBytes(bytes, offset, length, charset);
}
public TString(byte[] bytes, int offset, int length) {
initWithBytes(bytes, offset, length, new UTF8Charset());
initWithBytes(bytes, offset, length, new TUTF8Charset());
}
public TString(byte[] bytes) {
@ -80,14 +83,18 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
this(bytes, 0, bytes.length, charsetName);
}
public TString(byte[] bytes, TCharset charset) {
this(bytes, 0, bytes.length, charset);
}
public TString(int[] codePoints, int offset, int count) {
characters = new char[count * 2];
int charCount = 0;
for (int i = 0; i < count; ++i) {
int codePoint = codePoints[offset++];
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) {
characters[charCount++] = UTF16Helper.highSurrogate(codePoint);
characters[charCount++] = UTF16Helper.lowSurrogate(codePoint);
if (codePoint >= TCharacter.MIN_SUPPLEMENTARY_CODE_POINT) {
characters[charCount++] = TCharacter.highSurrogate(codePoint);
characters[charCount++] = TCharacter.lowSurrogate(codePoint);
} else {
characters[charCount++] = (char)codePoint;
}
@ -97,19 +104,14 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
}
}
private void initWithBytes(byte[] bytes, int offset, int length, Charset charset) {
TStringBuilder sb = new TStringBuilder(bytes.length * 2);
this.characters = new char[sb.length()];
ByteBuffer source = new ByteBuffer(bytes, offset, offset + length);
char[] destChars = new char[TMath.max(8, TMath.min(length * 2, 1024))];
CharBuffer dest = new CharBuffer(destChars, 0, destChars.length);
while (!source.end()) {
charset.decode(source, dest);
sb.append(destChars, 0, dest.position());
dest.rewind(0);
private void initWithBytes(byte[] bytes, int offset, int length, TCharset charset) {
TCharBuffer buffer = charset.decode(TByteBuffer.wrap(bytes, offset, length));
if (buffer.hasArray() && buffer.position() == 0 && buffer.limit() == buffer.capacity()) {
characters = buffer.array();
} else {
characters = new char[buffer.remaining()];
buffer.get(characters);
}
characters = new char[sb.length()];
sb.getChars(0, sb.length(), characters, 0);
}
public TString(TStringBuilder sb) {
@ -283,7 +285,7 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
}
public int indexOf(int ch, int fromIndex) {
if (ch < UTF16Helper.SUPPLEMENTARY_PLANE) {
if (ch < TCharacter.MIN_SUPPLEMENTARY_CODE_POINT) {
char bmpChar = (char)ch;
for (int i = fromIndex; i < characters.length; ++i) {
if (characters[i] == bmpChar) {
@ -292,8 +294,8 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
}
return -1;
} else {
char hi = UTF16Helper.highSurrogate(ch);
char lo = UTF16Helper.lowSurrogate(ch);
char hi = TCharacter.highSurrogate(ch);
char lo = TCharacter.lowSurrogate(ch);
for (int i = fromIndex; i < characters.length - 1; ++i) {
if (characters[i] == hi && characters[i + 1] == lo) {
return i;
@ -308,7 +310,7 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
}
public int lastIndexOf(int ch, int fromIndex) {
if (ch < UTF16Helper.SUPPLEMENTARY_PLANE) {
if (ch < TCharacter.MIN_SUPPLEMENTARY_CODE_POINT) {
char bmpChar = (char)ch;
for (int i = fromIndex; i >= 0; --i) {
if (characters[i] == bmpChar) {
@ -317,8 +319,8 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
}
return -1;
} else {
char hi = UTF16Helper.highSurrogate(ch);
char lo = UTF16Helper.lowSurrogate(ch);
char hi = TCharacter.highSurrogate(ch);
char lo = TCharacter.lowSurrogate(ch);
for (int i = fromIndex; i >= 1; --i) {
if (characters[i] == lo && characters[i - 1] == hi) {
return i - 1;
@ -550,34 +552,22 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
}
public byte[] getBytes(TString charsetName) throws TUnsupportedEncodingException {
Charset charset = Charset.get(charsetName.toString());
if (charset == null) {
throw new TUnsupportedEncodingException(TString.wrap("Unsupported encoding: " + charsetName));
}
return getBytes(charset);
return getBytes(TCharset.forName(charsetName.toString()));
}
public byte[] getBytes() {
return getBytes(new UTF8Charset());
return getBytes(new TUTF8Charset());
}
private byte[] getBytes(Charset charset) {
byte[] result = new byte[length() * 2];
int resultLength = 0;
byte[] destArray = new byte[TMath.max(16, TMath.min(length() * 2, 4096))];
ByteBuffer dest = new ByteBuffer(destArray);
CharBuffer src = new CharBuffer(characters);
while (!src.end()) {
charset.encode(src, dest);
if (resultLength + dest.position() > result.length) {
result = TArrays.copyOf(result, result.length * 2);
}
for (int i = 0; i < dest.position(); ++i) {
result[resultLength++] = destArray[i];
}
dest.rewind(0);
public byte[] getBytes(TCharset charset) {
TByteBuffer buffer = charset.encode(TCharBuffer.wrap(characters));
if (buffer.hasArray() && buffer.position() == 0 && buffer.limit() == buffer.capacity()) {
return buffer.array();
} else {
byte[] result = new byte[buffer.remaining()];
buffer.get(result);
return result;
}
return TArrays.copyOf(result, resultLength);
}
@Override
@ -601,11 +591,11 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
int[] codePoints = new int[characters.length];
int codePointCount = 0;
for (int i = 0; i < characters.length; ++i) {
if (i == characters.length - 1 || !UTF16Helper.isHighSurrogate(characters[i]) ||
!UTF16Helper.isLowSurrogate(characters[i + 1])) {
if (i == characters.length - 1 || !TCharacter.isHighSurrogate(characters[i]) ||
!TCharacter.isLowSurrogate(characters[i + 1])) {
codePoints[codePointCount++] = TCharacter.toLowerCase(characters[i]);
} else {
codePoints[codePointCount++] = TCharacter.toLowerCase(UTF16Helper.buildCodePoint(
codePoints[codePointCount++] = TCharacter.toLowerCase(TCharacter.toCodePoint(
characters[i], characters[i + 1]));
++i;
}
@ -620,11 +610,11 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
int[] codePoints = new int[characters.length];
int codePointCount = 0;
for (int i = 0; i < characters.length; ++i) {
if (i == characters.length - 1 || !UTF16Helper.isHighSurrogate(characters[i]) ||
!UTF16Helper.isLowSurrogate(characters[i + 1])) {
if (i == characters.length - 1 || !TCharacter.isHighSurrogate(characters[i]) ||
!TCharacter.isLowSurrogate(characters[i + 1])) {
codePoints[codePointCount++] = TCharacter.toUpperCase(characters[i]);
} else {
codePoints[codePointCount++] = TCharacter.toUpperCase(UTF16Helper.buildCodePoint(
codePoints[codePointCount++] = TCharacter.toUpperCase(TCharacter.toCodePoint(
characters[i], characters[i + 1]));
++i;
}

View File

@ -31,7 +31,7 @@ public abstract class TCharsetDecoder {
private TCharset charset;
private float averageCharsPerByte;
private float maxCharsPerByte;
private String replacement = "?";
private String replacement = "\uFFFD";
private TCodingErrorAction malformedAction = TCodingErrorAction.REPORT;
private TCodingErrorAction unmappableAction = TCodingErrorAction.REPORT;
private int state;
@ -126,8 +126,18 @@ public abstract class TCharsetDecoder {
return result;
} else if (result.isUnderflow()) {
if (endOfInput && in.hasRemaining()) {
state = END;
return TCoderResult.malformedForLength(in.remaining());
if (malformedAction == TCodingErrorAction.REPORT) {
return TCoderResult.malformedForLength(in.remaining());
} else {
if (out.remaining() > replacement.length()) {
in.position(in.position() + in.remaining());
if (malformedAction == TCodingErrorAction.REPLACE) {
out.put(replacement);
}
} else {
return TCoderResult.OVERFLOW;
}
}
}
return result;
} else if (result.isMalformed()) {

View File

@ -15,7 +15,6 @@
*/
package org.teavm.classlib.java.nio.charset.impl;
import org.teavm.classlib.impl.charset.UTF16Helper;
import org.teavm.classlib.java.nio.charset.TCharset;
import org.teavm.classlib.java.nio.charset.TCoderResult;
@ -44,7 +43,13 @@ public class TUTF8Decoder extends TBufferedDecoder {
}
break;
}
outArray[outPos++] = (char)(((b & 0x1F) << 6) | (inArray[inPos++] & 0x3F));
byte b2 = inArray[inPos++];
if (!checkMidByte(b2)) {
inPos -= 2;
result = TCoderResult.malformedForLength(1);
break;
}
outArray[outPos++] = (char)(((b & 0x1F) << 6) | (b2 & 0x3F));
} else if ((b & 0xF0) == 0xE0) {
if (inPos + 2 > inSize) {
--inPos;
@ -55,6 +60,11 @@ public class TUTF8Decoder extends TBufferedDecoder {
}
byte b2 = inArray[inPos++];
byte b3 = inArray[inPos++];
if (!checkMidByte(b2) || !checkMidByte(b3)) {
inPos -= 3;
result = TCoderResult.malformedForLength(1);
break;
}
char c = (char)(((b & 0x0F) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3F));
if (Character.isSurrogate(c)) {
inPos -= 3;
@ -72,7 +82,7 @@ public class TUTF8Decoder extends TBufferedDecoder {
}
if (outPos + 2 > outSize) {
--inPos;
if (!controller.hasMoreOutput()) {
if (!controller.hasMoreOutput(2)) {
result = TCoderResult.OVERFLOW;
}
break;
@ -80,9 +90,18 @@ public class TUTF8Decoder extends TBufferedDecoder {
byte b2 = inArray[inPos++];
byte b3 = inArray[inPos++];
byte b4 = inArray[inPos++];
if (!checkMidByte(b2) || !checkMidByte(b3) || !checkMidByte(b4)) {
inPos -= 3;
result = TCoderResult.malformedForLength(1);
break;
}
int code = ((b & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3F) << 6) | (b4 & 0x3F);
outArray[outPos++] = UTF16Helper.highSurrogate(code);
outArray[outPos++] = UTF16Helper.lowSurrogate(code);
outArray[outPos++] = Character.highSurrogate(code);
outArray[outPos++] = Character.lowSurrogate(code);
} else {
--inPos;
result = TCoderResult.malformedForLength(1);
break;
}
}
@ -90,4 +109,8 @@ public class TUTF8Decoder extends TBufferedDecoder {
controller.setOutPosition(outPos);
return result;
}
private boolean checkMidByte(byte b) {
return (b & 0xC0) == 0x80;
}
}

View File

@ -15,7 +15,6 @@
*/
package org.teavm.classlib.java.nio.charset.impl;
import org.teavm.classlib.impl.charset.UTF16Helper;
import org.teavm.classlib.java.nio.charset.TCharset;
import org.teavm.classlib.java.nio.charset.TCoderResult;
@ -57,7 +56,7 @@ public class TUTF8Encoder extends TBufferedEncoder {
outArray[outPos++] = (byte)(0xE0 | (ch >> 12));
outArray[outPos++] = (byte)(0x80 | ((ch >> 6) & 0x3F));
outArray[outPos++] = (byte)(0x80 | (ch & 0x3F));
} else if (UTF16Helper.isHighSurrogate(ch)) {
} else if (Character.isHighSurrogate(ch)) {
if (inPos >= inSize) {
if (!controller.hasMoreInput()) {
result = TCoderResult.UNDERFLOW;
@ -65,9 +64,9 @@ public class TUTF8Encoder extends TBufferedEncoder {
break;
}
char low = inArray[inPos++];
if (!UTF16Helper.isLowSurrogate(low)) {
if (!Character.isLowSurrogate(low)) {
inPos -= 2;
result = TCoderResult.malformedForLength(2);
result = TCoderResult.malformedForLength(1);
break;
}
if (outPos + 4 > outSize) {
@ -77,7 +76,7 @@ public class TUTF8Encoder extends TBufferedEncoder {
}
break;
}
int codePoint = UTF16Helper.buildCodePoint(ch, low);
int codePoint = Character.toCodePoint(ch, low);
outArray[outPos++] = (byte)(0xF0 | (codePoint >> 18));
outArray[outPos++] = (byte)(0x80 | ((codePoint >> 12) & 0x3F));
outArray[outPos++] = (byte)(0x80 | ((codePoint >> 6) & 0x3F));

View File

@ -1,7 +1,7 @@
package org.teavm.classlib.java.nio.charset;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.*;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
@ -46,6 +46,76 @@ public class UTF8Test {
runDecode(100, 600);
}
@Test
public void replaceMalformedSurrogatePair() {
Charset charset = Charset.forName("UTF-8");
ByteBuffer buffer = charset.encode("a\uD800\uD800b");
byte[] result = new byte[buffer.remaining()];
buffer.get(result);
assertArrayEquals(new byte[] { 97, 63, 63, 98 }, result);
}
@Test
public void encodeSurrogate() {
Charset charset = Charset.forName("UTF-8");
ByteBuffer buffer = charset.encode("a\uD800\uDC00b");
byte[] result = new byte[buffer.remaining()];
buffer.get(result);
assertArrayEquals(new byte[] { 97, -16, -112, -128, -128, 98 }, result);
}
@Test
public void replaceMalformedFirstByte() {
Charset charset = Charset.forName("UTF-8");
CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xFF, 98 }));
char[] result = new char[buffer.remaining()];
buffer.get(result);
assertEquals("a\uFFFDb", new String(result));
}
@Test
public void replaceMalformedMidByte() {
Charset charset = Charset.forName("UTF-8");
CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xC0, 98, 98 }));
char[] result = new char[buffer.remaining()];
buffer.get(result);
assertEquals("a\uFFFDbb", new String(result));
}
@Test
public void replaceDecodedSurrogate() {
Charset charset = Charset.forName("UTF-8");
CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xED, (byte)0xA0, (byte)0x80, 98 }));
char[] result = new char[buffer.remaining()];
buffer.get(result);
assertEquals("a\uFFFDb", new String(result));
}
@Test
public void replaceDecodedSurrogatePair() {
Charset charset = Charset.forName("UTF-8");
CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xED, (byte)0xA0, (byte)0x80,
(byte)0xED, (byte)0xBF, (byte)0xBF, 98 }));
char[] result = new char[buffer.remaining()];
buffer.get(result);
assertEquals("a\uFFFD\uFFFDb", new String(result));
}
@Test
public void decodeLongUTF8ByteArray() throws UnsupportedEncodingException {
byte[] bytes = new byte[16384];
for (int i = 0; i < bytes.length;) {
bytes[i++] = -16;
bytes[i++] = -66;
bytes[i++] = -78;
bytes[i++] = -69;
}
Charset charset = Charset.forName("UTF-8");
CharBuffer buffer = charset.decode(ByteBuffer.wrap(bytes));
assertEquals('\uD8BB', buffer.get(8190));
assertEquals('\uDCBB', buffer.get(8191));
}
private void runEncode(int inSize, int outSize) {
char[] input = text.toCharArray();
byte[] output = new byte[16384];