Replacing old TeaVM-based charsets with NIO charsets

This commit is contained in:
konsoletyper 2015-03-22 18:15:48 +03:00
parent cc5225a2a6
commit aa2451c3e3
10 changed files with 234 additions and 197 deletions

View File

@ -1,61 +0,0 @@
/*
* Copyright 2013 Alexey Andreev.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.teavm.classlib.impl.charset;
/**
*
* @author Alexey Andreev
*/
public class UTF16Helper {
public static final int SURROGATE_NEUTRAL_BIT_MASK = 0xF800;
public static final int SURROGATE_BITS = 0xD800;
public static final int SURROGATE_BIT_MASK = 0xFC00;
public static final int SURROGATE_BIT_INV_MASK = 0x03FF;
public static final int HIGH_SURROGATE_BITS = 0xD800;
public static final int LOW_SURROGATE_BITS = 0xDC00;
public static final int MEANINGFUL_SURROGATE_BITS = 10;
public static final int SUPPLEMENTARY_PLANE = 0x10000;
public static char highSurrogate(int codePoint) {
codePoint -= SUPPLEMENTARY_PLANE;
return (char)(HIGH_SURROGATE_BITS | (codePoint >> MEANINGFUL_SURROGATE_BITS) & SURROGATE_BIT_INV_MASK);
}
public static char lowSurrogate(int codePoint) {
return (char)(LOW_SURROGATE_BITS | codePoint & SURROGATE_BIT_INV_MASK);
}
public static boolean isHighSurrogate(char c) {
return (c & SURROGATE_BIT_MASK) == HIGH_SURROGATE_BITS;
}
public static boolean isLowSurrogate(char c) {
return (c & SURROGATE_BIT_MASK) == LOW_SURROGATE_BITS;
}
public static boolean isSurrogatePair(char a, char b) {
return isHighSurrogate(a) && isLowSurrogate(b);
}
public static int buildCodePoint(char a, char b) {
return (((a & SURROGATE_BIT_INV_MASK) << MEANINGFUL_SURROGATE_BITS) | (b & SURROGATE_BIT_INV_MASK)) +
SUPPLEMENTARY_PLANE;
}
public static boolean isSurrogate(char c) {
return (c & SURROGATE_NEUTRAL_BIT_MASK) == SURROGATE_BITS;
}
}

View File

@ -29,17 +29,17 @@ public class UTF8Charset extends Charset {
} else if (ch < 0x400) { } else if (ch < 0x400) {
dest.put((byte)(0xC0 | (ch >> 6))); dest.put((byte)(0xC0 | (ch >> 6)));
dest.put((byte)(0x80 | (ch & 0x3F))); dest.put((byte)(0x80 | (ch & 0x3F)));
} else if (!UTF16Helper.isSurrogate(ch)) { } else if (!Character.isSurrogate(ch)) {
dest.put((byte)(0xE0 | (ch >> 12))); dest.put((byte)(0xE0 | (ch >> 12)));
dest.put((byte)(0x80 | ((ch >> 6) & 0x3F))); dest.put((byte)(0x80 | ((ch >> 6) & 0x3F)));
dest.put((byte)(0x80 | (ch & 0x3F))); dest.put((byte)(0x80 | (ch & 0x3F)));
} else if (UTF16Helper.isHighSurrogate(ch)) { } else if (Character.isHighSurrogate(ch)) {
char low = source.get(); char low = source.get();
if (!UTF16Helper.isLowSurrogate(low)) { if (!Character.isLowSurrogate(low)) {
source.back(1); source.back(1);
dest.put((byte)'?'); dest.put((byte)'?');
} else { } else {
int codePoint = UTF16Helper.buildCodePoint(ch, low); int codePoint = Character.toCodePoint(ch, low);
dest.put((byte)(0xF0 | (codePoint >> 18))); dest.put((byte)(0xF0 | (codePoint >> 18)));
dest.put((byte)(0x80 | ((codePoint >> 12) & 0x3F))); dest.put((byte)(0x80 | ((codePoint >> 12) & 0x3F)));
dest.put((byte)(0x80 | ((codePoint >> 6) & 0x3F))); dest.put((byte)(0x80 | ((codePoint >> 6) & 0x3F)));
@ -72,7 +72,7 @@ public class UTF8Charset extends Charset {
byte b2 = source.get(); byte b2 = source.get();
byte b3 = source.get(); byte b3 = source.get();
char c = (char)(((b & 0x0F) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3F)); char c = (char)(((b & 0x0F) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3F));
dest.put(!UTF16Helper.isHighSurrogate(c) ? c : '?'); dest.put(!Character.isHighSurrogate(c) ? c : '?');
} else if ((b & 0xF8) == 0xF0) { } else if ((b & 0xF8) == 0xF0) {
if (source.available() < 3) { if (source.available() < 3) {
source.skip(source.available()); source.skip(source.available());
@ -83,8 +83,8 @@ public class UTF8Charset extends Charset {
byte b3 = source.get(); byte b3 = source.get();
byte b4 = source.get(); byte b4 = source.get();
int code = ((b & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3F) << 6) | (b4 & 0x3F); int code = ((b & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3F) << 6) | (b4 & 0x3F);
dest.put(UTF16Helper.highSurrogate(code)); dest.put(Character.highSurrogate(code));
dest.put(UTF16Helper.lowSurrogate(code)); dest.put(Character.lowSurrogate(code));
} }
} }
} }

View File

@ -15,11 +15,13 @@
*/ */
package org.teavm.classlib.java.io; package org.teavm.classlib.java.io;
import org.teavm.classlib.impl.charset.ByteBuffer;
import org.teavm.classlib.impl.charset.CharBuffer;
import org.teavm.classlib.impl.charset.Charset;
import org.teavm.classlib.impl.charset.UTF8Charset;
import org.teavm.classlib.java.lang.TString; import org.teavm.classlib.java.lang.TString;
import org.teavm.classlib.java.nio.TByteBuffer;
import org.teavm.classlib.java.nio.TCharBuffer;
import org.teavm.classlib.java.nio.charset.TCharset;
import org.teavm.classlib.java.nio.charset.TCharsetDecoder;
import org.teavm.classlib.java.nio.charset.TCodingErrorAction;
import org.teavm.classlib.java.nio.charset.impl.TUTF8Charset;
/** /**
* *
@ -27,30 +29,30 @@ import org.teavm.classlib.java.lang.TString;
*/ */
public class TInputStreamReader extends TReader { public class TInputStreamReader extends TReader {
private TInputStream stream; private TInputStream stream;
private Charset charset; private TCharset charset;
private TString charsetName; private TString charsetName;
private byte[] inData = new byte[8192]; private byte[] inData = new byte[8192];
private ByteBuffer inBuffer = new ByteBuffer(inData); private TByteBuffer inBuffer = TByteBuffer.wrap(inData);
private char[] outData = new char[1024]; private char[] outData = new char[1024];
private CharBuffer outBuffer = new CharBuffer(outData); private TCharBuffer outBuffer = TCharBuffer.wrap(outData);
private boolean streamEof; private boolean streamEof;
private boolean eof; private boolean eof;
public TInputStreamReader(TInputStream in, TString charsetName) { public TInputStreamReader(TInputStream in, TString charsetName) {
this(in, Charset.get(charsetName.toString())); this(in, TCharset.forName(charsetName.toString()));
this.charsetName = charsetName; this.charsetName = charsetName;
} }
public TInputStreamReader(TInputStream in) { public TInputStreamReader(TInputStream in) {
this(in, new UTF8Charset()); this(in, new TUTF8Charset());
charsetName = TString.wrap("UTF-8"); charsetName = TString.wrap("UTF-8");
} }
private TInputStreamReader(TInputStream in, Charset charset) { public TInputStreamReader(TInputStream in, TCharset charset) {
this.stream = in; this.stream = in;
this.charset = charset; this.charset = charset;
outBuffer.skip(outBuffer.available()); outBuffer.position(outBuffer.limit());
inBuffer.skip(inBuffer.available()); inBuffer.position(inBuffer.limit());
} }
public TString getEncoding() { public TString getEncoding() {
@ -64,10 +66,10 @@ public class TInputStreamReader extends TReader {
@Override @Override
public int read() throws TIOException { public int read() throws TIOException {
if (eof && outBuffer.end()) { if (eof && !outBuffer.hasRemaining()) {
return -1; return -1;
} }
if (!outBuffer.end()) { if (outBuffer.hasRemaining()) {
return outBuffer.get(); return outBuffer.get();
} }
return fillBuffer() ? outBuffer.get() : -1; return fillBuffer() ? outBuffer.get() : -1;
@ -75,37 +77,40 @@ public class TInputStreamReader extends TReader {
@Override @Override
public int read(char[] cbuf, int off, int len) throws TIOException { public int read(char[] cbuf, int off, int len) throws TIOException {
if (eof && outBuffer.end()) { if (eof && !outBuffer.hasRemaining()) {
return -1; return -1;
} }
CharBuffer wrapBuffer = new CharBuffer(cbuf, off, off + len); int bytesRead = 0;
while (!wrapBuffer.end()) { while (len > 0) {
wrapBuffer.put(outBuffer); int sz = Math.min(len, outBuffer.remaining());
if (outBuffer.end() && !fillBuffer()) { outBuffer.get(cbuf, off + bytesRead, sz);
len -= sz;
bytesRead += sz;
if (!outBuffer.hasRemaining() && !fillBuffer()) {
break; break;
} }
} }
return wrapBuffer.position() - off; return bytesRead;
} }
private boolean fillBuffer() throws TIOException { private boolean fillBuffer() throws TIOException {
if (eof) { if (eof) {
return false; return false;
} }
CharBuffer newBuffer = new CharBuffer(outData); outBuffer.compact();
newBuffer.put(outBuffer); TCharsetDecoder decoder = charset.newDecoder()
.onMalformedInput(TCodingErrorAction.REPLACE)
.onUnmappableCharacter(TCodingErrorAction.IGNORE);
while (true) { while (true) {
if (inBuffer.end() && !fillReadBuffer()) { if (!inBuffer.hasRemaining() && !fillReadBuffer()) {
eof = true; eof = true;
break; break;
} }
int oldAvail = newBuffer.available(); if (decoder.decode(inBuffer, outBuffer, eof).isOverflow()) {
charset.decode(inBuffer, newBuffer);
if (oldAvail == newBuffer.available()) {
break; break;
} }
} }
outBuffer = new CharBuffer(outData, 0, newBuffer.position()); outBuffer.flip();
return true; return true;
} }
@ -113,30 +118,25 @@ public class TInputStreamReader extends TReader {
if (streamEof) { if (streamEof) {
return false; return false;
} }
int off = 0; inBuffer.compact();
while (!inBuffer.end()) { while (inBuffer.hasRemaining()) {
inData[off] = inBuffer.get(); int bytesRead = stream.read(inBuffer.array(), inBuffer.position(), inBuffer.remaining());
}
inBuffer.rewind(0);
while (off < inData.length) {
int bytesRead = stream.read(inData, off, inData.length - off);
if (bytesRead == -1) { if (bytesRead == -1) {
streamEof = true; streamEof = true;
inBuffer = new ByteBuffer(inData, 0, inBuffer.position());
break; break;
} else { } else {
off += bytesRead; inBuffer.position(inBuffer.position() + bytesRead);
if (bytesRead == 0) { if (bytesRead == 0) {
break; break;
} }
} }
} }
inBuffer = new ByteBuffer(inData, 0, off); inBuffer.flip();
return true; return true;
} }
@Override @Override
public boolean ready() throws TIOException { public boolean ready() throws TIOException {
return !outBuffer.end() || inBuffer.end(); return outBuffer.hasRemaining() || inBuffer.hasRemaining();
} }
} }

View File

@ -15,7 +15,6 @@
*/ */
package org.teavm.classlib.java.lang; package org.teavm.classlib.java.lang;
import org.teavm.classlib.impl.charset.UTF16Helper;
import org.teavm.classlib.java.io.TSerializable; import org.teavm.classlib.java.io.TSerializable;
import org.teavm.classlib.java.util.TArrays; import org.teavm.classlib.java.util.TArrays;
@ -553,12 +552,12 @@ class TAbstractStringBuilder extends TObject implements TSerializable, TCharSequ
} }
protected TAbstractStringBuilder appendCodePoint(int codePoint) { protected TAbstractStringBuilder appendCodePoint(int codePoint) {
if (codePoint < UTF16Helper.SUPPLEMENTARY_PLANE) { if (codePoint < TCharacter.MIN_SUPPLEMENTARY_CODE_POINT) {
return append((char)codePoint); return append((char)codePoint);
} }
ensureCapacity(length + 2); ensureCapacity(length + 2);
buffer[length++] = UTF16Helper.highSurrogate(codePoint); buffer[length++] = TCharacter.highSurrogate(codePoint);
buffer[length++] = UTF16Helper.lowSurrogate(codePoint); buffer[length++] = TCharacter.lowSurrogate(codePoint);
return this; return this;
} }

View File

@ -15,7 +15,6 @@
*/ */
package org.teavm.classlib.java.lang; package org.teavm.classlib.java.lang;
import org.teavm.classlib.impl.charset.UTF16Helper;
import org.teavm.classlib.impl.unicode.UnicodeHelper; import org.teavm.classlib.impl.unicode.UnicodeHelper;
import org.teavm.platform.Platform; import org.teavm.platform.Platform;
import org.teavm.platform.metadata.MetadataProvider; import org.teavm.platform.metadata.MetadataProvider;
@ -96,6 +95,13 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
private static UnicodeHelper.Range[] classMapping; private static UnicodeHelper.Range[] classMapping;
private char value; private char value;
private static TCharacter[] characterCache = new TCharacter[128]; private static TCharacter[] characterCache = new TCharacter[128];
private static final int SURROGATE_NEUTRAL_BIT_MASK = 0xF800;
private static final int SURROGATE_BITS = 0xD800;
private static final int SURROGATE_BIT_MASK = 0xFC00;
private static final int SURROGATE_BIT_INV_MASK = 0x03FF;
private static final int HIGH_SURROGATE_BITS = 0xD800;
private static final int LOW_SURROGATE_BITS = 0xDC00;
private static final int MEANINGFUL_SURROGATE_BITS = 10;
public TCharacter(char value) { public TCharacter(char value) {
this.value = value; this.value = value;
@ -152,11 +158,11 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
} }
public static boolean isHighSurrogate(char ch) { public static boolean isHighSurrogate(char ch) {
return UTF16Helper.isHighSurrogate(ch); return (ch & SURROGATE_BIT_MASK) == HIGH_SURROGATE_BITS;
} }
public static boolean isLowSurrogate(char ch) { public static boolean isLowSurrogate(char ch) {
return UTF16Helper.isLowSurrogate(ch); return (ch & SURROGATE_BIT_MASK) == LOW_SURROGATE_BITS;
} }
public static boolean isSurrogate(char ch) { public static boolean isSurrogate(char ch) {
@ -172,7 +178,8 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
} }
public static int toCodePoint(char high, char low) { public static int toCodePoint(char high, char low) {
return UTF16Helper.buildCodePoint(high, low); return (((high & SURROGATE_BIT_INV_MASK) << MEANINGFUL_SURROGATE_BITS) | (low & SURROGATE_BIT_INV_MASK)) +
MIN_SUPPLEMENTARY_CODE_POINT;
} }
public static int codePointAt(TCharSequence seq, int index) { public static int codePointAt(TCharSequence seq, int index) {
@ -216,11 +223,12 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
} }
public static char highSurrogate(int codePoint) { public static char highSurrogate(int codePoint) {
return UTF16Helper.highSurrogate(codePoint); codePoint -= MIN_SUPPLEMENTARY_CODE_POINT;
return (char)(HIGH_SURROGATE_BITS | (codePoint >> MEANINGFUL_SURROGATE_BITS) & SURROGATE_BIT_INV_MASK);
} }
public static char lowSurrogate(int codePoint) { public static char lowSurrogate(int codePoint) {
return UTF16Helper.lowSurrogate(codePoint); return (char)(LOW_SURROGATE_BITS | codePoint & SURROGATE_BIT_INV_MASK);
} }
public static char toLowerCase(char ch) { public static char toLowerCase(char ch) {
@ -309,9 +317,9 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
private static native StringResource obtainClasses(); private static native StringResource obtainClasses();
public static int toChars(int codePoint, char[] dst, int dstIndex) { public static int toChars(int codePoint, char[] dst, int dstIndex) {
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) { if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) {
dst[dstIndex] = UTF16Helper.highSurrogate(codePoint); dst[dstIndex] = highSurrogate(codePoint);
dst[dstIndex + 1] = UTF16Helper.lowSurrogate(codePoint); dst[dstIndex + 1] = lowSurrogate(codePoint);
return 2; return 2;
} else { } else {
dst[dstIndex] = (char)codePoint; dst[dstIndex] = (char)codePoint;
@ -320,8 +328,8 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
} }
public static char[] toChars(int codePoint) { public static char[] toChars(int codePoint) {
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) { if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) {
return new char[] { UTF16Helper.highSurrogate(codePoint), UTF16Helper.lowSurrogate(codePoint) }; return new char[] { highSurrogate(codePoint), lowSurrogate(codePoint) };
} else { } else {
return new char[] { (char)codePoint }; return new char[] { (char)codePoint };
} }
@ -331,7 +339,7 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
int count = endIndex - beginIndex; int count = endIndex - beginIndex;
--endIndex; --endIndex;
for (int i = beginIndex; i < endIndex; ++i) { for (int i = beginIndex; i < endIndex; ++i) {
if (UTF16Helper.isHighSurrogate(seq.charAt(i)) && UTF16Helper.isLowSurrogate(seq.charAt(i + 1))) { if (isHighSurrogate(seq.charAt(i)) && isLowSurrogate(seq.charAt(i + 1))) {
--count; --count;
++i; ++i;
} }
@ -343,7 +351,7 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
int r = count; int r = count;
--count; --count;
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
if (UTF16Helper.isHighSurrogate(a[offset]) && UTF16Helper.isLowSurrogate(a[offset + i + 1])) { if (isHighSurrogate(a[offset]) && isLowSurrogate(a[offset + i + 1])) {
--r; --r;
++i; ++i;
} }
@ -353,8 +361,8 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
public static int offsetByCodePoints(TCharSequence seq, int index, int codePointOffset) { public static int offsetByCodePoints(TCharSequence seq, int index, int codePointOffset) {
for (int i = 0; i < codePointOffset; ++i) { for (int i = 0; i < codePointOffset; ++i) {
if (index < seq.length() - 1 && UTF16Helper.isHighSurrogate(seq.charAt(index)) && if (index < seq.length() - 1 && isHighSurrogate(seq.charAt(index)) &&
UTF16Helper.isLowSurrogate(seq.charAt(index + 1))) { isLowSurrogate(seq.charAt(index + 1))) {
index += 2; index += 2;
} else { } else {
index++; index++;
@ -365,8 +373,7 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
public static int offsetByCodePoints(char[] a, int start, int count, int index, int codePointOffset) { public static int offsetByCodePoints(char[] a, int start, int count, int index, int codePointOffset) {
for (int i = 0; i < codePointOffset; ++i) { for (int i = 0; i < codePointOffset; ++i) {
if (index < count - 1 && UTF16Helper.isHighSurrogate(a[index + start]) && if (index < count - 1 && isHighSurrogate(a[index + start]) && isLowSurrogate(a[index + start + 1])) {
UTF16Helper.isLowSurrogate(a[index + start + 1])) {
index += 2; index += 2;
} else { } else {
index++; index++;

View File

@ -15,9 +15,12 @@
*/ */
package org.teavm.classlib.java.lang; package org.teavm.classlib.java.lang;
import org.teavm.classlib.impl.charset.*;
import org.teavm.classlib.java.io.TSerializable; import org.teavm.classlib.java.io.TSerializable;
import org.teavm.classlib.java.io.TUnsupportedEncodingException; import org.teavm.classlib.java.io.TUnsupportedEncodingException;
import org.teavm.classlib.java.nio.TByteBuffer;
import org.teavm.classlib.java.nio.TCharBuffer;
import org.teavm.classlib.java.nio.charset.TCharset;
import org.teavm.classlib.java.nio.charset.impl.TUTF8Charset;
import org.teavm.classlib.java.util.TArrays; import org.teavm.classlib.java.util.TArrays;
import org.teavm.classlib.java.util.TComparator; import org.teavm.classlib.java.util.TComparator;
import org.teavm.classlib.java.util.THashMap; import org.teavm.classlib.java.util.THashMap;
@ -61,15 +64,15 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
} }
public TString(byte[] bytes, int offset, int length, TString charsetName) throws TUnsupportedEncodingException { public TString(byte[] bytes, int offset, int length, TString charsetName) throws TUnsupportedEncodingException {
Charset charset = Charset.get(charsetName.toString()); this(bytes, offset, length, TCharset.forName(charsetName.toString()));
if (charset == null) {
throw new TUnsupportedEncodingException(TString.wrap("Unknown encoding:" + charsetName));
} }
public TString(byte[] bytes, int offset, int length, TCharset charset) {
initWithBytes(bytes, offset, length, charset); initWithBytes(bytes, offset, length, charset);
} }
public TString(byte[] bytes, int offset, int length) { public TString(byte[] bytes, int offset, int length) {
initWithBytes(bytes, offset, length, new UTF8Charset()); initWithBytes(bytes, offset, length, new TUTF8Charset());
} }
public TString(byte[] bytes) { public TString(byte[] bytes) {
@ -80,14 +83,18 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
this(bytes, 0, bytes.length, charsetName); this(bytes, 0, bytes.length, charsetName);
} }
public TString(byte[] bytes, TCharset charset) {
this(bytes, 0, bytes.length, charset);
}
public TString(int[] codePoints, int offset, int count) { public TString(int[] codePoints, int offset, int count) {
characters = new char[count * 2]; characters = new char[count * 2];
int charCount = 0; int charCount = 0;
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
int codePoint = codePoints[offset++]; int codePoint = codePoints[offset++];
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) { if (codePoint >= TCharacter.MIN_SUPPLEMENTARY_CODE_POINT) {
characters[charCount++] = UTF16Helper.highSurrogate(codePoint); characters[charCount++] = TCharacter.highSurrogate(codePoint);
characters[charCount++] = UTF16Helper.lowSurrogate(codePoint); characters[charCount++] = TCharacter.lowSurrogate(codePoint);
} else { } else {
characters[charCount++] = (char)codePoint; characters[charCount++] = (char)codePoint;
} }
@ -97,19 +104,14 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
} }
} }
private void initWithBytes(byte[] bytes, int offset, int length, Charset charset) { private void initWithBytes(byte[] bytes, int offset, int length, TCharset charset) {
TStringBuilder sb = new TStringBuilder(bytes.length * 2); TCharBuffer buffer = charset.decode(TByteBuffer.wrap(bytes, offset, length));
this.characters = new char[sb.length()]; if (buffer.hasArray() && buffer.position() == 0 && buffer.limit() == buffer.capacity()) {
ByteBuffer source = new ByteBuffer(bytes, offset, offset + length); characters = buffer.array();
char[] destChars = new char[TMath.max(8, TMath.min(length * 2, 1024))]; } else {
CharBuffer dest = new CharBuffer(destChars, 0, destChars.length); characters = new char[buffer.remaining()];
while (!source.end()) { buffer.get(characters);
charset.decode(source, dest);
sb.append(destChars, 0, dest.position());
dest.rewind(0);
} }
characters = new char[sb.length()];
sb.getChars(0, sb.length(), characters, 0);
} }
public TString(TStringBuilder sb) { public TString(TStringBuilder sb) {
@ -283,7 +285,7 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
} }
public int indexOf(int ch, int fromIndex) { public int indexOf(int ch, int fromIndex) {
if (ch < UTF16Helper.SUPPLEMENTARY_PLANE) { if (ch < TCharacter.MIN_SUPPLEMENTARY_CODE_POINT) {
char bmpChar = (char)ch; char bmpChar = (char)ch;
for (int i = fromIndex; i < characters.length; ++i) { for (int i = fromIndex; i < characters.length; ++i) {
if (characters[i] == bmpChar) { if (characters[i] == bmpChar) {
@ -292,8 +294,8 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
} }
return -1; return -1;
} else { } else {
char hi = UTF16Helper.highSurrogate(ch); char hi = TCharacter.highSurrogate(ch);
char lo = UTF16Helper.lowSurrogate(ch); char lo = TCharacter.lowSurrogate(ch);
for (int i = fromIndex; i < characters.length - 1; ++i) { for (int i = fromIndex; i < characters.length - 1; ++i) {
if (characters[i] == hi && characters[i + 1] == lo) { if (characters[i] == hi && characters[i + 1] == lo) {
return i; return i;
@ -308,7 +310,7 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
} }
public int lastIndexOf(int ch, int fromIndex) { public int lastIndexOf(int ch, int fromIndex) {
if (ch < UTF16Helper.SUPPLEMENTARY_PLANE) { if (ch < TCharacter.MIN_SUPPLEMENTARY_CODE_POINT) {
char bmpChar = (char)ch; char bmpChar = (char)ch;
for (int i = fromIndex; i >= 0; --i) { for (int i = fromIndex; i >= 0; --i) {
if (characters[i] == bmpChar) { if (characters[i] == bmpChar) {
@ -317,8 +319,8 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
} }
return -1; return -1;
} else { } else {
char hi = UTF16Helper.highSurrogate(ch); char hi = TCharacter.highSurrogate(ch);
char lo = UTF16Helper.lowSurrogate(ch); char lo = TCharacter.lowSurrogate(ch);
for (int i = fromIndex; i >= 1; --i) { for (int i = fromIndex; i >= 1; --i) {
if (characters[i] == lo && characters[i - 1] == hi) { if (characters[i] == lo && characters[i - 1] == hi) {
return i - 1; return i - 1;
@ -550,34 +552,22 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
} }
public byte[] getBytes(TString charsetName) throws TUnsupportedEncodingException { public byte[] getBytes(TString charsetName) throws TUnsupportedEncodingException {
Charset charset = Charset.get(charsetName.toString()); return getBytes(TCharset.forName(charsetName.toString()));
if (charset == null) {
throw new TUnsupportedEncodingException(TString.wrap("Unsupported encoding: " + charsetName));
}
return getBytes(charset);
} }
public byte[] getBytes() { public byte[] getBytes() {
return getBytes(new UTF8Charset()); return getBytes(new TUTF8Charset());
} }
private byte[] getBytes(Charset charset) { public byte[] getBytes(TCharset charset) {
byte[] result = new byte[length() * 2]; TByteBuffer buffer = charset.encode(TCharBuffer.wrap(characters));
int resultLength = 0; if (buffer.hasArray() && buffer.position() == 0 && buffer.limit() == buffer.capacity()) {
byte[] destArray = new byte[TMath.max(16, TMath.min(length() * 2, 4096))]; return buffer.array();
ByteBuffer dest = new ByteBuffer(destArray); } else {
CharBuffer src = new CharBuffer(characters); byte[] result = new byte[buffer.remaining()];
while (!src.end()) { buffer.get(result);
charset.encode(src, dest); return result;
if (resultLength + dest.position() > result.length) {
result = TArrays.copyOf(result, result.length * 2);
} }
for (int i = 0; i < dest.position(); ++i) {
result[resultLength++] = destArray[i];
}
dest.rewind(0);
}
return TArrays.copyOf(result, resultLength);
} }
@Override @Override
@ -601,11 +591,11 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
int[] codePoints = new int[characters.length]; int[] codePoints = new int[characters.length];
int codePointCount = 0; int codePointCount = 0;
for (int i = 0; i < characters.length; ++i) { for (int i = 0; i < characters.length; ++i) {
if (i == characters.length - 1 || !UTF16Helper.isHighSurrogate(characters[i]) || if (i == characters.length - 1 || !TCharacter.isHighSurrogate(characters[i]) ||
!UTF16Helper.isLowSurrogate(characters[i + 1])) { !TCharacter.isLowSurrogate(characters[i + 1])) {
codePoints[codePointCount++] = TCharacter.toLowerCase(characters[i]); codePoints[codePointCount++] = TCharacter.toLowerCase(characters[i]);
} else { } else {
codePoints[codePointCount++] = TCharacter.toLowerCase(UTF16Helper.buildCodePoint( codePoints[codePointCount++] = TCharacter.toLowerCase(TCharacter.toCodePoint(
characters[i], characters[i + 1])); characters[i], characters[i + 1]));
++i; ++i;
} }
@ -620,11 +610,11 @@ public class TString extends TObject implements TSerializable, TComparable<TStri
int[] codePoints = new int[characters.length]; int[] codePoints = new int[characters.length];
int codePointCount = 0; int codePointCount = 0;
for (int i = 0; i < characters.length; ++i) { for (int i = 0; i < characters.length; ++i) {
if (i == characters.length - 1 || !UTF16Helper.isHighSurrogate(characters[i]) || if (i == characters.length - 1 || !TCharacter.isHighSurrogate(characters[i]) ||
!UTF16Helper.isLowSurrogate(characters[i + 1])) { !TCharacter.isLowSurrogate(characters[i + 1])) {
codePoints[codePointCount++] = TCharacter.toUpperCase(characters[i]); codePoints[codePointCount++] = TCharacter.toUpperCase(characters[i]);
} else { } else {
codePoints[codePointCount++] = TCharacter.toUpperCase(UTF16Helper.buildCodePoint( codePoints[codePointCount++] = TCharacter.toUpperCase(TCharacter.toCodePoint(
characters[i], characters[i + 1])); characters[i], characters[i + 1]));
++i; ++i;
} }

View File

@ -31,7 +31,7 @@ public abstract class TCharsetDecoder {
private TCharset charset; private TCharset charset;
private float averageCharsPerByte; private float averageCharsPerByte;
private float maxCharsPerByte; private float maxCharsPerByte;
private String replacement = "?"; private String replacement = "\uFFFD";
private TCodingErrorAction malformedAction = TCodingErrorAction.REPORT; private TCodingErrorAction malformedAction = TCodingErrorAction.REPORT;
private TCodingErrorAction unmappableAction = TCodingErrorAction.REPORT; private TCodingErrorAction unmappableAction = TCodingErrorAction.REPORT;
private int state; private int state;
@ -126,8 +126,18 @@ public abstract class TCharsetDecoder {
return result; return result;
} else if (result.isUnderflow()) { } else if (result.isUnderflow()) {
if (endOfInput && in.hasRemaining()) { if (endOfInput && in.hasRemaining()) {
state = END; if (malformedAction == TCodingErrorAction.REPORT) {
return TCoderResult.malformedForLength(in.remaining()); return TCoderResult.malformedForLength(in.remaining());
} else {
if (out.remaining() > replacement.length()) {
in.position(in.position() + in.remaining());
if (malformedAction == TCodingErrorAction.REPLACE) {
out.put(replacement);
}
} else {
return TCoderResult.OVERFLOW;
}
}
} }
return result; return result;
} else if (result.isMalformed()) { } else if (result.isMalformed()) {

View File

@ -15,7 +15,6 @@
*/ */
package org.teavm.classlib.java.nio.charset.impl; package org.teavm.classlib.java.nio.charset.impl;
import org.teavm.classlib.impl.charset.UTF16Helper;
import org.teavm.classlib.java.nio.charset.TCharset; import org.teavm.classlib.java.nio.charset.TCharset;
import org.teavm.classlib.java.nio.charset.TCoderResult; import org.teavm.classlib.java.nio.charset.TCoderResult;
@ -44,7 +43,13 @@ public class TUTF8Decoder extends TBufferedDecoder {
} }
break; break;
} }
outArray[outPos++] = (char)(((b & 0x1F) << 6) | (inArray[inPos++] & 0x3F)); byte b2 = inArray[inPos++];
if (!checkMidByte(b2)) {
inPos -= 2;
result = TCoderResult.malformedForLength(1);
break;
}
outArray[outPos++] = (char)(((b & 0x1F) << 6) | (b2 & 0x3F));
} else if ((b & 0xF0) == 0xE0) { } else if ((b & 0xF0) == 0xE0) {
if (inPos + 2 > inSize) { if (inPos + 2 > inSize) {
--inPos; --inPos;
@ -55,6 +60,11 @@ public class TUTF8Decoder extends TBufferedDecoder {
} }
byte b2 = inArray[inPos++]; byte b2 = inArray[inPos++];
byte b3 = inArray[inPos++]; byte b3 = inArray[inPos++];
if (!checkMidByte(b2) || !checkMidByte(b3)) {
inPos -= 3;
result = TCoderResult.malformedForLength(1);
break;
}
char c = (char)(((b & 0x0F) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3F)); char c = (char)(((b & 0x0F) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3F));
if (Character.isSurrogate(c)) { if (Character.isSurrogate(c)) {
inPos -= 3; inPos -= 3;
@ -72,7 +82,7 @@ public class TUTF8Decoder extends TBufferedDecoder {
} }
if (outPos + 2 > outSize) { if (outPos + 2 > outSize) {
--inPos; --inPos;
if (!controller.hasMoreOutput()) { if (!controller.hasMoreOutput(2)) {
result = TCoderResult.OVERFLOW; result = TCoderResult.OVERFLOW;
} }
break; break;
@ -80,9 +90,18 @@ public class TUTF8Decoder extends TBufferedDecoder {
byte b2 = inArray[inPos++]; byte b2 = inArray[inPos++];
byte b3 = inArray[inPos++]; byte b3 = inArray[inPos++];
byte b4 = inArray[inPos++]; byte b4 = inArray[inPos++];
if (!checkMidByte(b2) || !checkMidByte(b3) || !checkMidByte(b4)) {
inPos -= 3;
result = TCoderResult.malformedForLength(1);
break;
}
int code = ((b & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3F) << 6) | (b4 & 0x3F); int code = ((b & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3F) << 6) | (b4 & 0x3F);
outArray[outPos++] = UTF16Helper.highSurrogate(code); outArray[outPos++] = Character.highSurrogate(code);
outArray[outPos++] = UTF16Helper.lowSurrogate(code); outArray[outPos++] = Character.lowSurrogate(code);
} else {
--inPos;
result = TCoderResult.malformedForLength(1);
break;
} }
} }
@ -90,4 +109,8 @@ public class TUTF8Decoder extends TBufferedDecoder {
controller.setOutPosition(outPos); controller.setOutPosition(outPos);
return result; return result;
} }
private boolean checkMidByte(byte b) {
return (b & 0xC0) == 0x80;
}
} }

View File

@ -15,7 +15,6 @@
*/ */
package org.teavm.classlib.java.nio.charset.impl; package org.teavm.classlib.java.nio.charset.impl;
import org.teavm.classlib.impl.charset.UTF16Helper;
import org.teavm.classlib.java.nio.charset.TCharset; import org.teavm.classlib.java.nio.charset.TCharset;
import org.teavm.classlib.java.nio.charset.TCoderResult; import org.teavm.classlib.java.nio.charset.TCoderResult;
@ -57,7 +56,7 @@ public class TUTF8Encoder extends TBufferedEncoder {
outArray[outPos++] = (byte)(0xE0 | (ch >> 12)); outArray[outPos++] = (byte)(0xE0 | (ch >> 12));
outArray[outPos++] = (byte)(0x80 | ((ch >> 6) & 0x3F)); outArray[outPos++] = (byte)(0x80 | ((ch >> 6) & 0x3F));
outArray[outPos++] = (byte)(0x80 | (ch & 0x3F)); outArray[outPos++] = (byte)(0x80 | (ch & 0x3F));
} else if (UTF16Helper.isHighSurrogate(ch)) { } else if (Character.isHighSurrogate(ch)) {
if (inPos >= inSize) { if (inPos >= inSize) {
if (!controller.hasMoreInput()) { if (!controller.hasMoreInput()) {
result = TCoderResult.UNDERFLOW; result = TCoderResult.UNDERFLOW;
@ -65,9 +64,9 @@ public class TUTF8Encoder extends TBufferedEncoder {
break; break;
} }
char low = inArray[inPos++]; char low = inArray[inPos++];
if (!UTF16Helper.isLowSurrogate(low)) { if (!Character.isLowSurrogate(low)) {
inPos -= 2; inPos -= 2;
result = TCoderResult.malformedForLength(2); result = TCoderResult.malformedForLength(1);
break; break;
} }
if (outPos + 4 > outSize) { if (outPos + 4 > outSize) {
@ -77,7 +76,7 @@ public class TUTF8Encoder extends TBufferedEncoder {
} }
break; break;
} }
int codePoint = UTF16Helper.buildCodePoint(ch, low); int codePoint = Character.toCodePoint(ch, low);
outArray[outPos++] = (byte)(0xF0 | (codePoint >> 18)); outArray[outPos++] = (byte)(0xF0 | (codePoint >> 18));
outArray[outPos++] = (byte)(0x80 | ((codePoint >> 12) & 0x3F)); outArray[outPos++] = (byte)(0x80 | ((codePoint >> 12) & 0x3F));
outArray[outPos++] = (byte)(0x80 | ((codePoint >> 6) & 0x3F)); outArray[outPos++] = (byte)(0x80 | ((codePoint >> 6) & 0x3F));

View File

@ -1,7 +1,7 @@
package org.teavm.classlib.java.nio.charset; package org.teavm.classlib.java.nio.charset;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.*;
import static org.junit.Assert.assertTrue; import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.CharBuffer; import java.nio.CharBuffer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
@ -46,6 +46,76 @@ public class UTF8Test {
runDecode(100, 600); runDecode(100, 600);
} }
@Test
public void replaceMalformedSurrogatePair() {
Charset charset = Charset.forName("UTF-8");
ByteBuffer buffer = charset.encode("a\uD800\uD800b");
byte[] result = new byte[buffer.remaining()];
buffer.get(result);
assertArrayEquals(new byte[] { 97, 63, 63, 98 }, result);
}
@Test
public void encodeSurrogate() {
Charset charset = Charset.forName("UTF-8");
ByteBuffer buffer = charset.encode("a\uD800\uDC00b");
byte[] result = new byte[buffer.remaining()];
buffer.get(result);
assertArrayEquals(new byte[] { 97, -16, -112, -128, -128, 98 }, result);
}
@Test
public void replaceMalformedFirstByte() {
Charset charset = Charset.forName("UTF-8");
CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xFF, 98 }));
char[] result = new char[buffer.remaining()];
buffer.get(result);
assertEquals("a\uFFFDb", new String(result));
}
@Test
public void replaceMalformedMidByte() {
Charset charset = Charset.forName("UTF-8");
CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xC0, 98, 98 }));
char[] result = new char[buffer.remaining()];
buffer.get(result);
assertEquals("a\uFFFDbb", new String(result));
}
@Test
public void replaceDecodedSurrogate() {
Charset charset = Charset.forName("UTF-8");
CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xED, (byte)0xA0, (byte)0x80, 98 }));
char[] result = new char[buffer.remaining()];
buffer.get(result);
assertEquals("a\uFFFDb", new String(result));
}
@Test
public void replaceDecodedSurrogatePair() {
Charset charset = Charset.forName("UTF-8");
CharBuffer buffer = charset.decode(ByteBuffer.wrap(new byte[] { 97, (byte)0xED, (byte)0xA0, (byte)0x80,
(byte)0xED, (byte)0xBF, (byte)0xBF, 98 }));
char[] result = new char[buffer.remaining()];
buffer.get(result);
assertEquals("a\uFFFD\uFFFDb", new String(result));
}
@Test
public void decodeLongUTF8ByteArray() throws UnsupportedEncodingException {
byte[] bytes = new byte[16384];
for (int i = 0; i < bytes.length;) {
bytes[i++] = -16;
bytes[i++] = -66;
bytes[i++] = -78;
bytes[i++] = -69;
}
Charset charset = Charset.forName("UTF-8");
CharBuffer buffer = charset.decode(ByteBuffer.wrap(bytes));
assertEquals('\uD8BB', buffer.get(8190));
assertEquals('\uDCBB', buffer.get(8191));
}
private void runEncode(int inSize, int outSize) { private void runEncode(int inSize, int outSize) {
char[] input = text.toCharArray(); char[] input = text.toCharArray();
byte[] output = new byte[16384]; byte[] output = new byte[16384];