207 lines
6.5 KiB
Java
207 lines
6.5 KiB
Java
|
/*
|
||
|
* Copyright (C) 2013 The Guava Authors
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
|
||
|
* in compliance with the License. You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software distributed under the License
|
||
|
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||
|
* or implied. See the License for the specific language governing permissions and limitations under
|
||
|
* the License.
|
||
|
*/
|
||
|
|
||
|
package com.google.common.base;
|
||
|
|
||
|
import static com.google.common.base.Preconditions.checkPositionIndexes;
|
||
|
|
||
|
import com.google.common.annotations.Beta;
|
||
|
import com.google.common.annotations.GwtCompatible;
|
||
|
|
||
|
/**
|
||
|
* Low-level, high-performance utility methods related to the
|
||
|
* {@linkplain Charsets#UTF_8 UTF-8} character encoding. UTF-8 is defined in
|
||
|
* section D92 of
|
||
|
* <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode
|
||
|
* Standard Core Specification, Chapter 3</a>.
|
||
|
*
|
||
|
* <p>
|
||
|
* The variant of UTF-8 implemented by this class is the restricted definition
|
||
|
* of UTF-8 introduced in Unicode 3.1. One implication of this is that it
|
||
|
* rejects
|
||
|
* <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest
|
||
|
* form"</a> byte sequences, even though the JDK decoder may accept them.
|
||
|
*
|
||
|
* @author Martin Buchholz
|
||
|
* @author Clément Roux
|
||
|
* @since 16.0
|
||
|
*/
|
||
|
@Beta
|
||
|
@GwtCompatible
|
||
|
public final class Utf8 {
|
||
|
/**
|
||
|
* Returns the number of bytes in the UTF-8-encoded form of {@code sequence}.
|
||
|
* For a string, this method is equivalent to
|
||
|
* {@code string.getBytes(UTF_8).length}, but is more efficient in both time and
|
||
|
* space.
|
||
|
*
|
||
|
* @throws IllegalArgumentException if {@code sequence} contains ill-formed
|
||
|
* UTF-16 (unpaired surrogates)
|
||
|
*/
|
||
|
public static int encodedLength(CharSequence sequence) {
|
||
|
// Warning to maintainers: this implementation is highly optimized.
|
||
|
int utf16Length = sequence.length();
|
||
|
int utf8Length = utf16Length;
|
||
|
int i = 0;
|
||
|
|
||
|
// This loop optimizes for pure ASCII.
|
||
|
while (i < utf16Length && sequence.charAt(i) < 0x80) {
|
||
|
i++;
|
||
|
}
|
||
|
|
||
|
// This loop optimizes for chars less than 0x800.
|
||
|
for (; i < utf16Length; i++) {
|
||
|
char c = sequence.charAt(i);
|
||
|
if (c < 0x800) {
|
||
|
utf8Length += ((0x7f - c) >>> 31); // branch free!
|
||
|
} else {
|
||
|
utf8Length += encodedLengthGeneral(sequence, i);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (utf8Length < utf16Length) {
|
||
|
// Necessary and sufficient condition for overflow because of maximum 3x
|
||
|
// expansion
|
||
|
throw new IllegalArgumentException("UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
|
||
|
}
|
||
|
return utf8Length;
|
||
|
}
|
||
|
|
||
|
private static int encodedLengthGeneral(CharSequence sequence, int start) {
|
||
|
int utf16Length = sequence.length();
|
||
|
int utf8Length = 0;
|
||
|
for (int i = start; i < utf16Length; i++) {
|
||
|
char c = sequence.charAt(i);
|
||
|
if (c < 0x800) {
|
||
|
utf8Length += (0x7f - c) >>> 31; // branch free!
|
||
|
} else {
|
||
|
utf8Length += 2;
|
||
|
// jdk7+: if (Character.isSurrogate(c)) {
|
||
|
if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
|
||
|
// Check that we have a well-formed surrogate pair.
|
||
|
int cp = Character.codePointAt(sequence, i);
|
||
|
if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
|
||
|
throw new IllegalArgumentException("Unpaired surrogate at index " + i);
|
||
|
}
|
||
|
i++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return utf8Length;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte
|
||
|
* sequence according to Unicode 6.0. Note that this is a stronger criterion
|
||
|
* than simply whether the bytes can be decoded. For example, some versions of
|
||
|
* the JDK decoder will accept "non-shortest form" byte sequences, but encoding
|
||
|
* never reproduces these. Such byte sequences are <i>not</i> considered
|
||
|
* well-formed.
|
||
|
*
|
||
|
* <p>
|
||
|
* This method returns {@code true} if and only if
|
||
|
* {@code Arrays.equals(bytes, new
|
||
|
* String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both
|
||
|
* time and space.
|
||
|
*/
|
||
|
public static boolean isWellFormed(byte[] bytes) {
|
||
|
return isWellFormed(bytes, 0, bytes.length);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns whether the given byte array slice is a well-formed UTF-8 byte
|
||
|
* sequence, as defined by {@link #isWellFormed(byte[])}. Note that this can be
|
||
|
* false even when {@code
|
||
|
* isWellFormed(bytes)} is true.
|
||
|
*
|
||
|
* @param bytes the input buffer
|
||
|
* @param off the offset in the buffer of the first byte to read
|
||
|
* @param len the number of bytes to read from the buffer
|
||
|
*/
|
||
|
public static boolean isWellFormed(byte[] bytes, int off, int len) {
|
||
|
int end = off + len;
|
||
|
checkPositionIndexes(off, end, bytes.length);
|
||
|
// Look for the first non-ASCII character.
|
||
|
for (int i = off; i < end; i++) {
|
||
|
if (bytes[i] < 0) {
|
||
|
return isWellFormedSlowPath(bytes, i, end);
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {
|
||
|
int index = off;
|
||
|
while (true) {
|
||
|
int byte1;
|
||
|
|
||
|
// Optimize for interior runs of ASCII bytes.
|
||
|
do {
|
||
|
if (index >= end) {
|
||
|
return true;
|
||
|
}
|
||
|
} while ((byte1 = bytes[index++]) >= 0);
|
||
|
|
||
|
if (byte1 < (byte) 0xE0) {
|
||
|
// Two-byte form.
|
||
|
if (index == end) {
|
||
|
return false;
|
||
|
}
|
||
|
// Simultaneously check for illegal trailing-byte in leading position
|
||
|
// and overlong 2-byte form.
|
||
|
if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
|
||
|
return false;
|
||
|
}
|
||
|
} else if (byte1 < (byte) 0xF0) {
|
||
|
// Three-byte form.
|
||
|
if (index + 1 >= end) {
|
||
|
return false;
|
||
|
}
|
||
|
int byte2 = bytes[index++];
|
||
|
if (byte2 > (byte) 0xBF
|
||
|
// Overlong? 5 most significant bits must not all be zero.
|
||
|
|| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
|
||
|
// Check for illegal surrogate codepoints.
|
||
|
|| (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
|
||
|
// Third byte trailing-byte test.
|
||
|
|| bytes[index++] > (byte) 0xBF) {
|
||
|
return false;
|
||
|
}
|
||
|
} else {
|
||
|
// Four-byte form.
|
||
|
if (index + 2 >= end) {
|
||
|
return false;
|
||
|
}
|
||
|
int byte2 = bytes[index++];
|
||
|
if (byte2 > (byte) 0xBF
|
||
|
// Check that 1 <= plane <= 16. Tricky optimized form of:
|
||
|
// if (byte1 > (byte) 0xF4
|
||
|
// || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
|
||
|
// || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
|
||
|
|| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
|
||
|
// Third byte trailing-byte test
|
||
|
|| bytes[index++] > (byte) 0xBF
|
||
|
// Fourth byte trailing-byte test
|
||
|
|| bytes[index++] > (byte) 0xBF) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private Utf8() {
|
||
|
}
|
||
|
}
|