eaglercraft-1.8/sources/main/java/com/google/common/base/Utf8.java

/*
 * Copyright (C) 2013 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.common.base;

import static com.google.common.base.Preconditions.checkPositionIndexes;

import com.google.common.annotations.Beta;
import com.google.common.annotations.GwtCompatible;

/**
 * Low-level, high-performance utility methods related to the
 * {@linkplain Charsets#UTF_8 UTF-8} character encoding. UTF-8 is defined in
 * section D92 of
 * <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode
 * Standard Core Specification, Chapter 3</a>.
 *
 * <p>
 * The variant of UTF-8 implemented by this class is the restricted definition
 * of UTF-8 introduced in Unicode 3.1. One implication of this is that it
 * rejects
 * <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest
 * form"</a> byte sequences, even though the JDK decoder may accept them.
 *
 * @author Martin Buchholz
 * @author Clément Roux
 * @since 16.0
 */
@Beta
@GwtCompatible
public final class Utf8 {
	/**
	 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}.
	 * For a string, this method is equivalent to
	 * {@code string.getBytes(UTF_8).length}, but is more efficient in both time and
	 * space.
	 *
	 * @throws IllegalArgumentException if {@code sequence} contains ill-formed
	 *                                  UTF-16 (unpaired surrogates)
	 */
	public static int encodedLength(CharSequence sequence) {
		// Warning to maintainers: this implementation is highly optimized.
		int utf16Length = sequence.length();
		int utf8Length = utf16Length;
		int i = 0;

		// This loop optimizes for pure ASCII.
		while (i < utf16Length && sequence.charAt(i) < 0x80) {
			i++;
		}

		// This loop optimizes for chars less than 0x800.
		for (; i < utf16Length; i++) {
			char c = sequence.charAt(i);
			if (c < 0x800) {
				utf8Length += ((0x7f - c) >>> 31); // branch free!
			} else {
				utf8Length += encodedLengthGeneral(sequence, i);
				break;
			}
		}

		if (utf8Length < utf16Length) {
			// Necessary and sufficient condition for overflow because of maximum 3x
			// expansion
			throw new IllegalArgumentException("UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
		}
		return utf8Length;
	}

	private static int encodedLengthGeneral(CharSequence sequence, int start) {
		int utf16Length = sequence.length();
		int utf8Length = 0;
		for (int i = start; i < utf16Length; i++) {
			char c = sequence.charAt(i);
			if (c < 0x800) {
				utf8Length += (0x7f - c) >>> 31; // branch free!
			} else {
				utf8Length += 2;
				// jdk7+: if (Character.isSurrogate(c)) {
				if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
					// Check that we have a well-formed surrogate pair.
					int cp = Character.codePointAt(sequence, i);
					if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
						throw new IllegalArgumentException("Unpaired surrogate at index " + i);
					}
					i++;
				}
			}
		}
		return utf8Length;
	}

	/**
	 * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte
	 * sequence according to Unicode 6.0. Note that this is a stronger criterion
	 * than simply whether the bytes can be decoded. For example, some versions of
	 * the JDK decoder will accept "non-shortest form" byte sequences, but encoding
	 * never reproduces these. Such byte sequences are <i>not</i> considered
	 * well-formed.
	 *
	 * <p>
	 * This method returns {@code true} if and only if
	 * {@code Arrays.equals(bytes, new
	 * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both
	 * time and space.
	 */
	public static boolean isWellFormed(byte[] bytes) {
		return isWellFormed(bytes, 0, bytes.length);
	}

	/**
	 * Returns whether the given byte array slice is a well-formed UTF-8 byte
	 * sequence, as defined by {@link #isWellFormed(byte[])}. Note that this can be
	 * false even when {@code
	 * isWellFormed(bytes)} is true.
	 *
	 * @param bytes the input buffer
	 * @param off   the offset in the buffer of the first byte to read
	 * @param len   the number of bytes to read from the buffer
	 */
	public static boolean isWellFormed(byte[] bytes, int off, int len) {
		int end = off + len;
		checkPositionIndexes(off, end, bytes.length);
		// Look for the first non-ASCII character.
		for (int i = off; i < end; i++) {
			if (bytes[i] < 0) {
				return isWellFormedSlowPath(bytes, i, end);
			}
		}
		return true;
	}

	private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {
		int index = off;
		while (true) {
			int byte1;

			// Optimize for interior runs of ASCII bytes.
			do {
				if (index >= end) {
					return true;
				}
			} while ((byte1 = bytes[index++]) >= 0);

			if (byte1 < (byte) 0xE0) {
				// Two-byte form.
				if (index == end) {
					return false;
				}
				// Simultaneously check for illegal trailing-byte in leading position
				// and overlong 2-byte form.
				if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
					return false;
				}
			} else if (byte1 < (byte) 0xF0) {
				// Three-byte form.
				if (index + 1 >= end) {
					return false;
				}
				int byte2 = bytes[index++];
				if (byte2 > (byte) 0xBF
						// Overlong? 5 most significant bits must not all be zero.
						|| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
						// Check for illegal surrogate codepoints.
						|| (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
						// Third byte trailing-byte test.
						|| bytes[index++] > (byte) 0xBF) {
					return false;
				}
			} else {
				// Four-byte form.
				if (index + 2 >= end) {
					return false;
				}
				int byte2 = bytes[index++];
				if (byte2 > (byte) 0xBF
						// Check that 1 <= plane <= 16. Tricky optimized form of:
						// if (byte1 > (byte) 0xF4
						// || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
						// || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
						|| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
						// Third byte trailing-byte test
						|| bytes[index++] > (byte) 0xBF
						// Fourth byte trailing-byte test
						|| bytes[index++] > (byte) 0xBF) {
					return false;
				}
			}
		}
	}

	private Utf8() {
	}
}
Update #0 - First Release 2022-12-25 01:12:28 -08:00			`/*`
			`* Copyright (C) 2013 The Guava Authors`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except`
			`* in compliance with the License. You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software distributed under the License`
			`* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express`
			`* or implied. See the License for the specific language governing permissions and limitations under`
			`* the License.`
			`*/`

			`package com.google.common.base;`

			`import static com.google.common.base.Preconditions.checkPositionIndexes;`

			`import com.google.common.annotations.Beta;`
			`import com.google.common.annotations.GwtCompatible;`

			`/**`
			`* Low-level, high-performance utility methods related to the`
			`* {@linkplain Charsets#UTF_8 UTF-8} character encoding. UTF-8 is defined in`
			`* section D92 of`
			`* <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode`
			`* Standard Core Specification, Chapter 3</a>.`
			`*`
			`* <p>`
			`* The variant of UTF-8 implemented by this class is the restricted definition`
			`* of UTF-8 introduced in Unicode 3.1. One implication of this is that it`
			`* rejects`
			`* <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest`
			`* form"</a> byte sequences, even though the JDK decoder may accept them.`
			`*`
			`* @author Martin Buchholz`
			`* @author Clément Roux`
			`* @since 16.0`
			`*/`
			`@Beta`
			`@GwtCompatible`
			`public final class Utf8 {`
			`/**`
			`* Returns the number of bytes in the UTF-8-encoded form of {@code sequence}.`
			`* For a string, this method is equivalent to`
			`* {@code string.getBytes(UTF_8).length}, but is more efficient in both time and`
			`* space.`
			`*`
			`* @throws IllegalArgumentException if {@code sequence} contains ill-formed`
			`* UTF-16 (unpaired surrogates)`
			`*/`
			`public static int encodedLength(CharSequence sequence) {`
			`// Warning to maintainers: this implementation is highly optimized.`
			`int utf16Length = sequence.length();`
			`int utf8Length = utf16Length;`
			`int i = 0;`

			`// This loop optimizes for pure ASCII.`
			`while (i < utf16Length && sequence.charAt(i) < 0x80) {`
			`i++;`
			`}`

			`// This loop optimizes for chars less than 0x800.`
			`for (; i < utf16Length; i++) {`
			`char c = sequence.charAt(i);`
			`if (c < 0x800) {`
			`utf8Length += ((0x7f - c) >>> 31); // branch free!`
			`} else {`
			`utf8Length += encodedLengthGeneral(sequence, i);`
			`break;`
			`}`
			`}`

			`if (utf8Length < utf16Length) {`
			`// Necessary and sufficient condition for overflow because of maximum 3x`
			`// expansion`
			`throw new IllegalArgumentException("UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));`
			`}`
			`return utf8Length;`
			`}`

			`private static int encodedLengthGeneral(CharSequence sequence, int start) {`
			`int utf16Length = sequence.length();`
			`int utf8Length = 0;`
			`for (int i = start; i < utf16Length; i++) {`
			`char c = sequence.charAt(i);`
			`if (c < 0x800) {`
			`utf8Length += (0x7f - c) >>> 31; // branch free!`
			`} else {`
			`utf8Length += 2;`
			`// jdk7+: if (Character.isSurrogate(c)) {`
			`if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {`
			`// Check that we have a well-formed surrogate pair.`
			`int cp = Character.codePointAt(sequence, i);`
			`if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {`
			`throw new IllegalArgumentException("Unpaired surrogate at index " + i);`
			`}`
			`i++;`
			`}`
			`}`
			`}`
			`return utf8Length;`
			`}`

			`/**`
			`* Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte`
			`* sequence according to Unicode 6.0. Note that this is a stronger criterion`
			`* than simply whether the bytes can be decoded. For example, some versions of`
			`* the JDK decoder will accept "non-shortest form" byte sequences, but encoding`
			`* never reproduces these. Such byte sequences are <i>not</i> considered`
			`* well-formed.`
			`*`
			`* <p>`
			`* This method returns {@code true} if and only if`
			`* {@code Arrays.equals(bytes, new`
			`* String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both`
			`* time and space.`
			`*/`
			`public static boolean isWellFormed(byte[] bytes) {`
			`return isWellFormed(bytes, 0, bytes.length);`
			`}`

			`/**`
			`* Returns whether the given byte array slice is a well-formed UTF-8 byte`
			`* sequence, as defined by {@link #isWellFormed(byte[])}. Note that this can be`
			`* false even when {@code`
			`* isWellFormed(bytes)} is true.`
			`*`
			`* @param bytes the input buffer`
			`* @param off the offset in the buffer of the first byte to read`
			`* @param len the number of bytes to read from the buffer`
			`*/`
			`public static boolean isWellFormed(byte[] bytes, int off, int len) {`
			`int end = off + len;`
			`checkPositionIndexes(off, end, bytes.length);`
			`// Look for the first non-ASCII character.`
			`for (int i = off; i < end; i++) {`
			`if (bytes[i] < 0) {`
			`return isWellFormedSlowPath(bytes, i, end);`
			`}`
			`}`
			`return true;`
			`}`

			`private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {`
			`int index = off;`
			`while (true) {`
			`int byte1;`

			`// Optimize for interior runs of ASCII bytes.`
			`do {`
			`if (index >= end) {`
			`return true;`
			`}`
			`} while ((byte1 = bytes[index++]) >= 0);`

			`if (byte1 < (byte) 0xE0) {`
			`// Two-byte form.`
			`if (index == end) {`
			`return false;`
			`}`
			`// Simultaneously check for illegal trailing-byte in leading position`
			`// and overlong 2-byte form.`
			`if (byte1 < (byte) 0xC2 \|\| bytes[index++] > (byte) 0xBF) {`
			`return false;`
			`}`
			`} else if (byte1 < (byte) 0xF0) {`
			`// Three-byte form.`
			`if (index + 1 >= end) {`
			`return false;`
			`}`
			`int byte2 = bytes[index++];`
			`if (byte2 > (byte) 0xBF`
			`// Overlong? 5 most significant bits must not all be zero.`
			`\|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)`
			`// Check for illegal surrogate codepoints.`
			`\|\| (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)`
			`// Third byte trailing-byte test.`
			`\|\| bytes[index++] > (byte) 0xBF) {`
			`return false;`
			`}`
			`} else {`
			`// Four-byte form.`
			`if (index + 2 >= end) {`
			`return false;`
			`}`
			`int byte2 = bytes[index++];`
			`if (byte2 > (byte) 0xBF`
			`// Check that 1 <= plane <= 16. Tricky optimized form of:`
			`// if (byte1 > (byte) 0xF4`
			`// \|\| byte1 == (byte) 0xF0 && byte2 < (byte) 0x90`
			`// \|\| byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)`
			`\|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0`
			`// Third byte trailing-byte test`
			`\|\| bytes[index++] > (byte) 0xBF`
			`// Fourth byte trailing-byte test`
			`\|\| bytes[index++] > (byte) 0xBF) {`
			`return false;`
			`}`
			`}`
			`}`
			`}`

			`private Utf8() {`
			`}`
			`}`