mirror of
https://github.com/Eaglercraft-TeaVM-Fork/eagler-teavm.git
synced 2025-01-08 16:04:10 -08:00
Adds unicode class recognition
This commit is contained in:
parent
d5601b263d
commit
0e93eec575
|
@ -15,6 +15,8 @@
|
||||||
*/
|
*/
|
||||||
package org.teavm.classlib.impl.unicode;
|
package org.teavm.classlib.impl.unicode;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @author Alexey Andreev
|
* @author Alexey Andreev
|
||||||
|
@ -56,4 +58,61 @@ public class UnicodeHelper {
|
||||||
}
|
}
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static char encodeByte(byte b) {
|
||||||
|
if (b < '\"' - ' ') {
|
||||||
|
return (char)(b + ' ');
|
||||||
|
} else if (b < '\\' - ' ') {
|
||||||
|
return (char)(b + ' ' + 1);
|
||||||
|
} else {
|
||||||
|
return (char)(b + ' ' + 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static byte decodeByte(char c) {
|
||||||
|
if (c > '\\') {
|
||||||
|
return (byte)(c - ' ' - 2);
|
||||||
|
} else if (c > '"') {
|
||||||
|
return (byte)(c - ' ' - 1);
|
||||||
|
} else {
|
||||||
|
return (byte)(c - ' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String compressRle(byte[] bytes) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (int i = 0; i < bytes.length; ++i) {
|
||||||
|
byte b = bytes[i];
|
||||||
|
if (i < bytes.length - 1 && b == bytes[i + 1]) {
|
||||||
|
int count = 0;
|
||||||
|
while (bytes[i++] == b && count < 80) {
|
||||||
|
++count;
|
||||||
|
}
|
||||||
|
sb.append(UnicodeHelper.encodeByte((byte)(b + 32)));
|
||||||
|
sb.append(UnicodeHelper.encodeByte((byte)count));
|
||||||
|
--i;
|
||||||
|
} else {
|
||||||
|
sb.append(UnicodeHelper.encodeByte(bytes[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static byte[] extractRle(String encoded) {
|
||||||
|
byte[] data = new byte[65536 * 4];
|
||||||
|
int index = 0;
|
||||||
|
for (int i = 0; i < encoded.length(); ++i) {
|
||||||
|
byte b = decodeByte(encoded.charAt(i));
|
||||||
|
if (b > 32) {
|
||||||
|
b -= 32;
|
||||||
|
byte count = decodeByte(encoded.charAt(++i));
|
||||||
|
while (count-- > 0) {
|
||||||
|
data[index++] = b;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
data[index++] = b;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Arrays.copyOf(data, index);
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -19,7 +19,9 @@ import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.concurrent.CountDownLatch;
|
import java.util.concurrent.CountDownLatch;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import org.teavm.common.IntegerArray;
|
import org.teavm.common.IntegerArray;
|
||||||
|
@ -32,9 +34,45 @@ public class UnicodeSupport {
|
||||||
private static AtomicBoolean filled = new AtomicBoolean();
|
private static AtomicBoolean filled = new AtomicBoolean();
|
||||||
private static volatile CountDownLatch latch = new CountDownLatch(1);
|
private static volatile CountDownLatch latch = new CountDownLatch(1);
|
||||||
private static int[] digitValues;
|
private static int[] digitValues;
|
||||||
|
private static byte[] classes;
|
||||||
|
private static Map<String, Byte> classMap = new HashMap<>();
|
||||||
|
|
||||||
|
static {
|
||||||
|
classMap.put("Cn", Character.UNASSIGNED);
|
||||||
|
classMap.put("Lu", Character.UPPERCASE_LETTER);
|
||||||
|
classMap.put("Ll", Character.LOWERCASE_LETTER);
|
||||||
|
classMap.put("Lt", Character.TITLECASE_LETTER);
|
||||||
|
classMap.put("Lm", Character.MODIFIER_LETTER);
|
||||||
|
classMap.put("Lo", Character.OTHER_LETTER);
|
||||||
|
classMap.put("Mn", Character.NON_SPACING_MARK);
|
||||||
|
classMap.put("Me", Character.ENCLOSING_MARK);
|
||||||
|
classMap.put("Mc", Character.COMBINING_SPACING_MARK);
|
||||||
|
classMap.put("Nd", Character.DECIMAL_DIGIT_NUMBER);
|
||||||
|
classMap.put("Nl", Character.LETTER_NUMBER);
|
||||||
|
classMap.put("No", Character.OTHER_NUMBER);
|
||||||
|
classMap.put("Zs", Character.SPACE_SEPARATOR);
|
||||||
|
classMap.put("Zl", Character.LINE_SEPARATOR);
|
||||||
|
classMap.put("Zp", Character.PARAGRAPH_SEPARATOR);
|
||||||
|
classMap.put("Cc", Character.CONTROL);
|
||||||
|
classMap.put("Cf", Character.FORMAT);
|
||||||
|
classMap.put("Co", Character.PRIVATE_USE);
|
||||||
|
classMap.put("Cs", Character.SURROGATE);
|
||||||
|
classMap.put("Pd", Character.DASH_PUNCTUATION);
|
||||||
|
classMap.put("Ps", Character.START_PUNCTUATION);
|
||||||
|
classMap.put("Pe", Character.END_PUNCTUATION);
|
||||||
|
classMap.put("Pc", Character.CONNECTOR_PUNCTUATION);
|
||||||
|
classMap.put("Po", Character.OTHER_PUNCTUATION);
|
||||||
|
classMap.put("Sm", Character.MATH_SYMBOL);
|
||||||
|
classMap.put("Sc", Character.CURRENCY_SYMBOL);
|
||||||
|
classMap.put("Sk", Character.MODIFIER_SYMBOL);
|
||||||
|
classMap.put("So", Character.OTHER_SYMBOL);
|
||||||
|
classMap.put("Pi", Character.INITIAL_QUOTE_PUNCTUATION);
|
||||||
|
classMap.put("Pf", Character.FINAL_QUOTE_PUNCTUATION);
|
||||||
|
}
|
||||||
|
|
||||||
private static void parseUnicodeData() {
|
private static void parseUnicodeData() {
|
||||||
IntegerArray digitValues = new IntegerArray(4096);
|
IntegerArray digitValues = new IntegerArray(4096);
|
||||||
|
IntegerArray classes = new IntegerArray(65536);
|
||||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(UnicodeHelper.class
|
try (BufferedReader reader = new BufferedReader(new InputStreamReader(UnicodeHelper.class
|
||||||
.getResourceAsStream("UnicodeData.txt")))) {
|
.getResourceAsStream("UnicodeData.txt")))) {
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -47,11 +85,16 @@ public class UnicodeSupport {
|
||||||
}
|
}
|
||||||
String[] fields = splitLine(line);
|
String[] fields = splitLine(line);
|
||||||
int charCode = parseHex(fields[0]);
|
int charCode = parseHex(fields[0]);
|
||||||
|
while (classes.size() < charCode) {
|
||||||
|
classes.add(0);
|
||||||
|
}
|
||||||
if (!fields[6].isEmpty()) {
|
if (!fields[6].isEmpty()) {
|
||||||
int digit = Integer.parseInt(fields[6]);
|
int digit = Integer.parseInt(fields[6]);
|
||||||
digitValues.add(charCode);
|
digitValues.add(charCode);
|
||||||
digitValues.add(digit);
|
digitValues.add(digit);
|
||||||
}
|
}
|
||||||
|
Byte charClass = classMap.get(fields[2]);
|
||||||
|
classes.add(charClass != null ? charClass.intValue() : 0);
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException("Error reading unicode data", e);
|
throw new RuntimeException("Error reading unicode data", e);
|
||||||
|
@ -74,6 +117,10 @@ public class UnicodeSupport {
|
||||||
letterDigitValues.add(i - '\uFF41' + 10);
|
letterDigitValues.add(i - '\uFF41' + 10);
|
||||||
}
|
}
|
||||||
UnicodeSupport.digitValues = mergePairs(digitValues.getAll(), letterDigitValues.getAll());
|
UnicodeSupport.digitValues = mergePairs(digitValues.getAll(), letterDigitValues.getAll());
|
||||||
|
UnicodeSupport.classes = new byte[classes.size()];
|
||||||
|
for (int i = 0; i < classes.size(); ++i) {
|
||||||
|
UnicodeSupport.classes[i] = (byte)classes.get(i);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String[] splitLine(String line) {
|
private static String[] splitLine(String line) {
|
||||||
|
@ -150,4 +197,8 @@ public class UnicodeSupport {
|
||||||
return digitValues;
|
return digitValues;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static byte[] getClasses() {
|
||||||
|
ensureUnicodeData();
|
||||||
|
return classes;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,6 +45,9 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
|
||||||
case "obtainDigitMapping":
|
case "obtainDigitMapping":
|
||||||
generateObtainDigitMapping(writer);
|
generateObtainDigitMapping(writer);
|
||||||
break;
|
break;
|
||||||
|
case "obtainClasses":
|
||||||
|
generateObtainClasses(writer);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,7 +55,8 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
|
||||||
public void methodAchieved(DependencyChecker checker, MethodDependency method) {
|
public void methodAchieved(DependencyChecker checker, MethodDependency method) {
|
||||||
switch (method.getReference().getName()) {
|
switch (method.getReference().getName()) {
|
||||||
case "obtainDigitMapping":
|
case "obtainDigitMapping":
|
||||||
achieveObtainDigitMapping(method);
|
case "obtainClasses":
|
||||||
|
method.getResult().propagate("java.lang.String");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -68,11 +72,26 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void generateObtainDigitMapping(SourceWriter writer) throws IOException {
|
private void generateObtainDigitMapping(SourceWriter writer) throws IOException {
|
||||||
writer.append("return $rt_str(\"").append(UnicodeHelper.encodeIntByte(UnicodeSupport.getDigitValues()))
|
String str = UnicodeHelper.encodeIntByte(UnicodeSupport.getDigitValues());
|
||||||
.append("\");").softNewLine();
|
writer.append("return $rt_str(");
|
||||||
|
splitString(writer, str);
|
||||||
|
writer.append(");").softNewLine();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void achieveObtainDigitMapping(MethodDependency method) {
|
private void generateObtainClasses(SourceWriter writer) throws IOException {
|
||||||
method.getResult().propagate("java.lang.String");
|
String str = UnicodeHelper.compressRle(UnicodeSupport.getClasses());
|
||||||
|
writer.append("return $rt_str(");
|
||||||
|
splitString(writer, str);
|
||||||
|
writer.append(");").softNewLine();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void splitString(SourceWriter writer, String str) throws IOException {
|
||||||
|
for (int i = 0; i < str.length(); i += 512) {
|
||||||
|
if (i > 0) {
|
||||||
|
writer.ws().append("+").newLine();
|
||||||
|
}
|
||||||
|
int j = Math.min(i + 512, str.length());
|
||||||
|
writer.append("\"").append(str.substring(i, j)).append("\"");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -93,6 +93,7 @@ public class TCharacter extends TObject {
|
||||||
public static final int SIZE = 16;
|
public static final int SIZE = 16;
|
||||||
static final int ERROR = 0xFFFFFFFF;
|
static final int ERROR = 0xFFFFFFFF;
|
||||||
private static int[] digitMapping;
|
private static int[] digitMapping;
|
||||||
|
private static byte[] classMapping;
|
||||||
private char value;
|
private char value;
|
||||||
private static TCharacter[] characterCache = new TCharacter[128];
|
private static TCharacter[] characterCache = new TCharacter[128];
|
||||||
|
|
||||||
|
@ -197,11 +198,11 @@ public class TCharacter extends TObject {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int codePointBefore(TCharSequence seq, int index) {
|
public static int codePointBefore(TCharSequence seq, int index) {
|
||||||
if (index == 0 || !isLowSurrogate(seq.charAt(index)) || !isHighSurrogate(seq.charAt(index - 1))) {
|
if (index == 1 || !UTF16Helper.isLowSurrogate(seq.charAt(index - 2)) ||
|
||||||
return seq.charAt(index);
|
!UTF16Helper.isHighSurrogate(seq.charAt(index - 2))) {
|
||||||
} else {
|
return seq.charAt(index - 1);
|
||||||
return toCodePoint(seq.charAt(index - 1), seq.charAt(index));
|
|
||||||
}
|
}
|
||||||
|
return UTF16Helper.buildCodePoint(seq.charAt(index - 2), seq.charAt(index - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int codePointBefore(char[] a, int index) {
|
public static int codePointBefore(char[] a, int index) {
|
||||||
|
@ -209,10 +210,10 @@ public class TCharacter extends TObject {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int codePointBefore(char[] a, int index, int start) {
|
public static int codePointBefore(char[] a, int index, int start) {
|
||||||
if (index <= start || !isLowSurrogate(a[index]) || !isHighSurrogate(a[index - 1])) {
|
if (index <= start + 1 || !isLowSurrogate(a[index - 1]) || !isHighSurrogate(a[index - 2])) {
|
||||||
return a[index];
|
return a[index];
|
||||||
} else {
|
} else {
|
||||||
return toCodePoint(a[index - 1], a[index]);
|
return toCodePoint(a[index - 2], a[index - 1]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -286,6 +287,17 @@ public class TCharacter extends TObject {
|
||||||
@PluggableDependency(CharacterNativeGenerator.class)
|
@PluggableDependency(CharacterNativeGenerator.class)
|
||||||
private static native String obtainDigitMapping();
|
private static native String obtainDigitMapping();
|
||||||
|
|
||||||
|
private static byte[] getClasses() {
|
||||||
|
if (classMapping == null) {
|
||||||
|
classMapping = UnicodeHelper.extractRle(obtainClasses());
|
||||||
|
}
|
||||||
|
return classMapping;
|
||||||
|
}
|
||||||
|
|
||||||
|
@GeneratedBy(CharacterNativeGenerator.class)
|
||||||
|
@PluggableDependency(CharacterNativeGenerator.class)
|
||||||
|
private static native String obtainClasses();
|
||||||
|
|
||||||
public static int toChars(int codePoint, char[] dst, int dstIndex) {
|
public static int toChars(int codePoint, char[] dst, int dstIndex) {
|
||||||
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) {
|
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) {
|
||||||
dst[dstIndex] = UTF16Helper.highSurrogate(codePoint);
|
dst[dstIndex] = UTF16Helper.highSurrogate(codePoint);
|
||||||
|
@ -360,4 +372,12 @@ public class TCharacter extends TObject {
|
||||||
public static boolean isISOControl(int codePoint) {
|
public static boolean isISOControl(int codePoint) {
|
||||||
return codePoint >= 0 && codePoint <= 0x1F || codePoint >= 0x7F && codePoint <= 0x9F;
|
return codePoint >= 0 && codePoint <= 0x1F || codePoint >= 0x7F && codePoint <= 0x9F;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int getType(char c) {
|
||||||
|
return getType((int)c);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int getType(int codePoint) {
|
||||||
|
return getClasses()[codePoint];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,4 +30,14 @@ public class CharacterTest {
|
||||||
assertEquals(6, Character.digit('\u096C', 10));
|
assertEquals(6, Character.digit('\u096C', 10));
|
||||||
assertEquals(15, Character.digit('F', 16));
|
assertEquals(15, Character.digit('F', 16));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void classesRecognized() {
|
||||||
|
assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType('2'));
|
||||||
|
assertEquals(Character.UPPERCASE_LETTER, Character.getType('Q'));
|
||||||
|
assertEquals(Character.LOWERCASE_LETTER, Character.getType('w'));
|
||||||
|
assertEquals(Character.MATH_SYMBOL, Character.getType(0x21F7));
|
||||||
|
assertEquals(Character.NON_SPACING_MARK, Character.getType(0xFE25));
|
||||||
|
assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType(0x1D7D9));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user