mirror of
https://github.com/Eaglercraft-TeaVM-Fork/eagler-teavm.git
synced 2025-01-08 07:54:11 -08:00
Adds unicode class recognition
This commit is contained in:
parent
d5601b263d
commit
0e93eec575
|
@ -15,6 +15,8 @@
|
|||
*/
|
||||
package org.teavm.classlib.impl.unicode;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Alexey Andreev
|
||||
|
@ -56,4 +58,61 @@ public class UnicodeHelper {
|
|||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
public static char encodeByte(byte b) {
|
||||
if (b < '\"' - ' ') {
|
||||
return (char)(b + ' ');
|
||||
} else if (b < '\\' - ' ') {
|
||||
return (char)(b + ' ' + 1);
|
||||
} else {
|
||||
return (char)(b + ' ' + 2);
|
||||
}
|
||||
}
|
||||
|
||||
public static byte decodeByte(char c) {
|
||||
if (c > '\\') {
|
||||
return (byte)(c - ' ' - 2);
|
||||
} else if (c > '"') {
|
||||
return (byte)(c - ' ' - 1);
|
||||
} else {
|
||||
return (byte)(c - ' ');
|
||||
}
|
||||
}
|
||||
|
||||
public static String compressRle(byte[] bytes) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < bytes.length; ++i) {
|
||||
byte b = bytes[i];
|
||||
if (i < bytes.length - 1 && b == bytes[i + 1]) {
|
||||
int count = 0;
|
||||
while (bytes[i++] == b && count < 80) {
|
||||
++count;
|
||||
}
|
||||
sb.append(UnicodeHelper.encodeByte((byte)(b + 32)));
|
||||
sb.append(UnicodeHelper.encodeByte((byte)count));
|
||||
--i;
|
||||
} else {
|
||||
sb.append(UnicodeHelper.encodeByte(bytes[i]));
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static byte[] extractRle(String encoded) {
|
||||
byte[] data = new byte[65536 * 4];
|
||||
int index = 0;
|
||||
for (int i = 0; i < encoded.length(); ++i) {
|
||||
byte b = decodeByte(encoded.charAt(i));
|
||||
if (b > 32) {
|
||||
b -= 32;
|
||||
byte count = decodeByte(encoded.charAt(++i));
|
||||
while (count-- > 0) {
|
||||
data[index++] = b;
|
||||
}
|
||||
} else {
|
||||
data[index++] = b;
|
||||
}
|
||||
}
|
||||
return Arrays.copyOf(data, index);
|
||||
}
|
||||
}
|
|
@ -19,7 +19,9 @@ import java.io.BufferedReader;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import org.teavm.common.IntegerArray;
|
||||
|
@ -32,9 +34,45 @@ public class UnicodeSupport {
|
|||
private static AtomicBoolean filled = new AtomicBoolean();
|
||||
private static volatile CountDownLatch latch = new CountDownLatch(1);
|
||||
private static int[] digitValues;
|
||||
private static byte[] classes;
|
||||
private static Map<String, Byte> classMap = new HashMap<>();
|
||||
|
||||
static {
|
||||
classMap.put("Cn", Character.UNASSIGNED);
|
||||
classMap.put("Lu", Character.UPPERCASE_LETTER);
|
||||
classMap.put("Ll", Character.LOWERCASE_LETTER);
|
||||
classMap.put("Lt", Character.TITLECASE_LETTER);
|
||||
classMap.put("Lm", Character.MODIFIER_LETTER);
|
||||
classMap.put("Lo", Character.OTHER_LETTER);
|
||||
classMap.put("Mn", Character.NON_SPACING_MARK);
|
||||
classMap.put("Me", Character.ENCLOSING_MARK);
|
||||
classMap.put("Mc", Character.COMBINING_SPACING_MARK);
|
||||
classMap.put("Nd", Character.DECIMAL_DIGIT_NUMBER);
|
||||
classMap.put("Nl", Character.LETTER_NUMBER);
|
||||
classMap.put("No", Character.OTHER_NUMBER);
|
||||
classMap.put("Zs", Character.SPACE_SEPARATOR);
|
||||
classMap.put("Zl", Character.LINE_SEPARATOR);
|
||||
classMap.put("Zp", Character.PARAGRAPH_SEPARATOR);
|
||||
classMap.put("Cc", Character.CONTROL);
|
||||
classMap.put("Cf", Character.FORMAT);
|
||||
classMap.put("Co", Character.PRIVATE_USE);
|
||||
classMap.put("Cs", Character.SURROGATE);
|
||||
classMap.put("Pd", Character.DASH_PUNCTUATION);
|
||||
classMap.put("Ps", Character.START_PUNCTUATION);
|
||||
classMap.put("Pe", Character.END_PUNCTUATION);
|
||||
classMap.put("Pc", Character.CONNECTOR_PUNCTUATION);
|
||||
classMap.put("Po", Character.OTHER_PUNCTUATION);
|
||||
classMap.put("Sm", Character.MATH_SYMBOL);
|
||||
classMap.put("Sc", Character.CURRENCY_SYMBOL);
|
||||
classMap.put("Sk", Character.MODIFIER_SYMBOL);
|
||||
classMap.put("So", Character.OTHER_SYMBOL);
|
||||
classMap.put("Pi", Character.INITIAL_QUOTE_PUNCTUATION);
|
||||
classMap.put("Pf", Character.FINAL_QUOTE_PUNCTUATION);
|
||||
}
|
||||
|
||||
private static void parseUnicodeData() {
|
||||
IntegerArray digitValues = new IntegerArray(4096);
|
||||
IntegerArray classes = new IntegerArray(65536);
|
||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(UnicodeHelper.class
|
||||
.getResourceAsStream("UnicodeData.txt")))) {
|
||||
while (true) {
|
||||
|
@ -47,11 +85,16 @@ public class UnicodeSupport {
|
|||
}
|
||||
String[] fields = splitLine(line);
|
||||
int charCode = parseHex(fields[0]);
|
||||
while (classes.size() < charCode) {
|
||||
classes.add(0);
|
||||
}
|
||||
if (!fields[6].isEmpty()) {
|
||||
int digit = Integer.parseInt(fields[6]);
|
||||
digitValues.add(charCode);
|
||||
digitValues.add(digit);
|
||||
}
|
||||
Byte charClass = classMap.get(fields[2]);
|
||||
classes.add(charClass != null ? charClass.intValue() : 0);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Error reading unicode data", e);
|
||||
|
@ -74,6 +117,10 @@ public class UnicodeSupport {
|
|||
letterDigitValues.add(i - '\uFF41' + 10);
|
||||
}
|
||||
UnicodeSupport.digitValues = mergePairs(digitValues.getAll(), letterDigitValues.getAll());
|
||||
UnicodeSupport.classes = new byte[classes.size()];
|
||||
for (int i = 0; i < classes.size(); ++i) {
|
||||
UnicodeSupport.classes[i] = (byte)classes.get(i);
|
||||
}
|
||||
}
|
||||
|
||||
private static String[] splitLine(String line) {
|
||||
|
@ -150,4 +197,8 @@ public class UnicodeSupport {
|
|||
return digitValues;
|
||||
}
|
||||
|
||||
public static byte[] getClasses() {
|
||||
ensureUnicodeData();
|
||||
return classes;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,6 +45,9 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
|
|||
case "obtainDigitMapping":
|
||||
generateObtainDigitMapping(writer);
|
||||
break;
|
||||
case "obtainClasses":
|
||||
generateObtainClasses(writer);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -52,7 +55,8 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
|
|||
public void methodAchieved(DependencyChecker checker, MethodDependency method) {
|
||||
switch (method.getReference().getName()) {
|
||||
case "obtainDigitMapping":
|
||||
achieveObtainDigitMapping(method);
|
||||
case "obtainClasses":
|
||||
method.getResult().propagate("java.lang.String");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -68,11 +72,26 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
|
|||
}
|
||||
|
||||
private void generateObtainDigitMapping(SourceWriter writer) throws IOException {
|
||||
writer.append("return $rt_str(\"").append(UnicodeHelper.encodeIntByte(UnicodeSupport.getDigitValues()))
|
||||
.append("\");").softNewLine();
|
||||
String str = UnicodeHelper.encodeIntByte(UnicodeSupport.getDigitValues());
|
||||
writer.append("return $rt_str(");
|
||||
splitString(writer, str);
|
||||
writer.append(");").softNewLine();
|
||||
}
|
||||
|
||||
private void achieveObtainDigitMapping(MethodDependency method) {
|
||||
method.getResult().propagate("java.lang.String");
|
||||
private void generateObtainClasses(SourceWriter writer) throws IOException {
|
||||
String str = UnicodeHelper.compressRle(UnicodeSupport.getClasses());
|
||||
writer.append("return $rt_str(");
|
||||
splitString(writer, str);
|
||||
writer.append(");").softNewLine();
|
||||
}
|
||||
|
||||
private void splitString(SourceWriter writer, String str) throws IOException {
|
||||
for (int i = 0; i < str.length(); i += 512) {
|
||||
if (i > 0) {
|
||||
writer.ws().append("+").newLine();
|
||||
}
|
||||
int j = Math.min(i + 512, str.length());
|
||||
writer.append("\"").append(str.substring(i, j)).append("\"");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -93,6 +93,7 @@ public class TCharacter extends TObject {
|
|||
public static final int SIZE = 16;
|
||||
static final int ERROR = 0xFFFFFFFF;
|
||||
private static int[] digitMapping;
|
||||
private static byte[] classMapping;
|
||||
private char value;
|
||||
private static TCharacter[] characterCache = new TCharacter[128];
|
||||
|
||||
|
@ -197,11 +198,11 @@ public class TCharacter extends TObject {
|
|||
}
|
||||
|
||||
public static int codePointBefore(TCharSequence seq, int index) {
|
||||
if (index == 0 || !isLowSurrogate(seq.charAt(index)) || !isHighSurrogate(seq.charAt(index - 1))) {
|
||||
return seq.charAt(index);
|
||||
} else {
|
||||
return toCodePoint(seq.charAt(index - 1), seq.charAt(index));
|
||||
if (index == 1 || !UTF16Helper.isLowSurrogate(seq.charAt(index - 2)) ||
|
||||
!UTF16Helper.isHighSurrogate(seq.charAt(index - 2))) {
|
||||
return seq.charAt(index - 1);
|
||||
}
|
||||
return UTF16Helper.buildCodePoint(seq.charAt(index - 2), seq.charAt(index - 1));
|
||||
}
|
||||
|
||||
public static int codePointBefore(char[] a, int index) {
|
||||
|
@ -209,10 +210,10 @@ public class TCharacter extends TObject {
|
|||
}
|
||||
|
||||
public static int codePointBefore(char[] a, int index, int start) {
|
||||
if (index <= start || !isLowSurrogate(a[index]) || !isHighSurrogate(a[index - 1])) {
|
||||
if (index <= start + 1 || !isLowSurrogate(a[index - 1]) || !isHighSurrogate(a[index - 2])) {
|
||||
return a[index];
|
||||
} else {
|
||||
return toCodePoint(a[index - 1], a[index]);
|
||||
return toCodePoint(a[index - 2], a[index - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -286,6 +287,17 @@ public class TCharacter extends TObject {
|
|||
@PluggableDependency(CharacterNativeGenerator.class)
|
||||
private static native String obtainDigitMapping();
|
||||
|
||||
private static byte[] getClasses() {
|
||||
if (classMapping == null) {
|
||||
classMapping = UnicodeHelper.extractRle(obtainClasses());
|
||||
}
|
||||
return classMapping;
|
||||
}
|
||||
|
||||
@GeneratedBy(CharacterNativeGenerator.class)
|
||||
@PluggableDependency(CharacterNativeGenerator.class)
|
||||
private static native String obtainClasses();
|
||||
|
||||
public static int toChars(int codePoint, char[] dst, int dstIndex) {
|
||||
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) {
|
||||
dst[dstIndex] = UTF16Helper.highSurrogate(codePoint);
|
||||
|
@ -360,4 +372,12 @@ public class TCharacter extends TObject {
|
|||
public static boolean isISOControl(int codePoint) {
|
||||
return codePoint >= 0 && codePoint <= 0x1F || codePoint >= 0x7F && codePoint <= 0x9F;
|
||||
}
|
||||
|
||||
public static int getType(char c) {
|
||||
return getType((int)c);
|
||||
}
|
||||
|
||||
public static int getType(int codePoint) {
|
||||
return getClasses()[codePoint];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,4 +30,14 @@ public class CharacterTest {
|
|||
assertEquals(6, Character.digit('\u096C', 10));
|
||||
assertEquals(15, Character.digit('F', 16));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void classesRecognized() {
|
||||
assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType('2'));
|
||||
assertEquals(Character.UPPERCASE_LETTER, Character.getType('Q'));
|
||||
assertEquals(Character.LOWERCASE_LETTER, Character.getType('w'));
|
||||
assertEquals(Character.MATH_SYMBOL, Character.getType(0x21F7));
|
||||
assertEquals(Character.NON_SPACING_MARK, Character.getType(0xFE25));
|
||||
assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType(0x1D7D9));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user