Adds unicode class recognition

This commit is contained in:
konsoletyper 2014-03-02 23:47:56 +04:00
parent d5601b263d
commit 0e93eec575
5 changed files with 170 additions and 11 deletions

View File

@ -15,6 +15,8 @@
*/
package org.teavm.classlib.impl.unicode;
import java.util.Arrays;
/**
*
* @author Alexey Andreev
@ -56,4 +58,61 @@ public class UnicodeHelper {
}
return data;
}
public static char encodeByte(byte b) {
if (b < '\"' - ' ') {
return (char)(b + ' ');
} else if (b < '\\' - ' ') {
return (char)(b + ' ' + 1);
} else {
return (char)(b + ' ' + 2);
}
}
public static byte decodeByte(char c) {
if (c > '\\') {
return (byte)(c - ' ' - 2);
} else if (c > '"') {
return (byte)(c - ' ' - 1);
} else {
return (byte)(c - ' ');
}
}
public static String compressRle(byte[] bytes) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < bytes.length; ++i) {
byte b = bytes[i];
if (i < bytes.length - 1 && b == bytes[i + 1]) {
int count = 0;
while (bytes[i++] == b && count < 80) {
++count;
}
sb.append(UnicodeHelper.encodeByte((byte)(b + 32)));
sb.append(UnicodeHelper.encodeByte((byte)count));
--i;
} else {
sb.append(UnicodeHelper.encodeByte(bytes[i]));
}
}
return sb.toString();
}
public static byte[] extractRle(String encoded) {
byte[] data = new byte[65536 * 4];
int index = 0;
for (int i = 0; i < encoded.length(); ++i) {
byte b = decodeByte(encoded.charAt(i));
if (b > 32) {
b -= 32;
byte count = decodeByte(encoded.charAt(++i));
while (count-- > 0) {
data[index++] = b;
}
} else {
data[index++] = b;
}
}
return Arrays.copyOf(data, index);
}
}

View File

@ -19,7 +19,9 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicBoolean;
import org.teavm.common.IntegerArray;
@ -32,9 +34,45 @@ public class UnicodeSupport {
private static AtomicBoolean filled = new AtomicBoolean();
private static volatile CountDownLatch latch = new CountDownLatch(1);
private static int[] digitValues;
private static byte[] classes;
private static Map<String, Byte> classMap = new HashMap<>();
static {
classMap.put("Cn", Character.UNASSIGNED);
classMap.put("Lu", Character.UPPERCASE_LETTER);
classMap.put("Ll", Character.LOWERCASE_LETTER);
classMap.put("Lt", Character.TITLECASE_LETTER);
classMap.put("Lm", Character.MODIFIER_LETTER);
classMap.put("Lo", Character.OTHER_LETTER);
classMap.put("Mn", Character.NON_SPACING_MARK);
classMap.put("Me", Character.ENCLOSING_MARK);
classMap.put("Mc", Character.COMBINING_SPACING_MARK);
classMap.put("Nd", Character.DECIMAL_DIGIT_NUMBER);
classMap.put("Nl", Character.LETTER_NUMBER);
classMap.put("No", Character.OTHER_NUMBER);
classMap.put("Zs", Character.SPACE_SEPARATOR);
classMap.put("Zl", Character.LINE_SEPARATOR);
classMap.put("Zp", Character.PARAGRAPH_SEPARATOR);
classMap.put("Cc", Character.CONTROL);
classMap.put("Cf", Character.FORMAT);
classMap.put("Co", Character.PRIVATE_USE);
classMap.put("Cs", Character.SURROGATE);
classMap.put("Pd", Character.DASH_PUNCTUATION);
classMap.put("Ps", Character.START_PUNCTUATION);
classMap.put("Pe", Character.END_PUNCTUATION);
classMap.put("Pc", Character.CONNECTOR_PUNCTUATION);
classMap.put("Po", Character.OTHER_PUNCTUATION);
classMap.put("Sm", Character.MATH_SYMBOL);
classMap.put("Sc", Character.CURRENCY_SYMBOL);
classMap.put("Sk", Character.MODIFIER_SYMBOL);
classMap.put("So", Character.OTHER_SYMBOL);
classMap.put("Pi", Character.INITIAL_QUOTE_PUNCTUATION);
classMap.put("Pf", Character.FINAL_QUOTE_PUNCTUATION);
}
private static void parseUnicodeData() {
IntegerArray digitValues = new IntegerArray(4096);
IntegerArray classes = new IntegerArray(65536);
try (BufferedReader reader = new BufferedReader(new InputStreamReader(UnicodeHelper.class
.getResourceAsStream("UnicodeData.txt")))) {
while (true) {
@ -47,11 +85,16 @@ public class UnicodeSupport {
}
String[] fields = splitLine(line);
int charCode = parseHex(fields[0]);
while (classes.size() < charCode) {
classes.add(0);
}
if (!fields[6].isEmpty()) {
int digit = Integer.parseInt(fields[6]);
digitValues.add(charCode);
digitValues.add(digit);
}
Byte charClass = classMap.get(fields[2]);
classes.add(charClass != null ? charClass.intValue() : 0);
}
} catch (IOException e) {
throw new RuntimeException("Error reading unicode data", e);
@ -74,6 +117,10 @@ public class UnicodeSupport {
letterDigitValues.add(i - '\uFF41' + 10);
}
UnicodeSupport.digitValues = mergePairs(digitValues.getAll(), letterDigitValues.getAll());
UnicodeSupport.classes = new byte[classes.size()];
for (int i = 0; i < classes.size(); ++i) {
UnicodeSupport.classes[i] = (byte)classes.get(i);
}
}
private static String[] splitLine(String line) {
@ -150,4 +197,8 @@ public class UnicodeSupport {
return digitValues;
}
public static byte[] getClasses() {
ensureUnicodeData();
return classes;
}
}

View File

@ -45,6 +45,9 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
case "obtainDigitMapping":
generateObtainDigitMapping(writer);
break;
case "obtainClasses":
generateObtainClasses(writer);
break;
}
}
@ -52,7 +55,8 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
public void methodAchieved(DependencyChecker checker, MethodDependency method) {
switch (method.getReference().getName()) {
case "obtainDigitMapping":
achieveObtainDigitMapping(method);
case "obtainClasses":
method.getResult().propagate("java.lang.String");
break;
}
}
@ -68,11 +72,26 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
}
private void generateObtainDigitMapping(SourceWriter writer) throws IOException {
writer.append("return $rt_str(\"").append(UnicodeHelper.encodeIntByte(UnicodeSupport.getDigitValues()))
.append("\");").softNewLine();
String str = UnicodeHelper.encodeIntByte(UnicodeSupport.getDigitValues());
writer.append("return $rt_str(");
splitString(writer, str);
writer.append(");").softNewLine();
}
private void achieveObtainDigitMapping(MethodDependency method) {
method.getResult().propagate("java.lang.String");
private void generateObtainClasses(SourceWriter writer) throws IOException {
String str = UnicodeHelper.compressRle(UnicodeSupport.getClasses());
writer.append("return $rt_str(");
splitString(writer, str);
writer.append(");").softNewLine();
}
private void splitString(SourceWriter writer, String str) throws IOException {
for (int i = 0; i < str.length(); i += 512) {
if (i > 0) {
writer.ws().append("+").newLine();
}
int j = Math.min(i + 512, str.length());
writer.append("\"").append(str.substring(i, j)).append("\"");
}
}
}

View File

@ -93,6 +93,7 @@ public class TCharacter extends TObject {
public static final int SIZE = 16;
static final int ERROR = 0xFFFFFFFF;
private static int[] digitMapping;
private static byte[] classMapping;
private char value;
private static TCharacter[] characterCache = new TCharacter[128];
@ -197,11 +198,11 @@ public class TCharacter extends TObject {
}
public static int codePointBefore(TCharSequence seq, int index) {
if (index == 0 || !isLowSurrogate(seq.charAt(index)) || !isHighSurrogate(seq.charAt(index - 1))) {
return seq.charAt(index);
} else {
return toCodePoint(seq.charAt(index - 1), seq.charAt(index));
if (index == 1 || !UTF16Helper.isLowSurrogate(seq.charAt(index - 2)) ||
!UTF16Helper.isHighSurrogate(seq.charAt(index - 2))) {
return seq.charAt(index - 1);
}
return UTF16Helper.buildCodePoint(seq.charAt(index - 2), seq.charAt(index - 1));
}
public static int codePointBefore(char[] a, int index) {
@ -209,10 +210,10 @@ public class TCharacter extends TObject {
}
public static int codePointBefore(char[] a, int index, int start) {
if (index <= start || !isLowSurrogate(a[index]) || !isHighSurrogate(a[index - 1])) {
if (index <= start + 1 || !isLowSurrogate(a[index - 1]) || !isHighSurrogate(a[index - 2])) {
return a[index];
} else {
return toCodePoint(a[index - 1], a[index]);
return toCodePoint(a[index - 2], a[index - 1]);
}
}
@ -286,6 +287,17 @@ public class TCharacter extends TObject {
@PluggableDependency(CharacterNativeGenerator.class)
private static native String obtainDigitMapping();
private static byte[] getClasses() {
if (classMapping == null) {
classMapping = UnicodeHelper.extractRle(obtainClasses());
}
return classMapping;
}
@GeneratedBy(CharacterNativeGenerator.class)
@PluggableDependency(CharacterNativeGenerator.class)
private static native String obtainClasses();
public static int toChars(int codePoint, char[] dst, int dstIndex) {
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) {
dst[dstIndex] = UTF16Helper.highSurrogate(codePoint);
@ -360,4 +372,12 @@ public class TCharacter extends TObject {
public static boolean isISOControl(int codePoint) {
return codePoint >= 0 && codePoint <= 0x1F || codePoint >= 0x7F && codePoint <= 0x9F;
}
public static int getType(char c) {
return getType((int)c);
}
public static int getType(int codePoint) {
return getClasses()[codePoint];
}
}

View File

@ -30,4 +30,14 @@ public class CharacterTest {
assertEquals(6, Character.digit('\u096C', 10));
assertEquals(15, Character.digit('F', 16));
}
@Test
public void classesRecognized() {
assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType('2'));
assertEquals(Character.UPPERCASE_LETTER, Character.getType('Q'));
assertEquals(Character.LOWERCASE_LETTER, Character.getType('w'));
assertEquals(Character.MATH_SYMBOL, Character.getType(0x21F7));
assertEquals(Character.NON_SPACING_MARK, Character.getType(0xFE25));
assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType(0x1D7D9));
}
}