Adds unicode class recognition

This commit is contained in:
konsoletyper 2014-03-02 23:47:56 +04:00
parent d5601b263d
commit 0e93eec575
5 changed files with 170 additions and 11 deletions

View File

@ -15,6 +15,8 @@
*/ */
package org.teavm.classlib.impl.unicode; package org.teavm.classlib.impl.unicode;
import java.util.Arrays;
/** /**
* *
* @author Alexey Andreev * @author Alexey Andreev
@ -56,4 +58,61 @@ public class UnicodeHelper {
} }
return data; return data;
} }
public static char encodeByte(byte b) {
if (b < '\"' - ' ') {
return (char)(b + ' ');
} else if (b < '\\' - ' ') {
return (char)(b + ' ' + 1);
} else {
return (char)(b + ' ' + 2);
}
}
public static byte decodeByte(char c) {
if (c > '\\') {
return (byte)(c - ' ' - 2);
} else if (c > '"') {
return (byte)(c - ' ' - 1);
} else {
return (byte)(c - ' ');
}
}
public static String compressRle(byte[] bytes) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < bytes.length; ++i) {
byte b = bytes[i];
if (i < bytes.length - 1 && b == bytes[i + 1]) {
int count = 0;
while (bytes[i++] == b && count < 80) {
++count;
}
sb.append(UnicodeHelper.encodeByte((byte)(b + 32)));
sb.append(UnicodeHelper.encodeByte((byte)count));
--i;
} else {
sb.append(UnicodeHelper.encodeByte(bytes[i]));
}
}
return sb.toString();
}
public static byte[] extractRle(String encoded) {
byte[] data = new byte[65536 * 4];
int index = 0;
for (int i = 0; i < encoded.length(); ++i) {
byte b = decodeByte(encoded.charAt(i));
if (b > 32) {
b -= 32;
byte count = decodeByte(encoded.charAt(++i));
while (count-- > 0) {
data[index++] = b;
}
} else {
data[index++] = b;
}
}
return Arrays.copyOf(data, index);
}
} }

View File

@ -19,7 +19,9 @@ import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch; import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import org.teavm.common.IntegerArray; import org.teavm.common.IntegerArray;
@ -32,9 +34,45 @@ public class UnicodeSupport {
private static AtomicBoolean filled = new AtomicBoolean(); private static AtomicBoolean filled = new AtomicBoolean();
private static volatile CountDownLatch latch = new CountDownLatch(1); private static volatile CountDownLatch latch = new CountDownLatch(1);
private static int[] digitValues; private static int[] digitValues;
private static byte[] classes;
private static Map<String, Byte> classMap = new HashMap<>();
static {
classMap.put("Cn", Character.UNASSIGNED);
classMap.put("Lu", Character.UPPERCASE_LETTER);
classMap.put("Ll", Character.LOWERCASE_LETTER);
classMap.put("Lt", Character.TITLECASE_LETTER);
classMap.put("Lm", Character.MODIFIER_LETTER);
classMap.put("Lo", Character.OTHER_LETTER);
classMap.put("Mn", Character.NON_SPACING_MARK);
classMap.put("Me", Character.ENCLOSING_MARK);
classMap.put("Mc", Character.COMBINING_SPACING_MARK);
classMap.put("Nd", Character.DECIMAL_DIGIT_NUMBER);
classMap.put("Nl", Character.LETTER_NUMBER);
classMap.put("No", Character.OTHER_NUMBER);
classMap.put("Zs", Character.SPACE_SEPARATOR);
classMap.put("Zl", Character.LINE_SEPARATOR);
classMap.put("Zp", Character.PARAGRAPH_SEPARATOR);
classMap.put("Cc", Character.CONTROL);
classMap.put("Cf", Character.FORMAT);
classMap.put("Co", Character.PRIVATE_USE);
classMap.put("Cs", Character.SURROGATE);
classMap.put("Pd", Character.DASH_PUNCTUATION);
classMap.put("Ps", Character.START_PUNCTUATION);
classMap.put("Pe", Character.END_PUNCTUATION);
classMap.put("Pc", Character.CONNECTOR_PUNCTUATION);
classMap.put("Po", Character.OTHER_PUNCTUATION);
classMap.put("Sm", Character.MATH_SYMBOL);
classMap.put("Sc", Character.CURRENCY_SYMBOL);
classMap.put("Sk", Character.MODIFIER_SYMBOL);
classMap.put("So", Character.OTHER_SYMBOL);
classMap.put("Pi", Character.INITIAL_QUOTE_PUNCTUATION);
classMap.put("Pf", Character.FINAL_QUOTE_PUNCTUATION);
}
private static void parseUnicodeData() { private static void parseUnicodeData() {
IntegerArray digitValues = new IntegerArray(4096); IntegerArray digitValues = new IntegerArray(4096);
IntegerArray classes = new IntegerArray(65536);
try (BufferedReader reader = new BufferedReader(new InputStreamReader(UnicodeHelper.class try (BufferedReader reader = new BufferedReader(new InputStreamReader(UnicodeHelper.class
.getResourceAsStream("UnicodeData.txt")))) { .getResourceAsStream("UnicodeData.txt")))) {
while (true) { while (true) {
@ -47,11 +85,16 @@ public class UnicodeSupport {
} }
String[] fields = splitLine(line); String[] fields = splitLine(line);
int charCode = parseHex(fields[0]); int charCode = parseHex(fields[0]);
while (classes.size() < charCode) {
classes.add(0);
}
if (!fields[6].isEmpty()) { if (!fields[6].isEmpty()) {
int digit = Integer.parseInt(fields[6]); int digit = Integer.parseInt(fields[6]);
digitValues.add(charCode); digitValues.add(charCode);
digitValues.add(digit); digitValues.add(digit);
} }
Byte charClass = classMap.get(fields[2]);
classes.add(charClass != null ? charClass.intValue() : 0);
} }
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException("Error reading unicode data", e); throw new RuntimeException("Error reading unicode data", e);
@ -74,6 +117,10 @@ public class UnicodeSupport {
letterDigitValues.add(i - '\uFF41' + 10); letterDigitValues.add(i - '\uFF41' + 10);
} }
UnicodeSupport.digitValues = mergePairs(digitValues.getAll(), letterDigitValues.getAll()); UnicodeSupport.digitValues = mergePairs(digitValues.getAll(), letterDigitValues.getAll());
UnicodeSupport.classes = new byte[classes.size()];
for (int i = 0; i < classes.size(); ++i) {
UnicodeSupport.classes[i] = (byte)classes.get(i);
}
} }
private static String[] splitLine(String line) { private static String[] splitLine(String line) {
@ -150,4 +197,8 @@ public class UnicodeSupport {
return digitValues; return digitValues;
} }
public static byte[] getClasses() {
ensureUnicodeData();
return classes;
}
} }

View File

@ -45,6 +45,9 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
case "obtainDigitMapping": case "obtainDigitMapping":
generateObtainDigitMapping(writer); generateObtainDigitMapping(writer);
break; break;
case "obtainClasses":
generateObtainClasses(writer);
break;
} }
} }
@ -52,7 +55,8 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
public void methodAchieved(DependencyChecker checker, MethodDependency method) { public void methodAchieved(DependencyChecker checker, MethodDependency method) {
switch (method.getReference().getName()) { switch (method.getReference().getName()) {
case "obtainDigitMapping": case "obtainDigitMapping":
achieveObtainDigitMapping(method); case "obtainClasses":
method.getResult().propagate("java.lang.String");
break; break;
} }
} }
@ -68,11 +72,26 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin {
} }
private void generateObtainDigitMapping(SourceWriter writer) throws IOException { private void generateObtainDigitMapping(SourceWriter writer) throws IOException {
writer.append("return $rt_str(\"").append(UnicodeHelper.encodeIntByte(UnicodeSupport.getDigitValues())) String str = UnicodeHelper.encodeIntByte(UnicodeSupport.getDigitValues());
.append("\");").softNewLine(); writer.append("return $rt_str(");
splitString(writer, str);
writer.append(");").softNewLine();
} }
private void achieveObtainDigitMapping(MethodDependency method) { private void generateObtainClasses(SourceWriter writer) throws IOException {
method.getResult().propagate("java.lang.String"); String str = UnicodeHelper.compressRle(UnicodeSupport.getClasses());
writer.append("return $rt_str(");
splitString(writer, str);
writer.append(");").softNewLine();
}
private void splitString(SourceWriter writer, String str) throws IOException {
for (int i = 0; i < str.length(); i += 512) {
if (i > 0) {
writer.ws().append("+").newLine();
}
int j = Math.min(i + 512, str.length());
writer.append("\"").append(str.substring(i, j)).append("\"");
}
} }
} }

View File

@ -93,6 +93,7 @@ public class TCharacter extends TObject {
public static final int SIZE = 16; public static final int SIZE = 16;
static final int ERROR = 0xFFFFFFFF; static final int ERROR = 0xFFFFFFFF;
private static int[] digitMapping; private static int[] digitMapping;
private static byte[] classMapping;
private char value; private char value;
private static TCharacter[] characterCache = new TCharacter[128]; private static TCharacter[] characterCache = new TCharacter[128];
@ -197,11 +198,11 @@ public class TCharacter extends TObject {
} }
public static int codePointBefore(TCharSequence seq, int index) { public static int codePointBefore(TCharSequence seq, int index) {
if (index == 0 || !isLowSurrogate(seq.charAt(index)) || !isHighSurrogate(seq.charAt(index - 1))) { if (index == 1 || !UTF16Helper.isLowSurrogate(seq.charAt(index - 2)) ||
return seq.charAt(index); !UTF16Helper.isHighSurrogate(seq.charAt(index - 2))) {
} else { return seq.charAt(index - 1);
return toCodePoint(seq.charAt(index - 1), seq.charAt(index));
} }
return UTF16Helper.buildCodePoint(seq.charAt(index - 2), seq.charAt(index - 1));
} }
public static int codePointBefore(char[] a, int index) { public static int codePointBefore(char[] a, int index) {
@ -209,10 +210,10 @@ public class TCharacter extends TObject {
} }
public static int codePointBefore(char[] a, int index, int start) { public static int codePointBefore(char[] a, int index, int start) {
if (index <= start || !isLowSurrogate(a[index]) || !isHighSurrogate(a[index - 1])) { if (index <= start + 1 || !isLowSurrogate(a[index - 1]) || !isHighSurrogate(a[index - 2])) {
return a[index]; return a[index];
} else { } else {
return toCodePoint(a[index - 1], a[index]); return toCodePoint(a[index - 2], a[index - 1]);
} }
} }
@ -286,6 +287,17 @@ public class TCharacter extends TObject {
@PluggableDependency(CharacterNativeGenerator.class) @PluggableDependency(CharacterNativeGenerator.class)
private static native String obtainDigitMapping(); private static native String obtainDigitMapping();
private static byte[] getClasses() {
if (classMapping == null) {
classMapping = UnicodeHelper.extractRle(obtainClasses());
}
return classMapping;
}
@GeneratedBy(CharacterNativeGenerator.class)
@PluggableDependency(CharacterNativeGenerator.class)
private static native String obtainClasses();
public static int toChars(int codePoint, char[] dst, int dstIndex) { public static int toChars(int codePoint, char[] dst, int dstIndex) {
if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) { if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) {
dst[dstIndex] = UTF16Helper.highSurrogate(codePoint); dst[dstIndex] = UTF16Helper.highSurrogate(codePoint);
@ -360,4 +372,12 @@ public class TCharacter extends TObject {
public static boolean isISOControl(int codePoint) { public static boolean isISOControl(int codePoint) {
return codePoint >= 0 && codePoint <= 0x1F || codePoint >= 0x7F && codePoint <= 0x9F; return codePoint >= 0 && codePoint <= 0x1F || codePoint >= 0x7F && codePoint <= 0x9F;
} }
public static int getType(char c) {
return getType((int)c);
}
public static int getType(int codePoint) {
return getClasses()[codePoint];
}
} }

View File

@ -30,4 +30,14 @@ public class CharacterTest {
assertEquals(6, Character.digit('\u096C', 10)); assertEquals(6, Character.digit('\u096C', 10));
assertEquals(15, Character.digit('F', 16)); assertEquals(15, Character.digit('F', 16));
} }
@Test
public void classesRecognized() {
assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType('2'));
assertEquals(Character.UPPERCASE_LETTER, Character.getType('Q'));
assertEquals(Character.LOWERCASE_LETTER, Character.getType('w'));
assertEquals(Character.MATH_SYMBOL, Character.getType(0x21F7));
assertEquals(Character.NON_SPACING_MARK, Character.getType(0xFE25));
assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType(0x1D7D9));
}
} }