From 0e93eec575e6a163dd4726915e8cd3ab6be56c6f Mon Sep 17 00:00:00 2001 From: konsoletyper Date: Sun, 2 Mar 2014 23:47:56 +0400 Subject: [PATCH] Adds unicode class recognition --- .../classlib/impl/unicode/UnicodeHelper.java | 59 +++++++++++++++++++ .../classlib/impl/unicode/UnicodeSupport.java | 51 ++++++++++++++++ .../java/lang/CharacterNativeGenerator.java | 29 +++++++-- .../teavm/classlib/java/lang/TCharacter.java | 32 ++++++++-- .../classlib/java/lang/CharacterTest.java | 10 ++++ 5 files changed, 170 insertions(+), 11 deletions(-) diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/impl/unicode/UnicodeHelper.java b/teavm-classlib/src/main/java/org/teavm/classlib/impl/unicode/UnicodeHelper.java index 898777c74..3d487bf3e 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/impl/unicode/UnicodeHelper.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/impl/unicode/UnicodeHelper.java @@ -15,6 +15,8 @@ */ package org.teavm.classlib.impl.unicode; +import java.util.Arrays; + /** * * @author Alexey Andreev @@ -56,4 +58,61 @@ public class UnicodeHelper { } return data; } + + public static char encodeByte(byte b) { + if (b < '\"' - ' ') { + return (char)(b + ' '); + } else if (b < '\\' - ' ') { + return (char)(b + ' ' + 1); + } else { + return (char)(b + ' ' + 2); + } + } + + public static byte decodeByte(char c) { + if (c > '\\') { + return (byte)(c - ' ' - 2); + } else if (c > '"') { + return (byte)(c - ' ' - 1); + } else { + return (byte)(c - ' '); + } + } + + public static String compressRle(byte[] bytes) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < bytes.length; ++i) { + byte b = bytes[i]; + if (i < bytes.length - 1 && b == bytes[i + 1]) { + int count = 0; + while (bytes[i++] == b && count < 80) { + ++count; + } + sb.append(UnicodeHelper.encodeByte((byte)(b + 32))); + sb.append(UnicodeHelper.encodeByte((byte)count)); + --i; + } else { + sb.append(UnicodeHelper.encodeByte(bytes[i])); + } + } + return sb.toString(); + } + + public static byte[] extractRle(String encoded) { + byte[] data = new byte[65536 * 4]; + int index = 0; + for (int i = 0; i < encoded.length(); ++i) { + byte b = decodeByte(encoded.charAt(i)); + if (b > 32) { + b -= 32; + byte count = decodeByte(encoded.charAt(++i)); + while (count-- > 0) { + data[index++] = b; + } + } else { + data[index++] = b; + } + } + return Arrays.copyOf(data, index); + } } \ No newline at end of file diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/impl/unicode/UnicodeSupport.java b/teavm-classlib/src/main/java/org/teavm/classlib/impl/unicode/UnicodeSupport.java index 9a5ea3097..225d6d2ae 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/impl/unicode/UnicodeSupport.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/impl/unicode/UnicodeSupport.java @@ -19,7 +19,9 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.concurrent.CountDownLatch; import java.util.concurrent.atomic.AtomicBoolean; import org.teavm.common.IntegerArray; @@ -32,9 +34,45 @@ public class UnicodeSupport { private static AtomicBoolean filled = new AtomicBoolean(); private static volatile CountDownLatch latch = new CountDownLatch(1); private static int[] digitValues; + private static byte[] classes; + private static Map classMap = new HashMap<>(); + + static { + classMap.put("Cn", Character.UNASSIGNED); + classMap.put("Lu", Character.UPPERCASE_LETTER); + classMap.put("Ll", Character.LOWERCASE_LETTER); + classMap.put("Lt", Character.TITLECASE_LETTER); + classMap.put("Lm", Character.MODIFIER_LETTER); + classMap.put("Lo", Character.OTHER_LETTER); + classMap.put("Mn", Character.NON_SPACING_MARK); + classMap.put("Me", Character.ENCLOSING_MARK); + classMap.put("Mc", Character.COMBINING_SPACING_MARK); + classMap.put("Nd", Character.DECIMAL_DIGIT_NUMBER); + classMap.put("Nl", Character.LETTER_NUMBER); + classMap.put("No", Character.OTHER_NUMBER); + classMap.put("Zs", Character.SPACE_SEPARATOR); + classMap.put("Zl", Character.LINE_SEPARATOR); + classMap.put("Zp", Character.PARAGRAPH_SEPARATOR); + classMap.put("Cc", Character.CONTROL); + classMap.put("Cf", Character.FORMAT); + classMap.put("Co", Character.PRIVATE_USE); + classMap.put("Cs", Character.SURROGATE); + classMap.put("Pd", Character.DASH_PUNCTUATION); + classMap.put("Ps", Character.START_PUNCTUATION); + classMap.put("Pe", Character.END_PUNCTUATION); + classMap.put("Pc", Character.CONNECTOR_PUNCTUATION); + classMap.put("Po", Character.OTHER_PUNCTUATION); + classMap.put("Sm", Character.MATH_SYMBOL); + classMap.put("Sc", Character.CURRENCY_SYMBOL); + classMap.put("Sk", Character.MODIFIER_SYMBOL); + classMap.put("So", Character.OTHER_SYMBOL); + classMap.put("Pi", Character.INITIAL_QUOTE_PUNCTUATION); + classMap.put("Pf", Character.FINAL_QUOTE_PUNCTUATION); + } private static void parseUnicodeData() { IntegerArray digitValues = new IntegerArray(4096); + IntegerArray classes = new IntegerArray(65536); try (BufferedReader reader = new BufferedReader(new InputStreamReader(UnicodeHelper.class .getResourceAsStream("UnicodeData.txt")))) { while (true) { @@ -47,11 +85,16 @@ public class UnicodeSupport { } String[] fields = splitLine(line); int charCode = parseHex(fields[0]); + while (classes.size() < charCode) { + classes.add(0); + } if (!fields[6].isEmpty()) { int digit = Integer.parseInt(fields[6]); digitValues.add(charCode); digitValues.add(digit); } + Byte charClass = classMap.get(fields[2]); + classes.add(charClass != null ? charClass.intValue() : 0); } } catch (IOException e) { throw new RuntimeException("Error reading unicode data", e); @@ -74,6 +117,10 @@ public class UnicodeSupport { letterDigitValues.add(i - '\uFF41' + 10); } UnicodeSupport.digitValues = mergePairs(digitValues.getAll(), letterDigitValues.getAll()); + UnicodeSupport.classes = new byte[classes.size()]; + for (int i = 0; i < classes.size(); ++i) { + UnicodeSupport.classes[i] = (byte)classes.get(i); + } } private static String[] splitLine(String line) { @@ -150,4 +197,8 @@ public class UnicodeSupport { return digitValues; } + public static byte[] getClasses() { + ensureUnicodeData(); + return classes; + } } diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/CharacterNativeGenerator.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/CharacterNativeGenerator.java index 149f5822d..e73338710 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/CharacterNativeGenerator.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/CharacterNativeGenerator.java @@ -45,6 +45,9 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin { case "obtainDigitMapping": generateObtainDigitMapping(writer); break; + case "obtainClasses": + generateObtainClasses(writer); + break; } } @@ -52,7 +55,8 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin { public void methodAchieved(DependencyChecker checker, MethodDependency method) { switch (method.getReference().getName()) { case "obtainDigitMapping": - achieveObtainDigitMapping(method); + case "obtainClasses": + method.getResult().propagate("java.lang.String"); break; } } @@ -68,11 +72,26 @@ public class CharacterNativeGenerator implements Generator, DependencyPlugin { } private void generateObtainDigitMapping(SourceWriter writer) throws IOException { - writer.append("return $rt_str(\"").append(UnicodeHelper.encodeIntByte(UnicodeSupport.getDigitValues())) - .append("\");").softNewLine(); + String str = UnicodeHelper.encodeIntByte(UnicodeSupport.getDigitValues()); + writer.append("return $rt_str("); + splitString(writer, str); + writer.append(");").softNewLine(); } - private void achieveObtainDigitMapping(MethodDependency method) { - method.getResult().propagate("java.lang.String"); + private void generateObtainClasses(SourceWriter writer) throws IOException { + String str = UnicodeHelper.compressRle(UnicodeSupport.getClasses()); + writer.append("return $rt_str("); + splitString(writer, str); + writer.append(");").softNewLine(); + } + + private void splitString(SourceWriter writer, String str) throws IOException { + for (int i = 0; i < str.length(); i += 512) { + if (i > 0) { + writer.ws().append("+").newLine(); + } + int j = Math.min(i + 512, str.length()); + writer.append("\"").append(str.substring(i, j)).append("\""); + } } } diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TCharacter.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TCharacter.java index bfe0e1845..be0b12d67 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TCharacter.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TCharacter.java @@ -93,6 +93,7 @@ public class TCharacter extends TObject { public static final int SIZE = 16; static final int ERROR = 0xFFFFFFFF; private static int[] digitMapping; + private static byte[] classMapping; private char value; private static TCharacter[] characterCache = new TCharacter[128]; @@ -197,11 +198,11 @@ public class TCharacter extends TObject { } public static int codePointBefore(TCharSequence seq, int index) { - if (index == 0 || !isLowSurrogate(seq.charAt(index)) || !isHighSurrogate(seq.charAt(index - 1))) { - return seq.charAt(index); - } else { - return toCodePoint(seq.charAt(index - 1), seq.charAt(index)); + if (index == 1 || !UTF16Helper.isLowSurrogate(seq.charAt(index - 2)) || + !UTF16Helper.isHighSurrogate(seq.charAt(index - 2))) { + return seq.charAt(index - 1); } + return UTF16Helper.buildCodePoint(seq.charAt(index - 2), seq.charAt(index - 1)); } public static int codePointBefore(char[] a, int index) { @@ -209,10 +210,10 @@ public class TCharacter extends TObject { } public static int codePointBefore(char[] a, int index, int start) { - if (index <= start || !isLowSurrogate(a[index]) || !isHighSurrogate(a[index - 1])) { + if (index <= start + 1 || !isLowSurrogate(a[index - 1]) || !isHighSurrogate(a[index - 2])) { return a[index]; } else { - return toCodePoint(a[index - 1], a[index]); + return toCodePoint(a[index - 2], a[index - 1]); } } @@ -286,6 +287,17 @@ public class TCharacter extends TObject { @PluggableDependency(CharacterNativeGenerator.class) private static native String obtainDigitMapping(); + private static byte[] getClasses() { + if (classMapping == null) { + classMapping = UnicodeHelper.extractRle(obtainClasses()); + } + return classMapping; + } + + @GeneratedBy(CharacterNativeGenerator.class) + @PluggableDependency(CharacterNativeGenerator.class) + private static native String obtainClasses(); + public static int toChars(int codePoint, char[] dst, int dstIndex) { if (codePoint >= UTF16Helper.SUPPLEMENTARY_PLANE) { dst[dstIndex] = UTF16Helper.highSurrogate(codePoint); @@ -360,4 +372,12 @@ public class TCharacter extends TObject { public static boolean isISOControl(int codePoint) { return codePoint >= 0 && codePoint <= 0x1F || codePoint >= 0x7F && codePoint <= 0x9F; } + + public static int getType(char c) { + return getType((int)c); + } + + public static int getType(int codePoint) { + return getClasses()[codePoint]; + } } diff --git a/teavm-classlib/src/test/java/org/teavm/classlib/java/lang/CharacterTest.java b/teavm-classlib/src/test/java/org/teavm/classlib/java/lang/CharacterTest.java index 7598e9f44..9148ec65c 100644 --- a/teavm-classlib/src/test/java/org/teavm/classlib/java/lang/CharacterTest.java +++ b/teavm-classlib/src/test/java/org/teavm/classlib/java/lang/CharacterTest.java @@ -30,4 +30,14 @@ public class CharacterTest { assertEquals(6, Character.digit('\u096C', 10)); assertEquals(15, Character.digit('F', 16)); } + + @Test + public void classesRecognized() { + assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType('2')); + assertEquals(Character.UPPERCASE_LETTER, Character.getType('Q')); + assertEquals(Character.LOWERCASE_LETTER, Character.getType('w')); + assertEquals(Character.MATH_SYMBOL, Character.getType(0x21F7)); + assertEquals(Character.NON_SPACING_MARK, Character.getType(0xFE25)); + assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType(0x1D7D9)); + } }