classlib: use built-in unicode table to implement tolower/toupper methods.

This makes it possible to use TeaVM in environments where these functions aren't natively available.
For example, WASI
This commit is contained in:
Alexey Andreev 2022-11-09 12:08:17 +01:00
parent a49f47dac7
commit 292aa21aef
6 changed files with 166 additions and 45 deletions

View File

@ -237,6 +237,10 @@ public class JCLPlugin implements TeaVMPlugin {
new CharacterMetadataGenerator());
reg.register(new MethodReference(Character.class, "acquireTitleCaseMapping", StringResource.class),
new CharacterMetadataGenerator());
reg.register(new MethodReference(Character.class, "acquireUpperCaseMapping", StringResource.class),
new CharacterMetadataGenerator());
reg.register(new MethodReference(Character.class, "acquireLowerCaseMapping", StringResource.class),
new CharacterMetadataGenerator());
}
@PlatformMarker

View File

@ -87,6 +87,20 @@ public final class UnicodeHelper {
return sb.toString();
}
public static String encodeCaseMapping(int[] data) {
StringBuilder sb = new StringBuilder();
int sz = data.length / 2;
Base46.encodeUnsigned(sb, sz);
int last = 0;
for (int i = 0; i < sz; i++) {
int v = data[i * 2];
Base46.encodeUnsigned(sb, v - last);
last = v;
Base46.encode(sb, data[i * 2 + 1]);
}
return sb.toString();
}
public static int[] decodeIntDiff(String text) {
CharFlow flow = new CharFlow(text.toCharArray());
int sz = Base46.decodeUnsigned(flow);
@ -99,6 +113,19 @@ public final class UnicodeHelper {
return data;
}
public static int[] decodeCaseMapping(String text) {
CharFlow flow = new CharFlow(text.toCharArray());
int sz = Base46.decodeUnsigned(flow);
int[] data = new int[sz * 2];
int last = 0;
for (int i = 0; i < sz; i++) {
last += Base46.decodeUnsigned(flow);
data[i * 2] = last;
data[i * 2 + 1] = Base46.decode(flow);
}
return data;
}
public static char encodeByte(byte b) {
if (b < '\"' - ' ') {
return (char) (b + ' ');

View File

@ -32,6 +32,8 @@ public final class UnicodeSupport {
private static int[] digitValues;
private static byte[] classes;
private static int[] titleCaseMapping;
private static int[] upperCaseMapping;
private static int[] lowerCaseMapping;
private static Map<String, Byte> classMap = new HashMap<>();
static {
@ -74,6 +76,8 @@ public final class UnicodeSupport {
IntegerArray digitValues = new IntegerArray(4096);
IntegerArray classes = new IntegerArray(65536);
IntegerArray titleCaseMapping = new IntegerArray(256);
IntegerArray upperCaseMapping = new IntegerArray(256);
IntegerArray lowerCaseMapping = new IntegerArray(256);
try (BufferedReader reader = new BufferedReader(new InputStreamReader(UnicodeHelper.class
.getResourceAsStream("UnicodeData.txt")))) {
while (true) {
@ -97,13 +101,15 @@ public final class UnicodeSupport {
Byte charClass = classMap.get(fields[2]);
classes.add(charClass != null ? charClass.intValue() : 0);
if (!fields[14].isEmpty()) {
int titleCaseCode = parseHex(fields[14]);
if (fields[12].isEmpty() || parseHex(fields[12]) != titleCaseCode) {
titleCaseMapping.add(charCode);
titleCaseMapping.add(titleCaseCode);
}
int upperCaseCode = !fields[12].isEmpty() ? parseHex(fields[12]) : charCode;
encodeCaseMapping(upperCaseMapping, charCode, upperCaseCode);
int lowerCaseCode = !fields[13].isEmpty() ? parseHex(fields[13]) : charCode;
encodeCaseMapping(lowerCaseMapping, charCode, lowerCaseCode);
int titleCaseCode = !fields[14].isEmpty() ? parseHex(fields[14]) : charCode;
if (titleCaseCode == upperCaseCode) {
titleCaseCode = charCode;
}
encodeCaseMapping(titleCaseMapping, charCode, titleCaseCode);
}
} catch (IOException e) {
throw new RuntimeException("Error reading unicode data", e);
@ -131,6 +137,16 @@ public final class UnicodeSupport {
UnicodeSupport.classes[i] = (byte) classes.get(i);
}
UnicodeSupport.titleCaseMapping = titleCaseMapping.getAll();
UnicodeSupport.upperCaseMapping = upperCaseMapping.getAll();
UnicodeSupport.lowerCaseMapping = lowerCaseMapping.getAll();
}
private static void encodeCaseMapping(IntegerArray array, int codePoint, int mappedCodePoint) {
int diff = mappedCodePoint - codePoint;
if (array.size() == 0 || diff != array.get(array.size() - 1)) {
array.add(codePoint);
array.add(diff);
}
}
private static String[] splitLine(String line) {
@ -216,4 +232,14 @@ public final class UnicodeSupport {
ensureUnicodeData();
return titleCaseMapping;
}
public static int[] getUpperCaseMapping() {
ensureUnicodeData();
return upperCaseMapping;
}
public static int[] getLowerCaseMapping() {
ensureUnicodeData();
return lowerCaseMapping;
}
}

View File

@ -32,7 +32,11 @@ public class CharacterMetadataGenerator implements MetadataGenerator {
case "obtainClasses":
return generateObtainClasses(context);
case "acquireTitleCaseMapping":
return generateObtainTitleCaseMapping(context);
return generateAcquireTitleCaseMapping(context);
case "acquireUpperCaseMapping":
return generateAcquireUpperCaseMapping(context);
case "acquireLowerCaseMapping":
return generateAcquireLowerCaseMapping(context);
default:
return null;
}
@ -50,9 +54,21 @@ public class CharacterMetadataGenerator implements MetadataGenerator {
return res;
}
private Resource generateObtainTitleCaseMapping(MetadataGeneratorContext context) {
private Resource generateAcquireTitleCaseMapping(MetadataGeneratorContext context) {
StringResource res = context.createResource(StringResource.class);
res.setValue(UnicodeHelper.encodeIntDiff(UnicodeSupport.getTitleCaseMapping()));
res.setValue(UnicodeHelper.encodeCaseMapping(UnicodeSupport.getTitleCaseMapping()));
return res;
}
private Resource generateAcquireUpperCaseMapping(MetadataGeneratorContext context) {
StringResource res = context.createResource(StringResource.class);
res.setValue(UnicodeHelper.encodeCaseMapping(UnicodeSupport.getUpperCaseMapping()));
return res;
}
private Resource generateAcquireLowerCaseMapping(MetadataGeneratorContext context) {
StringResource res = context.createResource(StringResource.class);
res.setValue(UnicodeHelper.encodeCaseMapping(UnicodeSupport.getLowerCaseMapping()));
return res;
}
}

View File

@ -16,11 +16,6 @@
package org.teavm.classlib.java.lang;
import org.teavm.classlib.impl.unicode.UnicodeHelper;
import org.teavm.interop.DelegateTo;
import org.teavm.interop.Import;
import org.teavm.interop.Unmanaged;
import org.teavm.interop.c.Include;
import org.teavm.platform.Platform;
import org.teavm.platform.metadata.StringResource;
public class TCharacter extends TObject implements TComparable<TCharacter> {
@ -92,6 +87,8 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
static final int ERROR = 0xFFFFFFFF;
private static int[] digitMapping;
private static int[] titleCaseMapping;
private static int[] upperCaseMapping;
private static int[] lowerCaseMapping;
private static UnicodeHelper.Range[] classMapping;
private char value;
private static TCharacter[] characterCache = new TCharacter[128];
@ -235,54 +232,43 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
return (char) toLowerCase((int) ch);
}
@DelegateTo("toLowerCaseLowLevel")
public static int toLowerCase(int ch) {
return Platform.stringFromCharCode(ch).toLowerCase().charCodeAt(0);
return mapChar(getLowerCaseMapping(), ch);
}
private static int toLowerCaseLowLevel(int codePoint) {
return toLowerCaseSystem(codePoint);
private static int[] getLowerCaseMapping() {
if (lowerCaseMapping == null) {
lowerCaseMapping = UnicodeHelper.decodeCaseMapping(acquireLowerCaseMapping().getValue());
}
return lowerCaseMapping;
}
@Import(module = "teavm", name = "towlower")
@Include("wctype.h")
@Unmanaged
private static native int toLowerCaseSystem(int codePoint);
private static native StringResource acquireLowerCaseMapping();
public static char toUpperCase(char ch) {
return (char) toUpperCase((int) ch);
}
@DelegateTo("toUpperCaseLowLevel")
public static int toUpperCase(int codePoint) {
return Platform.stringFromCharCode(codePoint).toUpperCase().charCodeAt(0);
return mapChar(getUpperCaseMapping(), codePoint);
}
private static int toUpperCaseLowLevel(int codePoint) {
return toUpperCaseSystem(codePoint);
private static int[] getUpperCaseMapping() {
if (upperCaseMapping == null) {
upperCaseMapping = UnicodeHelper.decodeCaseMapping(acquireUpperCaseMapping().getValue());
}
return upperCaseMapping;
}
@Import(module = "teavm", name = "towupper")
@Include("wctype.h")
@Unmanaged
private static native int toUpperCaseSystem(int codePoint);
private static native StringResource acquireUpperCaseMapping();
public static int toTitleCase(int codePoint) {
int[] mapping = getTitleCaseMapping();
int l = 0;
int u = (mapping.length / 2) - 1;
while (u >= l) {
int idx = (l + u) / 2;
int val = mapping[idx * 2];
if (codePoint > val) {
l = idx + 1;
} else if (codePoint < val) {
u = idx - 1;
} else {
return mapping[idx * 2 + 1];
}
codePoint = mapChar(getTitleCaseMapping(), codePoint);
if (codePoint == codePoint) {
codePoint = toUpperCase(codePoint);
}
return toUpperCase(codePoint);
return codePoint;
}
public static char toTitleCase(char c) {
@ -291,13 +277,43 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
private static int[] getTitleCaseMapping() {
if (titleCaseMapping == null) {
titleCaseMapping = UnicodeHelper.decodeIntDiff(acquireTitleCaseMapping().getValue());
titleCaseMapping = UnicodeHelper.decodeCaseMapping(acquireTitleCaseMapping().getValue());
}
return titleCaseMapping;
}
private static native StringResource acquireTitleCaseMapping();
private static int mapChar(int[] table, int codePoint) {
int index = binarySearchTable(table, codePoint);
if (index < 0 || index >= table.length / 2) {
return 0;
}
return codePoint + table[index * 2 + 1];
}
private static int binarySearchTable(int[] data, int key) {
int l = 0;
int u = data.length / 2 - 1;
while (true) {
int i = (l + u) / 2;
int e = data[i * 2];
if (e == key) {
return i;
} else if (e > key) {
u = i - 1;
if (u < l) {
return i - 1;
}
} else {
l = i + 1;
if (l > u) {
return i;
}
}
}
}
public static int digit(char ch, int radix) {
return digit((int) ch, radix);
}

View File

@ -39,4 +39,36 @@ public class CharacterTest {
assertEquals(Character.NON_SPACING_MARK, Character.getType(0xFE25));
assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType(0x1D7D9));
}
@Test
public void lowerCase() {
assertEquals('1', Character.toLowerCase('1'));
assertEquals('a', Character.toLowerCase('a'));
assertEquals('b', Character.toLowerCase('b'));
assertEquals('z', Character.toLowerCase('z'));
assertEquals('@', Character.toLowerCase('@'));
assertEquals('a', Character.toLowerCase('A'));
assertEquals('b', Character.toLowerCase('B'));
assertEquals('z', Character.toLowerCase('Z'));
assertEquals('щ', Character.toLowerCase('щ'));
assertEquals('щ', Character.toLowerCase('Щ'));
assertEquals('ü', Character.toLowerCase('ü'));
assertEquals('ü', Character.toLowerCase('Ü'));
}
@Test
public void upperCase() {
assertEquals('1', Character.toUpperCase('1'));
assertEquals('A', Character.toUpperCase('a'));
assertEquals('B', Character.toUpperCase('b'));
assertEquals('Z', Character.toUpperCase('z'));
assertEquals('@', Character.toUpperCase('@'));
assertEquals('A', Character.toUpperCase('A'));
assertEquals('B', Character.toUpperCase('B'));
assertEquals('Z', Character.toUpperCase('Z'));
assertEquals('Щ', Character.toUpperCase('щ'));
assertEquals('Щ', Character.toUpperCase('Щ'));
assertEquals('Ü', Character.toUpperCase('ü'));
assertEquals('Ü', Character.toUpperCase('Ü'));
}
}