classlib: use built-in unicode table to implement tolower/toupper methods.

This makes it possible to use TeaVM in environments where these functions aren't natively available.
For example, WASI
This commit is contained in:
Alexey Andreev 2022-11-09 12:08:17 +01:00
parent a49f47dac7
commit 292aa21aef
6 changed files with 166 additions and 45 deletions

View File

@ -237,6 +237,10 @@ public class JCLPlugin implements TeaVMPlugin {
new CharacterMetadataGenerator()); new CharacterMetadataGenerator());
reg.register(new MethodReference(Character.class, "acquireTitleCaseMapping", StringResource.class), reg.register(new MethodReference(Character.class, "acquireTitleCaseMapping", StringResource.class),
new CharacterMetadataGenerator()); new CharacterMetadataGenerator());
reg.register(new MethodReference(Character.class, "acquireUpperCaseMapping", StringResource.class),
new CharacterMetadataGenerator());
reg.register(new MethodReference(Character.class, "acquireLowerCaseMapping", StringResource.class),
new CharacterMetadataGenerator());
} }
@PlatformMarker @PlatformMarker

View File

@ -87,6 +87,20 @@ public final class UnicodeHelper {
return sb.toString(); return sb.toString();
} }
public static String encodeCaseMapping(int[] data) {
StringBuilder sb = new StringBuilder();
int sz = data.length / 2;
Base46.encodeUnsigned(sb, sz);
int last = 0;
for (int i = 0; i < sz; i++) {
int v = data[i * 2];
Base46.encodeUnsigned(sb, v - last);
last = v;
Base46.encode(sb, data[i * 2 + 1]);
}
return sb.toString();
}
public static int[] decodeIntDiff(String text) { public static int[] decodeIntDiff(String text) {
CharFlow flow = new CharFlow(text.toCharArray()); CharFlow flow = new CharFlow(text.toCharArray());
int sz = Base46.decodeUnsigned(flow); int sz = Base46.decodeUnsigned(flow);
@ -99,6 +113,19 @@ public final class UnicodeHelper {
return data; return data;
} }
public static int[] decodeCaseMapping(String text) {
CharFlow flow = new CharFlow(text.toCharArray());
int sz = Base46.decodeUnsigned(flow);
int[] data = new int[sz * 2];
int last = 0;
for (int i = 0; i < sz; i++) {
last += Base46.decodeUnsigned(flow);
data[i * 2] = last;
data[i * 2 + 1] = Base46.decode(flow);
}
return data;
}
public static char encodeByte(byte b) { public static char encodeByte(byte b) {
if (b < '\"' - ' ') { if (b < '\"' - ' ') {
return (char) (b + ' '); return (char) (b + ' ');

View File

@ -32,6 +32,8 @@ public final class UnicodeSupport {
private static int[] digitValues; private static int[] digitValues;
private static byte[] classes; private static byte[] classes;
private static int[] titleCaseMapping; private static int[] titleCaseMapping;
private static int[] upperCaseMapping;
private static int[] lowerCaseMapping;
private static Map<String, Byte> classMap = new HashMap<>(); private static Map<String, Byte> classMap = new HashMap<>();
static { static {
@ -74,6 +76,8 @@ public final class UnicodeSupport {
IntegerArray digitValues = new IntegerArray(4096); IntegerArray digitValues = new IntegerArray(4096);
IntegerArray classes = new IntegerArray(65536); IntegerArray classes = new IntegerArray(65536);
IntegerArray titleCaseMapping = new IntegerArray(256); IntegerArray titleCaseMapping = new IntegerArray(256);
IntegerArray upperCaseMapping = new IntegerArray(256);
IntegerArray lowerCaseMapping = new IntegerArray(256);
try (BufferedReader reader = new BufferedReader(new InputStreamReader(UnicodeHelper.class try (BufferedReader reader = new BufferedReader(new InputStreamReader(UnicodeHelper.class
.getResourceAsStream("UnicodeData.txt")))) { .getResourceAsStream("UnicodeData.txt")))) {
while (true) { while (true) {
@ -97,13 +101,15 @@ public final class UnicodeSupport {
Byte charClass = classMap.get(fields[2]); Byte charClass = classMap.get(fields[2]);
classes.add(charClass != null ? charClass.intValue() : 0); classes.add(charClass != null ? charClass.intValue() : 0);
if (!fields[14].isEmpty()) { int upperCaseCode = !fields[12].isEmpty() ? parseHex(fields[12]) : charCode;
int titleCaseCode = parseHex(fields[14]); encodeCaseMapping(upperCaseMapping, charCode, upperCaseCode);
if (fields[12].isEmpty() || parseHex(fields[12]) != titleCaseCode) { int lowerCaseCode = !fields[13].isEmpty() ? parseHex(fields[13]) : charCode;
titleCaseMapping.add(charCode); encodeCaseMapping(lowerCaseMapping, charCode, lowerCaseCode);
titleCaseMapping.add(titleCaseCode); int titleCaseCode = !fields[14].isEmpty() ? parseHex(fields[14]) : charCode;
} if (titleCaseCode == upperCaseCode) {
titleCaseCode = charCode;
} }
encodeCaseMapping(titleCaseMapping, charCode, titleCaseCode);
} }
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException("Error reading unicode data", e); throw new RuntimeException("Error reading unicode data", e);
@ -131,6 +137,16 @@ public final class UnicodeSupport {
UnicodeSupport.classes[i] = (byte) classes.get(i); UnicodeSupport.classes[i] = (byte) classes.get(i);
} }
UnicodeSupport.titleCaseMapping = titleCaseMapping.getAll(); UnicodeSupport.titleCaseMapping = titleCaseMapping.getAll();
UnicodeSupport.upperCaseMapping = upperCaseMapping.getAll();
UnicodeSupport.lowerCaseMapping = lowerCaseMapping.getAll();
}
private static void encodeCaseMapping(IntegerArray array, int codePoint, int mappedCodePoint) {
int diff = mappedCodePoint - codePoint;
if (array.size() == 0 || diff != array.get(array.size() - 1)) {
array.add(codePoint);
array.add(diff);
}
} }
private static String[] splitLine(String line) { private static String[] splitLine(String line) {
@ -216,4 +232,14 @@ public final class UnicodeSupport {
ensureUnicodeData(); ensureUnicodeData();
return titleCaseMapping; return titleCaseMapping;
} }
public static int[] getUpperCaseMapping() {
ensureUnicodeData();
return upperCaseMapping;
}
public static int[] getLowerCaseMapping() {
ensureUnicodeData();
return lowerCaseMapping;
}
} }

View File

@ -32,7 +32,11 @@ public class CharacterMetadataGenerator implements MetadataGenerator {
case "obtainClasses": case "obtainClasses":
return generateObtainClasses(context); return generateObtainClasses(context);
case "acquireTitleCaseMapping": case "acquireTitleCaseMapping":
return generateObtainTitleCaseMapping(context); return generateAcquireTitleCaseMapping(context);
case "acquireUpperCaseMapping":
return generateAcquireUpperCaseMapping(context);
case "acquireLowerCaseMapping":
return generateAcquireLowerCaseMapping(context);
default: default:
return null; return null;
} }
@ -50,9 +54,21 @@ public class CharacterMetadataGenerator implements MetadataGenerator {
return res; return res;
} }
private Resource generateObtainTitleCaseMapping(MetadataGeneratorContext context) { private Resource generateAcquireTitleCaseMapping(MetadataGeneratorContext context) {
StringResource res = context.createResource(StringResource.class); StringResource res = context.createResource(StringResource.class);
res.setValue(UnicodeHelper.encodeIntDiff(UnicodeSupport.getTitleCaseMapping())); res.setValue(UnicodeHelper.encodeCaseMapping(UnicodeSupport.getTitleCaseMapping()));
return res;
}
private Resource generateAcquireUpperCaseMapping(MetadataGeneratorContext context) {
StringResource res = context.createResource(StringResource.class);
res.setValue(UnicodeHelper.encodeCaseMapping(UnicodeSupport.getUpperCaseMapping()));
return res;
}
private Resource generateAcquireLowerCaseMapping(MetadataGeneratorContext context) {
StringResource res = context.createResource(StringResource.class);
res.setValue(UnicodeHelper.encodeCaseMapping(UnicodeSupport.getLowerCaseMapping()));
return res; return res;
} }
} }

View File

@ -16,11 +16,6 @@
package org.teavm.classlib.java.lang; package org.teavm.classlib.java.lang;
import org.teavm.classlib.impl.unicode.UnicodeHelper; import org.teavm.classlib.impl.unicode.UnicodeHelper;
import org.teavm.interop.DelegateTo;
import org.teavm.interop.Import;
import org.teavm.interop.Unmanaged;
import org.teavm.interop.c.Include;
import org.teavm.platform.Platform;
import org.teavm.platform.metadata.StringResource; import org.teavm.platform.metadata.StringResource;
public class TCharacter extends TObject implements TComparable<TCharacter> { public class TCharacter extends TObject implements TComparable<TCharacter> {
@ -92,6 +87,8 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
static final int ERROR = 0xFFFFFFFF; static final int ERROR = 0xFFFFFFFF;
private static int[] digitMapping; private static int[] digitMapping;
private static int[] titleCaseMapping; private static int[] titleCaseMapping;
private static int[] upperCaseMapping;
private static int[] lowerCaseMapping;
private static UnicodeHelper.Range[] classMapping; private static UnicodeHelper.Range[] classMapping;
private char value; private char value;
private static TCharacter[] characterCache = new TCharacter[128]; private static TCharacter[] characterCache = new TCharacter[128];
@ -235,54 +232,43 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
return (char) toLowerCase((int) ch); return (char) toLowerCase((int) ch);
} }
@DelegateTo("toLowerCaseLowLevel")
public static int toLowerCase(int ch) { public static int toLowerCase(int ch) {
return Platform.stringFromCharCode(ch).toLowerCase().charCodeAt(0); return mapChar(getLowerCaseMapping(), ch);
} }
private static int toLowerCaseLowLevel(int codePoint) { private static int[] getLowerCaseMapping() {
return toLowerCaseSystem(codePoint); if (lowerCaseMapping == null) {
lowerCaseMapping = UnicodeHelper.decodeCaseMapping(acquireLowerCaseMapping().getValue());
}
return lowerCaseMapping;
} }
@Import(module = "teavm", name = "towlower") private static native StringResource acquireLowerCaseMapping();
@Include("wctype.h")
@Unmanaged
private static native int toLowerCaseSystem(int codePoint);
public static char toUpperCase(char ch) { public static char toUpperCase(char ch) {
return (char) toUpperCase((int) ch); return (char) toUpperCase((int) ch);
} }
@DelegateTo("toUpperCaseLowLevel")
public static int toUpperCase(int codePoint) { public static int toUpperCase(int codePoint) {
return Platform.stringFromCharCode(codePoint).toUpperCase().charCodeAt(0); return mapChar(getUpperCaseMapping(), codePoint);
} }
private static int toUpperCaseLowLevel(int codePoint) { private static int[] getUpperCaseMapping() {
return toUpperCaseSystem(codePoint); if (upperCaseMapping == null) {
upperCaseMapping = UnicodeHelper.decodeCaseMapping(acquireUpperCaseMapping().getValue());
}
return upperCaseMapping;
} }
@Import(module = "teavm", name = "towupper") private static native StringResource acquireUpperCaseMapping();
@Include("wctype.h")
@Unmanaged
private static native int toUpperCaseSystem(int codePoint);
public static int toTitleCase(int codePoint) { public static int toTitleCase(int codePoint) {
int[] mapping = getTitleCaseMapping(); codePoint = mapChar(getTitleCaseMapping(), codePoint);
int l = 0; if (codePoint == codePoint) {
int u = (mapping.length / 2) - 1; codePoint = toUpperCase(codePoint);
while (u >= l) {
int idx = (l + u) / 2;
int val = mapping[idx * 2];
if (codePoint > val) {
l = idx + 1;
} else if (codePoint < val) {
u = idx - 1;
} else {
return mapping[idx * 2 + 1];
}
} }
return toUpperCase(codePoint); return codePoint;
} }
public static char toTitleCase(char c) { public static char toTitleCase(char c) {
@ -291,13 +277,43 @@ public class TCharacter extends TObject implements TComparable<TCharacter> {
private static int[] getTitleCaseMapping() { private static int[] getTitleCaseMapping() {
if (titleCaseMapping == null) { if (titleCaseMapping == null) {
titleCaseMapping = UnicodeHelper.decodeIntDiff(acquireTitleCaseMapping().getValue()); titleCaseMapping = UnicodeHelper.decodeCaseMapping(acquireTitleCaseMapping().getValue());
} }
return titleCaseMapping; return titleCaseMapping;
} }
private static native StringResource acquireTitleCaseMapping(); private static native StringResource acquireTitleCaseMapping();
private static int mapChar(int[] table, int codePoint) {
int index = binarySearchTable(table, codePoint);
if (index < 0 || index >= table.length / 2) {
return 0;
}
return codePoint + table[index * 2 + 1];
}
private static int binarySearchTable(int[] data, int key) {
int l = 0;
int u = data.length / 2 - 1;
while (true) {
int i = (l + u) / 2;
int e = data[i * 2];
if (e == key) {
return i;
} else if (e > key) {
u = i - 1;
if (u < l) {
return i - 1;
}
} else {
l = i + 1;
if (l > u) {
return i;
}
}
}
}
public static int digit(char ch, int radix) { public static int digit(char ch, int radix) {
return digit((int) ch, radix); return digit((int) ch, radix);
} }

View File

@ -39,4 +39,36 @@ public class CharacterTest {
assertEquals(Character.NON_SPACING_MARK, Character.getType(0xFE25)); assertEquals(Character.NON_SPACING_MARK, Character.getType(0xFE25));
assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType(0x1D7D9)); assertEquals(Character.DECIMAL_DIGIT_NUMBER, Character.getType(0x1D7D9));
} }
@Test
public void lowerCase() {
assertEquals('1', Character.toLowerCase('1'));
assertEquals('a', Character.toLowerCase('a'));
assertEquals('b', Character.toLowerCase('b'));
assertEquals('z', Character.toLowerCase('z'));
assertEquals('@', Character.toLowerCase('@'));
assertEquals('a', Character.toLowerCase('A'));
assertEquals('b', Character.toLowerCase('B'));
assertEquals('z', Character.toLowerCase('Z'));
assertEquals('щ', Character.toLowerCase('щ'));
assertEquals('щ', Character.toLowerCase('Щ'));
assertEquals('ü', Character.toLowerCase('ü'));
assertEquals('ü', Character.toLowerCase('Ü'));
}
@Test
public void upperCase() {
assertEquals('1', Character.toUpperCase('1'));
assertEquals('A', Character.toUpperCase('a'));
assertEquals('B', Character.toUpperCase('b'));
assertEquals('Z', Character.toUpperCase('z'));
assertEquals('@', Character.toUpperCase('@'));
assertEquals('A', Character.toUpperCase('A'));
assertEquals('B', Character.toUpperCase('B'));
assertEquals('Z', Character.toUpperCase('Z'));
assertEquals('Щ', Character.toUpperCase('щ'));
assertEquals('Щ', Character.toUpperCase('Щ'));
assertEquals('Ü', Character.toUpperCase('ü'));
assertEquals('Ü', Character.toUpperCase('Ü'));
}
} }