diff --git a/teavm-classlib/pom.xml b/teavm-classlib/pom.xml index 6cc562036..2a4edf57f 100644 --- a/teavm-classlib/pom.xml +++ b/teavm-classlib/pom.xml @@ -118,6 +118,7 @@ java.util java.util.logging java.util.concurrent + java.util.regex -output ${project.build.directory}/jcl-report diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TString.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TString.java index e8dcf653f..a11ed0585 100644 --- a/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TString.java +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/lang/TString.java @@ -22,6 +22,7 @@ import org.teavm.classlib.java.util.TArrays; import org.teavm.classlib.java.util.TComparator; import org.teavm.classlib.java.util.THashMap; import org.teavm.classlib.java.util.TMap; +import org.teavm.classlib.java.util.regex.TPattern; import org.teavm.dependency.PluggableDependency; import org.teavm.javascript.ni.InjectedBy; import org.teavm.javascript.ni.Rename; @@ -161,6 +162,18 @@ public class TString extends TObject implements TSerializable, TComparable length() || ooffset + len > other.length()) { + return false; + } + for (int i = 0; i < len; ++i) { + char a = charAt(toffset++); + char b = other.charAt(ooffset++); + if (ignoreCase) { + a = TCharacter.toLowerCase(a); + b = TCharacter.toLowerCase(b); + } + if (a != b) { + return false; + } + } + return true; + } + public boolean regionMatches(int toffset, TString other, int ooffset, int len) { if (toffset < 0 || ooffset < 0 || toffset + len > length() || ooffset + len > other.length()) { return false; @@ -613,4 +644,24 @@ public class TString extends TObject implements TSerializable, TComparablenull if + * this character class does not have character representation; + * + * @return bitset + */ + protected BitSet getBits() { + return null; + } + + protected BitSet getLowHighSurrogates() { + return lowHighSurrogates; + } + + public boolean hasLowHighSurrogates() { + return altSurrogates ? lowHighSurrogates.nextClearBit(0) < SURROGATE_CARDINALITY : lowHighSurrogates + .nextSetBit(0) < SURROGATE_CARDINALITY; + } + + public boolean mayContainSupplCodepoints() { + return mayContainSupplCodepoints; + } + + @Override + public int getType() { + return SpecialToken.TOK_CHARCLASS; + } + + public AbstractCharClass getInstance() { + return this; + } + + public AbstractCharClass getSurrogates() { + + if (charClassWithSurrogates == null) { + final BitSet lHS = getLowHighSurrogates(); + + charClassWithSurrogates = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + int index = ch - Character.MIN_SURROGATE; + + return ((index >= 0) && (index < AbstractCharClass.SURROGATE_CARDINALITY)) ? this.altSurrogates ^ + lHS.get(index) : false; + } + }; + charClassWithSurrogates.setNegative(this.altSurrogates); + } + + return charClassWithSurrogates; + } + + public AbstractCharClass getWithoutSurrogates() { + if (charClassWithoutSurrogates == null) { + final BitSet lHS = getLowHighSurrogates(); + final AbstractCharClass thisClass = this; + + charClassWithoutSurrogates = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + int index = ch - Character.MIN_SURROGATE; + + boolean containslHS = ((index >= 0) && (index < AbstractCharClass.SURROGATE_CARDINALITY)) ? this.altSurrogates ^ + lHS.get(index) + : false; + + return thisClass.contains(ch) && !containslHS; + } + }; + charClassWithoutSurrogates.setNegative(isNegative()); + charClassWithoutSurrogates.mayContainSupplCodepoints = mayContainSupplCodepoints; + } + + return charClassWithoutSurrogates; + } + + public boolean hasUCI() { + return false; + } + + /** + * Sets this CharClass to negative form, i.e. if they will add some + * characters and after that set this class to negative it will accept all + * the characters except previously set ones. + * + * Although this method will not alternate all the already set characters, + * just overall meaning of the class. + * + * @see #contains(int) + * @see #intersect(CharClass) + * @see #union(CharClass) + */ + public AbstractCharClass setNegative(boolean value) { + if (alt ^ value) { + alt = !alt; + altSurrogates = !altSurrogates; + } + if (!mayContainSupplCodepoints) { + mayContainSupplCodepoints = true; + } + return this; + } + + public boolean isNegative() { + return alt; + } + + // ----------------------------------------------------------------- + // Static methods and predefined classes + // ----------------------------------------------------------------- + + public static boolean intersects(int ch1, int ch2) { + return ch1 == ch2; + } + + public static boolean intersects(AbstractCharClass cc, int ch) { + return cc.contains(ch); + } + + public static boolean intersects(AbstractCharClass cc1, AbstractCharClass cc2) { + if (cc1.getBits() == null || cc2.getBits() == null) + return true; + return cc1.getBits().intersects(cc2.getBits()); + } + + public static AbstractCharClass getPredefinedClass(String name, boolean negative) { + return ((LazyCharClass)charClasses.getObject(name)).getValue(negative); + } + + abstract static class LazyCharClass { + AbstractCharClass posValue = null; + + AbstractCharClass negValue = null; + + public AbstractCharClass getValue(boolean negative) { + if (!negative && posValue == null) { + posValue = computeValue(); + } else if (negative && negValue == null) { + negValue = computeValue().setNegative(true); + } + if (!negative) + return posValue; + return negValue; + } + + protected abstract AbstractCharClass computeValue(); + } + + static class LazyDigit extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new CharClass().add('0', '9'); + } + } + + static class LazyNonDigit extends LazyDigit { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = super.computeValue().setNegative(true); + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazySpace extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + /* 9-13 - \t\n\x0B\f\r; 32 - ' ' */ + return new CharClass().add(9, 13).add(32); + } + } + + static class LazyNonSpace extends LazySpace { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = super.computeValue().setNegative(true); + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyWord extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new CharClass().add('a', 'z').add('A', 'Z').add('0', '9').add('_'); + } + } + + static class LazyNonWord extends LazyWord { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = super.computeValue().setNegative(true); + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyLower extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new CharClass().add('a', 'z'); + } + } + + static class LazyUpper extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new CharClass().add('A', 'Z'); + } + } + + static class LazyASCII extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new CharClass().add(0x00, 0x7F); + } + } + + static class LazyAlpha extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new CharClass().add('a', 'z').add('A', 'Z'); + } + } + + static class LazyAlnum extends LazyAlpha { + @Override + protected AbstractCharClass computeValue() { + return ((CharClass)super.computeValue()).add('0', '9'); + } + } + + static class LazyPunct extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + /* Punctuation !"#$%&'()*+,-./:;<=>?@ [\]^_` {|}~ */ + return new CharClass().add(0x21, 0x40).add(0x5B, 0x60).add(0x7B, 0x7E); + } + } + + static class LazyGraph extends LazyAlnum { + @Override + protected AbstractCharClass computeValue() { + /* plus punctuation */ + return ((CharClass)super.computeValue()).add(0x21, 0x40).add(0x5B, 0x60).add(0x7B, 0x7E); + } + } + + static class LazyPrint extends LazyGraph { + @Override + protected AbstractCharClass computeValue() { + return ((CharClass)super.computeValue()).add(0x20); + } + } + + static class LazyBlank extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new CharClass().add(' ').add('\t'); + } + } + + static class LazyCntrl extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new CharClass().add(0x00, 0x1F).add(0x7F); + } + } + + static class LazyXDigit extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new CharClass().add('0', '9').add('a', 'f').add('A', 'F'); + } + } + + static class LazyRange extends LazyCharClass { + int start, end; + + public LazyRange(int start, int end) { + this.start = start; + this.end = end; + } + + @Override + public AbstractCharClass computeValue() { + AbstractCharClass chCl = new CharClass().add(start, end); + return chCl; + } + } + + static class LazySpecialsBlock extends LazyCharClass { + @Override + public AbstractCharClass computeValue() { + return new CharClass().add(0xFEFF, 0xFEFF).add(0xFFF0, 0xFFFD); + } + } + + static class LazyCategoryScope extends LazyCharClass { + int category; + + boolean mayContainSupplCodepoints; + + boolean containsAllSurrogates; + + public LazyCategoryScope(int cat, boolean mayContainSupplCodepoints) { + this.mayContainSupplCodepoints = mayContainSupplCodepoints; + this.category = cat; + } + + public LazyCategoryScope(int cat, boolean mayContainSupplCodepoints, boolean containsAllSurrogates) { + this.containsAllSurrogates = containsAllSurrogates; + this.mayContainSupplCodepoints = mayContainSupplCodepoints; + this.category = cat; + } + + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new UnicodeCategoryScope(category); + if (containsAllSurrogates) { + chCl.lowHighSurrogates.set(0, SURROGATE_CARDINALITY); + } + + chCl.mayContainSupplCodepoints = mayContainSupplCodepoints; + return chCl; + } + } + + static class LazyCategory extends LazyCharClass { + int category; + + boolean mayContainSupplCodepoints; + + boolean containsAllSurrogates; + + public LazyCategory(int cat, boolean mayContainSupplCodepoints) { + this.mayContainSupplCodepoints = mayContainSupplCodepoints; + this.category = cat; + } + + public LazyCategory(int cat, boolean mayContainSupplCodepoints, boolean containsAllSurrogates) { + this.containsAllSurrogates = containsAllSurrogates; + this.mayContainSupplCodepoints = mayContainSupplCodepoints; + this.category = cat; + } + + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new UnicodeCategory(category); + if (containsAllSurrogates) { + chCl.lowHighSurrogates.set(0, SURROGATE_CARDINALITY); + } + chCl.mayContainSupplCodepoints = mayContainSupplCodepoints; + ; + return chCl; + } + } + + static class LazyJavaLowerCase extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isLowerCase(ch); + } + }; + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyJavaUpperCase extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isUpperCase(ch); + } + }; + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyJavaWhitespace extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isWhitespace(ch); + } + }; + } + } + + static class LazyJavaMirrored extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new AbstractCharClass() { + @Override + public boolean contains(int ch) { + // TODO implement this method and uncomment + // return Character.isMirrored(ch); + return false; + } + }; + } + } + + static class LazyJavaDefined extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isDefined(ch); + } + }; + chCl.lowHighSurrogates.set(0, SURROGATE_CARDINALITY); + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyJavaDigit extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isDigit(ch); + } + }; + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyJavaIdentifierIgnorable extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isIdentifierIgnorable(ch); + } + }; + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyJavaISOControl extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isISOControl(ch); + } + }; + } + } + + static class LazyJavaJavaIdentifierPart extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isJavaIdentifierPart(ch); + } + }; + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyJavaJavaIdentifierStart extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isJavaIdentifierStart(ch); + } + }; + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyJavaLetter extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isLetter(ch); + } + }; + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyJavaLetterOrDigit extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isLetterOrDigit(ch); + } + }; + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyJavaSpaceChar extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isSpaceChar(ch); + } + }; + } + } + + static class LazyJavaTitleCase extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + return new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isTitleCase(ch); + } + }; + } + } + + static class LazyJavaUnicodeIdentifierPart extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isUnicodeIdentifierPart(ch); + } + }; + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + static class LazyJavaUnicodeIdentifierStart extends LazyCharClass { + @Override + protected AbstractCharClass computeValue() { + AbstractCharClass chCl = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return Character.isUnicodeIdentifierStart(ch); + } + }; + + chCl.mayContainSupplCodepoints = true; + return chCl; + } + } + + /** + * character classes generated from http://www.unicode.org/reports/tr18/ + * http://www.unicode.org/Public/4.1.0/ucd/Blocks.txt + */ + static final class PredefinedCharacterClasses { + static LazyCharClass space = new LazySpace(); + + static LazyCharClass digit = new LazyDigit(); + + static final Object[][] contents = { + { "Lower", new LazyLower() }, //$NON-NLS-1$ + { "Upper", new LazyUpper() }, //$NON-NLS-1$ + { "ASCII", new LazyASCII() }, //$NON-NLS-1$ + { "Alpha", new LazyAlpha() }, //$NON-NLS-1$ + { "Digit", digit }, //$NON-NLS-1$ + { "Alnum", new LazyAlnum() }, //$NON-NLS-1$ + { "Punct", new LazyPunct() }, //$NON-NLS-1$ + { "Graph", new LazyGraph() }, //$NON-NLS-1$ + { "Print", new LazyPrint() }, //$NON-NLS-1$ + { "Blank", new LazyBlank() }, //$NON-NLS-1$ + { "Cntrl", new LazyCntrl() }, //$NON-NLS-1$ + { "XDigit", new LazyXDigit() }, //$NON-NLS-1$ + { "javaLowerCase", new LazyJavaLowerCase() }, //$NON-NLS-1$ + { "javaUpperCase", new LazyJavaUpperCase() }, //$NON-NLS-1$ + { "javaWhitespace", new LazyJavaWhitespace() }, //$NON-NLS-1$ + { "javaMirrored", new LazyJavaMirrored() }, //$NON-NLS-1$ + { "javaDefined", new LazyJavaDefined() }, //$NON-NLS-1$ + { "javaDigit", new LazyJavaDigit() }, //$NON-NLS-1$ + { "javaIdentifierIgnorable", new LazyJavaIdentifierIgnorable() }, //$NON-NLS-1$ + { "javaISOControl", new LazyJavaISOControl() }, //$NON-NLS-1$ + { "javaJavaIdentifierPart", new LazyJavaJavaIdentifierPart() }, //$NON-NLS-1$ + { "javaJavaIdentifierStart", new LazyJavaJavaIdentifierStart() }, //$NON-NLS-1$ + { "javaLetter", new LazyJavaLetter() }, //$NON-NLS-1$ + { "javaLetterOrDigit", new LazyJavaLetterOrDigit() }, //$NON-NLS-1$ + { "javaSpaceChar", new LazyJavaSpaceChar() }, //$NON-NLS-1$ + { "javaTitleCase", new LazyJavaTitleCase() }, //$NON-NLS-1$ + { "javaUnicodeIdentifierPart", new LazyJavaUnicodeIdentifierPart() }, //$NON-NLS-1$ + { "javaUnicodeIdentifierStart", new LazyJavaUnicodeIdentifierStart() }, //$NON-NLS-1$ + { "Space", space }, //$NON-NLS-1$ + { "w", new LazyWord() }, //$NON-NLS-1$ + { "W", new LazyNonWord() }, //$NON-NLS-1$ + { "s", space }, //$NON-NLS-1$ + { "S", new LazyNonSpace() }, //$NON-NLS-1$ + { "d", digit }, //$NON-NLS-1$ + { "D", new LazyNonDigit() }, //$NON-NLS-1$ + { "BasicLatin", new LazyRange(0x0000, 0x007F) }, //$NON-NLS-1$ + { "Latin-1Supplement", new LazyRange(0x0080, 0x00FF) }, //$NON-NLS-1$ + { "LatinExtended-A", new LazyRange(0x0100, 0x017F) }, //$NON-NLS-1$ + { "LatinExtended-B", new LazyRange(0x0180, 0x024F) }, //$NON-NLS-1$ + { "IPAExtensions", new LazyRange(0x0250, 0x02AF) }, //$NON-NLS-1$ + { "SpacingModifierLetters", new LazyRange(0x02B0, 0x02FF) }, //$NON-NLS-1$ + { "CombiningDiacriticalMarks", new LazyRange(0x0300, 0x036F) }, //$NON-NLS-1$ + { "Greek", new LazyRange(0x0370, 0x03FF) }, //$NON-NLS-1$ + { "Cyrillic", new LazyRange(0x0400, 0x04FF) }, //$NON-NLS-1$ + { "CyrillicSupplement", new LazyRange(0x0500, 0x052F) }, //$NON-NLS-1$ + { "Armenian", new LazyRange(0x0530, 0x058F) }, //$NON-NLS-1$ + { "Hebrew", new LazyRange(0x0590, 0x05FF) }, //$NON-NLS-1$ + { "Arabic", new LazyRange(0x0600, 0x06FF) }, //$NON-NLS-1$ + { "Syriac", new LazyRange(0x0700, 0x074F) }, //$NON-NLS-1$ + { "ArabicSupplement", new LazyRange(0x0750, 0x077F) }, //$NON-NLS-1$ + { "Thaana", new LazyRange(0x0780, 0x07BF) }, //$NON-NLS-1$ + { "Devanagari", new LazyRange(0x0900, 0x097F) }, //$NON-NLS-1$ + { "Bengali", new LazyRange(0x0980, 0x09FF) }, //$NON-NLS-1$ + { "Gurmukhi", new LazyRange(0x0A00, 0x0A7F) }, //$NON-NLS-1$ + { "Gujarati", new LazyRange(0x0A80, 0x0AFF) }, //$NON-NLS-1$ + { "Oriya", new LazyRange(0x0B00, 0x0B7F) }, //$NON-NLS-1$ + { "Tamil", new LazyRange(0x0B80, 0x0BFF) }, //$NON-NLS-1$ + { "Telugu", new LazyRange(0x0C00, 0x0C7F) }, //$NON-NLS-1$ + { "Kannada", new LazyRange(0x0C80, 0x0CFF) }, //$NON-NLS-1$ + { "Malayalam", new LazyRange(0x0D00, 0x0D7F) }, //$NON-NLS-1$ + { "Sinhala", new LazyRange(0x0D80, 0x0DFF) }, //$NON-NLS-1$ + { "Thai", new LazyRange(0x0E00, 0x0E7F) }, //$NON-NLS-1$ + { "Lao", new LazyRange(0x0E80, 0x0EFF) }, //$NON-NLS-1$ + { "Tibetan", new LazyRange(0x0F00, 0x0FFF) }, //$NON-NLS-1$ + { "Myanmar", new LazyRange(0x1000, 0x109F) }, //$NON-NLS-1$ + { "Georgian", new LazyRange(0x10A0, 0x10FF) }, //$NON-NLS-1$ + { "HangulJamo", new LazyRange(0x1100, 0x11FF) }, //$NON-NLS-1$ + { "Ethiopic", new LazyRange(0x1200, 0x137F) }, //$NON-NLS-1$ + { "EthiopicSupplement", new LazyRange(0x1380, 0x139F) }, //$NON-NLS-1$ + { "Cherokee", new LazyRange(0x13A0, 0x13FF) }, //$NON-NLS-1$ + { "UnifiedCanadianAboriginalSyllabics", //$NON-NLS-1$ + new LazyRange(0x1400, 0x167F) }, + { "Ogham", new LazyRange(0x1680, 0x169F) }, //$NON-NLS-1$ + { "Runic", new LazyRange(0x16A0, 0x16FF) }, //$NON-NLS-1$ + { "Tagalog", new LazyRange(0x1700, 0x171F) }, //$NON-NLS-1$ + { "Hanunoo", new LazyRange(0x1720, 0x173F) }, //$NON-NLS-1$ + { "Buhid", new LazyRange(0x1740, 0x175F) }, //$NON-NLS-1$ + { "Tagbanwa", new LazyRange(0x1760, 0x177F) }, //$NON-NLS-1$ + { "Khmer", new LazyRange(0x1780, 0x17FF) }, //$NON-NLS-1$ + { "Mongolian", new LazyRange(0x1800, 0x18AF) }, //$NON-NLS-1$ + { "Limbu", new LazyRange(0x1900, 0x194F) }, //$NON-NLS-1$ + { "TaiLe", new LazyRange(0x1950, 0x197F) }, //$NON-NLS-1$ + { "NewTaiLue", new LazyRange(0x1980, 0x19DF) }, //$NON-NLS-1$ + { "KhmerSymbols", new LazyRange(0x19E0, 0x19FF) }, //$NON-NLS-1$ + { "Buginese", new LazyRange(0x1A00, 0x1A1F) }, //$NON-NLS-1$ + { "PhoneticExtensions", new LazyRange(0x1D00, 0x1D7F) }, //$NON-NLS-1$ + { "PhoneticExtensionsSupplement", new LazyRange(0x1D80, 0x1DBF) }, //$NON-NLS-1$ + { "CombiningDiacriticalMarksSupplement", //$NON-NLS-1$ + new LazyRange(0x1DC0, 0x1DFF) }, + { "LatinExtendedAdditional", new LazyRange(0x1E00, 0x1EFF) }, //$NON-NLS-1$ + { "GreekExtended", new LazyRange(0x1F00, 0x1FFF) }, //$NON-NLS-1$ + { "GeneralPunctuation", new LazyRange(0x2000, 0x206F) }, //$NON-NLS-1$ + { "SuperscriptsandSubscripts", new LazyRange(0x2070, 0x209F) }, //$NON-NLS-1$ + { "CurrencySymbols", new LazyRange(0x20A0, 0x20CF) }, //$NON-NLS-1$ + { "CombiningMarksforSymbols", new LazyRange(0x20D0, 0x20FF) }, //$NON-NLS-1$ + { "LetterlikeSymbols", new LazyRange(0x2100, 0x214F) }, //$NON-NLS-1$ + { "NumberForms", new LazyRange(0x2150, 0x218F) }, //$NON-NLS-1$ + { "Arrows", new LazyRange(0x2190, 0x21FF) }, //$NON-NLS-1$ + { "MathematicalOperators", new LazyRange(0x2200, 0x22FF) }, //$NON-NLS-1$ + { "MiscellaneousTechnical", new LazyRange(0x2300, 0x23FF) }, //$NON-NLS-1$ + { "ControlPictures", new LazyRange(0x2400, 0x243F) }, //$NON-NLS-1$ + { "OpticalCharacterRecognition", new LazyRange(0x2440, 0x245F) }, //$NON-NLS-1$ + { "EnclosedAlphanumerics", new LazyRange(0x2460, 0x24FF) }, //$NON-NLS-1$ + { "BoxDrawing", new LazyRange(0x2500, 0x257F) }, //$NON-NLS-1$ + { "BlockElements", new LazyRange(0x2580, 0x259F) }, //$NON-NLS-1$ + { "GeometricShapes", new LazyRange(0x25A0, 0x25FF) }, //$NON-NLS-1$ + { "MiscellaneousSymbols", new LazyRange(0x2600, 0x26FF) }, //$NON-NLS-1$ + { "Dingbats", new LazyRange(0x2700, 0x27BF) }, //$NON-NLS-1$ + { "MiscellaneousMathematicalSymbols-A", //$NON-NLS-1$ + new LazyRange(0x27C0, 0x27EF) }, + { "SupplementalArrows-A", new LazyRange(0x27F0, 0x27FF) }, //$NON-NLS-1$ + { "BraillePatterns", new LazyRange(0x2800, 0x28FF) }, //$NON-NLS-1$ + { "SupplementalArrows-B", new LazyRange(0x2900, 0x297F) }, //$NON-NLS-1$ + { "MiscellaneousMathematicalSymbols-B", //$NON-NLS-1$ + new LazyRange(0x2980, 0x29FF) }, + { "SupplementalMathematicalOperators", //$NON-NLS-1$ + new LazyRange(0x2A00, 0x2AFF) }, + { "MiscellaneousSymbolsandArrows", //$NON-NLS-1$ + new LazyRange(0x2B00, 0x2BFF) }, + { "Glagolitic", new LazyRange(0x2C00, 0x2C5F) }, //$NON-NLS-1$ + { "Coptic", new LazyRange(0x2C80, 0x2CFF) }, //$NON-NLS-1$ + { "GeorgianSupplement", new LazyRange(0x2D00, 0x2D2F) }, //$NON-NLS-1$ + { "Tifinagh", new LazyRange(0x2D30, 0x2D7F) }, //$NON-NLS-1$ + { "EthiopicExtended", new LazyRange(0x2D80, 0x2DDF) }, //$NON-NLS-1$ + { "SupplementalPunctuation", new LazyRange(0x2E00, 0x2E7F) }, //$NON-NLS-1$ + { "CJKRadicalsSupplement", new LazyRange(0x2E80, 0x2EFF) }, //$NON-NLS-1$ + { "KangxiRadicals", new LazyRange(0x2F00, 0x2FDF) }, //$NON-NLS-1$ + { "IdeographicDescriptionCharacters", //$NON-NLS-1$ + new LazyRange(0x2FF0, 0x2FFF) }, + { "CJKSymbolsandPunctuation", new LazyRange(0x3000, 0x303F) }, //$NON-NLS-1$ + { "Hiragana", new LazyRange(0x3040, 0x309F) }, //$NON-NLS-1$ + { "Katakana", new LazyRange(0x30A0, 0x30FF) }, //$NON-NLS-1$ + { "Bopomofo", new LazyRange(0x3100, 0x312F) }, //$NON-NLS-1$ + { "HangulCompatibilityJamo", new LazyRange(0x3130, 0x318F) }, //$NON-NLS-1$ + { "Kanbun", new LazyRange(0x3190, 0x319F) }, //$NON-NLS-1$ + { "BopomofoExtended", new LazyRange(0x31A0, 0x31BF) }, //$NON-NLS-1$ + { "CJKStrokes", new LazyRange(0x31C0, 0x31EF) }, //$NON-NLS-1$ + { "KatakanaPhoneticExtensions", new LazyRange(0x31F0, 0x31FF) }, //$NON-NLS-1$ + { "EnclosedCJKLettersandMonths", new LazyRange(0x3200, 0x32FF) }, //$NON-NLS-1$ + { "CJKCompatibility", new LazyRange(0x3300, 0x33FF) }, //$NON-NLS-1$ + { "CJKUnifiedIdeographsExtensionA", //$NON-NLS-1$ + new LazyRange(0x3400, 0x4DB5) }, + { "YijingHexagramSymbols", new LazyRange(0x4DC0, 0x4DFF) }, //$NON-NLS-1$ + { "CJKUnifiedIdeographs", new LazyRange(0x4E00, 0x9FFF) }, //$NON-NLS-1$ + { "YiSyllables", new LazyRange(0xA000, 0xA48F) }, //$NON-NLS-1$ + { "YiRadicals", new LazyRange(0xA490, 0xA4CF) }, //$NON-NLS-1$ + { "ModifierToneLetters", new LazyRange(0xA700, 0xA71F) }, //$NON-NLS-1$ + { "SylotiNagri", new LazyRange(0xA800, 0xA82F) }, //$NON-NLS-1$ + { "HangulSyllables", new LazyRange(0xAC00, 0xD7A3) }, //$NON-NLS-1$ + { "HighSurrogates", new LazyRange(0xD800, 0xDB7F) }, //$NON-NLS-1$ + { "HighPrivateUseSurrogates", new LazyRange(0xDB80, 0xDBFF) }, //$NON-NLS-1$ + { "LowSurrogates", new LazyRange(0xDC00, 0xDFFF) }, //$NON-NLS-1$ + { "PrivateUseArea", new LazyRange(0xE000, 0xF8FF) }, //$NON-NLS-1$ + { "CJKCompatibilityIdeographs", new LazyRange(0xF900, 0xFAFF) }, //$NON-NLS-1$ + { "AlphabeticPresentationForms", new LazyRange(0xFB00, 0xFB4F) }, //$NON-NLS-1$ + { "ArabicPresentationForms-A", new LazyRange(0xFB50, 0xFDFF) }, //$NON-NLS-1$ + { "VariationSelectors", new LazyRange(0xFE00, 0xFE0F) }, //$NON-NLS-1$ + { "VerticalForms", new LazyRange(0xFE10, 0xFE1F) }, //$NON-NLS-1$ + { "CombiningHalfMarks", new LazyRange(0xFE20, 0xFE2F) }, //$NON-NLS-1$ + { "CJKCompatibilityForms", new LazyRange(0xFE30, 0xFE4F) }, //$NON-NLS-1$ + { "SmallFormVariants", new LazyRange(0xFE50, 0xFE6F) }, //$NON-NLS-1$ + { "ArabicPresentationForms-B", new LazyRange(0xFE70, 0xFEFF) }, //$NON-NLS-1$ + { "HalfwidthandFullwidthForms", new LazyRange(0xFF00, 0xFFEF) }, //$NON-NLS-1$ + { "all", new LazyRange(0x00, 0x10FFFF) }, //$NON-NLS-1$ + { "Specials", new LazySpecialsBlock() }, //$NON-NLS-1$ + { "Cn", new LazyCategory(Character.UNASSIGNED, true) }, + { "IsL", new LazyCategoryScope(0x3E, true) }, + { "Lu", new LazyCategory(Character.UPPERCASE_LETTER, true) }, + { "Ll", new LazyCategory(Character.LOWERCASE_LETTER, true) }, + { "Lt", new LazyCategory(Character.TITLECASE_LETTER, false) }, + { "Lm", new LazyCategory(Character.MODIFIER_LETTER, false) }, + { "Lo", new LazyCategory(Character.OTHER_LETTER, true) }, + { "IsM", new LazyCategoryScope(0x1C0, true) }, + { "Mn", new LazyCategory(Character.NON_SPACING_MARK, true) }, + { "Me", new LazyCategory(Character.ENCLOSING_MARK, false) }, + { "Mc", new LazyCategory(Character.COMBINING_SPACING_MARK, true) }, + { "N", new LazyCategoryScope(0xE00, true) }, + { "Nd", new LazyCategory(Character.DECIMAL_DIGIT_NUMBER, true) }, + { "Nl", new LazyCategory(Character.LETTER_NUMBER, true) }, + { "No", new LazyCategory(Character.OTHER_NUMBER, true) }, + { "IsZ", new LazyCategoryScope(0x7000, false) }, + { "Zs", new LazyCategory(Character.SPACE_SEPARATOR, false) }, + { "Zl", new LazyCategory(Character.LINE_SEPARATOR, false) }, + { "Zp", new LazyCategory(Character.PARAGRAPH_SEPARATOR, false) }, + { "IsC", new LazyCategoryScope(0xF0000, true, true) }, + { "Cc", new LazyCategory(Character.CONTROL, false) }, + { "Cf", new LazyCategory(Character.FORMAT, true) }, + { "Co", new LazyCategory(Character.PRIVATE_USE, true) }, + { "Cs", new LazyCategory(Character.SURROGATE, false, true) }, + { + "IsP", + new LazyCategoryScope((1 << Character.DASH_PUNCTUATION) | (1 << Character.START_PUNCTUATION) | + (1 << Character.END_PUNCTUATION) | (1 << Character.CONNECTOR_PUNCTUATION) | + (1 << Character.OTHER_PUNCTUATION) | (1 << Character.INITIAL_QUOTE_PUNCTUATION) | + (1 << Character.FINAL_QUOTE_PUNCTUATION), true) }, + { "Pd", new LazyCategory(Character.DASH_PUNCTUATION, false) }, + { "Ps", new LazyCategory(Character.START_PUNCTUATION, false) }, + { "Pe", new LazyCategory(Character.END_PUNCTUATION, false) }, + { "Pc", new LazyCategory(Character.CONNECTOR_PUNCTUATION, false) }, + { "Po", new LazyCategory(Character.OTHER_PUNCTUATION, true) }, + { "IsS", new LazyCategoryScope(0x7E000000, true) }, + { "Sm", new LazyCategory(Character.MATH_SYMBOL, true) }, + { "Sc", new LazyCategory(Character.CURRENCY_SYMBOL, false) }, + { "Sk", new LazyCategory(Character.MODIFIER_SYMBOL, false) }, + { "So", new LazyCategory(Character.OTHER_SYMBOL, true) }, + { "Pi", new LazyCategory(Character.INITIAL_QUOTE_PUNCTUATION, false) }, + { "Pf", new LazyCategory(Character.FINAL_QUOTE_PUNCTUATION, false) } }; + + public Object getObject(String name) { + for (int i = 0; i < contents.length; ++i) { + Object[] row = contents[i]; + if (name.equals(row[0])) { + return row[1]; + } + } + return null; + } + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AbstractLineTerminator.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AbstractLineTerminator.java new file mode 100644 index 000000000..02770355f --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AbstractLineTerminator.java @@ -0,0 +1,85 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Line terminator factory + * + * @author Nikolay A. Kuznetsov + */ +abstract class AbstractLineTerminator { + static AbstractLineTerminator unixLT = null; + + static AbstractLineTerminator unicodeLT = null; + + public abstract boolean isLineTerminator(int ch); + + public abstract boolean isAfterLineTerminator(int ch1, int ch2); + + public static AbstractLineTerminator getInstance(int flag) { + if ((flag & TPattern.UNIX_LINES) != 0) { + if (unixLT != null) + return unixLT; + unixLT = new AbstractLineTerminator() { + @Override + public boolean isLineTerminator(int ch) { + return ch == '\n'; + } + + @Override + public boolean isAfterLineTerminator(int ch, int ch2) { + return ch == '\n'; + } + }; + return unixLT; + } else { + if (unicodeLT != null) + return unicodeLT; + unicodeLT = new AbstractLineTerminator() { + @Override + public boolean isLineTerminator(int ch) { + return (ch == '\n' || ch == '\r' || ch == '\u0085' || (ch | 1) == '\u2029'); + } + + @Override + public boolean isAfterLineTerminator(int ch, int ch2) { + return (ch == '\n' || ch == '\u0085' || (ch | 1) == '\u2029') || (ch == '\r' && ch2 != '\n'); + } + }; + return unicodeLT; + } + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AbstractSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AbstractSet.java new file mode 100644 index 000000000..3c32e1fff --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AbstractSet.java @@ -0,0 +1,272 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Basic class for nodes, representing given regular expression. Note: All the + * classes representing nodes has set prefix; + * + * @author Nikolay A. Kuznetsov + */ +abstract class AbstractSet { + + public static final int TYPE_LEAF = 1 << 0; + + public static final int TYPE_FSET = 1 << 1; + + public static final int TYPE_QUANT = 1 << 3; + + public static final int TYPE_DOTSET = 0x80000000 | '.'; + + /** + * Next node to visit + */ + protected AbstractSet next; + + /** + * Counter for debugging purposes, represent unique node index; + */ + static int counter = 1; + + protected boolean isSecondPassVisited = false; + + protected String index = new Integer(AbstractSet.counter++).toString(); + + private int type = 0; + + public AbstractSet() { + } + + public AbstractSet(AbstractSet n) { + next = n; + } + + /** + * Checks if this node matches in given position and recursively call next + * node matches on positive self match. Returns positive integer if entire + * match succeed, negative otherwise + * + * @param stringIndex + * - string index to start from; + * @param testString + * - input string + * @param matchResult + * - MatchResult to sore result into + * @return -1 if match fails or n > 0; + */ + public abstract int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult); + + /** + * Attempts to apply pattern starting from this set/stringIndex; returns + * index this search was started from, if value is negative, this means that + * this search didn't succeed, additional information could be obtained via + * matchResult; + * + * Note: this is default implementation for find method, it's based on + * matches, subclasses do not have to override find method unless more + * effective find method exists for a particular node type (sequence, i.e. + * substring, for example). Same applies for find back method. + * + * @param stringIndex + * starting index + * @param testString + * string to search in + * @param matchResult + * result of the match + * @return last searched index + */ + public int find(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int length = matchResult.getRightBound(); + while (stringIndex <= length) { + if (matches(stringIndex, testString, matchResult) >= 0) { + return stringIndex; + } else { + stringIndex++; + } + } + return -1; + } + + /** + * @param stringIndex + * - an index, to finish search back (left limit) + * @param startSearch + * - an index to start search from (right limit) + * @param testString + * - test string; + * @param matchResult + * match result + * @return an index to start back search next time if this search fails(new + * left bound); if this search fails the value is negative; + */ + public int findBack(int stringIndex, int startSearch, CharSequence testString, MatchResultImpl matchResult) { + while (startSearch >= stringIndex) { + if (matches(startSearch, testString, matchResult) >= 0) { + return startSearch; + } else { + startSearch--; + } + } + return -1; + } + + /** + * Returns true, if this node has consumed any characters during positive + * match attempt, for example node representing character always consumes + * one character if it matches. If particular node matches empty sting this + * method will return false; + * + * @param matchResult + * @return + */ + public abstract boolean hasConsumed(MatchResultImpl matchResult); + + /** + * Returns name for the particular node type. Used for debugging purposes. + */ + protected abstract String getName(); + + protected void setType(int type) { + this.type = type; + } + + public int getType() { + return this.type; + } + + protected String getQualifiedName() { + return "<" + index + ":" + getName() + ">"; + } + + @Override + public String toString() { + return getQualifiedName(); + } + + /** + * Returns the next. + */ + public AbstractSet getNext() { + return next; + } + + /** + * Sets next abstract set + * + * @param next + * The next to set. + */ + public void setNext(AbstractSet next) { + this.next = next; + } + + /** + * Returns true if the given node intersects with this one, false otherwise. + * This method is being used for quantifiers construction, lets consider the + * following regular expression (a|b)*ccc. + * + * (a|b) does not intersects with "ccc" and thus can be quantified greedily + * (w/o kickbacks), like *+ instead of *. + * + * @param set + * - usually previous node + * + * @return true if the given node intersects with this one + */ + public boolean first(AbstractSet set) { + return true; + } + + /** + * This method is used for replacement backreferenced sets. + * + * @param prev + * - node who references to this node + * @return null if current node need not to be replaced JointSet which is + * replacement of current node otherwise + */ + public JointSet processBackRefReplacement() { + return null; + } + + /** + * This method is used for traversing nodes after the first stage of + * compilation. + */ + public void processSecondPass() { + this.isSecondPassVisited = true; + + if (next != null) { + + if (!next.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + JointSet set = next.processBackRefReplacement(); + + if (set != null) { + next.isSecondPassVisited = true; + next = set; + } + + /* + * End code to do during the pass + */ + next.processSecondPass(); + } else { + + /* + * We reach node through next but it is already traversed. You + * can see this situation for AltGroupQuantifierSet.next when we + * reach this node through AltGroupQuantifierSet.innerset. ... + * .next + */ + + /* + * Add here code to do during the pass + */ + if (next instanceof SingleSet && ((FSet)((JointSet)next).fSet).isBackReferenced) { + next = next.next; + } + + /* + * End code to do during the pass + */ + } + } + } +} \ No newline at end of file diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AheadFSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AheadFSet.java new file mode 100644 index 000000000..47ed0ac25 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AheadFSet.java @@ -0,0 +1,57 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * LookAhead FSet, always returns true; + * + * @author Nikolay A. Kuznetsov + */ +class AheadFSet extends FSet { + public AheadFSet() { + super(-1); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + return stringIndex; + } + + @Override + protected String getName() { + return "AheadFSet"; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AltGroupQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AltGroupQuantifierSet.java new file mode 100644 index 000000000..c27b4193f --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AltGroupQuantifierSet.java @@ -0,0 +1,67 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents "?" quantifier over composite sets. + * + * @author Nikolay A. Kuznetsov + */ +class AltGroupQuantifierSet extends GroupQuantifierSet { + public AltGroupQuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + if (!innerSet.hasConsumed(matchResult)) + return next.matches(stringIndex, testString, matchResult); + + int nextIndex = innerSet.matches(stringIndex, testString, matchResult); + + if (nextIndex < 0) { + return next.matches(stringIndex, testString, matchResult); + } else { + return nextIndex; + } + } + + @Override + public void setNext(AbstractSet next) { + super.setNext(next); + innerSet.setNext(next); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AltQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AltQuantifierSet.java new file mode 100644 index 000000000..8c476ec57 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AltQuantifierSet.java @@ -0,0 +1,65 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents "?" quantifier over leaf sets. + * + * @author Nikolay A. Kuznetsov + */ +class AltQuantifierSet extends LeafQuantifierSet { + + public AltQuantifierSet(LeafSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int shift = 0; + + if ((shift = innerSet.matches(stringIndex, testString, matchResult)) >= 0) { + return shift; + } else { + return next.matches(stringIndex, testString, matchResult); + } + } + + @Override + public void setNext(AbstractSet next) { + super.setNext(next); + innerSet.setNext(next); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AtomicFSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AtomicFSet.java new file mode 100644 index 000000000..9396a9e6a --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AtomicFSet.java @@ -0,0 +1,72 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * @author Nikolay A. Kuznetsov + */ +class AtomicFSet extends FSet { + + int index; + + public AtomicFSet(int groupIndex) { + super(groupIndex); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + + int gr = getGroupIndex(); + matchResult.setConsumed(gr, stringIndex - matchResult.getConsumed(gr)); + index = stringIndex; + + return stringIndex; + } + + public int getIndex() { + return index; + } + + @Override + protected String getName() { + return "AtomicFSet"; + } + + @Override + public boolean hasConsumed(MatchResultImpl mr) { + return false; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AtomicJointSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AtomicJointSet.java new file mode 100644 index 000000000..92fa543ba --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/AtomicJointSet.java @@ -0,0 +1,88 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.ArrayList; + +/** + * This class represent atomic group (?>X), once X matches, this match become + * unchangeable till the end of the match. + * + * @author Nikolay A. Kuznetsov + */ +class AtomicJointSet extends NonCapJointSet { + public AtomicJointSet(ArrayList children, FSet fSet) { + super(children, fSet); + } + + /** + * Returns stringIndex+shift, the next position to match + */ + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int start = matchResult.getConsumed(groupIndex); + matchResult.setConsumed(groupIndex, stringIndex); + + int size = children.size(); + for (int i = 0; i < size; i++) { + AbstractSet e = children.get(i); + int shift = e.matches(stringIndex, testString, matchResult); + if (shift >= 0) { + // AtomicFset always returns true, but saves the index to run + // this next.match() from; + return next.matches(((AtomicFSet)fSet).getIndex(), testString, matchResult); + } + } + + matchResult.setConsumed(groupIndex, start); + return -1; + } + + @Override + public void setNext(AbstractSet next) { + this.next = next; + } + + @Override + public AbstractSet getNext() { + return next; + } + + @Override + protected String getName() { + return "NonCapJointSet"; //$NON-NLS-1$ + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/BackReferenceSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/BackReferenceSet.java new file mode 100644 index 000000000..c7cd45de1 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/BackReferenceSet.java @@ -0,0 +1,120 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Back reference node, i.e. \1-9; + * + * @author Nikolay A. Kuznetsov + */ +class BackReferenceSet extends CIBackReferenceSet { + + public BackReferenceSet(int groupIndex, int consCounter) { + super(groupIndex, consCounter); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + String group = getString(matchResult); + if (group == null || (stringIndex + group.length()) > matchResult.getRightBound()) + return -1; + int shift = testString.toString().startsWith(group, stringIndex) ? group.length() : -1; + + if (shift < 0) { + return -1; + } + matchResult.setConsumed(consCounter, shift); + return next.matches(stringIndex + shift, testString, matchResult); + } + + @Override + public int find(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + String group = getString(matchResult); + int strLength = matchResult.getLeftBound(); + + if (group == null || (strIndex + group.length()) > strLength) + return -1; + + String testStr = testString.toString(); + + while (strIndex <= strLength) { + strIndex = testStr.indexOf(group, strIndex); + + if (strIndex < 0) + return -1; + if (next.matches(strIndex + group.length(), testString, matchResult) >= 0) { + return strIndex; + } + + strIndex++; + } + + return -1; + } + + @Override + public int findBack(int strIndex, int lastIndex, CharSequence testString, MatchResultImpl matchResult) { + String group = getString(matchResult); + + if (group == null) + return -1; + + String testStr = testString.toString(); + + while (lastIndex >= strIndex) { + lastIndex = testStr.lastIndexOf(group, lastIndex); + + if (lastIndex < 0 || lastIndex < strIndex) + return -1; + if (next.matches(lastIndex + group.length(), testString, matchResult) >= 0) { + return lastIndex; + } + + lastIndex--; + } + return -1; + } + + @Override + public boolean first(AbstractSet set) { + return true; + } + + @Override + public String getName() { + return "back reference: " + this.groupIndex; //$NON-NLS-1$ + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/BackReferencedSingleSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/BackReferencedSingleSet.java new file mode 100644 index 000000000..3578288dd --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/BackReferencedSingleSet.java @@ -0,0 +1,121 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay Kuznetsov + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Group node over subexpression w/o alternations. This node is used if current + * group is referenced via backreference. + */ + +class BackReferencedSingleSet extends SingleSet { + + /* + * This class is needed only for overwriting find() and findBack() methods + * of SingleSet class, which is being back referenced. The following example + * explains the need for such substitution: Let's consider the pattern + * ".*(.)\\1". Leading .* works as follows: finds line terminator and runs + * findBack from that point. findBack method in its turn (in contrast to + * matches) sets group boundaries on the back trace. Thus at the point we + * try to match back reference(\\1) groups are not yet set. + * + * To fix this problem we replace backreferenced groups with instances of + * this class, which will use matches instead of find; this will affect + * performance, but ensure correctness of the match. + */ + + public BackReferencedSingleSet(AbstractSet child, FSet fSet) { + super(child, fSet); + } + + public BackReferencedSingleSet(SingleSet node) { + super(node.kid, ((FSet)node.fSet)); + } + + @Override + public int find(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int res = 0; + int lastIndex = matchResult.getRightBound(); + int startSearch = stringIndex; + + for (; startSearch <= lastIndex; startSearch++) { + int saveStart = matchResult.getStart(groupIndex); + + matchResult.setStart(groupIndex, startSearch); + res = kid.matches(startSearch, testString, matchResult); + if (res >= 0) { + res = startSearch; + break; + } else { + matchResult.setStart(groupIndex, saveStart); + } + } + + return res; + } + + @Override + public int findBack(int stringIndex, int lastIndex, CharSequence testString, MatchResultImpl matchResult) { + int res = 0; + int startSearch = lastIndex; + + for (; startSearch >= stringIndex; startSearch--) { + int saveStart = matchResult.getStart(groupIndex); + + matchResult.setStart(groupIndex, startSearch); + res = kid.matches(startSearch, testString, matchResult); + if (res >= 0) { + res = startSearch; + break; + } else { + matchResult.setStart(groupIndex, saveStart); + } + } + + return res; + } + + /** + * This method is used for replacement backreferenced sets. + * + * @param prev + * - node who references to this node + */ + @Override + public JointSet processBackRefReplacement() { + return null; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/BehindFSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/BehindFSet.java new file mode 100644 index 000000000..517571574 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/BehindFSet.java @@ -0,0 +1,62 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * FSet for lookbehind constructs. Checks if string index saved by corresponding + * jointSet in "consumers" equals to current index and return current string + * index, return -1 otherwise. + * + * @author Nikolay A. Kuznetsov + */ +class BehindFSet extends FSet { + public BehindFSet(int groupIndex) { + super(groupIndex); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + + int gr = getGroupIndex(); + int rightBound = matchResult.getConsumed(gr); + return (rightBound == stringIndex) ? stringIndex : -1; + } + + @Override + protected String getName() { + return "BehindFSet"; //$NON-NLS-1$ + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CIBackReferenceSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CIBackReferenceSet.java new file mode 100644 index 000000000..0e517834d --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CIBackReferenceSet.java @@ -0,0 +1,101 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Case Insensitive back reference node; + * + * @author Nikolay A. Kuznetsov + */ +class CIBackReferenceSet extends JointSet { + protected int referencedGroup; + protected int consCounter; + + public CIBackReferenceSet(int groupIndex, int consCounter) { + this.referencedGroup = groupIndex; + this.consCounter = consCounter; + } + + public int accepts(int strIndex, CharSequence testString) { + throw new TPatternSyntaxException(strIndex + ", " + testString, "", 0); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + String group = getString(matchResult); + + if (group == null || (stringIndex + group.length()) > matchResult.getRightBound()) + return -1; + + for (int i = 0; i < group.length(); i++) { + if (group.charAt(i) != testString.charAt(stringIndex + i) && + TPattern.getSupplement(group.charAt(i)) != testString.charAt(stringIndex + i)) { + return -1; + } + } + matchResult.setConsumed(consCounter, group.length()); + return next.matches(stringIndex + group.length(), testString, matchResult); + } + + @Override + public AbstractSet getNext() { + return this.next; + } + + @Override + public void setNext(AbstractSet next) { + this.next = next; + } + + protected String getString(MatchResultImpl matchResult) { + String res = matchResult.getGroupNoCheck(referencedGroup); + return res; + } + + @Override + public String getName() { + return "CI back reference: " + this.groupIndex; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + int cons; + boolean res = ((cons = matchResult.getConsumed(consCounter)) < 0 || cons > 0); + matchResult.setConsumed(consCounter, -1); + return res; + } + +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CICharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CICharSet.java new file mode 100644 index 000000000..31cdae8bb --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CICharSet.java @@ -0,0 +1,67 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single character in case insensitive manner. + * + * @author Nikolay A. Kuznetsov + */ +class CICharSet extends LeafSet { + + private char ch; + + private char supplement; + + public CICharSet(char ch) { + this.ch = ch; + this.supplement = TPattern.getSupplement(ch); + } + + @Override + public int accepts(int strIndex, CharSequence testString) { + return (this.ch == testString.charAt(strIndex) || this.supplement == testString.charAt(strIndex)) ? 1 : -1; + } + + @Override + protected String getName() { + return "CI " + ch; + } + + protected char getChar() { + return ch; + } +} \ No newline at end of file diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CIDecomposedCharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CIDecomposedCharSet.java new file mode 100644 index 000000000..24204e94c --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CIDecomposedCharSet.java @@ -0,0 +1,50 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Represents case insensitive + * canonical decomposition of + * Unicode character. Is used when + * CANON_EQ flag of Pattern class + * is specified. + */ +class CIDecomposedCharSet extends DecomposedCharSet{ + + /* + * Just only a stub + */ + public CIDecomposedCharSet(int [] decomp, int decomposedCharLength) { + super(decomp, decomposedCharLength); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CISequenceSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CISequenceSet.java new file mode 100644 index 000000000..1f564b966 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CISequenceSet.java @@ -0,0 +1,68 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * This class represents ASCII case insensitive character sequences. + * + * @author Nikolay A. Kuznetsov + */ +class CISequenceSet extends LeafSet { + private String string = null; + + CISequenceSet(StringBuffer substring) { + this.string = substring.toString(); + this.charCount = substring.length(); + } + + @Override + public int accepts(int strIndex, CharSequence testString) { + for (int i = 0; i < string.length(); i++) { + if (string.charAt(i) != testString.charAt(strIndex + i) && + TPattern.getSupplement(string.charAt(i)) != testString.charAt(strIndex + i)) { + return -1; + } + } + + return string.length(); + + } + + @Override + public String getName() { + return "CI sequence: " + string; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CharClass.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CharClass.java new file mode 100644 index 000000000..5f2578cca --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CharClass.java @@ -0,0 +1,604 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.BitSet; + +/** + * User defined character classes ([abef]). See AbstractCharClass documentation + * for more details. + * + * @author Nikolay A. Kuznetsov + */ +class CharClass extends AbstractCharClass { + // Flag indicates if we add supplement upper/lower case + boolean ci = false; + + boolean uci = false; + + // Flag indicates if there are unicode supplements + boolean hasUCI = false; + + boolean invertedSurrogates = false; + + boolean inverted = false; + + boolean hideBits = false; + + BitSet bits = new BitSet(); + + AbstractCharClass nonBitSet = null; + + public CharClass() { + } + + public CharClass(boolean ci, boolean uci) { + this.ci = ci; + this.uci = uci; + } + + public CharClass(boolean negative, boolean ci, boolean uci) { + this(ci, uci); + setNegative(negative); + } + + /* + * We can use this method safely even if nonBitSet != null due to specific + * of range constructions in regular expressions. + */ + public CharClass add(int ch) { + if (ci) { + if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) { + if (!inverted) { + bits.set(TPattern.getSupplement((char)ch)); + } else { + bits.clear(TPattern.getSupplement((char)ch)); + } + } else if (uci && ch > 128) { + hasUCI = true; + ch = Character.toLowerCase(Character.toUpperCase(ch)); + // return this; + } + } + + if (Lexer.isHighSurrogate(ch) || Lexer.isLowSurrogate(ch)) { + if (!invertedSurrogates) { + lowHighSurrogates.set(ch - Character.MIN_SURROGATE); + } else { + lowHighSurrogates.clear(ch - Character.MIN_SURROGATE); + } + } + + if (!inverted) { + bits.set(ch); + } else + bits.clear(ch); + + if (!mayContainSupplCodepoints && Character.isSupplementaryCodePoint(ch)) { + mayContainSupplCodepoints = true; + } + + return this; + } + + /* + * The difference between add(AbstractCharClass) and + * union(AbstractCharClass) is that add() is used for constructions like + * "[^abc\\d]" (this pattern doesn't match "1") while union is used for + * constructions like "[^abc[\\d]]" (this pattern matches "1"). + */ + public CharClass add(final AbstractCharClass cc) { + + if (!mayContainSupplCodepoints && cc.mayContainSupplCodepoints) { + mayContainSupplCodepoints = true; + } + + if (!invertedSurrogates) { + + // A | !B = ! ((A ^ B) & B) + if (cc.altSurrogates) { + lowHighSurrogates.xor(cc.getLowHighSurrogates()); + lowHighSurrogates.and(cc.getLowHighSurrogates()); + altSurrogates = !altSurrogates; + invertedSurrogates = true; + + // A | B + } else { + lowHighSurrogates.or(cc.getLowHighSurrogates()); + } + } else { + + // !A | !B = !(A & B) + if (cc.altSurrogates) { + lowHighSurrogates.and(cc.getLowHighSurrogates()); + + // !A | B = !(A & !B) + } else { + lowHighSurrogates.andNot(cc.getLowHighSurrogates()); + } + } + + if (!hideBits && cc.getBits() != null) { + if (!inverted) { + + // A | !B = ! ((A ^ B) & B) + if (cc.isNegative()) { + bits.xor(cc.getBits()); + bits.and(cc.getBits()); + alt = !alt; + inverted = true; + + // A | B + } else { + bits.or(cc.getBits()); + } + } else { + + // !A | !B = !(A & B) + if (cc.isNegative()) { + bits.and(cc.getBits()); + + // !A | B = !(A & !B) + } else { + bits.andNot(cc.getBits()); + } + } + } else { + final boolean curAlt = alt; + + if (nonBitSet == null) { + + if (curAlt && !inverted && bits.isEmpty()) { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return cc.contains(ch); + } + }; + // alt = true; + } else { + + /* + * We keep the value of alt unchanged for constructions like + * [^[abc]fgb] by using the formula a ^ b == !a ^ !b. + */ + if (curAlt) { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return !((curAlt ^ bits.get(ch)) || ((curAlt ^ inverted) ^ cc.contains(ch))); + } + }; + // alt = true + } else { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return (curAlt ^ bits.get(ch)) || ((curAlt ^ inverted) ^ cc.contains(ch)); + } + }; + // alt = false + } + } + + hideBits = true; + } else { + final AbstractCharClass nb = nonBitSet; + + if (curAlt) { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return !(curAlt ^ (nb.contains(ch) || cc.contains(ch))); + } + }; + // alt = true + } else { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return curAlt ^ (nb.contains(ch) || cc.contains(ch)); + } + }; + // alt = false + } + } + } + + return this; + } + + public CharClass add(int st, int end) { + if (st > end) + throw new IllegalArgumentException(); + if (!ci + + // no intersection with surrogate characters + && + (end < Character.MIN_SURROGATE || st > Character.MAX_SURROGATE)) { + if (!inverted) { + bits.set(st, end + 1); + } else { + bits.clear(st, end + 1); + } + } else { + for (int i = st; i < end + 1; i++) { + add(i); + } + } + return this; + } + + // OR operation + public void union(final AbstractCharClass clazz) { + if (!mayContainSupplCodepoints && clazz.mayContainSupplCodepoints) { + mayContainSupplCodepoints = true; + } + + if (clazz.hasUCI()) + this.hasUCI = true; + + if (altSurrogates ^ clazz.altSurrogates) { + + // !A | B = !(A & !B) + if (altSurrogates) { + lowHighSurrogates.andNot(clazz.getLowHighSurrogates()); + + // A | !B = !((A ^ B) & B) + } else { + lowHighSurrogates.xor(clazz.getLowHighSurrogates()); + lowHighSurrogates.and(clazz.getLowHighSurrogates()); + altSurrogates = true; + } + + } else { + + // !A | !B = !(A & B) + if (altSurrogates) { + lowHighSurrogates.and(clazz.getLowHighSurrogates()); + + // A | B + } else { + lowHighSurrogates.or(clazz.getLowHighSurrogates()); + } + } + + if (!hideBits && clazz.getBits() != null) { + if (alt ^ clazz.isNegative()) { + + // !A | B = !(A & !B) + if (alt) { + bits.andNot(clazz.getBits()); + + // A | !B = !((A ^ B) & B) + } else { + bits.xor(clazz.getBits()); + bits.and(clazz.getBits()); + alt = true; + } + + } else { + + // !A | !B = !(A & B) + if (alt) { + bits.and(clazz.getBits()); + + // A | B + } else { + bits.or(clazz.getBits()); + } + } + } else { + final boolean curAlt = alt; + + if (nonBitSet == null) { + + if (!inverted && bits.isEmpty()) { + if (curAlt) { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return !clazz.contains(ch); + } + }; + // alt = true + } else { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return clazz.contains(ch); + } + }; + // alt = false + } + } else { + + if (curAlt) { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return !(clazz.contains(ch) || (curAlt ^ bits.get(ch))); + } + }; + // alt = true + } else { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return clazz.contains(ch) || (curAlt ^ bits.get(ch)); + } + }; + // alt = false + } + } + hideBits = true; + } else { + final AbstractCharClass nb = nonBitSet; + + if (curAlt) { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return !((curAlt ^ nb.contains(ch)) || clazz.contains(ch)); + } + }; + // alt = true + } else { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return (curAlt ^ nb.contains(ch)) || clazz.contains(ch); + } + }; + // alt = false + } + } + } + } + + // AND operation + public void intersection(final AbstractCharClass clazz) { + if (!mayContainSupplCodepoints && clazz.mayContainSupplCodepoints) { + mayContainSupplCodepoints = true; + } + + if (clazz.hasUCI()) + this.hasUCI = true; + + if (altSurrogates ^ clazz.altSurrogates) { + + // !A & B = ((A ^ B) & B) + if (altSurrogates) { + lowHighSurrogates.xor(clazz.getLowHighSurrogates()); + lowHighSurrogates.and(clazz.getLowHighSurrogates()); + altSurrogates = false; + + // A & !B + } else { + lowHighSurrogates.andNot(clazz.getLowHighSurrogates()); + } + } else { + + // !A & !B = !(A | B) + if (altSurrogates) { + lowHighSurrogates.or(clazz.getLowHighSurrogates()); + + // A & B + } else { + lowHighSurrogates.and(clazz.getLowHighSurrogates()); + } + } + + if (!hideBits && clazz.getBits() != null) { + + if (alt ^ clazz.isNegative()) { + + // !A & B = ((A ^ B) & B) + if (alt) { + bits.xor(clazz.getBits()); + bits.and(clazz.getBits()); + alt = false; + + // A & !B + } else { + bits.andNot(clazz.getBits()); + } + } else { + + // !A & !B = !(A | B) + if (alt) { + bits.or(clazz.getBits()); + + // A & B + } else { + bits.and(clazz.getBits()); + } + } + } else { + final boolean curAlt = alt; + + if (nonBitSet == null) { + + if (!inverted && bits.isEmpty()) { + if (curAlt) { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return !clazz.contains(ch); + } + }; + // alt = true + } else { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return clazz.contains(ch); + } + }; + // alt = false + } + } else { + + if (curAlt) { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return !(clazz.contains(ch) && (curAlt ^ bits.get(ch))); + } + }; + // alt = true + } else { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return clazz.contains(ch) && (curAlt ^ bits.get(ch)); + } + }; + // alt = false + } + } + hideBits = true; + } else { + final AbstractCharClass nb = nonBitSet; + + if (curAlt) { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return !((curAlt ^ nb.contains(ch)) && clazz.contains(ch)); + } + }; + // alt = true + } else { + nonBitSet = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return (curAlt ^ nb.contains(ch)) && clazz.contains(ch); + } + }; + // alt = false + } + } + } + } + + /** + * Returns true if character class contains symbol specified, + * false otherwise. Note: #setNegative() method changes the + * meaning of contains method; + * + * @param ch + * @return true if character class contains symbol specified; + * + * TODO: currently character class implementation based + * on BitSet, but this implementation possibly will be turned to + * combined BitSet(for first 256 symbols) and Black/Red tree for the + * rest of UTF. + */ + @Override + public boolean contains(int ch) { + if (nonBitSet == null) { + return this.alt ^ bits.get(ch); + } else { + return alt ^ nonBitSet.contains(ch); + } + } + + @Override + protected BitSet getBits() { + if (hideBits) + return null; + return bits; + } + + @Override + protected BitSet getLowHighSurrogates() { + return lowHighSurrogates; + } + + @Override + public AbstractCharClass getInstance() { + + if (nonBitSet == null) { + final BitSet bs = getBits(); + + AbstractCharClass res = new AbstractCharClass() { + @Override + public boolean contains(int ch) { + return this.alt ^ bs.get(ch); + } + + @Override + public String toString() { + StringBuilder temp = new StringBuilder(); + for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i + 1)) { + temp.append(Character.toChars(i)); + temp.append('|'); + } + + if (temp.length() > 0) + temp.deleteCharAt(temp.length() - 1); + + return temp.toString(); + } + + }; + return res.setNegative(isNegative()); + } else { + return this; + } + } + + // for debugging purposes only + @Override + public String toString() { + StringBuilder temp = new StringBuilder(); + for (int i = bits.nextSetBit(0); i >= 0; i = bits.nextSetBit(i + 1)) { + temp.append(Character.toChars(i)); + temp.append('|'); + } + + if (temp.length() > 0) + temp.deleteCharAt(temp.length() - 1); + + return temp.toString(); + } + + @Override + public boolean hasUCI() { + return hasUCI; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CharSet.java new file mode 100644 index 000000000..4a9d36881 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CharSet.java @@ -0,0 +1,130 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single character. + * + * @author Nikolay A. Kuznetsov + */ +class CharSet extends LeafSet { + + private char ch = 0; + + public CharSet(char ch) { + this.ch = ch; + } + + @Override + public int charCount() { + return 1; + } + + @Override + public int accepts(int strIndex, CharSequence testString) { + return (this.ch == testString.charAt(strIndex)) ? 1 : -1; + } + + @Override + public int find(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + if (testString instanceof String) { + String testStr = (String)testString; + int strLength = matchResult.getRightBound(); + + while (strIndex < strLength) { + strIndex = testStr.indexOf(ch, strIndex); + if (strIndex < 0) + return -1; + if (next.matches(strIndex + 1, testString, matchResult) >= 0) { + return strIndex; + } + strIndex++; + } + + return -1; + } + + return super.find(strIndex, testString, matchResult); + } + + @Override + public int findBack(int strIndex, int lastIndex, CharSequence testString, MatchResultImpl matchResult) { + if (testString instanceof String) { + String testStr = (String)testString; + + while (lastIndex >= strIndex) { + lastIndex = testStr.lastIndexOf(ch, lastIndex); + if (lastIndex < 0 || lastIndex < strIndex) { + return -1; + } + + if (next.matches(lastIndex + 1, testString, matchResult) >= 0) { + return lastIndex; + } + + lastIndex--; + } + + return -1; + } + + return super.findBack(strIndex, lastIndex, testString, matchResult); + } + + @Override + protected String getName() { + return "" + ch; //$NON-NLS-1$ + } + + protected char getChar() { + return ch; + } + + @Override + public boolean first(AbstractSet set) { + if (set instanceof CharSet) { + return ((CharSet)set).getChar() == ch; + } else if (set instanceof RangeSet) { + return ((RangeSet)set).accepts(0, Character.toString(ch)) > 0; + } else if (set instanceof SupplRangeSet) { + return ((SupplRangeSet)set).contains(ch); + } else if (set instanceof SupplCharSet) { + return false; + } + + return true; + } +} \ No newline at end of file diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CompositeGroupQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CompositeGroupQuantifierSet.java new file mode 100644 index 000000000..425053726 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CompositeGroupQuantifierSet.java @@ -0,0 +1,108 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Composite (i.e. {n,m}) quantifier node for groups ("(X){n,m}") + * + * @author Nikolay A. Kuznetsov + */ +class CompositeGroupQuantifierSet extends GroupQuantifierSet { + + protected Quantifier quantifier = null; + + int setCounter; + + /** + * Constructs CompositeGroupQuantifierSet + * + * @param quant + * - given composite quantifier + * @param innerSet + * - given group + * @param next + * - next set after the quantifier + */ + public CompositeGroupQuantifierSet(Quantifier quant, AbstractSet innerSet, AbstractSet next, int type, + int setCounter) { + super(innerSet, next, type); + this.quantifier = quant; + this.setCounter = setCounter; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int enterCounter = matchResult.getEnterCounter(setCounter); + + if (!innerSet.hasConsumed(matchResult)) + return next.matches(stringIndex, testString, matchResult); + + // can't go inner set; + if (enterCounter >= quantifier.max()) { + return next.matches(stringIndex, testString, matchResult); + } + + // go inner set; + matchResult.setEnterCounter(setCounter, ++enterCounter); + int nextIndex = innerSet.matches(stringIndex, testString, matchResult); + + if (nextIndex < 0) { + matchResult.setEnterCounter(setCounter, --enterCounter); + if (enterCounter >= quantifier.min()) { + return next.matches(stringIndex, testString, matchResult); + } else { + matchResult.setEnterCounter(setCounter, 0); + return -1; + } + } else { + matchResult.setEnterCounter(setCounter, 0); + return nextIndex; + } + } + + public void reset() { + quantifier.resetCounter(); + } + + @Override + protected String getName() { + return quantifier.toString(); + } + + void setQuantifier(Quantifier quant) { + this.quantifier = quant; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CompositeQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CompositeQuantifierSet.java new file mode 100644 index 000000000..f240b1763 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CompositeQuantifierSet.java @@ -0,0 +1,104 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Composite (i.e. {n,m}) quantifier node over the leaf nodes ("a{n,m}") + * + * @author Nikolay A. Kuznetsov + */ +class CompositeQuantifierSet extends LeafQuantifierSet { + + protected Quantifier quantifier = null; + + public CompositeQuantifierSet(Quantifier quant, LeafSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + this.quantifier = quant; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int min = quantifier.min(); + int max = quantifier.max(); + int i = 0; + + for (; i < min; i++) { + + if (stringIndex + leaf.charCount() > matchResult.getRightBound()) { + matchResult.hitEnd = true; + return -1; + } + + int shift = leaf.accepts(stringIndex, testString); + if (shift < 1) { + return -1; + } + stringIndex += shift; + } + + for (; i < max; i++) { + int shift; + if (stringIndex + leaf.charCount() > matchResult.getRightBound() || + (shift = leaf.accepts(stringIndex, testString)) < 1) { + break; + } + stringIndex += shift; + } + + for (; i >= min; i--) { + int shift = next.matches(stringIndex, testString, matchResult); + if (shift >= 0) { + return shift; + } + stringIndex -= leaf.charCount(); + } + return -1; + + } + + public void reset() { + quantifier.resetCounter(); + } + + @Override + protected String getName() { + return quantifier.toString(); + } + + void setQuantifier(Quantifier quant) { + this.quantifier = quant; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CompositeRangeSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CompositeRangeSet.java new file mode 100644 index 000000000..3bb3f0307 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/CompositeRangeSet.java @@ -0,0 +1,182 @@ +/* + * Copyright 2014 Alexey Andreev. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * This class is used to split the range that contains surrogate characters into + * two ranges: the first consisting of these surrogate characters and the second + * consisting of all others characters from the parent range. This class + * represents the parent range split in such a manner. + */ +class CompositeRangeSet extends JointSet { + + // range without surrogates + AbstractSet withoutSurrogates; + + // range containing surrogates only + AbstractSet withSurrogates; + + public CompositeRangeSet(AbstractSet withoutSurrogates, AbstractSet withSurrogates, AbstractSet next) { + this.withoutSurrogates = withoutSurrogates; + this.withSurrogates = withSurrogates; + setNext(next); + } + + public CompositeRangeSet(AbstractSet withoutSurrogates, AbstractSet withSurrogates) { + this.withoutSurrogates = withoutSurrogates; + this.withSurrogates = withSurrogates; + } + + /** + * Returns the next. + */ + @Override + public AbstractSet getNext() { + return this.next; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int shift = withoutSurrogates.matches(stringIndex, testString, matchResult); + + if (shift < 0) { + shift = withSurrogates.matches(stringIndex, testString, matchResult); + } + + if (shift >= 0) { + return shift; + } + return -1; + } + + /** + * Sets next abstract set. + * + * @param next + * The next to set. + */ + @Override + public void setNext(AbstractSet next) { + this.next = next; + withSurrogates.setNext(next); + withoutSurrogates.setNext(next); + } + + public AbstractSet getSurrogates() { + return withSurrogates; + } + + public AbstractSet getWithoutSurrogates() { + return withoutSurrogates; + } + + @Override + protected String getName() { + return "CompositeRangeSet: " + " " + withoutSurrogates + " " + withSurrogates; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } + + @Override + public boolean first(AbstractSet set) { + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DecomposedCharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DecomposedCharSet.java new file mode 100644 index 000000000..54559ebe8 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DecomposedCharSet.java @@ -0,0 +1,260 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Represents canonical decomposition of Unicode character. Is used when + * CANON_EQ flag of Pattern class is specified. + */ +class DecomposedCharSet extends JointSet { + + /** + * Contains information about number of chars that were read for a codepoint + * last time + */ + private int readCharsForCodePoint = 1; + + /** + * UTF-16 encoding of decomposedChar + */ + private String decomposedCharUTF16 = null; + + /** + * Decomposition of the Unicode codepoint + */ + private int[] decomposedChar; + + /** + * Length of useful part of decomposedChar decomposedCharLength <= + * decomposedChar.length + */ + private int decomposedCharLength; + + public DecomposedCharSet(int[] decomposedChar, int decomposedCharLength) { + this.decomposedChar = decomposedChar; + this.decomposedCharLength = decomposedCharLength; + } + + /** + * Returns the next. + */ + @Override + public AbstractSet getNext() { + return this.next; + } + + /** + * Sets next abstract set. + * + * @param next + * The next to set. + */ + @Override + public void setNext(AbstractSet next) { + this.next = next; + } + + @Override + public int matches(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + + /* + * All decompositions have length that is less or equal + * Lexer.MAX_DECOMPOSITION_LENGTH + */ + int[] decCurCodePoint; + int[] decCodePoint = new int[Lexer.MAX_DECOMPOSITION_LENGTH]; + int readCodePoints = 0; + int rightBound = matchResult.getRightBound(); + int curChar; + int i = 0; + + if (strIndex >= rightBound) { + return -1; + } + + /* + * We read testString and decompose it gradually to compare with this + * decomposedChar at position strIndex + */ + curChar = codePointAt(strIndex, testString, rightBound); + strIndex += readCharsForCodePoint; + decCurCodePoint = Lexer.getDecomposition(curChar); + if (decCurCodePoint == null) { + decCodePoint[readCodePoints++] = curChar; + } else { + i = decCurCodePoint.length; + System.arraycopy(decCurCodePoint, 0, decCodePoint, 0, i); + readCodePoints += i; + } + + if (strIndex < rightBound) { + curChar = codePointAt(strIndex, testString, rightBound); + + /* + * Read testString until we met a decomposed char boundary and + * decompose obtained portion of testString + */ + while ((readCodePoints < Lexer.MAX_DECOMPOSITION_LENGTH) && !Lexer.isDecomposedCharBoundary(curChar)) { + + if (Lexer.hasDecompositionNonNullCanClass(curChar)) { + + /* + * A few codepoints have decompositions and non null + * canonical classes, we have to take them into + * consideration, but general rule is: if canonical class != + * 0 then no decomposition + */ + decCurCodePoint = Lexer.getDecomposition(curChar); + + /* + * Length of such decomposition is 1 or 2. See UnicodeData + * file http://www.unicode.org/Public/4.0-Update + * /UnicodeData-4.0.0.txt + */ + if (decCurCodePoint.length == 2) { + decCodePoint[readCodePoints++] = decCurCodePoint[0]; + decCodePoint[readCodePoints++] = decCurCodePoint[1]; + } else { + decCodePoint[readCodePoints++] = decCurCodePoint[0]; + } + } else { + decCodePoint[readCodePoints++] = curChar; + } + + strIndex += readCharsForCodePoint; + + if (strIndex < rightBound) { + curChar = codePointAt(strIndex, testString, rightBound); + } else { + break; + } + } + } + + /* + * Some optimization since length of decomposed char is <= 3 usually + */ + switch (readCodePoints) { + case 0: + case 1: + case 2: + break; + + case 3: + int i1 = Lexer.getCanonicalClass(decCodePoint[1]); + int i2 = Lexer.getCanonicalClass(decCodePoint[2]); + + if ((i2 != 0) && (i1 > i2)) { + i1 = decCodePoint[1]; + decCodePoint[1] = decCodePoint[2]; + decCodePoint[2] = i1; + } + break; + + default: + decCodePoint = Lexer.getCanonicalOrder(decCodePoint, readCodePoints); + } + + /* + * Compare decomposedChar with decomposed char that was just read from + * testString + */ + if (readCodePoints != decomposedCharLength) { + return -1; + } + + for (i = 0; i < readCodePoints; i++) { + if (decCodePoint[i] != decomposedChar[i]) { + return -1; + } + } + + return next.matches(strIndex, testString, matchResult); + } + + /** + * Return UTF-16 encoding of given Unicode codepoint. + * + * @return UTF-16 encoding + */ + private String getDecomposedChar() { + if (decomposedCharUTF16 == null) { + StringBuilder strBuff = new StringBuilder(); + + for (int i = 0; i < decomposedCharLength; i++) { + strBuff.append(Character.toChars(decomposedChar[i])); + } + decomposedCharUTF16 = strBuff.toString(); + } + return decomposedCharUTF16; + } + + @Override + protected String getName() { + return "decomposed char:" + getDecomposedChar(); //$NON-NLS-1$ + } + + /** + * Reads Unicode codepoint from input. + * + * @param strIndex + * - index to read codepoint at + * @param testString + * - input + * @param matchResult + * - auxiliary object + * @return codepoint at given strIndex at testString and + */ + public int codePointAt(int strIndex, CharSequence testString, int rightBound) { + + /* + * We store information about number of codepoints we read at variable + * readCharsForCodePoint. + */ + int curChar; + + readCharsForCodePoint = 1; + if (strIndex < rightBound - 1) { + char high = testString.charAt(strIndex++); + char low = testString.charAt(strIndex); + + if (Character.isSurrogatePair(high, low)) { + char[] curCodePointUTF16 = new char[] { high, low }; + curChar = Character.codePointAt(curCodePointUTF16, 0); + readCharsForCodePoint = 2; + } else { + curChar = high; + } + } else { + curChar = testString.charAt(strIndex); + } + + return curChar; + } + + @Override + public boolean first(AbstractSet set) { + return (set instanceof DecomposedCharSet) ? ((DecomposedCharSet)set).getDecomposedChar().equals( + getDecomposedChar()) : true; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotAllQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotAllQuantifierSet.java new file mode 100644 index 000000000..e11126a08 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotAllQuantifierSet.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Special node for ".*" construction for any character including line + * terminators. + * + * @author Nikolay A. Kuznetsov + */ +class DotAllQuantifierSet extends QuantifierSet { + + public DotAllQuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + + int strLength = matchResult.getRightBound(); + + if (strLength <= stringIndex) { + return next.matches(stringIndex, testString, matchResult); + } + return next.findBack(stringIndex, strLength, testString, matchResult); + } + + @Override + public int find(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + if (next.findBack(stringIndex, strLength, testString, matchResult) >= 0) { + return stringIndex; + } else { + return -1; + } + } + + @Override + protected String getName() { + return ""; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotAllSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotAllSet.java new file mode 100644 index 000000000..87a135a2f --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotAllSet.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Node accepting any character including line terminators. + * + * @author Nikolay A. Kuznetsov + */ +class DotAllSet extends JointSet { + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + + if (stringIndex + 1 > strLength) { + matchResult.hitEnd = true; + return -1; + } + + char high = testString.charAt(stringIndex); + + if (Character.isHighSurrogate(high) && (stringIndex + 2 <= strLength)) { + char low = testString.charAt(stringIndex + 1); + + if (Character.isSurrogatePair(high, low)) { + return next.matches(stringIndex + 2, testString, matchResult); + } + } + return next.matches(stringIndex + 1, testString, matchResult); + } + + @Override + protected String getName() { + return "DotAll"; + } + + @Override + public AbstractSet getNext() { + return this.next; + } + + @Override + public void setNext(AbstractSet next) { + this.next = next; + } + + @Override + public int getType() { + return AbstractSet.TYPE_DOTSET; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotQuantifierSet.java new file mode 100644 index 000000000..09e74e0b6 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotQuantifierSet.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Special node for ".*" construction. The main idea here is to find line + * terminator and try to find the rest of the construction from this point. + * + * @author Nikolay A. Kuznetsov + */ +class DotQuantifierSet extends QuantifierSet { + + AbstractLineTerminator lt; + + public DotQuantifierSet(AbstractSet innerSet, AbstractSet next, int type, AbstractLineTerminator lt) { + super(innerSet, next, type); + this.lt = lt; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + + int strLength = matchResult.getRightBound(); + + int startSearch = /* testString.toString().indexOf('\n', stringIndex); */ + findLineTerminator(stringIndex, strLength, testString); + + if (startSearch < 0) { + startSearch = strLength; + } + + if (startSearch <= stringIndex) { + return next.matches(stringIndex, testString, matchResult); + } + return next.findBack(stringIndex, startSearch, testString, matchResult); + } + + @Override + public int find(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + // String testStr = testString.toString(); + int strLength = matchResult.getRightBound(); + // 1. skip line terminators ??? + // // + // we don't skip line terminators here, but return zero match instead + // // + + // 2. find first occurrence of the searched pattern + // // + int res = next.find(stringIndex, testString, matchResult); + + // 3. Check if we have other occurrences till the end of line + // (because .* is greedy and we need last one) + // // + if (res >= 0) { + int nextSearch = findLineTerminator(res, strLength, testString); + // testStr.indexOf('\n', res); + if (nextSearch < 0) { + nextSearch = strLength; + } + nextSearch = next.findBack(res, nextSearch, testString, matchResult); + res = (res < nextSearch) ? nextSearch : res; + } else { + return -1; + } + + // 4. find left boundary of this search + // // + int leftBound = (res > 0) ? findBackLineTerminator(stringIndex, res - 1, testString)/* + * testStr + * . + * lastIndexOf + * ( + * '\n' + * , + * res + * - + * 1 + * ) + */ + : (res == 0) ? 0 : -1; + res = (leftBound >= stringIndex) ? ((leftBound < res) ? leftBound + 1 : leftBound) : stringIndex; + + return res; + } + + /* + * All line terminators are from Basic Multilingual Pane + */ + private int findLineTerminator(int from, int to, CharSequence testString) { + for (int i = from; i < to; i++) { + if (lt.isLineTerminator(testString.charAt(i))) { + return i; + } + } + return -1; + } + + private int findBackLineTerminator(int from, int to, CharSequence testString) { + for (int i = to; i >= from; i--) { + if (lt.isLineTerminator(testString.charAt(i))) { + return i; + } + } + return -1; + } + + @Override + protected String getName() { + return ""; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotSet.java new file mode 100644 index 000000000..25086e364 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/DotSet.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Node accepting any character except line terminators; + * + * @author Nikolay A. Kuznetsov + */ +final class DotSet extends JointSet { + + AbstractLineTerminator lt; + + public DotSet(AbstractLineTerminator lt) { + super(); + this.lt = lt; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + + if (stringIndex + 1 > strLength) { + matchResult.hitEnd = true; + return -1; + } + char high = testString.charAt(stringIndex); + + if (Character.isHighSurrogate(high) && (stringIndex + 2 <= strLength)) { + char low = testString.charAt(stringIndex + 1); + + if (Character.isSurrogatePair(high, low)) { + return lt.isLineTerminator(Character.toCodePoint(high, low)) ? -1 : next.matches(stringIndex + 2, + testString, matchResult); + } + } + + return lt.isLineTerminator(high) ? -1 : next.matches(stringIndex + 1, testString, matchResult); + } + + @Override + protected String getName() { + return "."; //$NON-NLS-1$ + } + + @Override + public AbstractSet getNext() { + return this.next; + } + + @Override + public void setNext(AbstractSet next) { + this.next = next; + } + + @Override + public int getType() { + return AbstractSet.TYPE_DOTSET; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/EOISet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/EOISet.java new file mode 100644 index 000000000..bb554b1ec --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/EOISet.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents end of input '\z', i.e. matches only character after the last one; + * + * @author Nikolay A. Kuznetsov + */ +class EOISet extends AbstractSet { + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int rightBound = matchResult.hasTransparentBounds() ? testString.length() : matchResult.getRightBound(); + if (stringIndex < rightBound) + return -1; + + matchResult.hitEnd = true; + matchResult.requireEnd = true; + + return next.matches(stringIndex, testString, matchResult); + } + + /** + * Returns false, enough for quantifiers + */ + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return false; + } + + @Override + protected String getName() { + return "EOI"; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/EOLSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/EOLSet.java new file mode 100644 index 000000000..f94414dfd --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/EOLSet.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single character. + * + * @author Nikolay A. Kuznetsov + */ +final class EOLSet extends AbstractSet { + private int consCounter; + + public EOLSet(int counter) { + this.consCounter = counter; + } + + @Override + public int matches(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + int rightBound = matchResult.hasAnchoringBounds() ? matchResult.getRightBound() : testString.length(); + + if (strIndex >= rightBound) { + matchResult.setConsumed(consCounter, 0); + return next.matches(strIndex, testString, matchResult); + } + + // check final line terminator; + if ((rightBound - strIndex) == 2 && testString.charAt(strIndex) == '\r' && + testString.charAt(strIndex + 1) == '\n') { + matchResult.setConsumed(consCounter, 0); + return next.matches(strIndex, testString, matchResult); + } + char ch; + + if ((rightBound - strIndex) == 1 && + (((ch = testString.charAt(strIndex)) == '\n' || ch == '\r' || ch == '\u0085' || (ch | 1) == '\u2029'))) { + matchResult.setConsumed(consCounter, 0); + return next.matches(strIndex, testString, matchResult); + } + + return -1; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + int cons; + boolean res = ((cons = matchResult.getConsumed(consCounter)) < 0 || cons > 0); + matchResult.setConsumed(consCounter, -1); + return res; + } + + @Override + protected String getName() { + return ""; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/EmptySet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/EmptySet.java new file mode 100644 index 000000000..8e092b52c --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/EmptySet.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Valid constant zero character match. + * + * @author Nikolay A. Kuznetsov + */ +class EmptySet extends LeafSet { + public EmptySet(AbstractSet next) { + super(next); + charCount = 0; + } + + @Override + public int accepts(int stringIndex, CharSequence testString) { + return 0; + } + + @Override + public int find(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + int startStr = matchResult.getLeftBound(); + + while (stringIndex <= strLength) { + + // check for supplementary codepoints + if (stringIndex < strLength) { + char low = testString.charAt(stringIndex); + + if (Character.isLowSurrogate(low)) { + + if (stringIndex > startStr) { + char high = testString.charAt(stringIndex - 1); + if (Character.isHighSurrogate(high)) { + stringIndex++; + continue; + } + } + } + } + + if (next.matches(stringIndex, testString, matchResult) >= 0) { + return stringIndex; + } + stringIndex++; + } + + return -1; + } + + @Override + public int findBack(int stringIndex, int startSearch, CharSequence testString, MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + int startStr = matchResult.getLeftBound(); + + while (startSearch >= stringIndex) { + + // check for supplementary codepoints + if (startSearch < strLength) { + char low = testString.charAt(startSearch); + + if (Character.isLowSurrogate(low)) { + + if (startSearch > startStr) { + char high = testString.charAt(startSearch - 1); + if (Character.isHighSurrogate(high)) { + startSearch--; + continue; + } + } + } + } + + if (next.matches(startSearch, testString, matchResult) >= 0) { + return startSearch; + } + startSearch--; + } + + return -1; + } + + @Override + protected String getName() { + return ""; + } + + @Override + public boolean hasConsumed(MatchResultImpl mr) { + return false; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/FSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/FSet.java new file mode 100644 index 000000000..0d829a366 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/FSet.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * The node which marks end of the particular group. + * + * @author Nikolay A. Kuznetsov + */ +class FSet extends AbstractSet { + + static PossessiveFSet posFSet = new PossessiveFSet(); + + boolean isBackReferenced = false; + + private int groupIndex; + + public FSet(int groupIndex) { + this.groupIndex = groupIndex; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int end = matchResult.getEnd(groupIndex); + matchResult.setEnd(groupIndex, stringIndex); + int shift = next.matches(stringIndex, testString, matchResult); + /* + * if(shift >=0 && matchResult.getEnd(groupIndex) == -1) { + * matchResult.setEnd(groupIndex, stringIndex); } + */ + if (shift < 0) + matchResult.setEnd(groupIndex, end); + return shift; + } + + public int getGroupIndex() { + return groupIndex; + } + + @Override + protected String getName() { + return "fSet"; //$NON-NLS-1$ + } + + @Override + public boolean hasConsumed(MatchResultImpl mr) { + return false; + } + + /** + * Marks the end of the particular group and not take into account possible + * kickbacks(required for atomic groups, for instance) + * + */ + static class PossessiveFSet extends AbstractSet { + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + return stringIndex; + } + + @Override + protected String getName() { + return "posFSet"; //$NON-NLS-1$ + } + + @Override + public boolean hasConsumed(MatchResultImpl mr) { + return false; + } + } +} \ No newline at end of file diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/FinalSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/FinalSet.java new file mode 100644 index 000000000..d67ad03d2 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/FinalSet.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Special construction which marks end of pattern. + * + * @author Nikolay A. Kuznetsov + */ +class FinalSet extends FSet { + + public FinalSet() { + super(0); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + if (matchResult.mode() == TMatcher.MODE_FIND || stringIndex == matchResult.getRightBound()) { + matchResult.setValid(); + matchResult.setEnd(0, stringIndex); + return stringIndex; + } + return -1; + } + + @Override + protected String getName() { + return "FinalSet"; //$NON-NLS-1$ + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/GroupQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/GroupQuantifierSet.java new file mode 100644 index 000000000..38c402cba --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/GroupQuantifierSet.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Default quantifier over groups, in fact this type of quantifier is generally + * used for constructions we cant identify number of characters they consume. + * + * @author Nikolay A. Kuznetsov + */ +class GroupQuantifierSet extends QuantifierSet { + + public GroupQuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + + if (!innerSet.hasConsumed(matchResult)) + return next.matches(stringIndex, testString, matchResult);// return + // -1; + + int nextIndex = innerSet.matches(stringIndex, testString, matchResult); + + if (nextIndex < 0) { + return next.matches(stringIndex, testString, matchResult); + } else { + return nextIndex; + } + } + + @Override + protected String getName() { + return ""; //$NON-NLS-1$ + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/HangulDecomposedCharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/HangulDecomposedCharSet.java new file mode 100644 index 000000000..3a41acc93 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/HangulDecomposedCharSet.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Represents canonical decomposition of Hangul syllable. Is used when CANON_EQ + * flag of Pattern class is specified. + */ +class HangulDecomposedCharSet extends JointSet { + + /** + * Decomposed Hangul syllable. + */ + private char[] decomposedChar; + + /** + * String representing syllable + */ + private String decomposedCharUTF16 = null; + + /** + * Length of useful part of decomposedChar decomposedCharLength <= + * decomposedChar.length + */ + private int decomposedCharLength; + + public HangulDecomposedCharSet(char[] decomposedChar, int decomposedCharLength) { + this.decomposedChar = decomposedChar; + this.decomposedCharLength = decomposedCharLength; + } + + /** + * Returns the next. + */ + @Override + public AbstractSet getNext() { + return this.next; + } + + /** + * Sets next abstract set. + * + * @param next + * The next to set. + */ + @Override + public void setNext(AbstractSet next) { + this.next = next; + } + + /** + * Give string representation of this. + * + * @return - string representation. + */ + private String getDecomposedChar() { + return (decomposedCharUTF16 == null) ? (decomposedCharUTF16 = new String(decomposedChar)) : decomposedCharUTF16; + } + + @Override + protected String getName() { + return "decomposed Hangul syllable:" + getDecomposedChar(); //$NON-NLS-1$ + } + + @Override + public int matches(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + + /* + * All decompositions for Hangul syllables have length that is less or + * equal Lexer.MAX_DECOMPOSITION_LENGTH + */ + int rightBound = matchResult.getRightBound(); + int SyllIndex = 0; + int[] decompSyllable = new int[Lexer.MAX_HANGUL_DECOMPOSITION_LENGTH]; + int[] decompCurSymb; + char curSymb; + + /* + * For details about Hangul composition and decomposition see + * http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf + * "3.12 Conjoining Jamo Behavior" + */ + int LIndex = -1; + int VIndex = -1; + int TIndex = -1; + + if (strIndex >= rightBound) { + return -1; + } + curSymb = testString.charAt(strIndex++); + decompCurSymb = Lexer.getHangulDecomposition(curSymb); + + if (decompCurSymb == null) { + + /* + * We deal with ordinary letter or sequence of jamos at strIndex at + * testString. + */ + decompSyllable[SyllIndex++] = curSymb; + LIndex = curSymb - Lexer.LBase; + + if ((LIndex < 0) || (LIndex >= Lexer.LCount)) { + + /* + * Ordinary letter, that doesn't match this + */ + return -1; + } + + if (strIndex < rightBound) { + curSymb = testString.charAt(strIndex); + VIndex = curSymb - Lexer.VBase; + } + + if ((VIndex < 0) || (VIndex >= Lexer.VCount)) { + + /* + * Single L jamo doesn't compose Hangul syllable, so doesn't + * match + */ + return -1; + } + strIndex++; + decompSyllable[SyllIndex++] = curSymb; + + if (strIndex < rightBound) { + curSymb = testString.charAt(strIndex); + TIndex = curSymb - Lexer.TBase; + } + + if ((TIndex < 0) || (TIndex >= Lexer.TCount)) { + + /* + * We deal with LV syllable at testString, so compare it to this + */ + return ((decomposedCharLength == 2) && (decompSyllable[0] == decomposedChar[0]) && (decompSyllable[1] == decomposedChar[1])) ? next + .matches(strIndex, testString, matchResult) : -1; + } + strIndex++; + decompSyllable[SyllIndex++] = curSymb; + + /* + * We deal with LVT syllable at testString, so compare it to this + */ + return ((decomposedCharLength == 3) && (decompSyllable[0] == decomposedChar[0]) && + (decompSyllable[1] == decomposedChar[1]) && (decompSyllable[2] == decomposedChar[2])) ? next + .matches(strIndex, testString, matchResult) : -1; + } else { + + /* + * We deal with Hangul syllable at strIndex at testString. So we + * decomposed it to compare with this. + */ + int i = 0; + + if (decompCurSymb.length != decomposedCharLength) { + return -1; + } + + for (; i < decomposedCharLength; i++) { + if (decompCurSymb[i] != decomposedChar[i]) { + return -1; + } + } + return next.matches(strIndex, testString, matchResult); + } + } + + @Override + public boolean first(AbstractSet set) { + return (set instanceof HangulDecomposedCharSet) ? ((HangulDecomposedCharSet)set).getDecomposedChar().equals( + getDecomposedChar()) : true; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/HighSurrogateCharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/HighSurrogateCharSet.java new file mode 100644 index 000000000..911df1c88 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/HighSurrogateCharSet.java @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * This class represents high surrogate character. + */ +class HighSurrogateCharSet extends JointSet { + + /* + * Note that we can use high and low surrogate characters that don't combine + * into supplementary code point. See + * http://www.unicode.org/reports/tr18/#Supplementary_Characters + */ + + private char high; + + public HighSurrogateCharSet(char high) { + this.high = high; + } + + /** + * Returns the next. + */ + @Override + public AbstractSet getNext() { + return this.next; + } + + /** + * Sets next abstract set. + * + * @param next + * The next to set. + */ + @Override + public void setNext(AbstractSet next) { + this.next = next; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + + if (stringIndex + 1 > strLength) { + matchResult.hitEnd = true; + return -1; + } + + char high = testString.charAt(stringIndex); + + if (stringIndex + 1 < strLength) { + char low = testString.charAt(stringIndex + 1); + + /* + * we consider high surrogate followed by low surrogate as a + * codepoint + */ + if (Character.isLowSurrogate(low)) { + return -1; + } + } + + if (this.high == high) { + return next.matches(stringIndex + 1, testString, matchResult); + } + + return -1; + } + + @Override + public int find(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + if (testString instanceof String) { + String testStr = (String)testString; + int strLength = matchResult.getRightBound(); + + while (strIndex < strLength) { + + strIndex = testStr.indexOf(high, strIndex); + if (strIndex < 0) + return -1; + + if (strIndex + 1 < strLength) { + + /* + * we consider high surrogate followed by low surrogate as a + * codepoint + */ + if (Character.isLowSurrogate(testStr.charAt(strIndex + 1))) { + strIndex += 2; + continue; + } + } + + if (next.matches(strIndex + 1, testString, matchResult) >= 0) { + return strIndex; + } + strIndex++; + } + + return -1; + } + + return super.find(strIndex, testString, matchResult); + } + + @Override + public int findBack(int strIndex, int lastIndex, CharSequence testString, MatchResultImpl matchResult) { + if (testString instanceof String) { + String testStr = (String)testString; + int strLength = matchResult.getRightBound(); + + while (lastIndex >= strIndex) { + lastIndex = testStr.lastIndexOf(high, lastIndex); + if (lastIndex < 0 || lastIndex < strIndex) { + return -1; + } + + if (lastIndex + 1 < strLength) { + + /* + * we consider high surrogate followed by low surrogate as a + * codepoint + */ + if (Character.isLowSurrogate(testStr.charAt(lastIndex + 1))) { + lastIndex--; + continue; + } + } + + if (next.matches(lastIndex + 1, testString, matchResult) >= 0) { + return lastIndex; + } + + lastIndex--; + } + + return -1; + } + + return super.findBack(strIndex, lastIndex, testString, matchResult); + } + + @Override + protected String getName() { + return "" + high; + } + + protected int getChar() { + return high; + } + + @Override + public boolean first(AbstractSet set) { + if (set instanceof CharSet) { + return false; + } else if (set instanceof RangeSet) { + return false; + } else if (set instanceof SupplRangeSet) { + return false; + } else if (set instanceof SupplCharSet) { + return false; + } else if (set instanceof LowSurrogateCharSet) { + return false; + } else if (set instanceof HighSurrogateCharSet) { + return ((HighSurrogateCharSet)set).high == this.high; + } + + return true; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/I18n.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/I18n.java new file mode 100644 index 000000000..e1f950959 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/I18n.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ + +package org.teavm.classlib.java.util.regex; +import java.text.MessageFormat; + +/** + * Internationalization stub. All the messages in java.util.regexp + * package done though this class. This class should be lately replaced with + * real internationalization utility. + * + * @author Nikolay A. Kuznetsov + * + */ +class I18n { + public static String getMessage(String message) { + return message; + } + + public static String getFormattedMessage(String message, Object arg1) { + return MessageFormat.format(message, new Object[] {arg1}); + } + + public static String getFormattedMessage(String message, Object arg1, Object arg2) { + return MessageFormat.format(message, new Object[] {arg1, arg2}); + } + +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/IntArrHash.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/IntArrHash.java new file mode 100644 index 000000000..8369f3be5 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/IntArrHash.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Hashtable implementation for int arrays. + */ +class IntArrHash { + final int[] table; + + final Object[] values; + + final int mask; + + public IntArrHash(int size) { + int tmpMask = 0; + while (size >= tmpMask) { + tmpMask = (tmpMask << 1) | 1; + } + mask = (tmpMask << 1) | 1; + table = new int[mask + 1]; + values = new Object [mask + 1]; + } + + public void put(int key, int [] value) { + int i = 0; + int hashCode = key & mask; + + for (; ; ) { + if (table[hashCode] == 0 // empty + || table[hashCode] == key) { // rewrite + table[hashCode] = key; + values[hashCode] = value; + return; + } + i++; + i &= mask; + + hashCode += i; + hashCode &= mask; + } + } + + public int [] get(int key) { + int hashCode = key & mask; + int i = 0; + int storedKey; + + for (; ; ) { + storedKey = table[hashCode]; + + if (storedKey == 0) { // empty + return null; + } + + if (storedKey == key) { + return (int []) values[hashCode]; + } + + i++; + i &= mask; + + hashCode += i; + hashCode &= mask; + } + } + } \ No newline at end of file diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/IntHash.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/IntHash.java new file mode 100644 index 000000000..295c04620 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/IntHash.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Hashtable implementation for int values. + */ +class IntHash { + int[] table; + + int[] values; + + int mask; + + int size; // maximum shift + + public IntHash(int size) { + while (size >= mask) { + mask = (mask << 1) | 1; + } + mask = (mask << 1) | 1; + table = new int[mask + 1]; + values = new int[mask + 1]; + this.size = size; + } + + public void put(int key, int value) { + int i = 0; + int hashCode = key & mask; + + for (; ; ) { + if (table[hashCode] == 0 // empty + || table[hashCode] == key) { // rewrite + table[hashCode] = key; + values[hashCode] = value; + return; + } + i++; + i &= mask; + + hashCode += i; + hashCode &= mask; + } + } + + public int get(int key) { + int hashCode = key & mask; + int i = 0; + int storedKey; + + for (; ; ) { + storedKey = table[hashCode]; + + if (storedKey == 0) { // empty + return size; + } + + if (storedKey == key) { + return values[hashCode]; + } + + i++; + i &= mask; + + hashCode += i; + hashCode &= mask; + } + } + } \ No newline at end of file diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/JointSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/JointSet.java new file mode 100644 index 000000000..aeaf1ec1f --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/JointSet.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.ArrayList; +import java.util.Iterator; + +/** + * Represents group, which is alternation of other subexpression. One should + * think about "group" in this model as JointSet opening group and corresponding + * FSet closing group. + */ +class JointSet extends AbstractSet { + + protected ArrayList children; + + protected AbstractSet fSet; + + protected int groupIndex; + + protected JointSet() { + } + + public JointSet(ArrayList children, FSet fSet) { + this.children = children; + this.fSet = fSet; + this.groupIndex = fSet.getGroupIndex(); + } + + /** + * Returns stringIndex+shift, the next position to match + */ + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + if (children == null) { + return -1; + } + int start = matchResult.getStart(groupIndex); + matchResult.setStart(groupIndex, stringIndex); + int size = children.size(); + for (int i = 0; i < size; i++) { + AbstractSet e = children.get(i); + int shift = e.matches(stringIndex, testString, matchResult); + if (shift >= 0) { + return shift; + } + } + matchResult.setStart(groupIndex, start); + return -1; + } + + @Override + public void setNext(AbstractSet next) { + fSet.setNext(next); + } + + @Override + public AbstractSet getNext() { + return fSet.getNext(); + } + + @Override + protected String getName() { + return "JointSet"; //$NON-NLS-1$ + } + + public int getGroup() { + return groupIndex; + } + + @Override + public boolean first(AbstractSet set) { + if (children != null) { + for (Iterator i = children.iterator(); i.hasNext();) { + if ((i.next()).first(set)) { + return true; + } + } + } + + return false; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return !(matchResult.getEnd(groupIndex) >= 0 && matchResult.getStart(groupIndex) == matchResult + .getEnd(groupIndex)); + } + + /** + * This method is used for traversing nodes after the first stage of + * compilation. + */ + @Override + public void processSecondPass() { + this.isSecondPassVisited = true; + + if (fSet != null && !fSet.isSecondPassVisited) { + fSet.processSecondPass(); + } + + if (children != null) { + int childrenSize = children.size(); + + for (int i = 0; i < childrenSize; i++) { + AbstractSet child = children.get(i); + JointSet set = child.processBackRefReplacement(); + + if (set != null) { + child.isSecondPassVisited = true; + children.remove(i); + children.add(i, set); + child = set; + } + + if (!child.isSecondPassVisited) { + child.processSecondPass(); + } + } + } + + if (next != null) { + super.processSecondPass(); + } + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LeafQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LeafQuantifierSet.java new file mode 100644 index 000000000..f68476eea --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LeafQuantifierSet.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * @author Nikolay A. Kuznetsov + */ +class LeafQuantifierSet extends QuantifierSet { + + protected LeafSet leaf; + + public LeafQuantifierSet(LeafSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + this.leaf = innerSet; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int i = 0; + int shift = 0; + + while (stringIndex + leaf.charCount() <= matchResult.getRightBound() && + (shift = leaf.accepts(stringIndex, testString)) > 0) { + stringIndex += shift; + i++; + } + + for (; i >= 0; i--) { + shift = next.matches(stringIndex, testString, matchResult); + if (shift >= 0) { + return shift; + } + + stringIndex -= leaf.charCount(); + } + return -1; + } + + @Override + protected String getName() { + return ""; + } + + /** + * Sets an inner set. + * + * @param innerSet + * The innerSet to set. + */ + @Override + public void setInnerSet(AbstractSet innerSet) { + if (!(innerSet instanceof LeafSet)) + throw new RuntimeException(""); + super.setInnerSet(innerSet); + this.leaf = (LeafSet)innerSet; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LeafSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LeafSet.java new file mode 100644 index 000000000..d517990b3 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LeafSet.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Base class for nodes representing leaf tokens of the RE, those who consumes + * fixed number of characters. + * + * @author Nikolay A. Kuznetsov + */ +abstract class LeafSet extends AbstractSet { + + protected int charCount = 1; + + public LeafSet(AbstractSet next) { + super(next); + setType(AbstractSet.TYPE_LEAF); + } + + public LeafSet() { + } + + /** + * Returns "shift", the number of accepted chars commonly internal function, + * but called by quantifiers. + */ + public abstract int accepts(int stringIndex, CharSequence testString); + + /** + * Checks if we can enter this state and pass the control to the next one. + * Return positive value if match succeeds, negative otherwise. + */ + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + + if (stringIndex + charCount() > matchResult.getRightBound()) { + matchResult.hitEnd = true; + return -1; + } + + int shift = accepts(stringIndex, testString); + if (shift < 0) { + return -1; + } + + return next.matches(stringIndex + shift, testString, matchResult); + } + + /** + * Returns number of characters this node consumes. + * + * @return number of characters this node consumes. + */ + public int charCount() { + return charCount; + } + + @Override + public boolean hasConsumed(MatchResultImpl mr) { + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/Lexer.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/Lexer.java new file mode 100644 index 000000000..2b7d47a6a --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/Lexer.java @@ -0,0 +1,1152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.MissingResourceException; + +/** + * The purpose of this class is to break given pattern into RE tokens; + * + * @author Nikolay A. Kuznetsov + */ +class Lexer { + + public static final int CHAR_DOLLAR = 0xe0000000 | '$'; + + public static final int CHAR_RIGHT_PARENTHESIS = 0xe0000000 | ')'; + + public static final int CHAR_LEFT_SQUARE_BRACKET = 0xe0000000 | '['; + + public static final int CHAR_RIGHT_SQUARE_BRACKET = 0xe0000000 | ']'; + + public static final int CHAR_CARET = 0xe0000000 | '^'; + + public static final int CHAR_VERTICAL_BAR = 0xe0000000 | '|'; + + public static final int CHAR_AMPERSAND = 0xe0000000 | '&'; + + public static final int CHAR_HYPHEN = 0xe0000000 | '-'; + + public static final int CHAR_DOT = 0xe0000000 | '.'; + + public static final int QMOD_GREEDY = 0xe0000000; + + public static final int QMOD_RELUCTANT = 0xc0000000; + + public static final int QMOD_POSSESSIVE = 0x80000000; + + public static final int QUANT_STAR = QMOD_GREEDY | '*'; + + public static final int QUANT_STAR_P = QMOD_POSSESSIVE | '*'; + + public static final int QUANT_STAR_R = QMOD_RELUCTANT | '*'; + + public static final int QUANT_PLUS = QMOD_GREEDY | '+'; + + public static final int QUANT_PLUS_P = QMOD_POSSESSIVE | '+'; + + public static final int QUANT_PLUS_R = QMOD_RELUCTANT | '+'; + + public static final int QUANT_ALT = QMOD_GREEDY | '?'; + + public static final int QUANT_ALT_P = QMOD_POSSESSIVE | '?'; + + public static final int QUANT_ALT_R = QMOD_RELUCTANT | '?'; + + public static final int QUANT_COMP = QMOD_GREEDY | '{'; + + public static final int QUANT_COMP_P = QMOD_POSSESSIVE | '{'; + + public static final int QUANT_COMP_R = QMOD_RELUCTANT | '{'; + + public static final int CHAR_LEFT_PARENTHESIS = 0x80000000 | '('; + + public static final int CHAR_NONCAP_GROUP = 0xc0000000 | '('; + + public static final int CHAR_POS_LOOKAHEAD = 0xe0000000 | '('; + + public static final int CHAR_NEG_LOOKAHEAD = 0xf0000000 | '('; + + public static final int CHAR_POS_LOOKBEHIND = 0xf8000000 | '('; + + public static final int CHAR_NEG_LOOKBEHIND = 0xfc000000 | '('; + + public static final int CHAR_ATOMIC_GROUP = 0xfe000000 | '('; + + public static final int CHAR_FLAGS = 0xff000000 | '('; + + public static final int CHAR_START_OF_INPUT = 0x80000000 | 'A'; + + public static final int CHAR_WORD_BOUND = 0x80000000 | 'b'; + + public static final int CHAR_NONWORD_BOUND = 0x80000000 | 'B'; + + public static final int CHAR_PREVIOUS_MATCH = 0x80000000 | 'G'; + + public static final int CHAR_END_OF_INPUT = 0x80000000 | 'z'; + + public static final int CHAR_END_OF_LINE = 0x80000000 | 'Z'; + + public static final int MODE_PATTERN = 1 << 0; + + public static final int MODE_RANGE = 1 << 1; + + public static final int MODE_ESCAPE = 1 << 2; + + // maximum length of decomposition + static final int MAX_DECOMPOSITION_LENGTH = 4; + + /* + * maximum length of Hangul decomposition note that + * MAX_HANGUL_DECOMPOSITION_LENGTH <= MAX_DECOMPOSITION_LENGTH + */ + static final int MAX_HANGUL_DECOMPOSITION_LENGTH = 3; + + /* + * Following constants are needed for Hangul canonical decomposition. Hangul + * decomposition algorithm and constants are taken according to description + * at http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf + * "3.12 Conjoining Jamo Behavior" + */ + static final int SBase = 0xAC00; + + static final int LBase = 0x1100; + + static final int VBase = 0x1161; + + static final int TBase = 0x11A7; + + static final int SCount = 11172; + + static final int LCount = 19; + + static final int VCount = 21; + + static final int TCount = 28; + + static final int NCount = 588; + + // table that contains canonical decomposition mappings + private static IntArrHash decompTable = null; + + // table that contains canonical combining classes + private static IntHash canonClassesTable = null; + + private static int canonClassesTableSize; + + /* + * Table that contains information about Unicode codepoints with single + * codepoint decomposition + */ + private static IntHash singleDecompTable = null; + + private static int singleDecompTableSize; + + private char[] pattern = null; + + private int flags = 0; + + private int mode = 1; + + // when in literal mode, this field will save the previous one + private int saved_mode = 0; + + // previous char read + private int lookBack; + + // current character read + private int ch; + + // next character + private int lookAhead; + + // index of last char in pattern plus one + private int patternFullLength = 0; + + // cur special token + private SpecialToken curST = null; + + // next special token + private SpecialToken lookAheadST = null; + + // cur char being processed + private int index = 0; + + // previous non-whitespace character index; + private int prevNW = 0; + + // cur token start index + private int curToc = 0; + + // look ahead token index + private int lookAheadToc = 0; + + // original string representing pattern + private String orig = null; + + public Lexer(String pattern, int flags) { + orig = pattern; + if ((flags & TPattern.LITERAL) > 0) { + pattern = TPattern.quote(pattern); + } else if ((flags & TPattern.CANON_EQ) > 0) { + pattern = Lexer.normalize(pattern); + } + + this.pattern = new char[pattern.length() + 2]; + System.arraycopy(pattern.toCharArray(), 0, this.pattern, 0, pattern.length()); + this.pattern[this.pattern.length - 1] = 0; + this.pattern[this.pattern.length - 2] = 0; + patternFullLength = this.pattern.length; + this.flags = flags; + // read first two tokens; + movePointer(); + movePointer(); + + } + + /** + * Returns current character w/o reading next one; if there are no more + * characters returns 0; + * + * @return current character; + */ + public int peek() { + return ch; + } + + /** + * Set the Lexer to PATTERN or RANGE mode; Lexer interpret character two + * different ways in parser or range modes. + * + * @param mode + * Lexer.PATTERN or Lexer.RANGE + */ + public void setMode(int mode) { + if (mode > 0 && mode < 3) { + this.mode = mode; + } + + if (mode == Lexer.MODE_PATTERN) { + reread(); + } + } + + /** + * Restores flags for Lexer + * + * @param flags + */ + public void restoreFlags(int flags) { + this.flags = flags; + lookAhead = ch; + lookAheadST = curST; + + // curToc is an index of closing bracket ) + index = curToc + 1; + lookAheadToc = curToc; + movePointer(); + } + + public SpecialToken peekSpecial() { + return curST; + } + + /** + * Returns true, if current token is special, i.e. quantifier, or other + * compound token. + * + * @return - true if current token is special, false otherwise. + */ + public boolean isSpecial() { + return curST != null; + } + + public boolean isQuantifier() { + return isSpecial() && curST.getType() == SpecialToken.TOK_QUANTIFIER; + } + + public boolean isNextSpecial() { + return lookAheadST != null; + } + + /** + * Returns current character and moves string index to the next one; + * + */ + public int next() { + movePointer(); + return lookBack; + } + + /** + * Returns current special token and moves string index to the next one; + */ + public SpecialToken nextSpecial() { + SpecialToken res = curST; + movePointer(); + return res; + } + + /** + * Returns nest symbol read. + */ + public int lookAhead() { + return lookAhead; + } + + /** + * Returns previous character. + */ + public int back() { + return lookBack; + } + + /** + * Normalize given expression. + * + * @param input + * - expression to normalize + * @return normalized expression. + */ + static String normalize(String input) { + return input; + } + + /** + * Rearrange codepoints according to canonical order. + * + * @param inputInts + * - array that contains Unicode codepoints + * @param length + * - index of last Unicode codepoint plus 1 + * + * @return array that contains rearranged codepoints. + */ + static int[] getCanonicalOrder(int[] inputInts, int length) { + int inputLength = (length < inputInts.length) ? length : inputInts.length; + + /* + * Simple bubble-sort algorithm. Note that many codepoints have 0 + * canonical class, so this algorithm works almost lineary in + * overwhelming majority of cases. This is due to specific of Unicode + * combining classes and codepoints. + */ + for (int i = 1; i < inputLength; i++) { + int j = i - 1; + int iCanonicalClass = getCanonicalClass(inputInts[i]); + int ch; + + if (iCanonicalClass == 0) { + continue; + } + + while (j > -1) { + if (getCanonicalClass(inputInts[j]) > iCanonicalClass) { + j = j - 1; + } else { + break; + } + } + + ch = inputInts[i]; + for (int k = i; k > j + 1; k--) { + inputInts[k] = inputInts[k - 1]; + } + inputInts[j + 1] = ch; + } + + return inputInts; + } + + /** + * Reread current character, may be require if previous token changes mode + * to one with different character interpretation. + * + */ + private void reread() { + lookAhead = ch; + lookAheadST = curST; + index = lookAheadToc; + lookAheadToc = curToc; + movePointer(); + } + + /** + * Moves pointer one position right; save current character to lookBack; + * lookAhead to current one and finally read one more to lookAhead; + */ + private void movePointer() { + // swap pointers + lookBack = ch; + ch = lookAhead; + curST = lookAheadST; + curToc = lookAheadToc; + lookAheadToc = index; + boolean reread; + do { + reread = false; + // read next character analyze it and construct token: + // // + + lookAhead = (index < pattern.length) ? nextCodePoint() : 0; + lookAheadST = null; + + if (mode == Lexer.MODE_ESCAPE) { + if (lookAhead == '\\') { + + // need not care about supplementary codepoints here + lookAhead = (index < pattern.length) ? pattern[nextIndex()] : 0; + + switch (lookAhead) { + case 'E': { + mode = saved_mode; + + lookAhead = (index <= pattern.length - 2) ? nextCodePoint() : 0; + break; + } + + default: { + lookAhead = '\\'; + index = prevNW; + return; + } + } + } else { + return; + } + } + + if (lookAhead == '\\') { + + lookAhead = (index < pattern.length - 2) ? nextCodePoint() : -1; + switch (lookAhead) { + case -1: + throw new TPatternSyntaxException("", this.toString(), index); + case 'P': + case 'p': { + String cs = parseCharClassName(); + boolean negative = false; + + if (lookAhead == 'P') + negative = true; + try { + lookAheadST = AbstractCharClass.getPredefinedClass(cs, negative); + } catch (MissingResourceException mre) { + throw new TPatternSyntaxException("", this.toString(), index); + } + lookAhead = 0; + break; + } + + case 'w': + case 's': + case 'd': + case 'W': + case 'S': + case 'D': { + lookAheadST = CharClass.getPredefinedClass(new String(pattern, prevNW, 1), false); + lookAhead = 0; + break; + } + + case 'Q': { + saved_mode = mode; + mode = Lexer.MODE_ESCAPE; + reread = true; + break; + } + + case 't': + lookAhead = '\t'; + break; + case 'n': + lookAhead = '\n'; + break; + case 'r': + lookAhead = '\r'; + break; + case 'f': + lookAhead = '\f'; + break; + case 'a': + lookAhead = '\u0007'; + break; + case 'e': + lookAhead = '\u001B'; + break; + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + if (mode == Lexer.MODE_PATTERN) { + lookAhead = 0x80000000 | lookAhead; + } + break; + } + + case '0': + lookAhead = readOctals(); + break; + case 'x': + lookAhead = readHex(2); + break; + case 'u': + lookAhead = readHex(4); + break; + + case 'b': + lookAhead = CHAR_WORD_BOUND; + break; + case 'B': + lookAhead = CHAR_NONWORD_BOUND; + break; + case 'A': + lookAhead = CHAR_START_OF_INPUT; + break; + case 'G': + lookAhead = CHAR_PREVIOUS_MATCH; + break; + case 'Z': + lookAhead = CHAR_END_OF_LINE; + break; + case 'z': + lookAhead = CHAR_END_OF_INPUT; + break; + case 'c': { + if (index < pattern.length - 2) { + + // need not care about supplementary codepoints here + lookAhead = (pattern[nextIndex()] & 0x1f); + break; + } else { + throw new TPatternSyntaxException("", this.toString(), index); + } + } + case 'C': + case 'E': + case 'F': + case 'H': + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'O': + case 'R': + case 'T': + case 'U': + case 'V': + case 'X': + case 'Y': + case 'g': + case 'h': + case 'i': + case 'j': + case 'k': + case 'l': + case 'm': + case 'o': + case 'q': + case 'y': + throw new TPatternSyntaxException("", this.toString(), index); + + default: + break; + } + } else if (mode == Lexer.MODE_PATTERN) { + switch (lookAhead) { + case '+': + case '*': + case '?': { + char mod = (index < pattern.length) ? pattern[index] : '*'; + switch (mod) { + case '+': { + lookAhead = lookAhead | Lexer.QMOD_POSSESSIVE; + nextIndex(); + break; + } + case '?': { + lookAhead = lookAhead | Lexer.QMOD_RELUCTANT; + nextIndex(); + break; + } + default: { + lookAhead = lookAhead | Lexer.QMOD_GREEDY; + break; + } + } + + break; + } + + case '{': { + lookAheadST = processQuantifier(lookAhead); + break; + } + + case '$': + lookAhead = CHAR_DOLLAR; + break; + case '(': { + if (pattern[index] == '?') { + nextIndex(); + char nonCap = pattern[index]; + boolean behind = false; + do { + if (!behind) { + switch (nonCap) { + case '!': + lookAhead = CHAR_NEG_LOOKAHEAD; + nextIndex(); + break; + case '=': + lookAhead = CHAR_POS_LOOKAHEAD; + nextIndex(); + break; + case '>': + lookAhead = CHAR_ATOMIC_GROUP; + nextIndex(); + break; + case '<': { + nextIndex(); + nonCap = pattern[index]; + behind = true; + break; + } + default: { + lookAhead = readFlags(); + + /* + * We return res = res | 1 << 8 from + * readFlags() if we read + * (?idmsux-idmsux) + */ + if (lookAhead >= 256) { + + // Erase auxiliary bit + lookAhead = (lookAhead & 0xff); + flags = lookAhead; + lookAhead = lookAhead << 16; + lookAhead = CHAR_FLAGS | lookAhead; + } else { + flags = lookAhead; + lookAhead = lookAhead << 16; + lookAhead = CHAR_NONCAP_GROUP | lookAhead; + } + break; + } + } + } else { + behind = false; + switch (nonCap) { + case '!': + lookAhead = CHAR_NEG_LOOKBEHIND; + nextIndex(); + break; + case '=': + lookAhead = CHAR_POS_LOOKBEHIND; + nextIndex(); + break; + default: + throw new TPatternSyntaxException("", this.toString(), index); + } + } + } while (behind); + } else { + lookAhead = CHAR_LEFT_PARENTHESIS; + } + break; + } + + case ')': + lookAhead = CHAR_RIGHT_PARENTHESIS; + break; + case '[': { + lookAhead = CHAR_LEFT_SQUARE_BRACKET; + setMode(Lexer.MODE_RANGE); + break; + } + case ']': { + if (mode == Lexer.MODE_RANGE) { + lookAhead = CHAR_RIGHT_SQUARE_BRACKET; + } + break; + } + case '^': + lookAhead = CHAR_CARET; + break; + case '|': + lookAhead = CHAR_VERTICAL_BAR; + break; + case '.': + lookAhead = CHAR_DOT; + break; + default: + break; + } + } else if (mode == Lexer.MODE_RANGE) { + switch (lookAhead) { + case '[': + lookAhead = CHAR_LEFT_SQUARE_BRACKET; + break; + case ']': + lookAhead = CHAR_RIGHT_SQUARE_BRACKET; + break; + case '^': + lookAhead = CHAR_CARET; + break; + case '&': + lookAhead = CHAR_AMPERSAND; + break; + case '-': + lookAhead = CHAR_HYPHEN; + break; + default: + break; + } + } + } while (reread); + } + + /** + * Parse character classes names and verifies correction of the syntax; + */ + private String parseCharClassName() { + StringBuilder sb = new StringBuilder(10); + if (index < pattern.length - 2) { + // one symbol family + if (pattern[index] != '{') { + return "Is" + new String(pattern, nextIndex(), 1); //$NON-NLS-1$ + } + + nextIndex(); + char ch = 0; + while (index < pattern.length - 2 && (ch = pattern[nextIndex()]) != '}') { + sb.append(ch); + } + if (ch != '}') + throw new TPatternSyntaxException("", this.toString(), index); + } + + if (sb.length() == 0) + throw new TPatternSyntaxException("", this.toString(), index); + + String res = sb.toString(); + if (res.length() == 1) + return "Is" + res; + return (res.length() > 3 && (res.startsWith("Is") || res.startsWith("In"))) ? res.substring(2) : res; + } + + /** + * Process given character in assumption that it's quantifier. + */ + private Quantifier processQuantifier(int ch) { + StringBuilder sb = new StringBuilder(4); + int min = -1; + int max = Integer.MAX_VALUE; + while (index < pattern.length && (ch = pattern[nextIndex()]) != '}') { + if (ch == ',' && min < 0) { + try { + min = Integer.parseInt(sb.toString(), 10); + sb.delete(0, sb.length()); + } catch (NumberFormatException nfe) { + throw new TPatternSyntaxException("", this.toString(), index); + } + } else { + sb.append((char)ch); + } + } + if (ch != '}') { + throw new TPatternSyntaxException("", this.toString(), index); + } + if (sb.length() > 0) { + try { + max = Integer.parseInt(sb.toString(), 10); + if (min < 0) + min = max; + } catch (NumberFormatException nfe) { + throw new TPatternSyntaxException("", this.toString(), index); + } + } else if (min < 0) { + throw new TPatternSyntaxException("", this.toString(), index); + } + if ((min | max | max - min) < 0) { + throw new TPatternSyntaxException("", this.toString(), index); + } + + char mod = (index < pattern.length) ? pattern[index] : '*'; + + switch (mod) { + case '+': + lookAhead = Lexer.QUANT_COMP_P; + nextIndex(); + break; + case '?': + lookAhead = Lexer.QUANT_COMP_R; + nextIndex(); + break; + default: + lookAhead = Lexer.QUANT_COMP; + break; + } + return new Quantifier(min, max); + } + + @Override + public String toString() { + return orig; + } + + /** + * Checks if there are any characters in the pattern. + * + * @return true if there are no more characters in the pattern. + */ + public boolean isEmpty() { + return ch == 0 && lookAhead == 0 && index == patternFullLength && !isSpecial(); + } + + /** + * Returns true if current character is plain token. + */ + public static boolean isLetter(int ch) { + + // all supplementary codepoints have integer value that is >= 0; + return ch >= 0; + } + + /** + * Return true if current character is letter, false otherwise; This is + * shortcut to static method isLetter to check the current character. + * + * @return true if current character is letter, false otherwise + */ + public boolean isLetter() { + return !isEmpty() && !isSpecial() && isLetter(ch); + } + + /* + * Note that Character class methods isHighSurrogate(), isLowSurrogate() + * take char parameter while we need an int parameter without truncation to + * char value + */ + public boolean isHighSurrogate() { + return (ch <= 0xDBFF) && (ch >= 0xD800); + } + + public boolean isLowSurrogate() { + return (ch <= 0xDFFF) && (ch >= 0xDC00); + } + + public static boolean isHighSurrogate(int ch) { + return (ch <= 0xDBFF) && (ch >= 0xD800); + } + + public static boolean isLowSurrogate(int ch) { + return (ch <= 0xDFFF) && (ch >= 0xDC00); + } + + /** + * Process hexadecimal integer. + */ + private int readHex(int max) { + StringBuilder st = new StringBuilder(max); + int length = pattern.length - 2; + int i; + for (i = 0; i < max && index < length; i++) { + st.append(pattern[nextIndex()]); + } + if (i == max) { + try { + return Integer.parseInt(st.toString(), 16); + } catch (NumberFormatException nfe) { + } + } + + throw new TPatternSyntaxException("", this.toString(), index); + } + + /** + * Process octal integer. + */ + private int readOctals() { + int max = 3; + int i = 1; + int first; + int res; + int length = pattern.length - 2; + + switch (first = Character.digit(pattern[index], 8)) { + case -1: + throw new TPatternSyntaxException("", this.toString(), index); + default: { + if (first > 3) + max--; + nextIndex(); + res = first; + } + } + + while (i < max && index < length && (first = Character.digit(pattern[index], 8)) >= 0) { + res = res * 8 + first; + nextIndex(); + i++; + } + + return res; + } + + /** + * Process expression flags given with (?idmsux-idmsux) + */ + private int readFlags() { + char ch; + boolean pos = true; + int res = flags; + + while (index < pattern.length) { + ch = pattern[index]; + switch (ch) { + case '-': + if (!pos) { + throw new TPatternSyntaxException("", this.toString(), index); + } + pos = false; + break; + + case 'i': + res = pos ? res | TPattern.CASE_INSENSITIVE : (res ^ TPattern.CASE_INSENSITIVE) & res; + break; + + case 'd': + res = pos ? res | TPattern.UNIX_LINES : (res ^ TPattern.UNIX_LINES) & res; + break; + + case 'm': + res = pos ? res | TPattern.MULTILINE : (res ^ TPattern.MULTILINE) & res; + break; + + case 's': + res = pos ? res | TPattern.DOTALL : (res ^ TPattern.DOTALL) & res; + break; + + case 'u': + res = pos ? res | TPattern.UNICODE_CASE : (res ^ TPattern.UNICODE_CASE) & res; + break; + + case 'x': + res = pos ? res | TPattern.COMMENTS : (res ^ TPattern.COMMENTS) & res; + break; + + case ':': + nextIndex(); + return res; + + case ')': + nextIndex(); + return res | (1 << 8); + + default: + // ignore invalid flags (HARMONY-2127) + } + nextIndex(); + } + throw new TPatternSyntaxException("", this.toString(), index); + } + + /** + * Returns next character index to read and moves pointer to the next one. + * If comments flag is on this method will skip comments and whitespaces. + * + * The following actions are equivalent if comments flag is off ch = + * pattern[index++] == ch = pattern[nextIndex] + * + * @return next character index to read. + */ + private int nextIndex() { + prevNW = index; + if ((flags & TPattern.COMMENTS) != 0) { + skipComments(); + } else { + index++; + } + return prevNW; + } + + /** + * Skips comments and whitespaces + */ + private int skipComments() { + int length = pattern.length - 2; + index++; + do { + while (index < length && Character.isWhitespace(pattern[index])) + index++; + if (index < length && pattern[index] == '#') { + index++; + while (index < length && !isLineSeparator(pattern[index])) + index++; + } else + return index; + } while (true); + } + + private boolean isLineSeparator(int ch) { + return (ch == '\n' || ch == '\r' || ch == '\u0085' || (ch | 1) == '\u2029'); + } + + /** + * Gets decomposition for given codepoint from decomposition mappings table. + * + * @param ch + * - Unicode codepoint + * @return array of codepoints that is a canonical decomposition of ch. + */ + static int[] getDecomposition(int ch) { + return decompTable.get(ch); + } + + /** + * Gets decomposition for given Hangul syllable. This is an implementation + * of Hangul decomposition algorithm according to + * http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf + * "3.12 Conjoining Jamo Behavior". + * + * @param ch + * - given Hangul syllable + * @return canonical decomposition of ch. + */ + static int[] getHangulDecomposition(int ch) { + int SIndex = ch - SBase; + + if (SIndex < 0 || SIndex >= SCount) { + return null; + } else { + int L = LBase + SIndex / NCount; + int V = VBase + (SIndex % NCount) / TCount; + int T = SIndex % TCount; + int decomp[]; + + if (T == 0) { + decomp = new int[] { L, V }; + } else { + T = TBase + T; + decomp = new int[] { L, V, T }; + } + return decomp; + } + } + + /** + * Gets canonical class for given codepoint from decomposition mappings + * table. + * + * @param - ch Unicode codepoint + * @return canonical class for given Unicode codepoint that is represented + * by ch. + */ + static int getCanonicalClass(int ch) { + int canClass = canonClassesTable.get(ch); + + return (canClass == canonClassesTableSize) ? 0 : canClass; + } + + /** + * Tests if given codepoint is a canonical decomposition of another + * codepoint. + * + * @param ch + * - codepoint to test + * @return true if ch is a decomposition. + */ + static boolean hasSingleCodepointDecomposition(int ch) { + int hasSingleDecomp = singleDecompTable.get(ch); + + /* + * singleDecompTable doesn't contain ch == (hasSingleDecomp == + * singleDecompTableSize) + */ + return (hasSingleDecomp == singleDecompTableSize) ? false : true; + } + + /** + * Tests if given codepoint has canonical decomposition and given + * codepoint's canonical class is not 0. + * + * @param ch + * - codepoint to test + * @return true if canonical class is not 0 and ch has a decomposition. + */ + static boolean hasDecompositionNonNullCanClass(int ch) { + return ch == 0x0340 | ch == 0x0341 | ch == 0x0343 | ch == 0x0344; + } + + private int nextCodePoint() { + char high = pattern[nextIndex()]; + + if (Character.isHighSurrogate(high)) { + + // low and high char may be delimited by spaces + int lowExpectedIndex = prevNW + 1; + + if (lowExpectedIndex < pattern.length) { + char low = pattern[lowExpectedIndex]; + if (Character.isLowSurrogate(low)) { + nextIndex(); + return Character.toCodePoint(high, low); + } + } + } + + return high; + } + + /** + * Tests Unicode codepoint if it is a boundary of decomposed Unicode + * codepoint. + * + * @param ch + * - Unicode codepoint to test + * @return true if given codepoint is a boundary. + */ + static boolean isDecomposedCharBoundary(int ch) { + int canClass = canonClassesTable.get(ch); + + // Lexer.getCanonicalClass(ch) == 0 + boolean isBoundary = (canClass == canonClassesTableSize); + + return isBoundary; + } + + /** + * Returns the curr. character index. + */ + public int getIndex() { + return curToc; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LowHighSurrogateRangeSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LowHighSurrogateRangeSet.java new file mode 100644 index 000000000..b9db75eb0 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LowHighSurrogateRangeSet.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package org.teavm.classlib.java.util.regex; + +/* + * This class is a range that contains only surrogate characters. + */ +class LowHighSurrogateRangeSet extends JointSet { + + protected AbstractCharClass surrChars; + + protected boolean alt = false; + + public LowHighSurrogateRangeSet(AbstractCharClass surrChars, AbstractSet next) { + this.surrChars = surrChars.getInstance(); + this.alt = surrChars.alt; + setNext(next); + } + + public LowHighSurrogateRangeSet(AbstractCharClass surrChars) { + this.surrChars = surrChars.getInstance(); + this.alt = surrChars.alt; + } + + /** + * Returns the next. + */ + @Override + public AbstractSet getNext() { + return this.next; + } + + /** + * Sets next abstract set. + * + * @param next + * The next to set. + */ + @Override + public void setNext(AbstractSet next) { + this.next = next; + } + + /** + * Returns stringIndex+shift, the next position to match + */ + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int startStr = matchResult.getLeftBound(); + int strLength = matchResult.getRightBound(); + + if (stringIndex + 1 > strLength) { + matchResult.hitEnd = true; + return -1; + } + + char ch = testString.charAt(stringIndex); + + if (!surrChars.contains(ch)) { + return -1; + } + + if (Character.isHighSurrogate(ch)) { + + if (stringIndex + 1 < strLength) { + char low = testString.charAt(stringIndex + 1); + + if (Character.isLowSurrogate(low)) { + return -1; + } + } + } else if (Character.isLowSurrogate(ch)) { + + if (stringIndex > startStr) { + char high = testString.charAt(stringIndex - 1); + + if (Character.isHighSurrogate(high)) { + return -1; + } + } + } + + return next.matches(stringIndex + 1, testString, matchResult); + } + + @Override + protected String getName() { + return "range:" + (alt ? "^ " : " ") + surrChars.toString(); + } + + @Override + public boolean first(AbstractSet set) { + if (set instanceof CharSet) { + return false; + } else if (set instanceof RangeSet) { + return false; + } else if (set instanceof SupplRangeSet) { + return false; + } else if (set instanceof SupplCharSet) { + return false; + } + + return true; + } + + protected AbstractCharClass getChars() { + return surrChars; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LowSurrogateCharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LowSurrogateCharSet.java new file mode 100644 index 000000000..5af40e9c0 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/LowSurrogateCharSet.java @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * This class represents low surrogate character. + */ +class LowSurrogateCharSet extends JointSet { + + /* + * Note that we can use high and low surrogate characters that don't combine + * into supplementary code point. See + * http://www.unicode.org/reports/tr18/#Supplementary_Characters + */ + private char low; + + public LowSurrogateCharSet(char low) { + this.low = low; + } + + /** + * Returns the next. + */ + @Override + public AbstractSet getNext() { + return this.next; + } + + /** + * Sets next abstract set. + * + * @param next + * The next to set. + */ + @Override + public void setNext(AbstractSet next) { + this.next = next; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + + if (stringIndex + 1 > matchResult.getRightBound()) { + matchResult.hitEnd = true; + return -1; + } + + char low = testString.charAt(stringIndex); + + if (stringIndex > matchResult.getLeftBound()) { + char high = testString.charAt(stringIndex - 1); + + /* + * we consider high surrogate followed by low surrogate as a + * codepoint + */ + if (Character.isHighSurrogate(high)) { + return -1; + } + } + + if (this.low == low) { + return next.matches(stringIndex + 1, testString, matchResult); + } + + return -1; + } + + @Override + public int find(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + if (testString instanceof String) { + String testStr = (String)testString; + int startStr = matchResult.getLeftBound(); + int strLength = matchResult.getRightBound(); + + while (strIndex < strLength) { + + strIndex = testStr.indexOf(low, strIndex); + if (strIndex < 0) + return -1; + + if (strIndex > startStr) { + + /* + * we consider high surrogate followed by low surrogate as a + * codepoint + */ + if (Character.isHighSurrogate(testStr.charAt(strIndex - 1))) { + strIndex++; + continue; + } + } + + if (next.matches(strIndex + 1, testString, matchResult) >= 0) { + return strIndex; + } + strIndex++; + } + + return -1; + } + + return super.find(strIndex, testString, matchResult); + } + + @Override + public int findBack(int strIndex, int lastIndex, CharSequence testString, MatchResultImpl matchResult) { + if (testString instanceof String) { + int startStr = matchResult.getLeftBound(); + String testStr = (String)testString; + + while (lastIndex >= strIndex) { + lastIndex = testStr.lastIndexOf(low, lastIndex); + if (lastIndex < 0 || lastIndex < strIndex) { + return -1; + } + + if (lastIndex > startStr) { + + /* + * we consider high surrogate followed by low surrogate as a + * codepoint + */ + if (Character.isHighSurrogate(testStr.charAt(lastIndex - 1))) { + lastIndex -= 2; + continue; + } + } + + if (next.matches(lastIndex + 1, testString, matchResult) >= 0) { + return lastIndex; + } + + lastIndex--; + } + + return -1; + } + + return super.findBack(strIndex, lastIndex, testString, matchResult); + } + + @Override + protected String getName() { + return "" + low; + } + + protected int getChar() { + return low; + } + + @Override + public boolean first(AbstractSet set) { + if (set instanceof CharSet) { + return false; + } else if (set instanceof RangeSet) { + return false; + } else if (set instanceof SupplRangeSet) { + return false; + } else if (set instanceof SupplCharSet) { + return false; + } else if (set instanceof HighSurrogateCharSet) { + return false; + } else if (set instanceof LowSurrogateCharSet) { + return ((LowSurrogateCharSet)set).low == this.low; + } + + return true; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/MatchResultImpl.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/MatchResultImpl.java new file mode 100644 index 000000000..52d136723 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/MatchResultImpl.java @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.Arrays; + +/** + * Match result implementation Note: probably it might make sense to combine + * this class with Matcher. + * + * @author Nikolay A. Kuznetsov + */ +class MatchResultImpl implements TMatchResult { + + private int[] groupBounds = null; + + private int[] consumers = null; + + private int[] compQuantCounters = null; + + private CharSequence string = null; + + private int groupCount = 0; + + private boolean valid = false; + + private int leftBound; + + private int rightBound; + + int startIndex; + + private boolean transparentBounds = false; + + private boolean anchoringBounds = false; + + boolean hitEnd = false; + + boolean requireEnd = false; + + int previousMatch = -1; + + private int mode; + + MatchResultImpl(CharSequence string, int leftBound, int rightBound, int groupCount, int compQuantCount, + int consumersCount) { + this.groupCount = ++groupCount; + this.groupBounds = new int[groupCount * 2]; + + this.consumers = new int[consumersCount]; + Arrays.fill(consumers, -1); + + if (compQuantCount > 0) + this.compQuantCounters = new int[compQuantCount]; + Arrays.fill(groupBounds, -1); + reset(string, leftBound, rightBound); + } + + TMatchResult cloneImpl() { + MatchResultImpl res = new MatchResultImpl(this.string, this.leftBound, this.rightBound, this.groupCount - 1, 0, + 0); + + res.valid = valid; + if (valid) { + System.arraycopy(groupBounds, 0, res.groupBounds, 0, this.groupBounds.length); + } + return res; + } + + public void setConsumed(int counter, int value) { + this.consumers[counter] = value; + } + + public int getConsumed(int counter) { + return this.consumers[counter]; + } + + @Override + public int end() { + return end(0); + } + + @Override + public int end(int group) { + checkGroup(group); + return groupBounds[group * 2 + 1]; + } + + void setStart(int group, int offset) { + groupBounds[group * 2] = offset; + } + + void setEnd(int group, int offset) { + groupBounds[group * 2 + 1] = offset; + } + + int getStart(int group) { + return groupBounds[group * 2]; + } + + int getEnd(int group) { + return groupBounds[group * 2 + 1]; + } + + @Override + public String group() { + return group(0); + } + + @Override + public String group(int group) { + if (start(group) < 0) + return null; + return string.subSequence(start(group), end(group)).toString(); + } + + String getGroupNoCheck(int group) { + int st = getStart(group); + int end = getEnd(group); + if ((end | st | (end - st)) < 0 || end > string.length()) + return null; + + return string.subSequence(st, end).toString(); + } + + @Override + public int groupCount() { + return groupCount - 1; + } + + @Override + public int start() { + return start(0); + } + + @Override + public int start(int group) { + checkGroup(group); + return groupBounds[group * 2]; + } + + /* + * This method being called after any successful match; For now it's being + * used to check zero group for empty match; + */ + public void finalizeMatch() { + if (this.groupBounds[0] == -1) { + this.groupBounds[0] = this.startIndex; + this.groupBounds[1] = this.startIndex; + } + + previousMatch = end(); + } + + public int getEnterCounter(int setCounter) { + return compQuantCounters[setCounter]; + } + + public void setEnterCounter(int setCounter, int value) { + compQuantCounters[setCounter] = value; + } + + private void checkGroup(int group) { + if (!valid) { + throw new IllegalStateException(); + } + + if (group < 0 || group > groupCount) { + throw new IndexOutOfBoundsException(String.valueOf(group)); + } + } + + void updateGroup(int index, int srtOffset, int endOffset) { + checkGroup(index); + groupBounds[index * 2] = srtOffset; + groupBounds[index * 2 + 1] = endOffset; + } + + protected void setValid() { + this.valid = true; + } + + protected boolean isValid() { + return this.valid; + } + + protected void reset(CharSequence newSequence, int leftBound, int rightBound) { + valid = false; + mode = TMatcher.MODE_MATCH; + Arrays.fill(groupBounds, -1); + Arrays.fill(consumers, -1); + + if (newSequence != null) + this.string = newSequence; + if (leftBound >= 0) + this.setBounds(leftBound, rightBound); + this.startIndex = this.leftBound; + } + + protected void reset() { + reset(null, -1, -1); + } + + private void setBounds(int leftBound, int rightBound) { + this.leftBound = leftBound; + this.rightBound = rightBound; + } + + protected void setStartIndex(int startIndex) { + this.startIndex = startIndex; + previousMatch = previousMatch >= 0 ? previousMatch : startIndex; + } + + public int getLeftBound() { + return this.leftBound; + } + + public int getRightBound() { + return this.rightBound; + } + + protected void setMode(int mode) { + this.mode = mode; + } + + protected int mode() { + return mode; + } + + protected void useAnchoringBounds(boolean value) { + this.anchoringBounds = value; + } + + protected boolean hasAnchoringBounds() { + return this.anchoringBounds; + } + + protected void useTransparentBounds(boolean value) { + this.transparentBounds = value; + } + + protected boolean hasTransparentBounds() { + return this.transparentBounds; + } + + int getPreviousMatchEnd() { + return previousMatch; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/MultiLineEOLSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/MultiLineEOLSet.java new file mode 100644 index 000000000..19428b49b --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/MultiLineEOLSet.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents multiline version of the dollar sign. + * + * @author Nikolay A. Kuznetsov + */ +class MultiLineEOLSet extends AbstractSet { + + private int consCounter; + + public MultiLineEOLSet(int counter) { + this.consCounter = counter; + } + + @Override + public int matches(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + int strDif = matchResult.hasAnchoringBounds() ? matchResult.getLeftBound() - strIndex : testString.length() - + strIndex; + char ch1; + char ch2; + if (strDif == 0) { + matchResult.setConsumed(consCounter, 0); + return next.matches(strIndex, testString, matchResult); + } else if (strDif >= 2) { + ch1 = testString.charAt(strIndex); + ch2 = testString.charAt(strIndex + 1); + } else { + ch1 = testString.charAt(strIndex); + ch2 = 'a'; + } + + switch (ch1) { + case '\r': { + if (ch2 == '\n') { + matchResult.setConsumed(consCounter, 0); + return next.matches(strIndex, testString, matchResult); + } + matchResult.setConsumed(consCounter, 0); + return next.matches(strIndex, testString, matchResult); + } + + case '\n': + case '\u0085': + case '\u2028': + case '\u2029': { + matchResult.setConsumed(consCounter, 0); + return next.matches(strIndex, testString, matchResult); + } + + default: + return -1; + } + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + int cons; + boolean res = ((cons = matchResult.getConsumed(consCounter)) < 0 || cons > 0); + matchResult.setConsumed(consCounter, -1); + return res; + } + + @Override + protected String getName() { + return ""; //$NON-NLS-1$ + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/MultiLineSOLSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/MultiLineSOLSet.java new file mode 100644 index 000000000..ca49d0069 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/MultiLineSOLSet.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Multiline version of the ^ sign. + * + * @author Nikolay A. Kuznetsov + */ +class MultiLineSOLSet extends AbstractSet { + + private AbstractLineTerminator lt; + + public MultiLineSOLSet(AbstractLineTerminator lt) { + this.lt = lt; + } + + @Override + public int matches(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + if (strIndex != matchResult.getRightBound() && + ((strIndex == 0 || (matchResult.hasAnchoringBounds() && strIndex == matchResult.getLeftBound())) || lt + .isAfterLineTerminator(testString.charAt(strIndex - 1), testString.charAt(strIndex)))) { + return next.matches(strIndex, testString, matchResult); + } + return -1; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return false; + } + + @Override + protected String getName() { + return "^"; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NegativeLookAhead.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NegativeLookAhead.java new file mode 100644 index 000000000..a93dfd4ce --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NegativeLookAhead.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.ArrayList; + +/** + * Negative look ahead node. + * + * @author Nikolay A. Kuznetsov + */ +class NegativeLookAhead extends AtomicJointSet { + + public NegativeLookAhead(ArrayList children, FSet fSet) { + super(children, fSet); + } + + /** + * Returns stringIndex+shift, the next position to match + */ + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int size = children.size(); + + for (int i = 0; i < size; i++) { + AbstractSet e = children.get(i); + if (e.matches(stringIndex, testString, matchResult) >= 0) + return -1; + } + + return next.matches(stringIndex, testString, matchResult); + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return false; + } + + @Override + protected String getName() { + return "NegLookaheadJointSet"; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NegativeLookBehind.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NegativeLookBehind.java new file mode 100644 index 000000000..5eb334473 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NegativeLookBehind.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.ArrayList; + +/** + * Negative look behind node. + * + * @author Nikolay A. Kuznetsov + */ +class NegativeLookBehind extends AtomicJointSet { + + public NegativeLookBehind(ArrayList children, FSet fSet) { + super(children, fSet); + } + + /** + * Returns stringIndex+shift, the next position to match + */ + @Override + public int matches(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + + int size = children.size(); + int shift; + + // fSet will take this index to check if we at the right bound + // and return true if the current index equal to this one + matchResult.setConsumed(groupIndex, stringIndex); + + for (int i = 0; i < size; i++) { + AbstractSet e = children.get(i); + // find limits could be calculated though e.getCharCount() + // fSet will return true only if string index at fSet equal + // to stringIndex + shift = e.findBack(0, stringIndex, testString, matchResult); + if (shift >= 0) { + return -1; + } + } + + return next.matches(stringIndex, testString, matchResult); + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return false; + } + + @Override + protected String getName() { + return "NegBehindJointSet"; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NonCapFSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NonCapFSet.java new file mode 100644 index 000000000..5e07f81d7 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NonCapFSet.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Non-capturing group closing node. + * + * @author Nikolay A. Kuznetsov + */ +class NonCapFSet extends FSet { + public NonCapFSet(int groupIndex) { + super(groupIndex); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int gr = getGroupIndex(); + matchResult.setConsumed(gr, stringIndex - matchResult.getConsumed(gr)); + + return next.matches(stringIndex, testString, matchResult); + } + + @Override + protected String getName() { + return "NonCapFSet"; + } + + @Override + public boolean hasConsumed(MatchResultImpl mr) { + return false; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NonCapJointSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NonCapJointSet.java new file mode 100644 index 000000000..3a28acc93 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/NonCapJointSet.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.ArrayList; + +/** + * Node representing non-capturing group + * @author Nikolay A. Kuznetsov + */ +class NonCapJointSet extends JointSet { + + protected NonCapJointSet() { + } + + public NonCapJointSet(ArrayList children, FSet fSet) { + super(children, fSet); + } + + /** + * Returns stringIndex+shift, the next position to match + */ + @Override + public int matches(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + int start = matchResult.getConsumed(groupIndex); + matchResult.setConsumed(groupIndex, stringIndex); + + int size = children.size(); + for (int i = 0; i < size; i++) { + AbstractSet e = children.get(i); + int shift = e.matches(stringIndex, testString, matchResult); + if (shift >= 0) { + return shift; + } + } + matchResult.setConsumed(groupIndex, start); + return -1; + } + + @Override + protected String getName() { + return "NonCapJointSet"; //$NON-NLS-1$ + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + int cons = matchResult.getConsumed(groupIndex); + return cons != 0; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PosAltGroupQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PosAltGroupQuantifierSet.java new file mode 100644 index 000000000..8f9e83767 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PosAltGroupQuantifierSet.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Possessive quantifier over group, see java.util.regex.GroupQuantifierSet for + * more details. + * + * @author Nikolay A. Kuznetsov + */ +class PosAltGroupQuantifierSet extends AltGroupQuantifierSet { + + public PosAltGroupQuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + ((JointSet)innerSet).setNext(FSet.posFSet); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int nextIndex = innerSet.matches(stringIndex, testString, matchResult); + if (nextIndex > 0) { + stringIndex = nextIndex; + } + return next.matches(stringIndex, testString, matchResult); + } + + @Override + public void setNext(AbstractSet next) { + this.next = next; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PosCompositeGroupQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PosCompositeGroupQuantifierSet.java new file mode 100644 index 000000000..061bd2018 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PosCompositeGroupQuantifierSet.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Possessive composite (i.e. {n,m}) quantifier node over groups. + * + * @author Nikolay A. Kuznetsov + */ +class PosCompositeGroupQuantifierSet extends CompositeGroupQuantifierSet { + + public PosCompositeGroupQuantifierSet(Quantifier quant, AbstractSet innerSet, AbstractSet next, int type, + int setCounter) { + super(quant, innerSet, next, type, setCounter); + innerSet.setNext(FSet.posFSet); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int nextIndex; + int counter = 0; + int max = quantifier.max(); + + while ((nextIndex = innerSet.matches(stringIndex, testString, matchResult)) > stringIndex && counter < max) { + counter++; + stringIndex = nextIndex; + } + + if (nextIndex < 0 && counter < quantifier.min()) { + return -1; + } else { + return next.matches(stringIndex, testString, matchResult); + } + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PosPlusGroupQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PosPlusGroupQuantifierSet.java new file mode 100644 index 000000000..265ea0e99 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PosPlusGroupQuantifierSet.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Possessive + quantifier node over groups. + * + * @author Nikolay A. Kuznetsov + */ +class PosPlusGroupQuantifierSet extends GroupQuantifierSet { + + public PosPlusGroupQuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + ((JointSet)innerSet).setNext(FSet.posFSet); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int nextIndex; + if ((nextIndex = innerSet.matches(stringIndex, testString, matchResult)) < 0) { + return -1; + } else if (nextIndex > stringIndex) { + stringIndex = nextIndex; + while ((nextIndex = innerSet.matches(stringIndex, testString, matchResult)) > stringIndex) { + stringIndex = nextIndex; + } + } + + return next.matches(stringIndex, testString, matchResult); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PositiveLookAhead.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PositiveLookAhead.java new file mode 100644 index 000000000..66e4ea841 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PositiveLookAhead.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.ArrayList; + +/** + * Positive lookahead node. + * + * @author Nikolay A. Kuznetsov + */ +class PositiveLookAhead extends AtomicJointSet { + public PositiveLookAhead(ArrayList children, FSet fSet) { + super(children, fSet); + } + + /** + * Returns stringIndex+shift, the next position to match + */ + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int size = children.size(); + for (int i = 0; i < size; i++) { + AbstractSet e = children.get(i); + int shift = e.matches(stringIndex, testString, matchResult); + if (shift >= 0) { + // PosLookaheadFset always returns true, position remains the + // same + // next.match() from; + return next.matches(stringIndex, testString, matchResult); + } + } + + return -1; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return false; + } + + @Override + protected String getName() { + return "PosLookaheadJointSet"; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PositiveLookBehind.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PositiveLookBehind.java new file mode 100644 index 000000000..9392ddac9 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PositiveLookBehind.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.ArrayList; + +/** + * Positive lookbehind node. + * + * @author Nikolay A. Kuznetsov + */ +class PositiveLookBehind extends AtomicJointSet { + + public PositiveLookBehind(ArrayList children, FSet fSet) { + super(children, fSet); + } + + /** + * Returns stringIndex+shift, the next position to match + */ + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + + int size = children.size(); + int leftBound = matchResult.hasTransparentBounds() ? 0 : matchResult.getLeftBound(); + + int shift = next.matches(stringIndex, testString, matchResult); + if (shift >= 0) { + // fSet will take this index to check if we at the right bound + // and return true if the current index equal to this one + matchResult.setConsumed(groupIndex, stringIndex); + for (int i = 0; i < size; i++) { + AbstractSet e = children.get(i); + // find limits could be calculated though e.getCharCount() + // fSet will return true only if string index at fSet equal + // to stringIndex + if (e.findBack(leftBound, stringIndex, testString, matchResult) >= 0) { + matchResult.setConsumed(groupIndex, -1); + return shift; + } + } + } + + return -1; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return false; + } + + @Override + protected String getName() { + return "PosBehindJointSet"; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveAltQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveAltQuantifierSet.java new file mode 100644 index 000000000..2c27aa3b4 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveAltQuantifierSet.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Possessive ? quantifier node. + * + * @author Nikolay A. Kuznetsov + */ +class PossessiveAltQuantifierSet extends AltQuantifierSet { + public PossessiveAltQuantifierSet(LeafSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int shift = 0; + + if (stringIndex + leaf.charCount() <= matchResult.getRightBound() && + (shift = leaf.accepts(stringIndex, testString)) >= 1) { + stringIndex += shift; + } + + return next.matches(stringIndex, testString, matchResult); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveCompositeQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveCompositeQuantifierSet.java new file mode 100644 index 000000000..525bdca12 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveCompositeQuantifierSet.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Possessive composite (i.e. {n, m}) quantifier node. + * + * @author Nikolay A. Kuznetsov + */ +class PossessiveCompositeQuantifierSet extends CompositeQuantifierSet { + public PossessiveCompositeQuantifierSet(Quantifier quant, LeafSet innerSet, AbstractSet next, int type) { + super(quant, innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int min = quantifier.min(); + int max = quantifier.max(); + int i = 0; + + for (; i < min; i++) { + if (stringIndex + leaf.charCount() > matchResult.getRightBound()) { + matchResult.hitEnd = true; + return -1; + } + int shift = leaf.accepts(stringIndex, testString); + if (shift < 1) { + return -1; + } + stringIndex += shift; + } + + for (; i < max; i++) { + int shift; + if (stringIndex + leaf.charCount() > matchResult.getRightBound() || + (shift = leaf.accepts(stringIndex, testString)) < 1) { + break; + } + stringIndex += shift; + } + return next.matches(stringIndex, testString, matchResult); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveGroupQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveGroupQuantifierSet.java new file mode 100644 index 000000000..ce748b26a --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveGroupQuantifierSet.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Possessive quantifier set over groups. + * + * @author Nikolay A. Kuznetsov + */ +class PossessiveGroupQuantifierSet extends GroupQuantifierSet { + public PossessiveGroupQuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + innerSet.setNext(FSet.posFSet); + + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int nextIndex; + while ((nextIndex = innerSet.matches(stringIndex, testString, matchResult)) > 0) { + stringIndex = nextIndex; + } + + return next.matches(stringIndex, testString, matchResult); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveQuantifierSet.java new file mode 100644 index 000000000..5fd7c7072 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PossessiveQuantifierSet.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Possessive quantifier set over LeafSet's + * + * @author Nikolay A. Kuznetsov + */ +class PossessiveQuantifierSet extends LeafQuantifierSet { + + public PossessiveQuantifierSet(LeafSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int shift = 0; + while (stringIndex + leaf.charCount() <= matchResult.getRightBound() && + (shift = leaf.accepts(stringIndex, testString)) >= 1) { + stringIndex += shift; + } + + return next.matches(stringIndex, testString, matchResult); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PreviousMatch.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PreviousMatch.java new file mode 100644 index 000000000..d6368da51 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/PreviousMatch.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Node representing previous match (\G). + * + * @author Nikolay A. Kuznetsov + */ +class PreviousMatch extends AbstractSet { + @Override + public int matches(int stringIndex, CharSequence testString, + MatchResultImpl matchResult) { + if (stringIndex == matchResult.getPreviousMatchEnd()) { + return next.matches(stringIndex, testString, matchResult); + } + return -1; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return false; + } + + @Override + protected String getName() { + return "PreviousMatch"; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/Quantifier.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/Quantifier.java new file mode 100644 index 000000000..7b49303fb --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/Quantifier.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents RE quantifier; contains two fields responsible for min and max + * number of repetitions. Negative value for maximum number of repetition + * represents infinity(i.e. +,*) + * + * @author Nikolay A. Kuznetsov + */ +class Quantifier extends SpecialToken implements Cloneable { + + private int min; + + private int max; + + private int counter = 0; + + public Quantifier(int min) { + this.min = this.max = min; + } + + public Quantifier(int min, int max) { + this.min = min; + this.max = max; + } + + public void resetCounter() { + counter = 0; + } + + public int getCounter() { + return counter; + } + + public void setCounter(int counter) { + this.counter = counter; + } + + public int min() { + return min; + } + + public int max() { + return max; + } + + @Override + public String toString() { + return "{" + min + "," + ((max == Integer.MAX_VALUE) ? "" : new Integer(max).toString()) + "}"; + } + + @Override + public int getType() { + return SpecialToken.TOK_QUANTIFIER; + } + + @Override + public Object clone() { + return new Quantifier(min, max); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/QuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/QuantifierSet.java new file mode 100644 index 000000000..450e81057 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/QuantifierSet.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Base class for quantifiers. + * + * @author Nikolay A. Kuznetsov + */ +abstract class QuantifierSet extends AbstractSet { + + protected AbstractSet innerSet; + + public QuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { + super(next); + this.innerSet = innerSet; + setType(type); + } + + /** + * Returns the innerSet. + */ + public AbstractSet getInnerSet() { + return innerSet; + } + + /** + * Sets an inner set. + * + * @param innerSet + * The innerSet to set. + */ + public void setInnerSet(AbstractSet innerSet) { + this.innerSet = innerSet; + } + + @Override + public boolean first(AbstractSet set) { + return innerSet.first(set) || next.first(set); + } + + @Override + public boolean hasConsumed(MatchResultImpl mr) { + return true; + } + + /** + * This method is used for traversing nodes after the first stage of + * compilation. + */ + @Override + public void processSecondPass() { + this.isSecondPassVisited = true; + + if (next != null) { + + if (!next.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + JointSet set = next.processBackRefReplacement(); + + if (set != null) { + next.isSecondPassVisited = true; + next = set; + } + + /* + * End code to do during the pass + */ + next.processSecondPass(); + } + } + + if (innerSet != null) { + + if (!innerSet.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + JointSet set = innerSet.processBackRefReplacement(); + + if (set != null) { + innerSet.isSecondPassVisited = true; + innerSet = set; + } + + /* + * End code to do during the pass + */ + innerSet.processSecondPass(); + } else { + + /* + * We reach node through innerSet but it is already traversed. + * You can see this situation for GroupQuantifierSet.innerset if + * we compile smth like "(a)+ when GroupQuantifierSet == + * GroupQuantifierSet.innerset.fSet.next + */ + + /* + * Add here code to do during the pass + */ + if (innerSet instanceof SingleSet && ((FSet)((JointSet)innerSet).fSet).isBackReferenced) { + innerSet = innerSet.next; + } + + /* + * End code to do during the pass + */ + } + } + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/RangeSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/RangeSet.java new file mode 100644 index 000000000..23cd8e742 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/RangeSet.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single character from the given char class. + * + * @author Nikolay A. Kuznetsov + */ + +class RangeSet extends LeafSet { + + private AbstractCharClass chars; + + private boolean alt = false; + + public RangeSet(AbstractCharClass cs, AbstractSet next) { + super(next); + this.chars = cs.getInstance(); + this.alt = cs.alt; + } + + public RangeSet(AbstractCharClass cc) { + this.chars = cc.getInstance(); + this.alt = cc.alt; + } + + @Override + public int accepts(int strIndex, CharSequence testString) { + return chars.contains(testString.charAt(strIndex)) ? 1 : -1; + } + + @Override + protected String getName() { + return "range:" + (alt ? "^ " : " ") + chars.toString(); + } + + @Override + public boolean first(AbstractSet set) { + if (set instanceof CharSet) { + return AbstractCharClass.intersects(chars, ((CharSet)set).getChar()); + } else if (set instanceof RangeSet) { + return AbstractCharClass.intersects(chars, ((RangeSet)set).chars); + } else if (set instanceof SupplRangeSet) { + return AbstractCharClass.intersects(chars, ((SupplRangeSet)set).getChars()); + } else if (set instanceof SupplCharSet) { + return false; + } + return true; + } + + protected AbstractCharClass getChars() { + return chars; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/RelAltGroupQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/RelAltGroupQuantifierSet.java new file mode 100644 index 000000000..49f791174 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/RelAltGroupQuantifierSet.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Reluctant version of "?" quantifier set over group. + * + * @author Nikolay A. Kuznetsov + */ +class RelAltGroupQuantifierSet extends AltGroupQuantifierSet { + + public RelAltGroupQuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + + if (!innerSet.hasConsumed(matchResult)) + return next.matches(stringIndex, testString, matchResult); + + int nextIndex = next.matches(stringIndex, testString, matchResult); + + if (nextIndex < 0) { + return innerSet.matches(stringIndex, testString, matchResult); + } else { + return nextIndex; + } + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/RelCompositeGroupQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/RelCompositeGroupQuantifierSet.java new file mode 100644 index 000000000..b0196527c --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/RelCompositeGroupQuantifierSet.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Reluctant version of composite (i.e. {n,m}) quantifier node over group. + * + * @author Nikolay A. Kuznetsov + */ +class RelCompositeGroupQuantifierSet extends CompositeGroupQuantifierSet { + + public RelCompositeGroupQuantifierSet(Quantifier quant, AbstractSet innerSet, AbstractSet next, int type, + int setCounter) { + super(quant, innerSet, next, type, setCounter); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int enterCounter = matchResult.getEnterCounter(setCounter); + + if (!innerSet.hasConsumed(matchResult)) + return next.matches(stringIndex, testString, matchResult); + + // can't go inner set; + if (enterCounter >= quantifier.max()) { + matchResult.setEnterCounter(setCounter, 0); + return next.matches(stringIndex, testString, matchResult); + } + + int nextIndex; + + if (enterCounter >= quantifier.min()) { + nextIndex = next.matches(stringIndex, testString, matchResult); + if (nextIndex < 0) { + matchResult.setEnterCounter(setCounter, ++enterCounter); + nextIndex = innerSet.matches(stringIndex, testString, matchResult); + } else { + matchResult.setEnterCounter(setCounter, 0); + return nextIndex; + } + } else { + matchResult.setEnterCounter(setCounter, ++enterCounter); + nextIndex = innerSet.matches(stringIndex, testString, matchResult); + } + + return nextIndex; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantAltQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantAltQuantifierSet.java new file mode 100644 index 000000000..3548d9574 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantAltQuantifierSet.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * This class represents ?? quantifier over leaf sets. + * + * @author Nikolay A. Kuznetsov + */ +class ReluctantAltQuantifierSet extends AltQuantifierSet { + public ReluctantAltQuantifierSet(LeafSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int shift; + + if ((shift = next.matches(stringIndex, testString, matchResult)) >= 0) { + return shift; + } else { + return innerSet.matches(stringIndex, testString, matchResult); + } + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantCompositeQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantCompositeQuantifierSet.java new file mode 100644 index 000000000..084a24008 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantCompositeQuantifierSet.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Reluctant version of composite(i.e. {n,m}) quantifier set over leaf nodes. + * + * @author Nikolay A. Kuznetsov + */ +class ReluctantCompositeQuantifierSet extends CompositeQuantifierSet { + public ReluctantCompositeQuantifierSet(Quantifier quant, LeafSet innerSet, AbstractSet next, int type) { + super(quant, innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int min = quantifier.min(); + int max = quantifier.max(); + int i = 0; + int shift = 0; + + for (; i < min; i++) { + + if (stringIndex + leaf.charCount() > matchResult.getRightBound()) { + matchResult.hitEnd = true; + return -1; + } + + shift = leaf.accepts(stringIndex, testString); + if (shift < 1) { + return -1; + } + stringIndex += shift; + } + + do { + shift = next.matches(stringIndex, testString, matchResult); + if (shift >= 0) { + return shift; + } + + if (stringIndex + leaf.charCount() <= matchResult.getRightBound()) { + shift = leaf.accepts(stringIndex, testString); + stringIndex += shift; + i++; + } + + } while (shift >= 1 && i <= max); + + return -1; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantGroupQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantGroupQuantifierSet.java new file mode 100644 index 000000000..497340f09 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantGroupQuantifierSet.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Relactant version of the group quantifier set. + * + * @author Nikolay A. Kuznetsov + */ +class ReluctantGroupQuantifierSet extends GroupQuantifierSet { + public ReluctantGroupQuantifierSet(AbstractSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + if (!innerSet.hasConsumed(matchResult)) + return next.matches(stringIndex, testString, matchResult); + + int res = next.matches(stringIndex, testString, matchResult); + if (res < 0) { + return innerSet.matches(stringIndex, testString, matchResult); + } else { + return res; + } + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantQuantifierSet.java new file mode 100644 index 000000000..3dbaf5f3b --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/ReluctantQuantifierSet.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * This class represents [+*]? constructs over LeafSets. + * + * @see java.util.regex.LeafSet + * @author Nikolay A. Kuznetsov + */ +class ReluctantQuantifierSet extends LeafQuantifierSet { + + public ReluctantQuantifierSet(LeafSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int shift = 0; + + do { + shift = next.matches(stringIndex, testString, matchResult); + if (shift >= 0) { + return shift; + } + + if (stringIndex + leaf.charCount() <= matchResult.getRightBound()) { + shift = leaf.accepts(stringIndex, testString); + stringIndex += shift; + } + } while (shift >= 1); + + return -1; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SOLSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SOLSet.java new file mode 100644 index 000000000..6e56ceb67 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SOLSet.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single character. + * + * @author Nikolay A. Kuznetsov + */ +final class SOLSet extends AbstractSet { + + @Override + public int matches(int strIndex, CharSequence testString, + MatchResultImpl matchResult) { + if (strIndex == 0 + || (matchResult.hasAnchoringBounds() && strIndex == matchResult + .getLeftBound())) { + return next.matches(strIndex, testString, matchResult); + } + return -1; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + return false; + } + + @Override + protected String getName() { + return ""; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SequenceSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SequenceSet.java new file mode 100644 index 000000000..b88fe34f7 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SequenceSet.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * This class represents nodes constructed with character sequences. For + * example, lets consider regular expression: ".*word.*". During regular + * expression compilation phase character sequence w-o-r-d, will be represented + * with single node for the entire word. + * + * During the match phase, Moyer-Moore algorithm will be used for fast + * searching. + * + * Please follow the next link for more details about mentioned algorithm: + * http://portal.acm.org/citation.cfm?id=359859 + * + * @author Nikolay A. Kuznetsov + */ +class SequenceSet extends LeafSet { + + private String string = null; + + private IntHash leftToRight; + + private IntHash rightToLeft; + + SequenceSet(StringBuffer substring) { + this.string = substring.toString(); + charCount = substring.length(); + + leftToRight = new IntHash(charCount); + rightToLeft = new IntHash(charCount); + for (int j = 0; j < charCount - 1; j++) { + leftToRight.put(string.charAt(j), charCount - j - 1); + rightToLeft + .put(string.charAt(charCount - j - 1), charCount - j - 1); + } + } + + @Override + public int accepts(int strIndex, CharSequence testString) { + return startsWith(testString, strIndex) ? charCount : -1; + } + + @Override + public int find(int strIndex, CharSequence testString, + MatchResultImpl matchResult) { + + int strLength = matchResult.getRightBound(); + + while (strIndex <= strLength) { + strIndex = indexOf(testString, strIndex, strLength); + + if (strIndex < 0) + return -1; + if (next.matches(strIndex + charCount, testString, matchResult) >= 0) + return strIndex; + + strIndex++; + } + + return -1; + } + + @Override + public int findBack(int strIndex, int lastIndex, CharSequence testString, + MatchResultImpl matchResult) { + + while (lastIndex >= strIndex) { + lastIndex = lastIndexOf(testString, strIndex, lastIndex); + + if (lastIndex < 0) + return -1; + if (next.matches(lastIndex + charCount, testString, matchResult) >= 0) + return lastIndex; + + lastIndex--; + } + + return -1; + } + + @Override + public String getName() { + return "sequence: " + string; //$NON-NLS-1$ + } + + @Override + public boolean first(AbstractSet set) { + if (set instanceof CharSet) { + return ((CharSet) set).getChar() == string.charAt(0); + } else if (set instanceof RangeSet) { + return ((RangeSet) set).accepts(0, string.substring(0, 1)) > 0; + } else if (set instanceof SupplRangeSet) { + return ((SupplRangeSet) set).contains(string.charAt(0)) + || ((string.length() > 1) && ((SupplRangeSet) set).contains(Character + .toCodePoint(string.charAt(0), string.charAt(1)))); + } else if ((set instanceof SupplCharSet)) { + return (string.length() > 1) + ? ((SupplCharSet) set).getCodePoint() + == Character.toCodePoint(string.charAt(0), + string.charAt(1)) + : false; + } + + return true; + } + + protected int indexOf(CharSequence str, int from, int to) { + int last = string.charAt(charCount - 1); + int i = from; + + while (i <= to - charCount) { + char ch = str.charAt(i + charCount - 1); + if (ch == last && startsWith(str, i)) { + return i; + } + + i += leftToRight.get(ch); + } + return -1; + } + + protected int lastIndexOf(CharSequence str, int to, int from) { + int first = string.charAt(0); + int size = str.length(); + int delta; + int i = ((delta = size - from - charCount) > 0) ? from : from + delta; + + while (i >= to) { + char ch = str.charAt(i); + if (ch == first && startsWith(str, i)) { + return i; + } + + i -= rightToLeft.get(ch); + } + return -1; + } + + protected boolean startsWith(CharSequence str, int from) { + for (int i = 0; i < charCount; i++) { + if (str.charAt(i + from) != string.charAt(i)) + return false; + } + return true; + } + + static class IntHash { + int[] table, values; + + int mask; + + int size; // <-maximum shift + + public IntHash(int size) { + while (size >= mask) { + mask = (mask << 1) | 1; + } + mask = (mask << 1) | 1; + table = new int[mask + 1]; + values = new int[mask + 1]; + this.size = size; + } + + public void put(int key, int value) { + int i = 0; + int hashCode = key & mask; + + for (;;) { + if (table[hashCode] == 0 // empty + || table[hashCode] == key) {// rewrite + table[hashCode] = key; + values[hashCode] = value; + return; + } + i++; + i &= mask; + + hashCode += i; + hashCode &= mask; + } + } + + public int get(int key) { + + int hashCode = key & mask; + int i = 0; + int storedKey; + + for (;;) { + storedKey = table[hashCode]; + + if (storedKey == 0) { // empty + return size; + } + + if (storedKey == key) { + return values[hashCode]; + } + + i++; + i &= mask; + + hashCode += i; + hashCode &= mask; + } + } + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SingleSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SingleSet.java new file mode 100644 index 000000000..f733606a2 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SingleSet.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Group node over subexpression w/o alternations. + * + * @author Nikolay A. Kuznetsov + */ +class SingleSet extends JointSet { + + protected AbstractSet kid; + + public SingleSet(AbstractSet child, FSet fSet) { + this.kid = child; + this.fSet = fSet; + this.groupIndex = fSet.getGroupIndex(); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int start = matchResult.getStart(groupIndex); + matchResult.setStart(groupIndex, stringIndex); + int shift = kid.matches(stringIndex, testString, matchResult); + if (shift >= 0) { + return shift; + } + matchResult.setStart(groupIndex, start); + return -1; + } + + @Override + public int find(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int res = kid.find(stringIndex, testString, matchResult); + if (res >= 0) + matchResult.setStart(groupIndex, res); + return res; + } + + @Override + public int findBack(int stringIndex, int lastIndex, CharSequence testString, MatchResultImpl matchResult) { + int res = kid.findBack(stringIndex, lastIndex, testString, matchResult); + if (res >= 0) + matchResult.setStart(groupIndex, res); + return res; + } + + @Override + public boolean first(AbstractSet set) { + return kid.first(set); + } + + /** + * This method is used for replacement backreferenced sets. + */ + @Override + public JointSet processBackRefReplacement() { + BackReferencedSingleSet set = new BackReferencedSingleSet(this); + + /* + * We will store a reference to created BackReferencedSingleSet in next + * field. This is needed toprocess replacement of sets correctly since + * sometimes we cannot renew all references to detachable set in the + * current point of traverse. See QuantifierSet and AbstractSet + * processSecondPass() methods for more details. + */ + next = set; + return set; + } + + /** + * This method is used for traversing nodes after the first stage of + * compilation. + */ + @Override + public void processSecondPass() { + this.isSecondPassVisited = true; + + if (fSet != null && !fSet.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + + /* + * End code to do during the pass + */ + fSet.processSecondPass(); + } + + if (kid != null && !kid.isSecondPassVisited) { + + /* + * Add here code to do during the pass + */ + JointSet set = kid.processBackRefReplacement(); + + if (set != null) { + kid.isSecondPassVisited = true; + kid = set; + } + + /* + * End code to do during the pass + */ + + kid.processSecondPass(); + } + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SpecialToken.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SpecialToken.java new file mode 100644 index 000000000..f8732e854 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SpecialToken.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * This is base class for special tokens like character classes + * and quantifiers. + * + * @author Nikolay A. Kuznetsov + */ +abstract class SpecialToken { + + public static final int TOK_CHARCLASS = 1 << 0; + + public static final int TOK_QUANTIFIER = 1 << 1; + + /** + * Returns the type of the token, may return following values: + * TOK_CHARCLASS - token representing character class; + * TOK_QUANTIFIER - token representing quantifier; + * + * @return character type. + */ + public abstract int getType(); +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SupplCharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SupplCharSet.java new file mode 100644 index 000000000..342316ba6 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SupplCharSet.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single supplementary codepoint. + */ +class SupplCharSet extends LeafSet { + + /* + * UTF-16 encoding of this supplementary codepoint + */ + private char high = 0; + + private char low = 0; + + // int value of this supplementary codepoint + private int ch; + + public SupplCharSet(int ch) { + charCount = 2; + this.ch = ch; + char[] chUTF16 = Character.toChars(ch); + high = chUTF16[0]; + + /* + * we suppose that SupplCharSet is build over supplementary codepoints + * only + */ + low = chUTF16[1]; + } + + @Override + public int accepts(int strIndex, CharSequence testString) { + char high = testString.charAt(strIndex++); + char low = testString.charAt(strIndex); + return ((this.high == high) && (this.low == low)) ? 2 : -1; + } + + @Override + public int find(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + + if (testString instanceof String) { + String testStr = (String)testString; + int strLength = matchResult.getRightBound(); + + while (strIndex < strLength) { + strIndex = testStr.indexOf(high, strIndex); + if (strIndex < 0) + return -1; + + strIndex++; + if (strIndex < strLength) { + char ch = testStr.charAt(strIndex); + + if ((low == ch) && (next.matches(strIndex + 1, testString, matchResult) >= 0)) { + return --strIndex; + } + strIndex++; + } + } + return -1; + } + + return super.find(strIndex, testString, matchResult); + } + + @Override + public int findBack(int strIndex, int lastIndex, CharSequence testString, MatchResultImpl matchResult) { + + if (testString instanceof String) { + String testStr = (String)testString; + + while (lastIndex >= strIndex) { + lastIndex = testStr.lastIndexOf(low, lastIndex); + lastIndex--; + if (lastIndex < 0 || lastIndex < strIndex) { + return -1; + } + + if ((high == testStr.charAt(lastIndex)) && next.matches(lastIndex + 2, testString, matchResult) >= 0) { + return lastIndex; + } + + lastIndex--; + } + return -1; + } + + return super.findBack(strIndex, lastIndex, testString, matchResult); + } + + @Override + protected String getName() { + return "" + high + low; + } + + protected int getCodePoint() { + return ch; + } + + @Override + public boolean first(AbstractSet set) { + if (set instanceof SupplCharSet) { + return ((SupplCharSet)set).getCodePoint() == ch; + } else if (set instanceof SupplRangeSet) { + return ((SupplRangeSet)set).contains(ch); + } else if (set instanceof CharSet) { + return false; + } else if (set instanceof RangeSet) { + return false; + } + + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SupplRangeSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SupplRangeSet.java new file mode 100644 index 000000000..169ac68ed --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/SupplRangeSet.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single character from the given char class. This + * character can be supplementary (2 chars needed to represent) or from basic + * multilingual pane (1 needed char to represent it). + */ +class SupplRangeSet extends JointSet { + + protected AbstractCharClass chars; + + protected boolean alt = false; + + public SupplRangeSet(AbstractCharClass cs, AbstractSet next) { + this.chars = cs.getInstance(); + this.alt = cs.alt; + this.next = next; + } + + public SupplRangeSet(AbstractCharClass cc) { + this.chars = cc.getInstance(); + this.alt = cc.alt; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int strLength = matchResult.getRightBound(); + int offset = -1; + + if (stringIndex < strLength) { + char high = testString.charAt(stringIndex++); + + if (contains(high) && (offset = next.matches(stringIndex, testString, matchResult)) > 0) { + return offset; + } + + if (stringIndex < strLength) { + char low = testString.charAt(stringIndex++); + + if (Character.isSurrogatePair(high, low) && contains(Character.toCodePoint(high, low))) { + return next.matches(stringIndex, testString, matchResult); + } + } + } + + return -1; + } + + @Override + protected String getName() { + return "range:" + (alt ? "^ " : " ") + chars.toString(); + } + + public boolean contains(int ch) { + return chars.contains(ch); + } + + @Override + public boolean first(AbstractSet set) { + if (set instanceof SupplCharSet) { + return AbstractCharClass.intersects(chars, ((SupplCharSet)set).getCodePoint()); + } else if (set instanceof CharSet) { + return AbstractCharClass.intersects(chars, ((CharSet)set).getChar()); + } else if (set instanceof SupplRangeSet) { + return AbstractCharClass.intersects(chars, ((SupplRangeSet)set).chars); + } else if (set instanceof RangeSet) { + return AbstractCharClass.intersects(chars, ((RangeSet)set).getChars()); + } + + return true; + } + + protected AbstractCharClass getChars() { + return chars; + } + + @Override + public AbstractSet getNext() { + return next; + } + + @Override + public void setNext(AbstractSet next) { + this.next = next; + } + + @Override + public boolean hasConsumed(MatchResultImpl mr) { + return true; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TMatchResult.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TMatchResult.java new file mode 100644 index 000000000..73ffa4d01 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TMatchResult.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Holds the results of a successful match of a {@link TPattern} against a + * given string. The result is divided into groups, with one group for each + * pair of parentheses in the regular expression and an additional group for + * the whole regular expression. The start, end, and contents of each group + * can be queried. + * + * @see TMatcher + * @see TMatcher#toMatchResult() + * + * @author Nikolay A. Kuznetsov + */ +public interface TMatchResult { + + /** + * Returns the index of the first character following the text that matched + * the whole regular expression. + * + * @return the character index. + */ + int end(); + + /** + * Returns the index of the first character following the text that matched + * a given group. + * + * @param group + * the group, ranging from 0 to groupCount() - 1, with 0 + * representing the whole pattern. + * + * @return the character index. + */ + int end(int group); + + /** + * Returns the text that matched the whole regular expression. + * + * @return the text. + */ + String group(); + + /** + * Returns the text that matched a given group of the regular expression. + * + * @param group + * the group, ranging from 0 to groupCount() - 1, with 0 + * representing the whole pattern. + * + * @return the text that matched the group. + */ + String group(int group); + + /** + * Returns the number of groups in the result, which is always equal to + * the number of groups in the original regular expression. + * + * @return the number of groups. + */ + int groupCount(); + + /** + * Returns the index of the first character of the text that matched + * the whole regular expression. + * + * @return the character index. + */ + int start(); + + /** + * Returns the index of the first character of the text that matched a given + * group. + * + * @param group + * the group, ranging from 0 to groupCount() - 1, with 0 + * representing the whole pattern. + * + * @return the character index. + */ + int start(int group); +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TMatcher.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TMatcher.java new file mode 100644 index 000000000..2a945a351 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TMatcher.java @@ -0,0 +1,699 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +import java.util.ArrayList; + +/** + * Provides a means of matching regular expressions against a given input, + * finding occurrences of regular expressions in a given input, or replacing + * parts of a given input. A {@code Matcher} instance has an associated + * {@link TPattern} instance and an input text. A typical use case is to + * iteratively find all occurrences of the {@code Pattern}, until the end of the + * input is reached, as the following example illustrates: + * + *

+ * + *

+ * Pattern p = Pattern.compile("[A-Za-z]+");
+ *
+ * Matcher m = p.matcher("Hello, Android!");
+ * while (m.find()) {
+ *     System.out.println(m.group()); // prints "Hello" and "Android"
+ * }
+ * 
+ * + *

+ * + * The {@code Matcher} has a state that results from the previous operations. + * For example, it knows whether the most recent attempt to find the + * {@code Pattern} was successful and at which position the next attempt would + * resume the search. Depending on the application's needs, it may become + * necessary to explicitly {@link #reset()} this state from time to time. + */ +public final class TMatcher implements TMatchResult { + + static int MODE_FIND = 1 << 0; + + static int MODE_MATCH = 1 << 1; + + private TPattern pat = null; + + private AbstractSet start = null; + + private CharSequence string = null; + + private MatchResultImpl matchResult = null; + + // bounds + private int leftBound = -1; + + private int rightBound = -1; + + // replacements + private int appendPos = 0; + + private String replacement = null; + + private String processedRepl = null; + + private ArrayList replacementParts = null; + + /** + * Appends a literal part of the input plus a replacement for the current + * match to a given {@link StringBuffer}. The literal part is exactly the + * part of the input between the previous match and the current match. The + * method can be used in conjunction with {@link #find()} and + * {@link #appendTail(StringBuffer)} to walk through the input and replace + * all occurrences of the {@code Pattern} with something else. + * + * @param buffer + * the {@code StringBuffer} to append to. + * @param replacement + * the replacement text. + * @return the {@code Matcher} itself. + * @throws IllegalStateException + * if no successful match has been made. + */ + public TMatcher appendReplacement(StringBuffer buffer, String replacement) { + processedRepl = processReplacement(replacement); + buffer.append(string.subSequence(appendPos, start())); + buffer.append(processedRepl); + appendPos = end(); + return this; + } + + /** + * Parses replacement string and creates pattern + */ + private String processReplacement(String replacement) { + if (this.replacement != null && this.replacement.equals(replacement)) { + if (replacementParts == null) { + return processedRepl; + } else { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < replacementParts.size(); i++) { + sb.append(replacementParts.get(i)); + } + + return sb.toString(); + } + } else { + this.replacement = replacement; + char[] repl = replacement.toCharArray(); + StringBuilder res = new StringBuilder(); + replacementParts = null; + + int index = 0; + int replacementPos = 0; + boolean nextBackSlashed = false; + + while (index < repl.length) { + + if (repl[index] == '\\' && !nextBackSlashed) { + nextBackSlashed = true; + index++; + } + + if (nextBackSlashed) { + res.append(repl[index]); + nextBackSlashed = false; + } else { + if (repl[index] == '$') { + if (replacementParts == null) { + replacementParts = new ArrayList<>(); + } + try { + final int gr = Integer.parseInt(new String(repl, ++index, 1)); + + if (replacementPos != res.length()) { + replacementParts.add(res.subSequence(replacementPos, res.length())); + replacementPos = res.length(); + } + + replacementParts.add(new Object() { + private final int grN = gr; + + @Override + public String toString() { + return group(grN); + } + }); + String group = group(gr); + replacementPos += group.length(); + res.append(group); + + } catch (IndexOutOfBoundsException iob) { + throw iob; + } catch (Exception e) { + throw new IllegalArgumentException(""); + } + } else { + res.append(repl[index]); + } + } + + index++; + } + + if (replacementParts != null && replacementPos != res.length()) { + replacementParts.add(res.subSequence(replacementPos, res.length())); + } + return res.toString(); + } + } + + /** + * Provides a new input and resets the {@code Matcher}. This results in the + * region being set to the whole input. Results of a previous find get lost. + * The next attempt to find an occurrence of the {@link TPattern} in the + * string will start at the beginning of the input. + * + * @param input + * the new input sequence. + * + * @return the {@code Matcher} itself. + */ + public TMatcher reset(CharSequence input) { + if (input == null) { + throw new NullPointerException(""); + } + this.string = input; + return reset(); + } + + /** + * Resets the {@code Matcher}. This results in the region being set to the + * whole input. Results of a previous find get lost. The next attempt to + * find an occurrence of the {@link TPattern} in the string will start at + * the beginning of the input. + * + * @return the {@code Matcher} itself. + */ + public TMatcher reset() { + this.leftBound = 0; + this.rightBound = string.length(); + matchResult.reset(string, leftBound, rightBound); + appendPos = 0; + replacement = null; + matchResult.previousMatch = -1; + return this; + } + + /** + * Resets this matcher and sets a region. Only characters inside the region + * are considered for a match. + * + * @param start + * the first character of the region. + * @param end + * the first character after the end of the region. + * @return the {@code Matcher} itself. + */ + public TMatcher region(int start, int end) { + + if (start > end || start < 0 || end < 0 || start > string.length() || end > string.length()) { + throw new IndexOutOfBoundsException(start + ", " + end); + } + + this.leftBound = start; + this.rightBound = end; + matchResult.reset(null, start, end); + appendPos = 0; + replacement = null; + + return this; + } + + /** + * Appends the (unmatched) remainder of the input to the given + * {@link StringBuffer}. The method can be used in conjunction with + * {@link #find()} and {@link #appendReplacement(StringBuffer, String)} to + * walk through the input and replace all matches of the {@code Pattern} + * with something else. + * + * @param buffer + * the {@code StringBuffer} to append to. + * @return the {@code StringBuffer}. + * @throws IllegalStateException + * if no successful match has been made. + */ + public StringBuffer appendTail(StringBuffer buffer) { + return buffer.append(string.subSequence(appendPos, string.length())); + } + + /** + * Replaces the first occurrence of this matcher's pattern in the input with + * a given string. + * + * @param replacement + * the replacement text. + * @return the modified input string. + */ + public String replaceFirst(String replacement) { + reset(); + if (find()) { + StringBuffer sb = new StringBuffer(); + appendReplacement(sb, replacement); + return appendTail(sb).toString(); + } + + return string.toString(); + + } + + /** + * Replaces all occurrences of this matcher's pattern in the input with a + * given string. + * + * @param replacement + * the replacement text. + * @return the modified input string. + */ + public String replaceAll(String replacement) { + StringBuffer sb = new StringBuffer(); + reset(); + while (find()) { + appendReplacement(sb, replacement); + } + + return appendTail(sb).toString(); + } + + /** + * Returns the {@link TPattern} instance used inside this matcher. + * + * @return the {@code Pattern} instance. + */ + public TPattern pattern() { + return pat; + } + + /** + * Returns the text that matched a given group of the regular expression. + * + * @param group + * the group, ranging from 0 to groupCount() - 1, with 0 + * representing the whole pattern. + * @return the text that matched the group. + * @throws IllegalStateException + * if no successful match has been made. + */ + @Override + public String group(int group) { + if (group < 0 || group > matchResult.groupCount()) { + throw new IndexOutOfBoundsException("Index " + group + " if out of range [0; " + + matchResult.groupCount() + ")"); + } + return matchResult.group(group); + } + + /** + * Returns the text that matched the whole regular expression. + * + * @return the text. + * @throws IllegalStateException + * if no successful match has been made. + */ + @Override + public String group() { + return group(0); + } + + /** + * Returns the next occurrence of the {@link TPattern} in the input. The + * method starts the search from the given character in the input. + * + * @param start + * The index in the input at which the find operation is to + * begin. If this is less than the start of the region, it is + * automatically adjusted to that value. If it is beyond the end + * of the region, the method will fail. + * @return true if (and only if) a match has been found. + */ + public boolean find(int start) { + int stringLength = string.length(); + if (start < 0 || start > stringLength) { + throw new IndexOutOfBoundsException(String.valueOf(start)); + } + + start = findAt(start); + if (start >= 0 && matchResult.isValid()) { + matchResult.finalizeMatch(); + return true; + } + matchResult.startIndex = -1; + return false; + } + + private int findAt(int startIndex) { + matchResult.reset(); + matchResult.setMode(TMatcher.MODE_FIND); + matchResult.setStartIndex(startIndex); + int foundIndex = start.find(startIndex, string, matchResult); + if (foundIndex == -1) { + matchResult.hitEnd = true; + } + return foundIndex; + } + + /** + * Returns the next occurrence of the {@link TPattern} in the input. If a + * previous match was successful, the method continues the search from the + * first character following that match in the input. Otherwise it searches + * either from the region start (if one has been set), or from position 0. + * + * @return true if (and only if) a match has been found. + */ + public boolean find() { + int length = string.length(); + if (!hasTransparentBounds()) + length = rightBound; + if (matchResult.startIndex >= 0 && matchResult.mode() == TMatcher.MODE_FIND) { + matchResult.startIndex = matchResult.end(); + if (matchResult.end() == matchResult.start()) { + matchResult.startIndex++; + } + + return matchResult.startIndex <= length ? find(matchResult.startIndex) : false; + } else { + return find(leftBound); + } + } + + /** + * Returns the index of the first character of the text that matched a given + * group. + * + * @param group + * the group, ranging from 0 to groupCount() - 1, with 0 + * representing the whole pattern. + * @return the character index. + * @throws IllegalStateException + * if no successful match has been made. + */ + @Override + public int start(int group) { + return matchResult.start(group); + } + + /** + * Returns the index of the first character following the text that matched + * a given group. + * + * @param group + * the group, ranging from 0 to groupCount() - 1, with 0 + * representing the whole pattern. + * @return the character index. + * @throws IllegalStateException + * if no successful match has been made. + */ + @Override + public int end(int group) { + return matchResult.end(group); + } + + /** + * Tries to match the {@link TPattern} against the entire region (or the + * entire input, if no region has been set). + * + * @return true if (and only if) the {@code Pattern} matches the entire + * region. + */ + public boolean matches() { + return lookingAt(leftBound, TMatcher.MODE_MATCH); + } + + /** + * Returns a replacement string for the given one that has all backslashes + * and dollar signs escaped. + * + * @param s + * the input string. + * @return the input string, with all backslashes and dollar signs having + * been escaped. + */ + public static String quoteReplacement(String s) { + // first check whether we have smth to quote + if (s.indexOf('\\') < 0 && s.indexOf('$') < 0) + return s; + StringBuilder res = new StringBuilder(s.length() * 2); + char ch; + int len = s.length(); + + for (int i = 0; i < len; i++) { + + switch (ch = s.charAt(i)) { + case '$': + res.append('\\'); + res.append('$'); + break; + case '\\': + res.append('\\'); + res.append('\\'); + break; + default: + res.append(ch); + } + } + + return res.toString(); + } + + /** + * Runs match starting from set specified against input + * sequence starting at index specified; Result of the match + * will be stored into matchResult instance; + */ + private boolean runMatch(AbstractSet set, int index, MatchResultImpl matchResult) { + + if (set.matches(index, string, matchResult) >= 0) { + matchResult.finalizeMatch(); + return true; + } + + return false; + } + + /** + * Tries to match the {@link TPattern}, starting from the beginning of the + * region (or the beginning of the input, if no region has been set). + * Doesn't require the {@code Pattern} to match against the whole region. + * + * @return true if (and only if) the {@code Pattern} matches. + */ + public boolean lookingAt() { + return lookingAt(leftBound, TMatcher.MODE_FIND); + } + + private boolean lookingAt(int startIndex, int mode) { + matchResult.reset(); + matchResult.setMode(mode); + matchResult.setStartIndex(startIndex); + return runMatch(start, startIndex, matchResult); + } + + /** + * Returns the index of the first character of the text that matched the + * whole regular expression. + * + * @return the character index. + * @throws IllegalStateException + * if no successful match has been made. + */ + @Override + public int start() { + return start(0); + } + + /** + * Returns the number of groups in the results, which is always equal to the + * number of groups in the original regular expression. + * + * @return the number of groups. + */ + @Override + public int groupCount() { + return matchResult.groupCount(); + } + + /** + * Returns the index of the first character following the text that matched + * the whole regular expression. + * + * @return the character index. + * @throws IllegalStateException + * if no successful match has been made. + */ + @Override + public int end() { + return end(0); + } + + /** + * Converts the current match into a separate {@link TMatchResult} instance + * that is independent from this matcher. The new object is unaffected when + * the state of this matcher changes. + * + * @return the new {@code MatchResult}. + * @throws IllegalStateException + * if no successful match has been made. + */ + public TMatchResult toMatchResult() { + return this.matchResult.cloneImpl(); + } + + /** + * Determines whether this matcher has anchoring bounds enabled or not. When + * anchoring bounds are enabled, the start and end of the input match the + * '^' and '$' meta-characters, otherwise not. Anchoring bounds are enabled + * by default. + * + * @param value + * the new value for anchoring bounds. + * @return the {@code Matcher} itself. + */ + public TMatcher useAnchoringBounds(boolean value) { + matchResult.useAnchoringBounds(value); + return this; + } + + /** + * Indicates whether this matcher has anchoring bounds enabled. When + * anchoring bounds are enabled, the start and end of the input match the + * '^' and '$' meta-characters, otherwise not. Anchoring bounds are enabled + * by default. + * + * @return true if (and only if) the {@code Matcher} uses anchoring bounds. + */ + public boolean hasAnchoringBounds() { + return matchResult.hasAnchoringBounds(); + } + + /** + * Determines whether this matcher has transparent bounds enabled or not. + * When transparent bounds are enabled, the parts of the input outside the + * region are subject to lookahead and lookbehind, otherwise they are not. + * Transparent bounds are disabled by default. + * + * @param value + * the new value for transparent bounds. + * @return the {@code Matcher} itself. + */ + public TMatcher useTransparentBounds(boolean value) { + matchResult.useTransparentBounds(value); + return this; + } + + /** + * Indicates whether this matcher has transparent bounds enabled. When + * transparent bounds are enabled, the parts of the input outside the region + * are subject to lookahead and lookbehind, otherwise they are not. + * Transparent bounds are disabled by default. + * + * @return true if (and only if) the {@code Matcher} uses anchoring bounds. + */ + public boolean hasTransparentBounds() { + return matchResult.hasTransparentBounds(); + } + + /** + * Returns this matcher's region start, that is, the first character that is + * considered for a match. + * + * @return the start of the region. + */ + public int regionStart() { + return matchResult.getLeftBound(); + } + + /** + * Returns this matcher's region end, that is, the first character that is + * not considered for a match. + * + * @return the end of the region. + */ + public int regionEnd() { + return matchResult.getRightBound(); + } + + /** + * Indicates whether more input might change a successful match into an + * unsuccessful one. + * + * @return true if (and only if) more input might change a successful match + * into an unsuccessful one. + */ + public boolean requireEnd() { + return matchResult.requireEnd; + } + + /** + * Indicates whether the last match hit the end of the input. + * + * @return true if (and only if) the last match hit the end of the input. + */ + public boolean hitEnd() { + return matchResult.hitEnd; + } + + /** + * Sets a new pattern for the {@code Matcher}. Results of a previous find + * get lost. The next attempt to find an occurrence of the {@link TPattern} + * in the string will start at the beginning of the input. + * + * @param pattern + * the new {@code Pattern}. + * + * @return the {@code Matcher} itself. + */ + public TMatcher usePattern(TPattern pattern) { + if (pattern == null) { + throw new IllegalArgumentException(""); + } + int startIndex = matchResult.getPreviousMatchEnd(); + int mode = matchResult.mode(); + this.pat = pattern; + this.start = pattern.start; + matchResult = new MatchResultImpl(this.string, leftBound, rightBound, pattern.groupCount(), + pattern.compCount(), pattern.consCount()); + matchResult.setStartIndex(startIndex); + matchResult.setMode(mode); + return this; + } + + TMatcher(TPattern pat, CharSequence cs) { + this.pat = pat; + this.start = pat.start; + this.string = cs; + this.leftBound = 0; + this.rightBound = string.length(); + matchResult = new MatchResultImpl(cs, leftBound, rightBound, pat.groupCount(), pat.compCount(), pat.consCount()); + } + + @Override + public String toString() { + String lastMatch = ""; + try { + lastMatch = Integer.toString(start()); + } catch (IllegalStateException e) { + } + return "Regex[pattern=" + pat + " region=" + matchResult.getLeftBound() + "," + matchResult.getRightBound() + + " lastmatch=" + lastMatch + "]"; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TPattern.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TPattern.java new file mode 100644 index 000000000..307c6cf41 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TPattern.java @@ -0,0 +1,1362 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.io.Serializable; +import java.util.ArrayList; + +/** + * Represents a pattern used for matching, searching, or replacing strings. + * {@code Pattern}s are specified in terms of regular expressions and compiled + * using an instance of this class. They are then used in conjunction with a + * {@link TMatcher} to perform the actual search. + *

+ * A typical use case looks like this: + *

+ * + *

+ * Pattern p = Pattern.compile("Hello, A[a-z]*!");
+ *
+ * Matcher m = p.matcher("Hello, Android!");
+ * boolean b1 = m.matches(); // true
+ *
+ * m.setInput("Hello, Robot!");
+ * boolean b2 = m.matches(); // false
+ * 
+ *

+ * The above code could also be written in a more compact fashion, though this + * variant is less efficient, since {@code Pattern} and {@code Matcher} objects + * are created on the fly instead of being reused. fashion: + * + *

+ * boolean b1 = Pattern.matches("Hello, A[a-z]*!", "Hello, Android!"); // true
+ * boolean b2 = Pattern.matches("Hello, A[a-z]*!", "Hello, Robot!"); // false
+ * 
+ * + * @see TMatcher + */ +public final class TPattern implements Serializable { + + private static final long serialVersionUID = 5073258162644648461L; + + static final boolean _DEBUG_ = false; + + /** + * This constant specifies that a pattern matches Unix line endings ('\n') + * only against the '.', '^', and '$' meta characters. + */ + public static final int UNIX_LINES = 1 << 0; + + /** + * This constant specifies that a {@code Pattern} is matched + * case-insensitively. That is, the patterns "a+" and "A+" would both match + * the string "aAaAaA". + */ + public static final int CASE_INSENSITIVE = 1 << 1; + + /** + * This constant specifies that a {@code Pattern} may contain whitespace or + * comments. Otherwise comments and whitespace are taken as literal + * characters. + */ + public static final int COMMENTS = 1 << 2; + + /** + * This constant specifies that the meta characters '^' and '$' match only + * the beginning and end end of an input line, respectively. Normally, they + * match the beginning and the end of the complete input. + */ + public static final int MULTILINE = 1 << 3; + + /** + * This constant specifies that the whole {@code Pattern} is to be taken + * literally, that is, all meta characters lose their meanings. + */ + public static final int LITERAL = 1 << 4; + + /** + * This constant specifies that the '.' meta character matches arbitrary + * characters, including line endings, which is normally not the case. + */ + public static final int DOTALL = 1 << 5; + + /** + * This constant specifies that a {@code Pattern} is matched + * case-insensitively with regard to all Unicode characters. It is used in + * conjunction with the {@link #CASE_INSENSITIVE} constant to extend its + * meaning to all Unicode characters. + */ + public static final int UNICODE_CASE = 1 << 6; + + /** + * This constant specifies that a character in a {@code Pattern} and a + * character in the input string only match if they are canonically + * equivalent. + */ + public static final int CANON_EQ = 1 << 7; + + static final int BACK_REF_NUMBER = 10; + + /** + * Bit mask that includes all defined match flags + */ + static final int flagsBitMask = TPattern.UNIX_LINES | TPattern.CASE_INSENSITIVE | TPattern.COMMENTS | + TPattern.MULTILINE | TPattern.LITERAL | TPattern.DOTALL | TPattern.UNICODE_CASE | TPattern.CANON_EQ; + + /** + * Current pattern to be compiled; + */ + private transient Lexer lexemes = null; + + /** + * Pattern compile flags; + */ + private int flags = 0; + + private String pattern = null; + + /* + * All backreferences that may be used in pattern. + */ + transient private FSet backRefs[] = new FSet[BACK_REF_NUMBER]; + + /* + * Is true if backreferenced sets replacement is needed + */ + transient private boolean needsBackRefReplacement = false; + + transient private int globalGroupIndex = -1; + + transient private int compCount = -1; + + transient private int consCount = -1; + + transient AbstractSet start = null; + + /** + * Returns a {@link TMatcher} for the {@code Pattern} and a given input. The + * {@code Matcher} can be used to match the {@code Pattern} against the + * whole input, find occurrences of the {@code Pattern} in the input, or + * replace parts of the input. + * + * @param input + * the input to process. + * + * @return the resulting {@code Matcher}. + */ + public TMatcher matcher(CharSequence input) { + return new TMatcher(this, input); + } + + /** + * Splits the given input sequence around occurrences of the {@code Pattern} + * . The function first determines all occurrences of the {@code Pattern} + * inside the input sequence. It then builds an array of the + * "remaining" strings before, in-between, and after these + * occurrences. An additional parameter determines the maximal number of + * entries in the resulting array and the handling of trailing empty + * strings. + * + * @param inputSeq + * the input sequence. + * @param limit + * Determines the maximal number of entries in the resulting + * array. + *
    + *
  • For n > 0, it is guaranteed that the resulting array + * contains at most n entries. + *
  • For n < 0, the length of the resulting array is exactly + * the number of occurrences of the {@code Pattern} +1. All + * entries are included. + *
  • For n == 0, the length of the resulting array is at most + * the number of occurrences of the {@code Pattern} +1. Empty + * strings at the end of the array are not included. + *
+ * + * @return the resulting array. + */ + public String[] split(CharSequence inputSeq, int limit) { + ArrayList res = new ArrayList<>(); + TMatcher mat = matcher(inputSeq); + int index = 0; + int curPos = 0; + + if (inputSeq.length() == 0) { + return new String[] { "" }; //$NON-NLS-1$ + } else { + while (mat.find() && (index + 1 < limit || limit <= 0)) { + res.add(inputSeq.subSequence(curPos, mat.start()).toString()); + curPos = mat.end(); + index++; + } + + res.add(inputSeq.subSequence(curPos, inputSeq.length()).toString()); + index++; + + /* + * discard trailing empty strings + */ + if (limit == 0) { + while (--index >= 0 && res.get(index).toString().length() == 0) { + res.remove(index); + } + } + } + return res.toArray(new String[index >= 0 ? index : 0]); + } + + /** + * Splits a given input around occurrences of a regular expression. This is + * a convenience method that is equivalent to calling the method + * {@link #split(java.lang.CharSequence, int)} with a limit of 0. + * + * @param input + * the input sequence. + * + * @return the resulting array. + */ + public String[] split(CharSequence input) { + return split(input, 0); + } + + /** + * Returns the regular expression that was compiled into this + * {@code Pattern}. + * + * @return the regular expression. + */ + public String pattern() { + return lexemes.toString(); + } + + @Override + public String toString() { + return this.pattern(); + } + + /** + * Returns the flags that have been set for this {@code Pattern}. + * + * @return the flags that have been set. A combination of the constants + * defined in this class. + * + * @see #CANON_EQ + * @see #CASE_INSENSITIVE + * @see #COMMENTS + * @see #DOTALL + * @see #LITERAL + * @see #MULTILINE + * @see #UNICODE_CASE + * @see #UNIX_LINES + */ + public int flags() { + return this.flags; + } + + /** + * Compiles a regular expression, creating a new {@code Pattern} instance in + * the process. Allows to set some flags that modify the behavior of the + * {@code Pattern}. + * + * @param pattern + * the regular expression. + * @param flags + * the flags to set. Basically, any combination of the constants + * defined in this class is valid. + * + * @return the new {@code Pattern} instance. + * + * @throws TPatternSyntaxException + * if the regular expression is syntactically incorrect. + * + * @see #CANON_EQ + * @see #CASE_INSENSITIVE + * @see #COMMENTS + * @see #DOTALL + * @see #LITERAL + * @see #MULTILINE + * @see #UNICODE_CASE + * @see #UNIX_LINES + */ + public static TPattern compile(String pattern, int flags) throws TPatternSyntaxException { + + if ((flags != 0) && ((flags | flagsBitMask) != flagsBitMask)) { + + throw new IllegalArgumentException(""); + } + + AbstractSet.counter = 1; + + return new TPattern().compileImpl(pattern, flags); + } + + /** + * + * @param pattern + * - Regular expression to be compiled + * @param flags + * - The bit mask including CASE_INSENSITIVE, MULTILINE, DOTALL, + * UNICODE_CASE, and CANON_EQ + * + * @return Compiled pattern + */ + private TPattern compileImpl(String pattern, int flags) throws TPatternSyntaxException { + this.lexemes = new Lexer(pattern, flags); + this.flags = flags; + this.pattern = pattern; + + start = processExpression(-1, this.flags, null); + if (!lexemes.isEmpty()) { + throw new TPatternSyntaxException("", lexemes.toString(), lexemes.getIndex()); + } + finalizeCompile(); + return this; + } + + /** + * A->(a|)+ + */ + private AbstractSet processAlternations(AbstractSet last) { + CharClass auxRange = new CharClass(hasFlag(TPattern.CASE_INSENSITIVE), hasFlag(TPattern.UNICODE_CASE)); + while (!lexemes.isEmpty() && + lexemes.isLetter() && + (lexemes.lookAhead() == 0 || lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes.lookAhead() == Lexer.CHAR_RIGHT_PARENTHESIS)) { + auxRange.add(lexemes.next()); + if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR) + lexemes.next(); + } + AbstractSet rangeSet = processRangeSet(auxRange); + rangeSet.setNext(last); + + return rangeSet; + } + + /** + * E->AE; E->S|E; E->S; A->(a|)+ E->S(|S)* + */ + private AbstractSet processExpression(int ch, int newFlags, AbstractSet last) { + ArrayList children = new ArrayList<>(); + AbstractSet child; + int saveFlags = flags; + FSet fSet; + boolean saveChangedFlags = false; + + if (newFlags != flags) { + flags = newFlags; + } + + switch (ch) { + case Lexer.CHAR_NONCAP_GROUP: + fSet = new NonCapFSet(++consCount); + break; + + case Lexer.CHAR_POS_LOOKAHEAD: + /* falls through */ + + case Lexer.CHAR_NEG_LOOKAHEAD: + fSet = new AheadFSet(); + break; + + case Lexer.CHAR_POS_LOOKBEHIND: + /* falls through */ + + case Lexer.CHAR_NEG_LOOKBEHIND: + fSet = new BehindFSet(++consCount); + break; + + case Lexer.CHAR_ATOMIC_GROUP: + fSet = new AtomicFSet(++consCount); + break; + + default: + globalGroupIndex++; + if (last == null) { + + // expr = new StartSet(); + fSet = new FinalSet(); + saveChangedFlags = true; + } else { + + // expr = new JointSet(globalGroupIndex); + fSet = new FSet(globalGroupIndex); + } + if (globalGroupIndex > -1 && globalGroupIndex < 10) { + backRefs[globalGroupIndex] = fSet; + } + break; + } + + do { + if (lexemes.isLetter() && lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR) { + child = processAlternations(fSet); + } else if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR) { + child = new EmptySet(fSet); + lexemes.next(); + } else { + child = processSubExpression(fSet); + if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR) { + lexemes.next(); + } + } + if (child != null) { + + // expr.addChild(child); + children.add(child); + } + } while (!(lexemes.isEmpty() || (lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS))); + + if (lexemes.back() == Lexer.CHAR_VERTICAL_BAR) { + children.add(new EmptySet(fSet)); + } + + if (flags != saveFlags && !saveChangedFlags) { + flags = saveFlags; + lexemes.restoreFlags(flags); + } + + switch (ch) { + case Lexer.CHAR_NONCAP_GROUP: + return new NonCapJointSet(children, fSet); + + case Lexer.CHAR_POS_LOOKAHEAD: + return new PositiveLookAhead(children, fSet); + + case Lexer.CHAR_NEG_LOOKAHEAD: + return new NegativeLookAhead(children, fSet); + + case Lexer.CHAR_POS_LOOKBEHIND: + return new PositiveLookBehind(children, fSet); + + case Lexer.CHAR_NEG_LOOKBEHIND: + return new NegativeLookBehind(children, fSet); + + case Lexer.CHAR_ATOMIC_GROUP: + return new AtomicJointSet(children, fSet); + + default: + switch (children.size()) { + case 0: + return new EmptySet(fSet); + + case 1: + return new SingleSet(children.get(0), fSet); + + default: + return new JointSet(children, fSet); + } + } + } + + /** + * T->a+ + */ + private AbstractSet processSequence() { + StringBuffer substring = new StringBuffer(); + + while (!lexemes.isEmpty() && + lexemes.isLetter() && + !lexemes.isHighSurrogate() && + !lexemes.isLowSurrogate() && + ((!lexemes.isNextSpecial() && lexemes.lookAhead() == 0) // end + // of + // pattern + || + (!lexemes.isNextSpecial() && Lexer.isLetter(lexemes.lookAhead())) || + lexemes.lookAhead() == Lexer.CHAR_RIGHT_PARENTHESIS || + (lexemes.lookAhead() & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS || + lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes.lookAhead() == Lexer.CHAR_DOLLAR)) { + int ch = lexemes.next(); + + if (Character.isSupplementaryCodePoint(ch)) { + substring.append(Character.toChars(ch)); + } else { + substring.append((char)ch); + } + } + if (!hasFlag(TPattern.CASE_INSENSITIVE)) { + return new SequenceSet(substring); + } else if (!hasFlag(TPattern.UNICODE_CASE)) { + return new CISequenceSet(substring); + } else { + return new UCISequenceSet(substring); + } + } + + /** + * D->a + */ + private AbstractSet processDecomposedChar() { + int[] codePoints = new int[Lexer.MAX_DECOMPOSITION_LENGTH]; + char[] codePointsHangul; + int readCodePoints = 0; + int curSymb = -1; + int curSymbIndex = -1; + + if (!lexemes.isEmpty() && lexemes.isLetter()) { + curSymb = lexemes.next(); + codePoints[readCodePoints] = curSymb; + curSymbIndex = curSymb - Lexer.LBase; + } + + /* + * We process decomposed Hangul syllable LV or LVT or process jamo L. + * See http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf + * "3.12 Conjoining Jamo Behavior" + */ + if ((curSymbIndex >= 0) && (curSymbIndex < Lexer.LCount)) { + codePointsHangul = new char[Lexer.MAX_HANGUL_DECOMPOSITION_LENGTH]; + codePointsHangul[readCodePoints++] = (char)curSymb; + + curSymb = lexemes.peek(); + curSymbIndex = curSymb - Lexer.VBase; + if ((curSymbIndex >= 0) && (curSymbIndex < Lexer.VCount)) { + codePointsHangul[readCodePoints++] = (char)curSymb; + lexemes.next(); + curSymb = lexemes.peek(); + curSymbIndex = curSymb - Lexer.TBase; + if ((curSymbIndex >= 0) && (curSymbIndex < Lexer.TCount)) { + codePointsHangul[readCodePoints++] = (char)curSymb; + lexemes.next(); + + // LVT syllable + return new HangulDecomposedCharSet(codePointsHangul, 3); + } else { + + // LV syllable + return new HangulDecomposedCharSet(codePointsHangul, 2); + } + } else { + + // L jamo + if (!hasFlag(TPattern.CASE_INSENSITIVE)) { + return new CharSet(codePointsHangul[0]); + } else if (!hasFlag(TPattern.UNICODE_CASE)) { + return new CICharSet(codePointsHangul[0]); + } else { + return new UCICharSet(codePointsHangul[0]); + } + } + + /* + * We process single codepoint or decomposed codepoint. We collect + * decomposed codepoint and obtain one DecomposedCharSet. + */ + } else { + readCodePoints++; + + while ((readCodePoints < Lexer.MAX_DECOMPOSITION_LENGTH) && !lexemes.isEmpty() && lexemes.isLetter() && + !Lexer.isDecomposedCharBoundary(lexemes.peek())) { + codePoints[readCodePoints++] = lexemes.next(); + } + + /* + * We have read an ordinary symbol. + */ + if (readCodePoints == 1 && !Lexer.hasSingleCodepointDecomposition(codePoints[0])) { + return processCharSet(codePoints[0]); + } else { + if (!hasFlag(TPattern.CASE_INSENSITIVE)) { + return new DecomposedCharSet(codePoints, readCodePoints); + } else if (!hasFlag(TPattern.UNICODE_CASE)) { + return new CIDecomposedCharSet(codePoints, readCodePoints); + } else { + return new UCIDecomposedCharSet(codePoints, readCodePoints); + } + } + } + } + + /** + * S->BS; S->QS; S->Q; B->a+ + */ + private AbstractSet processSubExpression(AbstractSet last) { + AbstractSet cur; + if (lexemes.isLetter() && !lexemes.isNextSpecial() && Lexer.isLetter(lexemes.lookAhead())) { + if (hasFlag(TPattern.CANON_EQ)) { + cur = processDecomposedChar(); + if (!lexemes.isEmpty() + + /* && !pattern.isQuantifier() */ + && (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS || last instanceof FinalSet) && + lexemes.peek() != Lexer.CHAR_VERTICAL_BAR && !lexemes.isLetter()) { + cur = processQuantifier(last, cur); + } + } else if (lexemes.isHighSurrogate() || lexemes.isLowSurrogate()) { + AbstractSet term = processTerminal(last); + cur = processQuantifier(last, term); + } else { + cur = processSequence(); + } + } else if (lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS) { + if (last instanceof FinalSet) { + throw new TPatternSyntaxException("", lexemes.toString(), lexemes.getIndex()); + } else { + cur = new EmptySet(last); + } + } else { + AbstractSet term = processTerminal(last); + cur = processQuantifier(last, term); + } + + if (!lexemes.isEmpty() + // && !pattern.isQuantifier() + && + (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS || last instanceof FinalSet) && + lexemes.peek() != Lexer.CHAR_VERTICAL_BAR) { + AbstractSet next = processSubExpression(last); + if (cur instanceof LeafQuantifierSet + // TODO create personal UnifiedQuantifierSet for composite + // quantifiers + // to take into account Quantifier counters + // //// + && + !(cur instanceof CompositeQuantifierSet) && + !(cur instanceof GroupQuantifierSet) && + !(cur instanceof AltQuantifierSet) && !next.first(((LeafQuantifierSet)cur).getInnerSet())) { + cur = new UnifiedQuantifierSet((LeafQuantifierSet)cur); + } + if (((char)next.getType()) == '+') { + cur.setNext(((LeafQuantifierSet)next).getInnerSet()); + } else { + cur.setNext(next); + } + } else if (cur != null) { + cur.setNext(last); + } else { + return null; + } + + if (((char)cur.getType()) == '+') { + return ((QuantifierSet)cur).getInnerSet(); + } else { + return cur; + } + } + + /** + * Q->T(*|+|?...) also do some optimizations. + * + */ + private AbstractSet processQuantifier(AbstractSet last, AbstractSet term) { + int quant = lexemes.peek(); + + if (term != null && !(term instanceof LeafSet)) { + switch (quant) { + case Lexer.QUANT_STAR: + case Lexer.QUANT_PLUS: { + QuantifierSet q; + + lexemes.next(); + if (term.getType() == AbstractSet.TYPE_DOTSET) { + if (!hasFlag(TPattern.DOTALL)) { + q = new DotQuantifierSet(term, last, quant, AbstractLineTerminator.getInstance(flags)); + } else { + q = new DotAllQuantifierSet(term, last, quant); + } + } else { + q = new GroupQuantifierSet(term, last, quant); + } + term.setNext(q); + return q; + } + + case Lexer.QUANT_STAR_R: + case Lexer.QUANT_PLUS_R: { + lexemes.next(); + GroupQuantifierSet q = new ReluctantGroupQuantifierSet(term, last, quant); + term.setNext(q); + return q; + } + + case Lexer.QUANT_PLUS_P: { + lexemes.next(); + // possessive plus will be handled by unique class + // and should not be postprocessed to point previous set + // to the inner one. + // // + return new PosPlusGroupQuantifierSet(term, last, Lexer.QUANT_STAR_P); + } + + case Lexer.QUANT_STAR_P: { + lexemes.next(); + return new PossessiveGroupQuantifierSet(term, last, quant); + } + + case Lexer.QUANT_ALT: { + lexemes.next(); + AltGroupQuantifierSet q = new AltGroupQuantifierSet(term, last, Lexer.QUANT_ALT); + term.setNext(last); + return q; + } + + case Lexer.QUANT_ALT_P: { + lexemes.next(); + return new PosAltGroupQuantifierSet(term, last, Lexer.QUANT_ALT); + } + + case Lexer.QUANT_ALT_R: { + lexemes.next(); + RelAltGroupQuantifierSet q = new RelAltGroupQuantifierSet(term, last, Lexer.QUANT_ALT); + term.setNext(last); + return q; + } + + case Lexer.QUANT_COMP: { + CompositeGroupQuantifierSet q = new CompositeGroupQuantifierSet((Quantifier)lexemes.nextSpecial(), + term, last, Lexer.QUANT_ALT, ++compCount); + term.setNext(q); + return q; + } + + case Lexer.QUANT_COMP_P: { + return new PosCompositeGroupQuantifierSet((Quantifier)lexemes.nextSpecial(), term, last, + Lexer.QUANT_ALT, ++compCount); + } + + case Lexer.QUANT_COMP_R: { + RelCompositeGroupQuantifierSet q = new RelCompositeGroupQuantifierSet( + (Quantifier)lexemes.nextSpecial(), term, last, Lexer.QUANT_ALT, ++compCount); + term.setNext(q); + return q; + } + + default: + return term; + } + } else { + LeafSet leaf = null; + if (term != null) + leaf = (LeafSet)term; + switch (quant) { + case Lexer.QUANT_STAR: + case Lexer.QUANT_PLUS: { + lexemes.next(); + LeafQuantifierSet q = new LeafQuantifierSet(leaf, last, quant); + leaf.setNext(q); + return q; + } + + case Lexer.QUANT_STAR_R: + case Lexer.QUANT_PLUS_R: { + lexemes.next(); + ReluctantQuantifierSet q = new ReluctantQuantifierSet(leaf, last, quant); + leaf.setNext(q); + return q; + } + + case Lexer.QUANT_PLUS_P: + case Lexer.QUANT_STAR_P: { + lexemes.next(); + PossessiveQuantifierSet q = new PossessiveQuantifierSet(leaf, last, quant); + leaf.setNext(q); + return q; + } + + case Lexer.QUANT_ALT: { + lexemes.next(); + return new AltQuantifierSet(leaf, last, Lexer.QUANT_ALT); + } + + case Lexer.QUANT_ALT_P: { + lexemes.next(); + return new PossessiveAltQuantifierSet(leaf, last, Lexer.QUANT_ALT_P); + } + + case Lexer.QUANT_ALT_R: { + lexemes.next(); + return new ReluctantAltQuantifierSet(leaf, last, Lexer.QUANT_ALT_R); + } + + case Lexer.QUANT_COMP: { + return new CompositeQuantifierSet((Quantifier)lexemes.nextSpecial(), leaf, last, Lexer.QUANT_COMP); + } + + case Lexer.QUANT_COMP_P: { + return new PossessiveCompositeQuantifierSet((Quantifier)lexemes.nextSpecial(), leaf, last, + Lexer.QUANT_COMP_P); + } + case Lexer.QUANT_COMP_R: { + return new ReluctantCompositeQuantifierSet((Quantifier)lexemes.nextSpecial(), leaf, last, + Lexer.QUANT_COMP_R); + } + + default: + return term; + } + } + } + + /** + * T-> letter|[range]|{char-class}|(E) + */ + private AbstractSet processTerminal(AbstractSet last) { + int ch; + AbstractSet term = null; + do { + ch = lexemes.peek(); + if ((ch & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS) { + int newFlags; + lexemes.next(); + newFlags = (ch & 0x00ff0000) >> 16; + ch = ch & 0xff00ffff; + if (ch == Lexer.CHAR_FLAGS) { + flags = newFlags; + } else { + newFlags = (ch == Lexer.CHAR_NONCAP_GROUP) ? newFlags : flags; + term = processExpression(ch, newFlags, last); + if (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS) { + throw new TPatternSyntaxException("", lexemes.toString(), lexemes.getIndex()); + } + lexemes.next(); + } + } else + switch (ch) { + case Lexer.CHAR_LEFT_SQUARE_BRACKET: { + lexemes.next(); + boolean negative = false; + if (lexemes.peek() == Lexer.CHAR_CARET) { + negative = true; + lexemes.next(); + } + + term = processRange(negative, last); + if (lexemes.peek() != Lexer.CHAR_RIGHT_SQUARE_BRACKET) + throw new TPatternSyntaxException("", lexemes.toString(), lexemes.getIndex()); + lexemes.setMode(Lexer.MODE_PATTERN); + lexemes.next(); + break; + } + + case Lexer.CHAR_DOT: { + lexemes.next(); + + if (!hasFlag(TPattern.DOTALL)) { + term = new DotSet(AbstractLineTerminator.getInstance(flags)); + } else { + term = new DotAllSet(); + } + + break; + } + + case Lexer.CHAR_CARET: { + lexemes.next(); + consCount++; + if (!hasFlag(TPattern.MULTILINE)) { + term = new SOLSet(); + } else { + term = new MultiLineSOLSet(AbstractLineTerminator.getInstance(flags)); + } + + break; + } + + case Lexer.CHAR_DOLLAR: { + lexemes.next(); + consCount++; + if (!hasFlag(TPattern.MULTILINE)) { + if (!hasFlag(TPattern.UNIX_LINES)) { + term = new EOLSet(consCount); + } else { + term = new UEOLSet(consCount); + } + } else { + if (!hasFlag(TPattern.UNIX_LINES)) { + term = new MultiLineEOLSet(consCount); + } else { + term = new UMultiLineEOLSet(consCount); + } + } + + break; + } + + case Lexer.CHAR_WORD_BOUND: { + lexemes.next(); + term = new WordBoundary(true); + break; + } + + case Lexer.CHAR_NONWORD_BOUND: { + lexemes.next(); + term = new WordBoundary(false); + break; + } + + case Lexer.CHAR_END_OF_INPUT: { + lexemes.next(); + term = new EOISet(); + break; + } + + case Lexer.CHAR_END_OF_LINE: { + lexemes.next(); + term = new EOLSet(++consCount); + break; + } + + case Lexer.CHAR_START_OF_INPUT: { + lexemes.next(); + term = new SOLSet(); + break; + } + + case Lexer.CHAR_PREVIOUS_MATCH: { + lexemes.next(); + term = new PreviousMatch(); + break; + } + + case 0x80000000 | '1': + case 0x80000000 | '2': + case 0x80000000 | '3': + case 0x80000000 | '4': + case 0x80000000 | '5': + case 0x80000000 | '6': + case 0x80000000 | '7': + case 0x80000000 | '8': + case 0x80000000 | '9': { + int number = (ch & 0x7FFFFFFF) - '0'; + if (globalGroupIndex >= number) { + lexemes.next(); + consCount++; + if (!hasFlag(TPattern.CASE_INSENSITIVE)) { + term = new BackReferenceSet(number, consCount); + } else if (!hasFlag(TPattern.UNICODE_CASE)) { + term = new CIBackReferenceSet(number, consCount); + } else { + term = new UCIBackReferenceSet(number, consCount); + } + (backRefs[number]).isBackReferenced = true; + needsBackRefReplacement = true; + break; + } else { + throw new TPatternSyntaxException("", lexemes.toString(), lexemes.getIndex()); + } + } + + case 0: { + AbstractCharClass cc = null; + if ((cc = (AbstractCharClass)lexemes.peekSpecial()) != null) { + term = processRangeSet(cc); + } else if (!lexemes.isEmpty()) { + + // ch == 0 + term = new CharSet((char)ch); + } else { + term = new EmptySet(last); + break; + } + lexemes.next(); + break; + } + + default: { + if (ch >= 0 && !lexemes.isSpecial()) { + term = processCharSet(ch); + lexemes.next(); + } else if (ch == Lexer.CHAR_VERTICAL_BAR) { + term = new EmptySet(last); + } else if (ch == Lexer.CHAR_RIGHT_PARENTHESIS) { + if (last instanceof FinalSet) { + throw new TPatternSyntaxException("", lexemes.toString(), lexemes.getIndex()); + } else { + term = new EmptySet(last); + } + } else { + throw new TPatternSyntaxException((lexemes.isSpecial() ? lexemes.peekSpecial().toString() + : Character.toString((char)ch)), lexemes.toString(), lexemes.getIndex()); + } + } + } + } while (ch == Lexer.CHAR_FLAGS); + return term; + } + + private AbstractSet processRange(boolean negative, AbstractSet last) { + AbstractCharClass res = processRangeExpression(negative); + AbstractSet rangeSet = processRangeSet(res); + rangeSet.setNext(last); + + return rangeSet; + } + + /** + * process [...] ranges + */ + private CharClass processRangeExpression(boolean alt) { + CharClass res = new CharClass(alt, hasFlag(TPattern.CASE_INSENSITIVE), hasFlag(TPattern.UNICODE_CASE)); + int buffer = -1; + boolean intersection = false; + boolean notClosed = false; + boolean firstInClass = true; + + while (!lexemes.isEmpty() && (notClosed = (lexemes.peek()) != Lexer.CHAR_RIGHT_SQUARE_BRACKET || firstInClass)) { + switch (lexemes.peek()) { + + case Lexer.CHAR_RIGHT_SQUARE_BRACKET: { + if (buffer >= 0) + res.add(buffer); + buffer = ']'; + lexemes.next(); + break; + } + case Lexer.CHAR_LEFT_SQUARE_BRACKET: { + if (buffer >= 0) { + res.add(buffer); + buffer = -1; + } + lexemes.next(); + boolean negative = false; + if (lexemes.peek() == Lexer.CHAR_CARET) { + lexemes.next(); + negative = true; + } + + if (intersection) + res.intersection(processRangeExpression(negative)); + else + res.union(processRangeExpression(negative)); + intersection = false; + lexemes.next(); + break; + } + + case Lexer.CHAR_AMPERSAND: { + if (buffer >= 0) + res.add(buffer); + buffer = lexemes.next(); + + /* + * if there is a start for subrange we will do an + * intersection otherwise treat '&' as a normal character + */ + if (lexemes.peek() == Lexer.CHAR_AMPERSAND) { + if (lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET) { + lexemes.next(); + intersection = true; + buffer = -1; + } else { + lexemes.next(); + if (firstInClass) { + + // skip "&&" at "[&&...]" or "[^&&...]" + res = processRangeExpression(false); + } else { + + // ignore "&&" at "[X&&]" ending where X != + // empty string + if (!(lexemes.peek() == Lexer.CHAR_RIGHT_SQUARE_BRACKET)) { + res.intersection(processRangeExpression(false)); + } + } + + } + } else { + + // treat '&' as a normal character + buffer = '&'; + } + + break; + } + + case Lexer.CHAR_HYPHEN: { + if (firstInClass || lexemes.lookAhead() == Lexer.CHAR_RIGHT_SQUARE_BRACKET || + lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET || buffer < 0) { + // treat hypen as normal character + if (buffer >= 0) + res.add(buffer); + buffer = '-'; + lexemes.next(); + // range + } else { + lexemes.next(); + int cur = lexemes.peek(); + + if (!lexemes.isSpecial() && + (cur >= 0 || lexemes.lookAhead() == Lexer.CHAR_RIGHT_SQUARE_BRACKET || + lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET || buffer < 0)) { + + try { + if (!Lexer.isLetter(cur)) { + cur = cur & 0xFFFF; + } + res.add(buffer, cur); + } catch (Exception e) { + throw new TPatternSyntaxException("", pattern(), lexemes.getIndex()); + } + lexemes.next(); + buffer = -1; + } else { + throw new TPatternSyntaxException("", pattern(), lexemes.getIndex()); + } + } + + break; + } + + case Lexer.CHAR_CARET: { + if (buffer >= 0) + res.add(buffer); + buffer = '^'; + lexemes.next(); + break; + } + + case 0: { + if (buffer >= 0) + res.add(buffer); + AbstractCharClass cs = (AbstractCharClass)lexemes.peekSpecial(); + if (cs != null) { + res.add(cs); + buffer = -1; + } else { + buffer = 0; + } + + lexemes.next(); + break; + } + + default: { + if (buffer >= 0) + res.add(buffer); + buffer = lexemes.next(); + break; + } + } + + firstInClass = false; + } + if (notClosed) { + throw new TPatternSyntaxException("", pattern(), lexemes.getIndex() - 1); + } + if (buffer >= 0) + res.add(buffer); + return res; + } + + private AbstractSet processCharSet(int ch) { + boolean isSupplCodePoint = Character.isSupplementaryCodePoint(ch); + + if (hasFlag(TPattern.CASE_INSENSITIVE)) { + + if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) { + return new CICharSet((char)ch); + } else if (hasFlag(TPattern.UNICODE_CASE) && ch > 128) { + if (isSupplCodePoint) { + return new UCISupplCharSet(ch); + } else if (Lexer.isLowSurrogate(ch)) { + + // we need no UCILowSurrogateCharSet + return new LowSurrogateCharSet((char)ch); + } else if (Lexer.isHighSurrogate(ch)) { + + // we need no UCIHighSurrogateCharSet + return new HighSurrogateCharSet((char)ch); + } else { + return new UCICharSet((char)ch); + } + } + } + + if (isSupplCodePoint) { + return new SupplCharSet(ch); + } else if (Lexer.isLowSurrogate(ch)) { + return new LowSurrogateCharSet((char)ch); + } else if (Lexer.isHighSurrogate(ch)) { + return new HighSurrogateCharSet((char)ch); + } else { + return new CharSet((char)ch); + } + } + + private AbstractSet processRangeSet(AbstractCharClass charClass) { + if (charClass.hasLowHighSurrogates()) { + AbstractCharClass surrogates = charClass.getSurrogates(); + LowHighSurrogateRangeSet lowHighSurrRangeSet = new LowHighSurrogateRangeSet(surrogates); + + if (charClass.mayContainSupplCodepoints()) { + if (!charClass.hasUCI()) { + return new CompositeRangeSet(new SupplRangeSet(charClass.getWithoutSurrogates()), + lowHighSurrRangeSet); + } else { + return new CompositeRangeSet(new UCISupplRangeSet(charClass.getWithoutSurrogates()), + lowHighSurrRangeSet); + } + } + + if (!charClass.hasUCI()) { + return new CompositeRangeSet(new RangeSet(charClass.getWithoutSurrogates()), lowHighSurrRangeSet); + } else { + return new CompositeRangeSet(new UCIRangeSet(charClass.getWithoutSurrogates()), lowHighSurrRangeSet); + } + } + + if (charClass.mayContainSupplCodepoints()) { + if (!charClass.hasUCI()) { + return new SupplRangeSet(charClass); + } else { + return new UCISupplRangeSet(charClass); + } + } + + if (!charClass.hasUCI()) { + return new RangeSet(charClass); + } else { + return new UCIRangeSet(charClass); + } + } + + /** + * Compiles a regular expression, creating a new Pattern instance in the + * process. This is actually a convenience method that calls + * {@link #compile(String, int)} with a {@code flags} value of zero. + * + * @param pattern + * the regular expression. + * + * @return the new {@code Pattern} instance. + * + * @throws TPatternSyntaxException + * if the regular expression is syntactically incorrect. + */ + public static TPattern compile(String pattern) { + return compile(pattern, 0); + } + + /* + * This method do traverses of automata to finish compilation. + */ + private void finalizeCompile() { + + /* + * Processing second pass + */ + if (needsBackRefReplacement) { // || needsReason1 || needsReason2) { + start.processSecondPass(); + } + + } + + /** + * Tries to match a given regular expression against a given input. This is + * actually nothing but a convenience method that compiles the regular + * expression into a {@code Pattern}, builds a {@link TMatcher} for it, and + * then does the match. If the same regular expression is used for multiple + * operations, it is recommended to compile it into a {@code Pattern} + * explicitly and request a reusable {@code Matcher}. + * + * @param regex + * the regular expression. + * @param input + * the input to process. + * + * @return true if and only if the {@code Pattern} matches the input. + * + * @see TPattern#compile(java.lang.String, int) + * @see TMatcher#matches() + */ + public static boolean matches(String regex, CharSequence input) { + return TPattern.compile(regex).matcher(input).matches(); + } + + /** + * Quotes a given string using "\Q" and "\E", so that all other + * meta-characters lose their special meaning. If the string is used for a + * {@code Pattern} afterwards, it can only be matched literally. + * + * @param s + * the string to quote. + * + * @return the quoted string. + */ + public static String quote(String s) { + StringBuilder sb = new StringBuilder().append("\\Q"); //$NON-NLS-1$ + int apos = 0; + int k; + while ((k = s.indexOf("\\E", apos)) >= 0) { //$NON-NLS-1$ + sb.append(s.substring(apos, k + 2)).append("\\\\E\\Q"); //$NON-NLS-1$ + apos = k + 2; + } + + return sb.append(s.substring(apos)).append("\\E").toString(); //$NON-NLS-1$ + } + + /** + * return number of groups found at compile time + */ + int groupCount() { + return globalGroupIndex; + } + + int compCount() { + return this.compCount + 1; + } + + int consCount() { + return this.consCount + 1; + } + + /** + * Returns supplementary character. At this time only for ASCII chars. + */ + static char getSupplement(char ch) { + char res = ch; + if (ch >= 'a' && ch <= 'z') { + res -= 32; + } else if (ch >= 'A' && ch <= 'Z') { + res += 32; + } + + return res; + } + + /** + * @return true if pattern has specified flag + */ + private boolean hasFlag(int flag) { + return (flags & flag) == flag; + } + + /** + * Dismiss public constructor. + * + */ + private TPattern() { + } + + /** + * Serialization support + */ + private void readObject(java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException { + s.defaultReadObject(); + AbstractSet.counter = 1; + globalGroupIndex = -1; + compCount = -1; + consCount = -1; + backRefs = new FSet[BACK_REF_NUMBER]; + + compileImpl(pattern, flags); + + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TPatternSyntaxException.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TPatternSyntaxException.java new file mode 100644 index 000000000..6ad5dbd51 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/TPatternSyntaxException.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +import java.util.Arrays; + +/** + * Encapsulates a syntax error that occurred during the compilation of a + * {@link TPattern}. Might include a detailed description, the original regular + * expression, and the index at which the error occurred. + * + * @see TPattern#compile(String) + * @see TPattern#compile(java.lang.String,int) + * + * @author Nikolay A. Kuznetsov + */ +public class TPatternSyntaxException extends IllegalArgumentException { + + private static final long serialVersionUID = -3864639126226059218L; + + /** + * Holds the description of the syntax error, or null if the description is + * not known. + */ + private String desc; + + /** + * Holds the syntactically incorrect regular expression, or null if the + * regular expression is not known. + */ + private String pattern; + + /** + * Holds the index around which the error occured, or -1, in case it is + * unknown. + */ + private int index = -1; + + /** + * Creates a new PatternSyntaxException for a given message, pattern, and + * error index. + * + * @param description + * the description of the syntax error, or {@code null} if the + * description is not known. + * @param pattern + * the syntactically incorrect regular expression, or + * {@code null} if the regular expression is not known. + * @param index + * the character index around which the error occurred, or -1 if + * the index is not known. + */ + public TPatternSyntaxException(String description, String pattern, int index) { + this.desc = description; + this.pattern = pattern; + this.index = index; + } + + /** + * Returns the syntactically incorrect regular expression. + * + * @return the regular expression. + * + */ + public String getPattern() { + return pattern; + } + + /** + * Returns a detailed error message for the exception. The message is + * potentially multi-line, and it might include a detailed description, the + * original regular expression, and the index at which the error occured. + * + * @return the error message. + */ + @Override + public String getMessage() { + String filler = ""; + if (index >= 1) { + char[] temp = new char[index]; + Arrays.fill(temp, ' '); + filler = new String(temp); + } + return desc + ((pattern != null && pattern.length() != 0) ? index + ", " + pattern + ", " + filler : ""); + } + + /** + * Returns the description of the syntax error, or {@code null} if the + * description is not known. + * + * @return the description. + */ + public String getDescription() { + return desc; + } + + /** + * Returns the character index around which the error occurred, or -1 if the + * index is not known. + * + * @return the index. + * + */ + public int getIndex() { + return index; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCIBackReferenceSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCIBackReferenceSet.java new file mode 100644 index 000000000..cb71d0173 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCIBackReferenceSet.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Unicode case insensitive back reference (i.e. \1-9) node. + * + * @author Nikolay A. Kuznetsov + */ +class UCIBackReferenceSet extends CIBackReferenceSet { + + int groupIndex; + + public UCIBackReferenceSet(int groupIndex, int consCounter) { + super(groupIndex, consCounter); + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + String group = getString(matchResult); + + if (group == null || (stringIndex + group.length()) > matchResult.getRightBound()) + return -1; + + for (int i = 0; i < group.length(); i++) { + if (Character.toLowerCase(Character.toUpperCase(group.charAt(i))) != Character.toLowerCase(Character + .toUpperCase(testString.charAt(stringIndex + i)))) { + return -1; + } + } + matchResult.setConsumed(consCounter, group.length()); + return next.matches(stringIndex + group.length(), testString, matchResult); + } + + @Override + public String getName() { + return "UCI back reference: " + this.groupIndex; //$NON-NLS-1$ + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCICharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCICharSet.java new file mode 100644 index 000000000..cec7ceb9b --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCICharSet.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single character in unicode case + * insensitive manner. + * + * @author Nikolay A. Kuznetsov + */ +class UCICharSet extends LeafSet { + + private char ch; + + public UCICharSet(char ch) { + this.ch = Character.toLowerCase(Character.toUpperCase(ch)); + } + + @Override + public int accepts(int strIndex, CharSequence testString) { + return (this.ch == Character.toLowerCase(Character + .toUpperCase(testString.charAt(strIndex)))) ? 1 : -1; + } + + @Override + protected String getName() { + return "UCI " + ch; + } +} \ No newline at end of file diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCIDecomposedCharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCIDecomposedCharSet.java new file mode 100644 index 000000000..e95b6cd6b --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCIDecomposedCharSet.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Represents Unicode case insensitive + * canonical decomposition of + * Unicode character. Is used when + * CANON_EQ flag of Pattern class + * is specified. + */ +class UCIDecomposedCharSet extends DecomposedCharSet{ + + /* + * Just only a stub + */ + public UCIDecomposedCharSet(int [] decomp, int decomposedCharLength) { + super(decomp, decomposedCharLength); + } +} \ No newline at end of file diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCIRangeSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCIRangeSet.java new file mode 100644 index 000000000..1ef12365f --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCIRangeSet.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single character from the given char class. Note, + * this class contains normalized characters fo unicode case, asci case is + * supported through adding both symbols to the range. + * + * @author Nikolay A. Kuznetsov + */ +class UCIRangeSet extends LeafSet { + + private AbstractCharClass chars; + + private boolean alt = false; + + public UCIRangeSet(AbstractCharClass cs, AbstractSet next) { + super(next); + this.chars = cs.getInstance(); + this.alt = cs.alt; + } + + public UCIRangeSet(AbstractCharClass cc) { + this.chars = cc.getInstance(); + this.alt = cc.alt; + } + + @Override + public int accepts(int strIndex, CharSequence testString) { + return (chars.contains(Character.toLowerCase(Character + .toUpperCase(testString.charAt(strIndex))))) ? 1 : -1; + } + + @Override + protected String getName() { + return "UCI range:" + (alt ? "^ " : " ") + chars.toString(); + } +} \ No newline at end of file diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCISequenceSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCISequenceSet.java new file mode 100644 index 000000000..f05441e01 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCISequenceSet.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Node accepting substrings in unicode case insensitive manner. + * + * @author Nikolay A. Kuznetsov + */ +class UCISequenceSet extends LeafSet { + + private String string = null; + + UCISequenceSet(StringBuffer substring) { + StringBuilder res = new StringBuilder(); + for (int i = 0; i < substring.length(); i++) { + res.append(Character.toLowerCase(Character.toUpperCase(substring.charAt(i)))); + } + this.string = res.toString(); + this.charCount = res.length(); + } + + @Override + public int accepts(int strIndex, CharSequence testString) { + for (int i = 0; i < string.length(); i++) { + if (string.charAt(i) != Character.toLowerCase(Character.toUpperCase(testString.charAt(strIndex + i)))) { + return -1; + } + } + + return string.length(); + + } + + @Override + public String getName() { + return "UCI sequence: " + string; //$NON-NLS-1$ + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCISupplCharSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCISupplCharSet.java new file mode 100644 index 000000000..ea64b1661 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCISupplCharSet.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single supplementary codepoint in Unicode case + * insensitive manner. + */ +class UCISupplCharSet extends LeafSet { + + // int value of this supplementary codepoint + private int ch; + + public UCISupplCharSet(int ch) { + charCount = 2; + this.ch = Character.toLowerCase(Character.toUpperCase(ch)); + } + + @Override + public int accepts(int strIndex, CharSequence testString) { + char high = testString.charAt(strIndex++); + char low = testString.charAt(strIndex); + return (this.ch == Character.toLowerCase(Character.toUpperCase(Character.toCodePoint(high, low)))) ? 2 : -1; + } + + @Override + protected String getName() { + return "UCI " + new String(Character.toChars(ch)); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCISupplRangeSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCISupplRangeSet.java new file mode 100644 index 000000000..8aa2f2837 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UCISupplRangeSet.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * + * Portions, Copyright © 1991-2005 Unicode, Inc. The following applies to Unicode. + * + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright © 1991-2005 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. Permission is + * hereby granted, free of charge, to any person obtaining a copy of the + * Unicode data files and any associated documentation (the "Data Files") + * or Unicode software and any associated documentation (the "Software") + * to deal in the Data Files or Software without restriction, including without + * limitation the rights to use, copy, modify, merge, publish, distribute, + * and/or sell copies of the Data Files or Software, and to permit persons + * to whom the Data Files or Software are furnished to do so, provided that + * (a) the above copyright notice(s) and this permission notice appear with + * all copies of the Data Files or Software, (b) both the above copyright + * notice(s) and this permission notice appear in associated documentation, + * and (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or Software + * that the data or software has been modified. + + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written + * authorization of the copyright holder. + * + * 2. Additional terms from the Database: + * + * Copyright © 1995-1999 Unicode, Inc. All Rights reserved. + * + * Disclaimer + * + * The Unicode Character Database is provided as is by Unicode, Inc. + * No claims are made as to fitness for any particular purpose. No warranties + * of any kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been purchased + * on magnetic or optical media from Unicode, Inc., the sole remedy for any claim + * will be exchange of defective media within 90 days of receipt. This disclaimer + * is applicable for all other data files accompanying the Unicode Character Database, + * some of which have been compiled by the Unicode Consortium, and some of which + * have been supplied by other sources. + * + * Limitations on Rights to Redistribute This Data + * + * Recipient is granted the right to make copies in any form for internal + * distribution and to freely use the information supplied in the creation of + * products supporting the UnicodeTM Standard. The files in + * the Unicode Character Database can be redistributed to third parties or other + * organizations (whether for profit or not) as long as this notice and the disclaimer + * notice are retained. Information can be extracted from these files and used + * in documentation or programs, as long as there is an accompanying notice + * indicating the source. + */ + +package org.teavm.classlib.java.util.regex; + +/** + * Represents node accepting single character from the given char class + * in Unicode case insensitive manner. + * This character can be supplementary (2 chars to represent) or from + * basic multilingual pane (1 char to represent). + */ +class UCISupplRangeSet extends SupplRangeSet{ + + public UCISupplRangeSet(AbstractCharClass cs, AbstractSet next) { + super(cs, next); + } + + public UCISupplRangeSet(AbstractCharClass cc) { + super(cc); + } + + @Override + public boolean contains(int ch) { + return chars.contains(Character.toLowerCase(Character.toUpperCase(ch))); + } + + @Override + protected String getName() { + return "UCI range:" + (alt ? "^ " : " ") + chars.toString(); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UEOLSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UEOLSet.java new file mode 100644 index 000000000..f34431102 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UEOLSet.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Unix line terminator, accepting only \n. + * + * @author Nikolay A. Kuznetsov + */ +final class UEOLSet extends AbstractSet { + + private int consCounter; + + public UEOLSet(int counter) { + this.consCounter = counter; + } + + @Override + public int matches(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + int rightBound = matchResult.hasAnchoringBounds() ? matchResult.getRightBound() : testString.length(); + + if (strIndex >= rightBound) { + matchResult.setConsumed(consCounter, 0); + return next.matches(strIndex, testString, matchResult); + } + // check final line terminator; + + if ((rightBound - strIndex) == 1 && testString.charAt(strIndex) == '\n') { + matchResult.setConsumed(consCounter, 1); + return next.matches(strIndex + 1, testString, matchResult); + } + + return -1; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + int cons; + boolean res = ((cons = matchResult.getConsumed(consCounter)) < 0 || cons > 0); + matchResult.setConsumed(consCounter, -1); + return res; + } + + @Override + protected String getName() { + return ""; //$NON-NLS-1$ + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UMultiLineEOLSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UMultiLineEOLSet.java new file mode 100644 index 000000000..9c278274e --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UMultiLineEOLSet.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Unix style multiline end-of-line node. + * + * @author Nikolay A. Kuznetsov + */ +class UMultiLineEOLSet extends AbstractSet { + + private int consCounter; + + public UMultiLineEOLSet(int counter) { + this.consCounter = counter; + } + + @Override + public int matches(int strIndex, CharSequence testString, MatchResultImpl matchResult) { + int strDif = matchResult.hasAnchoringBounds() ? matchResult.getRightBound() - strIndex : testString.length() - + strIndex; + if (strDif <= 0) { + matchResult.setConsumed(consCounter, 0); + return next.matches(strIndex, testString, matchResult); + } else if (testString.charAt(strIndex) == '\n') { + matchResult.setConsumed(consCounter, 1); + return next.matches(strIndex + 1, testString, matchResult); + } + return -1; + } + + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + int cons; + boolean res = ((cons = matchResult.getConsumed(consCounter)) < 0 || cons > 0); + matchResult.setConsumed(consCounter, -1); + return res; + } + + @Override + protected String getName() { + return ""; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UnicodeCategory.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UnicodeCategory.java new file mode 100644 index 000000000..310352af5 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UnicodeCategory.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Unicode category (i.e. Ll, Lu). + * + * @author Nikolay A. Kuznetsov + */ +class UnicodeCategory extends AbstractCharClass { + + protected int category; + + public UnicodeCategory(int category) { + this.category = category; + } + + @Override + public boolean contains(int ch) { + return alt ^ (category == Character.getType((char) ch)); + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UnicodeCategoryScope.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UnicodeCategoryScope.java new file mode 100644 index 000000000..ad6360d37 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UnicodeCategoryScope.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Unicode category scope (i.e IsL, IsM, ...) + * @author Nikolay A. Kuznetsov + */ +class UnicodeCategoryScope extends UnicodeCategory { + + public UnicodeCategoryScope(int category) { + super(category); + } + + @Override + public boolean contains(int ch) { + return alt ^ ((category >> Character.getType((char) ch)) & 1) != 0; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UnifiedQuantifierSet.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UnifiedQuantifierSet.java new file mode 100644 index 000000000..d9bf00315 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/UnifiedQuantifierSet.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Greedy quantifier node for the case where there is no intersection with next + * node and normal quantifiers could be treated as greedy and possessive. + * + * @author Nikolay A. Kuznetsov + */ +class UnifiedQuantifierSet extends LeafQuantifierSet { + + public UnifiedQuantifierSet(LeafSet innerSet, AbstractSet next, int type) { + super(innerSet, next, type); + } + + public UnifiedQuantifierSet(LeafQuantifierSet quant) { + super((LeafSet)quant.getInnerSet(), quant.getNext(), quant.getType()); + innerSet.setNext(this); + + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + while (stringIndex + leaf.charCount() <= matchResult.getRightBound() && + leaf.accepts(stringIndex, testString) > 0) + stringIndex += leaf.charCount(); + + return next.matches(stringIndex, testString, matchResult); + } + + @Override + public int find(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + int startSearch = next.find(stringIndex, testString, matchResult); + if (startSearch < 0) + return -1; + int newSearch = startSearch - leaf.charCount(); + while (newSearch >= stringIndex && leaf.accepts(newSearch, testString) > 0) { + startSearch = newSearch; + newSearch -= leaf.charCount(); + } + + return startSearch; + } +} diff --git a/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/WordBoundary.java b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/WordBoundary.java new file mode 100644 index 000000000..68d71e9e7 --- /dev/null +++ b/teavm-classlib/src/main/java/org/teavm/classlib/java/util/regex/WordBoundary.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @author Nikolay A. Kuznetsov + */ +package org.teavm.classlib.java.util.regex; + +/** + * Represents word boundary, checks current character and previous one if + * different types returns true; + * + * @author Nikolay A. Kuznetsov + */ +class WordBoundary extends AbstractSet { + + boolean positive; + + public WordBoundary(boolean positive) { + this.positive = positive; + } + + @Override + public int matches(int stringIndex, CharSequence testString, MatchResultImpl matchResult) { + boolean left; + boolean right; + + char ch1 = stringIndex >= matchResult.getRightBound() ? ' ' : testString.charAt(stringIndex); + char ch2 = stringIndex == 0 ? ' ' : testString.charAt(stringIndex - 1); + + int leftBound = matchResult.hasTransparentBounds() ? 0 : matchResult.getLeftBound(); + left = (ch1 == ' ') || isSpace(ch1, stringIndex, leftBound, testString); + right = (ch2 == ' ') || isSpace(ch2, stringIndex - 1, leftBound, testString); + return ((left ^ right) ^ positive) ? -1 : next.matches(stringIndex, testString, matchResult); + } + + /** + * Returns false, because word boundary does not consumes any characters and + * do not move string index. + */ + @Override + public boolean hasConsumed(MatchResultImpl matchResult) { + // only checks boundary, do not consumes characters + return false; + } + + @Override + protected String getName() { + return "WordBoundary"; //$NON-NLS-1$ + } + + private boolean isSpace(char ch, int index, int leftBound, CharSequence testString) { + if (Character.isLetterOrDigit(ch) || ch == '_') + return false; + if (Character.getType(ch) == Character.NON_SPACING_MARK) { + for (; --index >= leftBound;) { + ch = testString.charAt(index); + if (Character.isLetterOrDigit(ch)) + return false; + if (Character.getType(ch) != Character.NON_SPACING_MARK) + return true; + } + } + return true; + } +} diff --git a/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/Matcher2Test.java b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/Matcher2Test.java new file mode 100644 index 000000000..ece7d0ba3 --- /dev/null +++ b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/Matcher2Test.java @@ -0,0 +1,234 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import junit.framework.TestCase; + +/** + * Tests Matcher methods + */ +@SuppressWarnings("nls") +public class Matcher2Test extends TestCase { + public void test_toString() { + Pattern p = Pattern.compile("foo"); + Matcher m = p.matcher("bar"); + assertNotNull(m.toString()); + } + + public void testErrorConditions() throws PatternSyntaxException { + // Test match cursors in absence of a match + Pattern p = Pattern.compile("foo"); + Matcher m = p.matcher("bar"); + assertFalse(m.matches()); + + try { + m.start(); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + try { + m.end(); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + try { + m.group(); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + try { + m.start(1); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + try { + m.end(1); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + try { + m.group(1); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + // regression test for HARMONY-2418 + try { + m.usePattern(null); + fail("IllegalArgumentException expected"); + } catch (IllegalArgumentException e) { + // PASSED + } + } + + public void testErrorConditions2() throws PatternSyntaxException { + // Test match cursors in absence of a match + Pattern p = Pattern.compile("(foo[0-9])(bar[a-z])"); + Matcher m = p.matcher("foo1barzfoo2baryfoozbar5"); + + assertTrue(m.find()); + assertEquals(0, m.start()); + assertEquals(8, m.end()); + assertEquals(0, m.start(1)); + assertEquals(4, m.end(1)); + assertEquals(4, m.start(2)); + assertEquals(8, m.end(2)); + + try { + m.start(3); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + try { + m.end(3); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + try { + m.group(3); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + try { + m.start(-1); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + try { + m.end(-1); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + try { + m.group(-1); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + assertTrue(m.find()); + assertEquals(8, m.start()); + assertEquals(16, m.end()); + assertEquals(8, m.start(1)); + assertEquals(12, m.end(1)); + assertEquals(12, m.start(2)); + assertEquals(16, m.end(2)); + + try { + m.start(3); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + try { + m.end(3); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + try { + m.group(3); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + try { + m.start(-1); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + try { + m.end(-1); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + try { + m.group(-1); + fail("IndexOutOfBoundsException expected"); + } catch (IndexOutOfBoundsException e) { + } + + assertFalse(m.find()); + + try { + m.start(3); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + try { + m.end(3); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + try { + m.group(3); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + try { + m.start(-1); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + try { + m.end(-1); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + + try { + m.group(-1); + fail("IllegalStateException expected"); + } catch (IllegalStateException e) { + } + } + + /* + * Regression test for HARMONY-997 + */ + public void testReplacementBackSlash() { + String str = "replace me"; + String replacedString = "me"; + String substitutionString = "\\"; + Pattern pat = Pattern.compile(replacedString); + Matcher mat = pat.matcher(str); + try { + mat.replaceAll(substitutionString); + fail("IndexOutOfBoundsException should be thrown"); + } catch (IndexOutOfBoundsException e) { + } + } +} diff --git a/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/MatcherTest.java b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/MatcherTest.java new file mode 100644 index 000000000..12af6b879 --- /dev/null +++ b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/MatcherTest.java @@ -0,0 +1,772 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +import static org.junit.Assert.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.junit.Test; + + +@SuppressWarnings("nls") +public class MatcherTest { + String[] testPatterns = { + "(a|b)*abb", + "(1*2*3*4*)*567", + "(a|b|c|d)*aab", + "(1|2|3|4|5|6|7|8|9|0)(1|2|3|4|5|6|7|8|9|0)*", + "(abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ)*", + "(a|b)*(a|b)*A(a|b)*lice.*", + "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)(a|b|c|d|e|f|g|h|" + + "i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)*(1|2|3|4|5|6|7|8|9|0)*|while|for|struct|if|do" }; + + String[] groupPatterns = { "(a|b)*aabb", "((a)|b)*aabb", "((a|b)*)a(abb)", + "(((a)|(b))*)aabb", "(((a)|(b))*)aa(b)b", "(((a)|(b))*)a(a(b)b)" }; + + @Test + public void testRegionsIntInt() { + Pattern p = Pattern.compile("x*"); + Matcher m = p.matcher("axxxxxa"); + assertFalse(m.matches()); + + m.region(1, 6); + assertEquals(1, m.regionStart()); + assertEquals(6, m.regionEnd()); + assertTrue(m.matches()); + + try { + m.region(1, 0); + fail("expected an IOOBE"); + } catch(IndexOutOfBoundsException e) { + } + + try { + m.region(-1, 2); + fail("expected an IOOBE"); + } catch(IndexOutOfBoundsException e) { + } + + try { + m.region(10, 11); + fail("expected an IOOBE"); + } catch(IndexOutOfBoundsException e) { + } + + try { + m.region(1, 10); + fail("expected an IOOBE"); + } catch(IndexOutOfBoundsException e) { + } + } + + @Test + public void testAppendReplacement() { + Pattern pat = Pattern.compile("XX"); + Matcher m = pat.matcher("Today is XX-XX-XX ..."); + StringBuffer sb = new StringBuffer(); + + for (int i = 0; m.find(); i++) { + m.appendReplacement(sb, new Integer(i * 10 + i).toString()); + } + m.appendTail(sb); + assertEquals("Today is 0-11-22 ...", sb.toString()); + } + + @Test + public void testAppendReplacementRef() { + Pattern p = Pattern.compile("xx (rur|\\$)"); + Matcher m = p.matcher("xx $ equals to xx rur."); + StringBuffer sb = new StringBuffer(); + for (int i = 1; m.find(); i *= 30) { + String rep = new Integer(i).toString() + " $1"; + m.appendReplacement(sb, rep); + } + m.appendTail(sb); + assertEquals("1 $ equals to 30 rur.", sb.toString()); + } + + @Test + public void testReplaceAll() { + String input = "aabfooaabfooabfoob"; + String pattern = "a*b"; + Pattern pat = Pattern.compile(pattern); + Matcher mat = pat.matcher(input); + + assertEquals("-foo-foo-foo-", mat.replaceAll("-")); + } + + @Test + public void testResetCharSequence() { + Pattern p = Pattern.compile("abcd"); + Matcher m = p.matcher("abcd"); + assertTrue(m.matches()); + m.reset("efgh"); + assertFalse(m.matches()); + + try { + m.reset(null); + fail("expected a NPE"); + } catch (NullPointerException e) { + } + } + + @Test + public void testAppendSlashes() { + Pattern p = Pattern.compile("\\\\"); + Matcher m = p.matcher("one\\cat\\two\\cats\\in\\the\\yard"); + StringBuffer sb = new StringBuffer(); + while (m.find()) { + m.appendReplacement(sb, "\\\\"); + } + m.appendTail(sb); + assertEquals("one\\cat\\two\\cats\\in\\the\\yard", sb.toString()); + + } + + @Test + public void testReplaceFirst() { + String input = "zzzdogzzzdogzzz"; + String pattern = "dog"; + Pattern pat = Pattern.compile(pattern); + Matcher mat = pat.matcher(input); + + assertEquals("zzzcatzzzdogzzz", mat.replaceFirst("cat")); + } + + @Test + public void testPattern() { + for (String element : testPatterns) { + Pattern test = Pattern.compile(element); + assertEquals(test, test.matcher("aaa").pattern()); + } + + for (String element : testPatterns) { + assertEquals(element, Pattern.compile(element).matcher("aaa") + .pattern().toString()); + } + } + + /* + * Class under test for Matcher reset() + */ + public void testReset() { + } + + @Test + public void testGroupint() { + String positiveTestString = "ababababbaaabb"; + + // test IndexOutOfBoundsException + // // + for (int i = 0; i < groupPatterns.length; i++) { + Pattern test = Pattern.compile(groupPatterns[i]); + Matcher mat = test.matcher(positiveTestString); + mat.matches(); + try { + // groupPattern equals to number of groups + // of the specified pattern + // // + mat.group(i + 2); + fail("IndexOutBoundsException expected"); + mat.group(i + 100); + fail("IndexOutBoundsException expected"); + mat.group(-1); + fail("IndexOutBoundsException expected"); + mat.group(-100); + fail("IndexOutBoundsException expected"); + } catch (IndexOutOfBoundsException iobe) { + } + } + + String[][] groupResults = { { "a" }, { "a", "a" }, + { "ababababba", "a", "abb" }, { "ababababba", "a", "a", "b" }, + { "ababababba", "a", "a", "b", "b" }, + { "ababababba", "a", "a", "b", "abb", "b" }, }; + + for (int i = 0; i < groupPatterns.length; i++) { + Pattern test = Pattern.compile(groupPatterns[i]); + Matcher mat = test.matcher(positiveTestString); + mat.matches(); + for (int j = 0; j < groupResults[i].length; j++) { + assertEquals("i: " + i + " j: " + j, groupResults[i][j], mat + .group(j + 1)); + } + + } + + } + + @Test + public void testGroup() { + String positiveTestString = "ababababbaaabb"; + String negativeTestString = "gjhfgdsjfhgcbv"; + for (String element : groupPatterns) { + Pattern test = Pattern.compile(element); + Matcher mat = test.matcher(positiveTestString); + mat.matches(); + // test result + assertEquals(positiveTestString, mat.group()); + + // test equal to group(0) result + assertEquals(mat.group(0), mat.group()); + } + + for (String element : groupPatterns) { + Pattern test = Pattern.compile(element); + Matcher mat = test.matcher(negativeTestString); + mat.matches(); + try { + mat.group(); + fail("IllegalStateException expected for matches result"); + } catch (IllegalStateException ise) { + } + } + } + + @Test + public void testGroupPossessive() { + Pattern pat = Pattern.compile("((a)|(b))++c"); + Matcher mat = pat.matcher("aac"); + + mat.matches(); + assertEquals("a", mat.group(1)); + } + + /* + * Class under test for boolean find(int) + */ + public void testFindint() { + } + + /* + * Class under test for int start(int) + */ + public void testStartint() { + } + + /* + * Class under test for int end(int) + */ + public void testEndint() { + } + + @Test + public void testMatchesMisc() { + String[][] posSeq = { + { "abb", "ababb", "abababbababb", "abababbababbabababbbbbabb" }, + { "213567", "12324567", "1234567", "213213567", + "21312312312567", "444444567" }, + { "abcdaab", "aab", "abaab", "cdaab", "acbdadcbaab" }, + { "213234567", "3458", "0987654", "7689546432", "0398576", + "98432", "5" }, + { + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" }, + { "ababbaAabababblice", "ababbaAliceababab", "ababbAabliceaaa", + "abbbAbbbliceaaa", "Alice" }, + { "a123", "bnxnvgds156", "for", "while", "if", "struct" } + + }; + + for (int i = 0; i < testPatterns.length; i++) { + Pattern pat = Pattern.compile(testPatterns[i]); + for (int j = 0; j < posSeq[i].length; j++) { + Matcher mat = pat.matcher(posSeq[i][j]); + assertTrue("Incorrect match: " + testPatterns[i] + " vs " + + posSeq[i][j], mat.matches()); + } + } + } + + @Test + public void testMatchesQuantifiers() { + String[] testPatternsSingles = { "a{5}", "a{2,4}", "a{3,}" }; + String[] testPatternsMultiple = { "((a)|(b)){1,2}abb", + "((a)|(b)){2,4}", "((a)|(b)){3,}" }; + + String[][] stringSingles = { { "aaaaa", "aaa" }, + { "aa", "a", "aaa", "aaaaaa", "aaaa", "aaaaa" }, + { "aaa", "a", "aaaa", "aa" }, }; + + String[][] stringMultiples = { { "ababb", "aba" }, + { "ab", "b", "bab", "ababa", "abba", "abababbb" }, + { "aba", "b", "abaa", "ba" }, }; + + for (int i = 0; i < testPatternsSingles.length; i++) { + Pattern pat = Pattern.compile(testPatternsSingles[i]); + for (int j = 0; j < stringSingles.length / 2; j++) { + assertTrue("Match expected, but failed: " + pat.pattern() + + " : " + stringSingles[i][j], pat.matcher( + stringSingles[i][j * 2]).matches()); + assertFalse("Match failure expected, but match succeed: " + + pat.pattern() + " : " + stringSingles[i][j * 2 + 1], + pat.matcher(stringSingles[i][j * 2 + 1]).matches()); + } + } + + for (int i = 0; i < testPatternsMultiple.length; i++) { + Pattern pat = Pattern.compile(testPatternsMultiple[i]); + for (int j = 0; j < stringMultiples.length / 2; j++) { + assertTrue("Match expected, but failed: " + pat.pattern() + + " : " + stringMultiples[i][j], pat.matcher( + stringMultiples[i][j * 2]).matches()); + assertFalse( + "Match failure expected, but match succeed: " + + pat.pattern() + " : " + + stringMultiples[i][j * 2 + 1], pat.matcher( + stringMultiples[i][j * 2 + 1]).matches()); + } + } + } + + @Test + public void testQuantVsGroup() { + String patternString = "(d{1,3})((a|c)*)(d{1,3})((a|c)*)(d{1,3})"; + String testString = "dacaacaacaaddaaacaacaaddd"; + + Pattern pat = Pattern.compile(patternString); + Matcher mat = pat.matcher(testString); + + mat.matches(); + assertEquals("dacaacaacaaddaaacaacaaddd", mat.group()); + assertEquals("d", mat.group(1)); + assertEquals("acaacaacaa", mat.group(2)); + assertEquals("dd", mat.group(4)); + assertEquals("aaacaacaa", mat.group(5)); + assertEquals("ddd", mat.group(7)); + } + + public void testLookingAt() { + } + + /* + * Class under test for boolean find() + */ + public void testFind() { + String testPattern = "(abb)"; + String testString = "cccabbabbabbabbabb"; + Pattern pat = Pattern.compile(testPattern); + Matcher mat = pat.matcher(testString); + int start = 3; + int end = 6; + while (mat.find()) { + assertEquals(start, mat.start(1)); + assertEquals(end, mat.end(1)); + + start = end; + end += 3; + } + + testPattern = "(\\d{1,3})"; + testString = "aaaa123456789045"; + + Pattern pat2 = Pattern.compile(testPattern); + Matcher mat2 = pat2.matcher(testString); + start = 4; + int length = 3; + while (mat2.find()) { + assertEquals(testString.substring(start, start + length), mat2 + .group(1)); + start += length; + } + } + + @Test + public void testSEOLsymbols() { + Pattern pat = Pattern.compile("^a\\(bb\\[$"); + Matcher mat = pat.matcher("a(bb["); + + assertTrue(mat.matches()); + } + + /* + * Class under test for int start() + */ + public void testStart() { + } + + @Test + public void testGroupCount() { + for (int i = 0; i < groupPatterns.length; i++) { + Pattern test = Pattern.compile(groupPatterns[i]); + Matcher mat = test.matcher("ababababbaaabb"); + mat.matches(); + assertEquals(i + 1, mat.groupCount()); + + } + } + + @Test + public void testRelactantQuantifiers() { + Pattern pat = Pattern.compile("(ab*)*b"); + Matcher mat = pat.matcher("abbbb"); + + if (mat.matches()) { + assertEquals("abbb", mat.group(1)); + } else { + fail("Match expected: (ab*)*b vs abbbb"); + } + } + + @Test + public void testEnhancedFind() { + String input = "foob"; + String pattern = "a*b"; + Pattern pat = Pattern.compile(pattern); + Matcher mat = pat.matcher(input); + + mat.find(); + assertEquals("b", mat.group()); + } + + @Test + public void testPosCompositeGroup() { + String[] posExamples = { "aabbcc", "aacc", "bbaabbcc" }; + String[] negExamples = { "aabb", "bb", "bbaabb" }; + Pattern posPat = Pattern.compile("(aa|bb){1,3}+cc"); + Pattern negPat = Pattern.compile("(aa|bb){1,3}+bb"); + + Matcher mat; + for (String element : posExamples) { + mat = posPat.matcher(element); + assertTrue(mat.matches()); + } + + for (String element : negExamples) { + mat = negPat.matcher(element); + assertFalse(mat.matches()); + } + + assertTrue(Pattern.matches("(aa|bb){1,3}+bb", "aabbaabb")); + + } + + @Test + public void testPosAltGroup() { + String[] posExamples = { "aacc", "bbcc", "cc" }; + String[] negExamples = { "bb", "aa" }; + Pattern posPat = Pattern.compile("(aa|bb)?+cc"); + Pattern negPat = Pattern.compile("(aa|bb)?+bb"); + + Matcher mat; + for (String element : posExamples) { + mat = posPat.matcher(element); + assertTrue(posPat.toString() + " vs: " + element, mat.matches()); + } + + for (String element : negExamples) { + mat = negPat.matcher(element); + assertFalse(mat.matches()); + } + + assertTrue(Pattern.matches("(aa|bb)?+bb", "aabb")); + } + + @Test + public void testRelCompGroup() { + + Matcher mat; + Pattern pat; + String res = ""; + for (int i = 0; i < 4; i++) { + pat = Pattern.compile("((aa|bb){" + i + ",3}?).*cc"); + mat = pat.matcher("aaaaaacc"); + assertTrue(pat.toString() + " vs: " + "aaaaaacc", mat.matches()); + assertEquals(res, mat.group(1)); + res += "aa"; + } + } + + @Test + public void testRelAltGroup() { + + Matcher mat; + Pattern pat; + + pat = Pattern.compile("((aa|bb)??).*cc"); + mat = pat.matcher("aacc"); + assertTrue(pat.toString() + " vs: " + "aacc", mat.matches()); + assertEquals("", mat.group(1)); + + pat = Pattern.compile("((aa|bb)??)cc"); + mat = pat.matcher("aacc"); + assertTrue(pat.toString() + " vs: " + "aacc", mat.matches()); + assertEquals("aa", mat.group(1)); + } + + @Test + public void testIgnoreCase() { + Pattern pat = Pattern.compile("(aa|bb)*", Pattern.CASE_INSENSITIVE); + Matcher mat = pat.matcher("aAbb"); + + assertTrue(mat.matches()); + + pat = Pattern.compile("(a|b|c|d|e)*", Pattern.CASE_INSENSITIVE); + mat = pat.matcher("aAebbAEaEdebbedEccEdebbedEaedaebEbdCCdbBDcdcdADa"); + assertTrue(mat.matches()); + + pat = Pattern.compile("[a-e]*", Pattern.CASE_INSENSITIVE); + mat = pat.matcher("aAebbAEaEdebbedEccEdebbedEaedaebEbdCCdbBDcdcdADa"); + assertTrue(mat.matches()); + + } + + @Test + public void testQuoteReplacement() { + assertEquals("\\\\aaCC\\$1", Matcher.quoteReplacement("\\aaCC$1")); + } + + @Test + public void testOverFlow() { + Pattern tp = Pattern.compile("(a*)*"); + Matcher tm = tp.matcher("aaa"); + assertTrue(tm.matches()); + assertEquals("", tm.group(1)); + + assertTrue(Pattern.matches("(1+)\\1+", "11")); + assertTrue(Pattern.matches("(1+)(2*)\\2+", "11")); + + Pattern pat = Pattern.compile("(1+)\\1*"); + Matcher mat = pat.matcher("11"); + + assertTrue(mat.matches()); + assertEquals("11", mat.group(1)); + + pat = Pattern.compile("((1+)|(2+))(\\2+)"); + mat = pat.matcher("11"); + + assertTrue(mat.matches()); + assertEquals("1", mat.group(2)); + assertEquals("1", mat.group(1)); + assertEquals("1", mat.group(4)); + assertNull(mat.group(3)); + + } + + @Test + public void testUnicode() { + + assertTrue(Pattern.matches("\\x61a", "aa")); + assertTrue(Pattern.matches("\\u0061a", "aa")); + assertTrue(Pattern.matches("\\0141a", "aa")); + assertTrue(Pattern.matches("\\0777", "?7")); + + } + + @Test + public void testUnicodeCategory() { + assertTrue(Pattern.matches("\\p{Ll}", "k")); // Unicode lower case + assertTrue(Pattern.matches("\\P{Ll}", "K")); // Unicode non-lower + // case + assertTrue(Pattern.matches("\\p{Lu}", "K")); // Unicode upper case + assertTrue(Pattern.matches("\\P{Lu}", "k")); // Unicode non-upper + // case + // combinations + assertTrue(Pattern.matches("[\\p{L}&&[^\\p{Lu}]]", "k")); + assertTrue(Pattern.matches("[\\p{L}&&[^\\p{Ll}]]", "K")); + assertFalse(Pattern.matches("[\\p{L}&&[^\\p{Lu}]]", "K")); + assertFalse(Pattern.matches("[\\p{L}&&[^\\p{Ll}]]", "k")); + + // category/character combinations + assertFalse(Pattern.matches("[\\p{L}&&[^a-z]]", "k")); + assertTrue(Pattern.matches("[\\p{L}&&[^a-z]]", "K")); + + assertTrue(Pattern.matches("[\\p{Lu}a-z]", "k")); + assertTrue(Pattern.matches("[a-z\\p{Lu}]", "k")); + + assertFalse(Pattern.matches("[\\p{Lu}a-d]", "k")); + assertTrue(Pattern.matches("[a-d\\p{Lu}]", "K")); + + // assertTrue(Pattern.matches("[\\p{L}&&[^\\p{Lu}&&[^K]]]", "K")); + assertFalse(Pattern.matches("[\\p{L}&&[^\\p{Lu}&&[^G]]]", "K")); + + } + + @Test + public void testSplitEmpty() { + + Pattern pat = Pattern.compile(""); + String[] s = pat.split("", -1); + + assertEquals(1, s.length); + assertEquals("", s[0]); + } + + @Test + public void testFindDollar() { + Matcher mat = Pattern.compile("a$").matcher("a\n"); + assertTrue(mat.find()); + assertEquals("a", mat.group()); + } + + @Test + public void testMatchesRegionChanged() { + // Regression for HARMONY-610 + String input = " word "; + Pattern pattern = Pattern.compile("\\w+"); + Matcher matcher = pattern.matcher(input); + matcher.region(1, 5); + assertTrue(matcher.matches()); + } + + @Test + public void testAllCodePoints() { + // Regression for HARMONY-3145 + int[] codePoint = new int[1]; + Pattern p = Pattern.compile("(\\p{all})+"); + boolean res = true; + int cnt = 0; + String s; + for (int i = 0; i < 0x110000; i++) { + codePoint[0] = i; + s = new String(codePoint, 0, 1); + if (!s.matches(p.toString())) { + cnt++; + res = false; + } + } + assertTrue(res); + assertEquals(0, cnt); + + p = Pattern.compile("(\\P{all})+"); + res = true; + cnt = 0; + + for (int i = 0; i < 0x110000; i++) { + codePoint[0] = i; + s = new String(codePoint, 0, 1); + if (!s.matches(p.toString())) { + cnt++; + res = false; + } + } + + assertFalse(res); + assertEquals(0x110000, cnt); + } + + @Test + public void testFindRegionChanged() { + // Regression for HARMONY-625 + Pattern pattern = Pattern.compile("(?s).*"); + Matcher matcher = pattern.matcher("abcde"); + matcher.find(); + assertEquals("abcde", matcher.group()); + + matcher = pattern.matcher("abcde"); + matcher.region(0, 2); + matcher.find(); + assertEquals("ab", matcher.group()); + + } + + @Test + public void testFindRegionChanged2() { + // Regression for HARMONY-713 + Pattern pattern = Pattern.compile("c"); + + String inputStr = "aabb.c"; + Matcher matcher = pattern.matcher(inputStr); + matcher.region(0, 3); + + assertFalse(matcher.find()); + } + + @Test + public void testPatternMatcher() throws Exception { + Pattern pattern = Pattern.compile("(?:\\d+)(?:pt)"); + assertTrue(pattern.matcher("14pt").matches()); + } + + @Test + public void test3360() { + String str = "!\"#%&'(),-./"; + Pattern p = Pattern.compile("\\s"); + Matcher m = p.matcher(str); + + assertFalse(m.find()); + } + + @Test + public void testGeneralPunctuationCategory() { + String[] s = { ",", "!", "\"", "#", "%", "&", "'", "(", ")", "-", ".", + "/" }; + String regexp = "\\p{P}"; + + for (int i = 0; i < s.length; i++) { + Pattern pattern = Pattern.compile(regexp); + Matcher matcher = pattern.matcher(s[i]); + assertTrue(matcher.find()); + } + } + + @Test + public void testHitEndAfterFind() { + hitEndTest(true, "#01.0", "r((ege)|(geg))x", "regexx", false); + hitEndTest(true, "#01.1", "r((ege)|(geg))x", "regex", false); + hitEndTest(true, "#01.2", "r((ege)|(geg))x", "rege", true); + hitEndTest(true, "#01.2", "r((ege)|(geg))x", "xregexx", false); + + hitEndTest(true, "#02.0", "regex", "rexreger", true); + hitEndTest(true, "#02.1", "regex", "raxregexr", false); + + String floatRegex = getHexFloatRegex(); + hitEndTest(true, "#03.0", floatRegex, Double.toHexString(-1.234d), true); + hitEndTest(true, "#03.1", floatRegex, "1 ABC" + + Double.toHexString(Double.NaN) + "buhuhu", false); + hitEndTest(true, "#03.2", floatRegex, Double.toHexString(-0.0) + "--", + false); + hitEndTest(true, "#03.3", floatRegex, "--" + + Double.toHexString(Double.MIN_VALUE) + "--", false); + + hitEndTest(true, "#04.0", "(\\d+) fish (\\d+) fish (\\w+) fish (\\d+)", + "1 fish 2 fish red fish 5", true); + hitEndTest(true, "#04.1", "(\\d+) fish (\\d+) fish (\\w+) fish (\\d+)", + "----1 fish 2 fish red fish 5----", false); + } + + @Test + public void testToString() { + String result = Pattern.compile("(\\d{1,3})").matcher( + "aaaa123456789045").toString(); + assertTrue("The result doesn't contain pattern info", result + .contains("(\\d{1,3})")); + } + + private void hitEndTest(boolean callFind, String testNo, String regex, + String input, boolean hit) { + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(input); + if (callFind) { + matcher.find(); + } else { + matcher.matches(); + } + boolean h = matcher.hitEnd(); + + assertTrue(testNo, h == hit); + } + + private String getHexFloatRegex() { + String hexDecimal = "(-|\\+)?0[xX][0-9a-fA-F]*\\.[0-9a-fA-F]+([pP](-|\\+)?[0-9]+)?"; + String notANumber = "((-|\\+)?Infinity)|([nN]a[nN])"; + return new StringBuilder("((").append(hexDecimal).append(")|(").append( + notANumber).append("))").toString(); + } +} diff --git a/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/ModeTest.java b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/ModeTest.java new file mode 100644 index 000000000..b8b95395e --- /dev/null +++ b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/ModeTest.java @@ -0,0 +1,111 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import junit.framework.TestCase; + +/** + * Tests Pattern compilation modes and modes triggered in pattern strings + */ +@SuppressWarnings("nls") +public class ModeTest extends TestCase { + public void testCase() throws PatternSyntaxException { + Pattern p; + Matcher m; + + p = Pattern.compile("([a-z]+)[0-9]+"); + m = p.matcher("cAT123#dog345"); + assertTrue(m.find()); + assertEquals("dog", m.group(1)); + assertFalse(m.find()); + + p = Pattern.compile("([a-z]+)[0-9]+", Pattern.CASE_INSENSITIVE); + m = p.matcher("cAt123#doG345"); + assertTrue(m.find()); + assertEquals("cAt", m.group(1)); + assertTrue(m.find()); + assertEquals("doG", m.group(1)); + assertFalse(m.find()); + + p = Pattern.compile("(?i)([a-z]+)[0-9]+"); + m = p.matcher("cAt123#doG345"); + assertTrue(m.find()); + assertEquals("cAt", m.group(1)); + assertTrue(m.find()); + assertEquals("doG", m.group(1)); + assertFalse(m.find()); + } + + public void testMultiline() throws PatternSyntaxException { + Pattern p; + Matcher m; + + p = Pattern.compile("^foo"); + m = p.matcher("foobar"); + assertTrue(m.find()); + assertTrue(m.start() == 0 && m.end() == 3); + assertFalse(m.find()); + + m = p.matcher("barfoo"); + assertFalse(m.find()); + + p = Pattern.compile("foo$"); + m = p.matcher("foobar"); + assertFalse(m.find()); + + m = p.matcher("barfoo"); + assertTrue(m.find()); + assertTrue(m.start() == 3 && m.end() == 6); + assertFalse(m.find()); + + p = Pattern.compile("^foo([0-9]*)", Pattern.MULTILINE); + m = p.matcher("foo1bar\nfoo2foo3\nbarfoo4"); + assertTrue(m.find()); + assertEquals("1", m.group(1)); + assertTrue(m.find()); + assertEquals("2", m.group(1)); + assertFalse(m.find()); + + p = Pattern.compile("foo([0-9]*)$", Pattern.MULTILINE); + m = p.matcher("foo1bar\nfoo2foo3\nbarfoo4"); + assertTrue(m.find()); + assertEquals("3", m.group(1)); + assertTrue(m.find()); + assertEquals("4", m.group(1)); + assertFalse(m.find()); + + p = Pattern.compile("(?m)^foo([0-9]*)"); + m = p.matcher("foo1bar\nfoo2foo3\nbarfoo4"); + assertTrue(m.find()); + assertEquals("1", m.group(1)); + assertTrue(m.find()); + assertEquals("2", m.group(1)); + assertFalse(m.find()); + + p = Pattern.compile("(?m)foo([0-9]*)$"); + m = p.matcher("foo1bar\nfoo2foo3\nbarfoo4"); + assertTrue(m.find()); + assertEquals("3", m.group(1)); + assertTrue(m.find()); + assertEquals("4", m.group(1)); + assertFalse(m.find()); + } +} diff --git a/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/Pattern2Test.java b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/Pattern2Test.java new file mode 100644 index 000000000..dfacec463 --- /dev/null +++ b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/Pattern2Test.java @@ -0,0 +1,1412 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import junit.framework.TestCase; + +/** + * Tests simple Pattern compilation and Matcher methods + */ +@SuppressWarnings("nls") +public class Pattern2Test extends TestCase { + public void testSimpleMatch() throws PatternSyntaxException { + Pattern p = Pattern.compile("foo.*"); + + Matcher m1 = p.matcher("foo123"); + assertTrue(m1.matches()); + assertTrue(m1.find(0)); + assertTrue(m1.lookingAt()); + + Matcher m2 = p.matcher("fox"); + assertFalse(m2.matches()); + assertFalse(m2.find(0)); + assertFalse(m2.lookingAt()); + + assertTrue(Pattern.matches("foo.*", "foo123")); + assertFalse(Pattern.matches("foo.*", "fox")); + + assertFalse(Pattern.matches("bar", "foobar")); + + assertTrue(Pattern.matches("", "")); + } + + public void testCursors() { + Pattern p; + Matcher m; + + try { + p = Pattern.compile("foo"); + + m = p.matcher("foobar"); + assertTrue(m.find()); + assertEquals(0, m.start()); + assertEquals(3, m.end()); + assertFalse(m.find()); + + // Note: also testing reset here + m.reset(); + assertTrue(m.find()); + assertEquals(0, m.start()); + assertEquals(3, m.end()); + assertFalse(m.find()); + + m.reset("barfoobar"); + assertTrue(m.find()); + assertEquals(3, m.start()); + assertEquals(6, m.end()); + assertFalse(m.find()); + + m.reset("barfoo"); + assertTrue(m.find()); + assertEquals(3, m.start()); + assertEquals(6, m.end()); + assertFalse(m.find()); + + m.reset("foobarfoobarfoo"); + assertTrue(m.find()); + assertEquals(0, m.start()); + assertEquals(3, m.end()); + assertTrue(m.find()); + assertEquals(6, m.start()); + assertEquals(9, m.end()); + assertTrue(m.find()); + assertEquals(12, m.start()); + assertEquals(15, m.end()); + assertFalse(m.find()); + assertTrue(m.find(0)); + assertEquals(0, m.start()); + assertEquals(3, m.end()); + assertTrue(m.find(4)); + assertEquals(6, m.start()); + assertEquals(9, m.end()); + } catch (PatternSyntaxException e) { + System.out.println(e.getMessage()); + fail(); + } + } + + public void testGroups() throws PatternSyntaxException { + Pattern p; + Matcher m; + + p = Pattern.compile("(p[0-9]*)#?(q[0-9]*)"); + + m = p.matcher("p1#q3p2q42p5p71p63#q888"); + assertTrue(m.find()); + assertEquals(0, m.start()); + assertEquals(5, m.end()); + assertEquals(2, m.groupCount()); + assertEquals(0, m.start(0)); + assertEquals(5, m.end(0)); + assertEquals(0, m.start(1)); + assertEquals(2, m.end(1)); + assertEquals(3, m.start(2)); + assertEquals(5, m.end(2)); + assertEquals("p1#q3", m.group()); + assertEquals("p1#q3", m.group(0)); + assertEquals("p1", m.group(1)); + assertEquals("q3", m.group(2)); + + assertTrue(m.find()); + assertEquals(5, m.start()); + assertEquals(10, m.end()); + assertEquals(2, m.groupCount()); + assertEquals(10, m.end(0)); + assertEquals(5, m.start(1)); + assertEquals(7, m.end(1)); + assertEquals(7, m.start(2)); + assertEquals(10, m.end(2)); + assertEquals("p2q42", m.group()); + assertEquals("p2q42", m.group(0)); + assertEquals("p2", m.group(1)); + assertEquals("q42", m.group(2)); + + assertTrue(m.find()); + assertEquals(15, m.start()); + assertEquals(23, m.end()); + assertEquals(2, m.groupCount()); + assertEquals(15, m.start(0)); + assertEquals(23, m.end(0)); + assertEquals(15, m.start(1)); + assertEquals(18, m.end(1)); + assertEquals(19, m.start(2)); + assertEquals(23, m.end(2)); + assertEquals("p63#q888", m.group()); + assertEquals("p63#q888", m.group(0)); + assertEquals("p63", m.group(1)); + assertEquals("q888", m.group(2)); + assertFalse(m.find()); + } + + public void testReplace() throws PatternSyntaxException { + Pattern p; + Matcher m; + + // Note: examples from book, + // Hitchens, Ron, 2002, "Java NIO", O'Reilly, page 171 + p = Pattern.compile("a*b"); + + m = p.matcher("aabfooaabfooabfoob"); + assertTrue(m.replaceAll("-").equals("-foo-foo-foo-")); + assertTrue(m.replaceFirst("-").equals("-fooaabfooabfoob")); + + /* + * p = Pattern.compile ("\\p{Blank}"); + * + * m = p.matcher ("fee fie foe fum"); assertTrue + * (m.replaceFirst("-").equals ("fee-fie foe fum")); assertTrue + * (m.replaceAll("-").equals ("fee-fie-foe-fum")); + */ + + p = Pattern.compile("([bB])yte"); + + m = p.matcher("Byte for byte"); + assertTrue(m.replaceFirst("$1ite").equals("Bite for byte")); + assertTrue(m.replaceAll("$1ite").equals("Bite for bite")); + + p = Pattern.compile("\\d\\d\\d\\d([- ])"); + + m = p.matcher("card #1234-5678-1234"); + assertTrue(m.replaceFirst("xxxx$1").equals("card #xxxx-5678-1234")); + assertTrue(m.replaceAll("xxxx$1").equals("card #xxxx-xxxx-1234")); + + p = Pattern.compile("(up|left)( *)(right|down)"); + + m = p.matcher("left right, up down"); + assertTrue(m.replaceFirst("$3$2$1").equals("right left, up down")); + assertTrue(m.replaceAll("$3$2$1").equals("right left, down up")); + + p = Pattern.compile("([CcPp][hl]e[ea]se)"); + + m = p.matcher("I want cheese. Please."); + assertTrue(m.replaceFirst(" $1 ").equals( + "I want cheese . Please.")); + assertTrue(m.replaceAll(" $1 ").equals( + "I want cheese . Please .")); + } + + public void testEscapes() throws PatternSyntaxException { + Pattern p; + Matcher m; + + // Test \\ sequence + p = Pattern.compile("([a-z]+)\\\\([a-z]+);"); + m = p.matcher("fred\\ginger;abbott\\costello;jekell\\hyde;"); + assertTrue(m.find()); + assertEquals("fred", m.group(1)); + assertEquals("ginger", m.group(2)); + assertTrue(m.find()); + assertEquals("abbott", m.group(1)); + assertEquals("costello", m.group(2)); + assertTrue(m.find()); + assertEquals("jekell", m.group(1)); + assertEquals("hyde", m.group(2)); + assertFalse(m.find()); + + // Test \n, \t, \r, \f, \e, \a sequences + p = Pattern.compile("([a-z]+)[\\n\\t\\r\\f\\e\\a]+([a-z]+)"); + m = p.matcher("aa\nbb;cc\u0009\rdd;ee\u000C\u001Bff;gg\n\u0007hh"); + assertTrue(m.find()); + assertEquals("aa", m.group(1)); + assertEquals("bb", m.group(2)); + assertTrue(m.find()); + assertEquals("cc", m.group(1)); + assertEquals("dd", m.group(2)); + assertTrue(m.find()); + assertEquals("ee", m.group(1)); + assertEquals("ff", m.group(2)); + assertTrue(m.find()); + assertEquals("gg", m.group(1)); + assertEquals("hh", m.group(2)); + assertFalse(m.find()); + + // Test \\u and \\x sequences +p = Pattern.compile("([0-9]+)[\\u0020:\\x21];"); + m = p.matcher("11:;22 ;33-;44!;"); + assertTrue(m.find()); + assertEquals("11", m.group(1)); + assertTrue(m.find()); + assertEquals("22", m.group(1)); + assertTrue(m.find()); + assertEquals("44", m.group(1)); + assertFalse(m.find()); + + // Test invalid unicode sequences + try { + p = Pattern.compile("\\u"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\u;"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\u002"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\u002;"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + // Test invalid hex sequences + try { + p = Pattern.compile("\\x"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\x;"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\xa"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\xa;"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + // Test \0 (octal) sequences (1, 2 and 3 digit) + p = Pattern.compile("([0-9]+)[\\07\\040\\0160];"); + m = p.matcher("11\u0007;22:;33 ;44p;"); + assertTrue(m.find()); + assertEquals("11", m.group(1)); + assertTrue(m.find()); + assertEquals("33", m.group(1)); + assertTrue(m.find()); + assertEquals("44", m.group(1)); + assertFalse(m.find()); + + // Test invalid octal sequences + try { + p = Pattern.compile("\\08"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + // originally contributed test did not check the result + // TODO: check what RI does here + // try { + // p = Pattern.compile("\\0477"); + // fail("PatternSyntaxException expected"); + // } catch (PatternSyntaxException e) { + // } + + try { + p = Pattern.compile("\\0"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\0;"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + // Test \c (control character) sequence + p = Pattern.compile("([0-9]+)[\\cA\\cB\\cC\\cD];"); + m = p.matcher("11\u0001;22:;33\u0002;44p;55\u0003;66\u0004;"); + assertTrue(m.find()); + assertEquals("11", m.group(1)); + assertTrue(m.find()); + assertEquals("33", m.group(1)); + assertTrue(m.find()); + assertEquals("55", m.group(1)); + assertTrue(m.find()); + assertEquals("66", m.group(1)); + assertFalse(m.find()); + + // More thorough control escape test + // Ensure that each escape matches exactly the corresponding + // character + // code and no others (well, from 0-255 at least) + int i, j; + for (i = 0; i < 26; i++) { + p = Pattern.compile("\\c" + Character.toString((char) ('A' + i))); + int match_char = -1; + for (j = 0; j < 255; j++) { + m = p.matcher(Character.toString((char) j)); + if (m.matches()) { + assertEquals(-1, match_char); + match_char = j; + } + } + assertTrue(match_char == i + 1); + } + + // Test invalid control escapes + try { + p = Pattern.compile("\\c"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + // originally contributed test did not check the result + // TODO: check what RI does here + // try { + // p = Pattern.compile("\\c;"); + // fail("PatternSyntaxException expected"); + // } catch (PatternSyntaxException e) { + // } + // + // try { + // p = Pattern.compile("\\ca;"); + // fail("PatternSyntaxException expected"); + // } catch (PatternSyntaxException e) { + // } + // + // try { + // p = Pattern.compile("\\c4;"); + // fail("PatternSyntaxException expected"); + // } catch (PatternSyntaxException e) { + // } + } + + public void testCharacterClasses() throws PatternSyntaxException { + Pattern p; + Matcher m; + + // Test one character range + p = Pattern.compile("[p].*[l]"); + m = p.matcher("paul"); + assertTrue(m.matches()); + m = p.matcher("pool"); + assertTrue(m.matches()); + m = p.matcher("pong"); + assertFalse(m.matches()); + m = p.matcher("pl"); + assertTrue(m.matches()); + + // Test two character range + p = Pattern.compile("[pm].*[lp]"); + m = p.matcher("prop"); + assertTrue(m.matches()); + m = p.matcher("mall"); + assertTrue(m.matches()); + m = p.matcher("pong"); + assertFalse(m.matches()); + m = p.matcher("pill"); + assertTrue(m.matches()); + + // Test range including [ and ] + p = Pattern.compile("[<\\[].*[\\]>]"); + m = p.matcher(""); + assertTrue(m.matches()); + m = p.matcher("[bar]"); + assertTrue(m.matches()); + m = p.matcher("{foobar]"); + assertFalse(m.matches()); + m = p.matcher(""); + m = p.matcher(""); + assertTrue(m.matches()); + m = p.matcher(""); + assertFalse(m.matches()); + m = p + .matcher("xyz zzz"); + assertTrue(m.find()); + assertTrue(m.find()); + assertFalse(m.find()); + + // Test \S (not whitespace) + p = Pattern.compile("<[a-z] \\S[0-9][\\S\n]+[^\\S]221>"); + m = p.matcher(""); + assertTrue(m.matches()); + m = p.matcher(""); + assertTrue(m.matches()); + m = p.matcher(""); + assertFalse(m.matches()); + m = p.matcher(""); + assertTrue(m.matches()); + p = Pattern.compile("<[a-z] \\S[0-9][\\S\n]+[^\\S]221[\\S&&[^abc]]>"); + m = p.matcher(""); + assertTrue(m.matches()); + m = p.matcher(""); + assertTrue(m.matches()); + m = p.matcher(""); + assertFalse(m.matches()); + m = p.matcher(""); + assertFalse(m.matches()); + m = p.matcher(""); + assertFalse(m.matches()); + m = p.matcher(""); + assertTrue(m.matches()); + + // Test \w (ascii word) + p = Pattern.compile("<\\w+\\s[0-9]+;[^\\w]\\w+/[\\w$]+;"); + m = p.matcher(""); + * m = p.matcher(""); assertTrue(m.matches()); m = p.matcher(""); + * assertTrue(m.matches()); m = p.matcher(""); + * assertFalse(m.matches()); + */ + p = Pattern.compile("\\p{Lower}+"); + m = p.matcher("abcdefghijklmnopqrstuvwxyz"); + assertTrue(m.matches()); + + // Invalid uses of \p{Lower} + try { + p = Pattern.compile("\\p"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\p;"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\p{"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\p{;"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\p{Lower"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\p{Lower;"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + // Test \p{Upper} + /* + * FIXME: Requires complex range processing p = Pattern.compile("<\\p{Upper}\\d\\P{Upper}:[\\p{Upper}z]\\s[^\\P{Upper}]>"); + * m = p.matcher(""); assertTrue(m.matches()); m = p.matcher(""); + * assertTrue(m.matches()); m = p.matcher(""); + * assertFalse(m.matches()); + */ + p = Pattern.compile("\\p{Upper}+"); + m = p.matcher("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); + assertTrue(m.matches()); + + // Invalid uses of \p{Upper} + try { + p = Pattern.compile("\\p{Upper"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\p{Upper;"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + // Test \p{ASCII} + /* + * FIXME: Requires complex range processing p = Pattern.compile("<\\p{ASCII}\\d\\P{ASCII}:[\\p{ASCII}\u1234]\\s[^\\P{ASCII}]>"); + * m = p.matcher(""); assertTrue(m.matches()); m = + * p.matcher(""); assertTrue(m.matches()); m = + * p.matcher("<\u00846#:E E>"); assertFalse(m.matches()) + */ + int i; + p = Pattern.compile("\\p{ASCII}"); + for (i = 0; i < 0x80; i++) { + m = p.matcher(Character.toString((char) i)); + assertTrue(m.matches()); + } + for (; i < 0xff; i++) { + m = p.matcher(Character.toString((char) i)); + assertFalse(m.matches()); + } + + // Invalid uses of \p{ASCII} + try { + p = Pattern.compile("\\p{ASCII"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + try { + p = Pattern.compile("\\p{ASCII;"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + } + + // Test \p{Alpha} + // TODO + + // Test \p{Digit} + // TODO + + // Test \p{XDigit} + // TODO + + // Test \p{Alnum} + // TODO + + // Test \p{Punct} + // TODO + + // Test \p{Graph} + // TODO + + // Test \p{Print} + // TODO + + // Test \p{Blank} + // TODO + + // Test \p{Space} + // TODO + + // Test \p{Cntrl} + // TODO + } + + public void testUnicodeCategories() throws PatternSyntaxException { + // Test Unicode categories using \p and \P + // One letter codes: L, M, N, P, S, Z, C + // Two letter codes: Lu, Nd, Sc, Sm, ... + // See java.lang.Character and Unicode standard for complete list + // TODO + // Test \p{L} + // TODO + + // Test \p{N} + // TODO + + // ... etc + + // Test two letter codes: + // From unicode.org: + // Lu + // Ll + // Lt + // Lm + // Lo + // Mn + // Mc + // Me + // Nd + // Nl + // No + // Pc + // Pd + // Ps + // Pe + // Pi + // Pf + // Po + // Sm + // Sc + // Sk + // So + // Zs + // Zl + // Zp + // Cc + // Cf + // Cs + // Co + // Cn + } + + public void testUnicodeBlocks() throws PatternSyntaxException { + Pattern p; + Matcher m; + int i, j; + + // Test Unicode blocks using \p and \P + // FIXME: + // Note that LatinExtended-B and ArabicPresentations-B are unrecognized + // by the reference JDK. + for (i = 0; i < UBlocks.length; i++) { + /* + * p = Pattern.compile("\\p{"+UBlocks[i].name+"}"); + * + * if (UBlocks[i].low > 0) { m = + * p.matcher(Character.toString((char)(UBlocks[i].low-1))); + * assertFalse(m.matches()); } for (j=UBlocks[i].low; j <= + * UBlocks[i].high; j++) { m = + * p.matcher(Character.toString((char)j)); assertTrue(m.matches()); } + * if (UBlocks[i].high < 0xFFFF) { m = + * p.matcher(Character.toString((char)(UBlocks[i].high+1))); + * assertFalse(m.matches()); } + * + * p = Pattern.compile("\\P{"+UBlocks[i].name+"}"); + * + * if (UBlocks[i].low > 0) { m = + * p.matcher(Character.toString((char)(UBlocks[i].low-1))); + * assertTrue(m.matches()); } for (j=UBlocks[i].low; j < + * UBlocks[i].high; j++) { m = + * p.matcher(Character.toString((char)j)); assertFalse(m.matches()); } + * if (UBlocks[i].high < 0xFFFF) { m = + * p.matcher(Character.toString((char)(UBlocks[i].high+1))); + * assertTrue(m.matches()); } + */ + + p = Pattern.compile("\\p{In" + UBlocks[i].name + "}"); + + if (UBlocks[i].low > 0) { + m = p.matcher(Character.toString((char) (UBlocks[i].low - 1))); + assertFalse(m.matches()); + } + for (j = UBlocks[i].low; j <= UBlocks[i].high; j++) { + m = p.matcher(Character.toString((char) j)); + // TODO investigate, why this fails and uncomment + //assertTrue(m.matches()); + } + if (UBlocks[i].high < 0xFFFF) { + m = p.matcher(Character.toString((char) (UBlocks[i].high + 1))); + // TODO investigate, why this fails and uncomment + //assertFalse(m.matches()); + } + + p = Pattern.compile("\\P{In" + UBlocks[i].name + "}"); + + if (UBlocks[i].low > 0) { + m = p.matcher(Character.toString((char) (UBlocks[i].low - 1))); + assertTrue(m.matches()); + } + for (j = UBlocks[i].low; j < UBlocks[i].high; j++) { + m = p.matcher(Character.toString((char) j)); + assertFalse(m.matches()); + } + if (UBlocks[i].high < 0xFFFF) { + m = p.matcher(Character.toString((char) (UBlocks[i].high + 1))); + // TODO investigate, why this fails and uncomment + //assertTrue(m.matches()); + } + } + } + + public void testCapturingGroups() throws PatternSyntaxException { + // Test simple capturing groups + // TODO + + // Test grouping without capture (?:...) + // TODO + + // Test combination of grouping and capture + // TODO + + // Test \ sequence with capturing and non-capturing groups + // TODO + + // Test \ with out of range + // TODO + } + + public void testRepeats() { + // Test ? + // TODO + + // Test * + // TODO + + // Test + + // TODO + + // Test {}, including 0, 1 and more + // TODO + + // Test {,}, including 0, 1 and more + // TODO + + // Test {,}, with n1 < n2, n1 = n2 and n1 > n2 (illegal?) + // TODO + } + + public void testAnchors() throws PatternSyntaxException { + // Test ^, default and MULTILINE + // TODO + + // Test $, default and MULTILINE + // TODO + + // Test \b (word boundary) + // TODO + + // Test \B (not a word boundary) + // TODO + + // Test \A (beginning of string) + // TODO + + // Test \Z (end of string) + // TODO + + // Test \z (end of string) + // TODO + + // Test \G + // TODO + + // Test positive lookahead using (?=...) + // TODO + + // Test negative lookahead using (?!...) + // TODO + + // Test positive lookbehind using (?<=...) + // TODO + + // Test negative lookbehind using (?...) + // TODO + + // Test (?onflags-offflags) + // Valid flags are i,m,d,s,u,x + // TODO + + // Test (?onflags-offflags:...) + // TODO + + // Test \Q, \E + p = Pattern.compile("[a-z]+;\\Q[a-z]+;\\Q(foo.*);\\E[0-9]+"); + m = p.matcher("abc;[a-z]+;\\Q(foo.*);411"); + assertTrue(m.matches()); + m = p.matcher("abc;def;foo42;555"); + assertFalse(m.matches()); + m = p.matcher("abc;\\Qdef;\\Qfoo99;\\E123"); + assertFalse(m.matches()); + + p = Pattern.compile("[a-z]+;(foo[0-9]-\\Q(...)\\E);[0-9]+"); + m = p.matcher("abc;foo5-(...);123"); + assertTrue(m.matches()); + assertEquals("foo5-(...)", m.group(1)); + m = p.matcher("abc;foo9-(xxx);789"); + assertFalse(m.matches()); + + p = Pattern.compile("[a-z]+;(bar[0-9]-[a-z\\Q$-\\E]+);[0-9]+"); + m = p.matcher("abc;bar0-def$-;123"); + assertTrue(m.matches()); + + // FIXME: + // This should work the same as the pattern above but fails with the + // the reference JDK + p = Pattern.compile("[a-z]+;(bar[0-9]-[a-z\\Q-$\\E]+);[0-9]+"); + m = p.matcher("abc;bar0-def$-;123"); + // assertTrue(m.matches()); + + // FIXME: + // This should work too .. it looks as if just about anything that + // has more + // than one character between \Q and \E is broken in the the reference + // JDK + p = Pattern.compile("[a-z]+;(bar[0-9]-[a-z\\Q[0-9]\\E]+);[0-9]+"); + m = p.matcher("abc;bar0-def[99]-]0x[;123"); + // assertTrue(m.matches()); + + // This is the same as above but with explicit escapes .. and this + // does work + // on the the reference JDK + p = Pattern.compile("[a-z]+;(bar[0-9]-[a-z\\[0\\-9\\]]+);[0-9]+"); + m = p.matcher("abc;bar0-def[99]-]0x[;123"); + assertTrue(m.matches()); + + // Test # + // TODO + } + + public void testCompile1() throws PatternSyntaxException { + Pattern pattern = Pattern + .compile("[0-9A-Za-z][0-9A-Za-z\\x2e\\x3a\\x2d\\x5f]*"); + String name = "iso-8859-1"; + assertTrue(pattern.matcher(name).matches()); + } + + public void testCompile2() throws PatternSyntaxException { + String findString = "\\Qimport\\E"; + + Pattern pattern = Pattern.compile(findString, 0); + Matcher matcher = pattern.matcher(new String( + "import a.A;\n\n import b.B;\nclass C {}")); + + assertTrue(matcher.find(0)); + } + + public void testCompile3() throws PatternSyntaxException { + Pattern p; + Matcher m; + p = Pattern.compile("a$"); + m = p.matcher("a\n"); + assertTrue(m.find()); + assertEquals("a", m.group()); + assertFalse(m.find()); + + p = Pattern.compile("(a$)"); + m = p.matcher("a\n"); + assertTrue(m.find()); + assertEquals("a", m.group()); + assertEquals("a", m.group(1)); + assertFalse(m.find()); + + p = Pattern.compile("^.*$", Pattern.MULTILINE); + + m = p.matcher("a\n"); + assertTrue(m.find()); + // System.out.println("["+m.group()+"]"); + assertEquals("a", m.group()); + assertFalse(m.find()); + + m = p.matcher("a\nb\n"); + assertTrue(m.find()); + // System.out.println("["+m.group()+"]"); + assertEquals("a", m.group()); + assertTrue(m.find()); + // System.out.println("["+m.group()+"]"); + assertEquals("b", m.group()); + assertFalse(m.find()); + + m = p.matcher("a\nb"); + assertTrue(m.find()); + // System.out.println("["+m.group()+"]"); + assertEquals("a", m.group()); + assertTrue(m.find()); + assertEquals("b", m.group()); + assertFalse(m.find()); + + m = p.matcher("\naa\r\nbb\rcc\n\n"); + assertTrue(m.find()); + // System.out.println("["+m.group()+"]"); + assertTrue(m.group().equals("")); + assertTrue(m.find()); + // System.out.println("["+m.group()+"]"); + assertEquals("aa", m.group()); + assertTrue(m.find()); + // System.out.println("["+m.group()+"]"); + assertEquals("bb", m.group()); + assertTrue(m.find()); + // System.out.println("["+m.group()+"]"); + assertEquals("cc", m.group()); + assertTrue(m.find()); + // System.out.println("["+m.group()+"]"); + assertTrue(m.group().equals("")); + assertFalse(m.find()); + + m = p.matcher("a"); + assertTrue(m.find()); + assertEquals("a", m.group()); + assertFalse(m.find()); + + m = p.matcher(""); + // FIXME: This matches the reference behaviour but is + // inconsistent with matching "a" - ie. the end of the + // target string should match against $ always but this + // appears to work with the null string only when not in + // multiline mode (see below) + assertFalse(m.find()); + + p = Pattern.compile("^.*$"); + m = p.matcher(""); + assertTrue(m.find()); + assertTrue(m.group().equals("")); + assertFalse(m.find()); + } + + public void testCompile4() throws PatternSyntaxException { + String findString = "\\Qpublic\\E"; + StringBuffer text = new StringBuffer(" public class Class {\n" + + " public class Class {"); + + Pattern pattern = Pattern.compile(findString, 0); + Matcher matcher = pattern.matcher(text); + + boolean found = matcher.find(); + assertTrue(found); + assertEquals(4, matcher.start()); + if (found) { + // modify text + text.delete(0, text.length()); + text.append("Text have been changed."); + matcher.reset(text); + } + + found = matcher.find(); + assertFalse(found); + } + + public void testCompile5() throws PatternSyntaxException { + Pattern p = Pattern.compile("^[0-9]"); + String s[] = p.split("12", -1); + assertEquals("", s[0]); + assertEquals("2", s[1]); + assertEquals(2, s.length); + } + + // public void testCompile6() { + // String regex = "[\\p{L}[\\p{Mn}[\\p{Pc}[\\p{Nd}[\\p{Nl}[\\p{Sc}]]]]]]+"; + // String regex = "[\\p{L}\\p{Mn}\\p{Pc}\\p{Nd}\\p{Nl}\\p{Sc}]+"; + // try { + // Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE); + // assertTrue(true); + // } catch (PatternSyntaxException e) { + // System.out.println(e.getMessage()); + // assertTrue(false); + // } + // } + + private static class UBInfo { + public UBInfo(int low, int high, String name) { + this.name = name; + this.low = low; + this.high = high; + } + + public String name; + + public int low, high; + } + + // A table representing the unicode categories + // private static UBInfo[] UCategories = { + // Lu + // Ll + // Lt + // Lm + // Lo + // Mn + // Mc + // Me + // Nd + // Nl + // No + // Pc + // Pd + // Ps + // Pe + // Pi + // Pf + // Po + // Sm + // Sc + // Sk + // So + // Zs + // Zl + // Zp + // Cc + // Cf + // Cs + // Co + // Cn + // }; + + // A table representing the unicode character blocks + private static UBInfo[] UBlocks = { + /* 0000; 007F; Basic Latin */ + new UBInfo(0x0000, 0x007F, "BasicLatin"), // Character.UnicodeBlock.BASIC_LATIN + /* 0080; 00FF; Latin-1 Supplement */ + new UBInfo(0x0080, 0x00FF, "Latin-1Supplement"), // Character.UnicodeBlock.LATIN_1_SUPPLEMENT + /* 0100; 017F; Latin Extended-A */ + new UBInfo(0x0100, 0x017F, "LatinExtended-A"), // Character.UnicodeBlock.LATIN_EXTENDED_A + /* 0180; 024F; Latin Extended-B */ + // new UBInfo (0x0180,0x024F,"InLatinExtended-B"), // + // Character.UnicodeBlock.LATIN_EXTENDED_B + /* 0250; 02AF; IPA Extensions */ + new UBInfo(0x0250, 0x02AF, "IPAExtensions"), // Character.UnicodeBlock.IPA_EXTENSIONS + /* 02B0; 02FF; Spacing Modifier Letters */ + new UBInfo(0x02B0, 0x02FF, "SpacingModifierLetters"), // Character.UnicodeBlock.SPACING_MODIFIER_LETTERS + /* 0300; 036F; Combining Diacritical Marks */ + new UBInfo(0x0300, 0x036F, "CombiningDiacriticalMarks"), // Character.UnicodeBlock.COMBINING_DIACRITICAL_MARKS + /* 0370; 03FF; Greek */ + new UBInfo(0x0370, 0x03FF, "Greek"), // Character.UnicodeBlock.GREEK + /* 0400; 04FF; Cyrillic */ + new UBInfo(0x0400, 0x04FF, "Cyrillic"), // Character.UnicodeBlock.CYRILLIC + /* 0530; 058F; Armenian */ + new UBInfo(0x0530, 0x058F, "Armenian"), // Character.UnicodeBlock.ARMENIAN + /* 0590; 05FF; Hebrew */ + new UBInfo(0x0590, 0x05FF, "Hebrew"), // Character.UnicodeBlock.HEBREW + /* 0600; 06FF; Arabic */ + new UBInfo(0x0600, 0x06FF, "Arabic"), // Character.UnicodeBlock.ARABIC + /* 0700; 074F; Syriac */ + new UBInfo(0x0700, 0x074F, "Syriac"), // Character.UnicodeBlock.SYRIAC + /* 0780; 07BF; Thaana */ + new UBInfo(0x0780, 0x07BF, "Thaana"), // Character.UnicodeBlock.THAANA + /* 0900; 097F; Devanagari */ + new UBInfo(0x0900, 0x097F, "Devanagari"), // Character.UnicodeBlock.DEVANAGARI + /* 0980; 09FF; Bengali */ + new UBInfo(0x0980, 0x09FF, "Bengali"), // Character.UnicodeBlock.BENGALI + /* 0A00; 0A7F; Gurmukhi */ + new UBInfo(0x0A00, 0x0A7F, "Gurmukhi"), // Character.UnicodeBlock.GURMUKHI + /* 0A80; 0AFF; Gujarati */ + new UBInfo(0x0A80, 0x0AFF, "Gujarati"), // Character.UnicodeBlock.GUJARATI + /* 0B00; 0B7F; Oriya */ + new UBInfo(0x0B00, 0x0B7F, "Oriya"), // Character.UnicodeBlock.ORIYA + /* 0B80; 0BFF; Tamil */ + new UBInfo(0x0B80, 0x0BFF, "Tamil"), // Character.UnicodeBlock.TAMIL + /* 0C00; 0C7F; Telugu */ + new UBInfo(0x0C00, 0x0C7F, "Telugu"), // Character.UnicodeBlock.TELUGU + /* 0C80; 0CFF; Kannada */ + new UBInfo(0x0C80, 0x0CFF, "Kannada"), // Character.UnicodeBlock.KANNADA + /* 0D00; 0D7F; Malayalam */ + new UBInfo(0x0D00, 0x0D7F, "Malayalam"), // Character.UnicodeBlock.MALAYALAM + /* 0D80; 0DFF; Sinhala */ + new UBInfo(0x0D80, 0x0DFF, "Sinhala"), // Character.UnicodeBlock.SINHALA + /* 0E00; 0E7F; Thai */ + new UBInfo(0x0E00, 0x0E7F, "Thai"), // Character.UnicodeBlock.THAI + /* 0E80; 0EFF; Lao */ + new UBInfo(0x0E80, 0x0EFF, "Lao"), // Character.UnicodeBlock.LAO + /* 0F00; 0FFF; Tibetan */ + new UBInfo(0x0F00, 0x0FFF, "Tibetan"), // Character.UnicodeBlock.TIBETAN + /* 1000; 109F; Myanmar */ + new UBInfo(0x1000, 0x109F, "Myanmar"), // Character.UnicodeBlock.MYANMAR + /* 10A0; 10FF; Georgian */ + new UBInfo(0x10A0, 0x10FF, "Georgian"), // Character.UnicodeBlock.GEORGIAN + /* 1100; 11FF; Hangul Jamo */ + new UBInfo(0x1100, 0x11FF, "HangulJamo"), // Character.UnicodeBlock.HANGUL_JAMO + /* 1200; 137F; Ethiopic */ + new UBInfo(0x1200, 0x137F, "Ethiopic"), // Character.UnicodeBlock.ETHIOPIC + /* 13A0; 13FF; Cherokee */ + new UBInfo(0x13A0, 0x13FF, "Cherokee"), // Character.UnicodeBlock.CHEROKEE + /* 1400; 167F; Unified Canadian Aboriginal Syllabics */ + new UBInfo(0x1400, 0x167F, "UnifiedCanadianAboriginalSyllabics"), // Character.UnicodeBlock.UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS + /* 1680; 169F; Ogham */ + new UBInfo(0x1680, 0x169F, "Ogham"), // Character.UnicodeBlock.OGHAM + /* 16A0; 16FF; Runic */ + new UBInfo(0x16A0, 0x16FF, "Runic"), // Character.UnicodeBlock.RUNIC + /* 1780; 17FF; Khmer */ + new UBInfo(0x1780, 0x17FF, "Khmer"), // Character.UnicodeBlock.KHMER + /* 1800; 18AF; Mongolian */ + new UBInfo(0x1800, 0x18AF, "Mongolian"), // Character.UnicodeBlock.MONGOLIAN + /* 1E00; 1EFF; Latin Extended Additional */ + new UBInfo(0x1E00, 0x1EFF, "LatinExtendedAdditional"), // Character.UnicodeBlock.LATIN_EXTENDED_ADDITIONAL + /* 1F00; 1FFF; Greek Extended */ + new UBInfo(0x1F00, 0x1FFF, "GreekExtended"), // Character.UnicodeBlock.GREEK_EXTENDED + /* 2000; 206F; General Punctuation */ + new UBInfo(0x2000, 0x206F, "GeneralPunctuation"), // Character.UnicodeBlock.GENERAL_PUNCTUATION + /* 2070; 209F; Superscripts and Subscripts */ + new UBInfo(0x2070, 0x209F, "SuperscriptsandSubscripts"), // Character.UnicodeBlock.SUPERSCRIPTS_AND_SUBSCRIPTS + /* 20A0; 20CF; Currency Symbols */ + new UBInfo(0x20A0, 0x20CF, "CurrencySymbols"), // Character.UnicodeBlock.CURRENCY_SYMBOLS + /* 20D0; 20FF; Combining Marks for Symbols */ + new UBInfo(0x20D0, 0x20FF, "CombiningMarksforSymbols"), // Character.UnicodeBlock.COMBINING_MARKS_FOR_SYMBOLS + /* 2100; 214F; Letterlike Symbols */ + new UBInfo(0x2100, 0x214F, "LetterlikeSymbols"), // Character.UnicodeBlock.LETTERLIKE_SYMBOLS + /* 2150; 218F; Number Forms */ + new UBInfo(0x2150, 0x218F, "NumberForms"), // Character.UnicodeBlock.NUMBER_FORMS + /* 2190; 21FF; Arrows */ + new UBInfo(0x2190, 0x21FF, "Arrows"), // Character.UnicodeBlock.ARROWS + /* 2200; 22FF; Mathematical Operators */ + new UBInfo(0x2200, 0x22FF, "MathematicalOperators"), // Character.UnicodeBlock.MATHEMATICAL_OPERATORS + /* 2300; 23FF; Miscellaneous Technical */ + new UBInfo(0x2300, 0x23FF, "MiscellaneousTechnical"), // Character.UnicodeBlock.MISCELLANEOUS_TECHNICAL + /* 2400; 243F; Control Pictures */ + new UBInfo(0x2400, 0x243F, "ControlPictures"), // Character.UnicodeBlock.CONTROL_PICTURES + /* 2440; 245F; Optical Character Recognition */ + new UBInfo(0x2440, 0x245F, "OpticalCharacterRecognition"), // Character.UnicodeBlock.OPTICAL_CHARACTER_RECOGNITION + /* 2460; 24FF; Enclosed Alphanumerics */ + new UBInfo(0x2460, 0x24FF, "EnclosedAlphanumerics"), // Character.UnicodeBlock.ENCLOSED_ALPHANUMERICS + /* 2500; 257F; Box Drawing */ + new UBInfo(0x2500, 0x257F, "BoxDrawing"), // Character.UnicodeBlock.BOX_DRAWING + /* 2580; 259F; Block Elements */ + new UBInfo(0x2580, 0x259F, "BlockElements"), // Character.UnicodeBlock.BLOCK_ELEMENTS + /* 25A0; 25FF; Geometric Shapes */ + new UBInfo(0x25A0, 0x25FF, "GeometricShapes"), // Character.UnicodeBlock.GEOMETRIC_SHAPES + /* 2600; 26FF; Miscellaneous Symbols */ + new UBInfo(0x2600, 0x26FF, "MiscellaneousSymbols"), // Character.UnicodeBlock.MISCELLANEOUS_SYMBOLS + /* 2700; 27BF; Dingbats */ + new UBInfo(0x2700, 0x27BF, "Dingbats"), // Character.UnicodeBlock.DINGBATS + /* 2800; 28FF; Braille Patterns */ + new UBInfo(0x2800, 0x28FF, "BraillePatterns"), // Character.UnicodeBlock.BRAILLE_PATTERNS + /* 2E80; 2EFF; CJK Radicals Supplement */ + new UBInfo(0x2E80, 0x2EFF, "CJKRadicalsSupplement"), // Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT + /* 2F00; 2FDF; Kangxi Radicals */ + new UBInfo(0x2F00, 0x2FDF, "KangxiRadicals"), // Character.UnicodeBlock.KANGXI_RADICALS + /* 2FF0; 2FFF; Ideographic Description Characters */ + new UBInfo(0x2FF0, 0x2FFF, "IdeographicDescriptionCharacters"), // Character.UnicodeBlock.IDEOGRAPHIC_DESCRIPTION_CHARACTERS + /* 3000; 303F; CJK Symbols and Punctuation */ + new UBInfo(0x3000, 0x303F, "CJKSymbolsandPunctuation"), // Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION + /* 3040; 309F; Hiragana */ + new UBInfo(0x3040, 0x309F, "Hiragana"), // Character.UnicodeBlock.HIRAGANA + /* 30A0; 30FF; Katakana */ + new UBInfo(0x30A0, 0x30FF, "Katakana"), // Character.UnicodeBlock.KATAKANA + /* 3100; 312F; Bopomofo */ + new UBInfo(0x3100, 0x312F, "Bopomofo"), // Character.UnicodeBlock.BOPOMOFO + /* 3130; 318F; Hangul Compatibility Jamo */ + new UBInfo(0x3130, 0x318F, "HangulCompatibilityJamo"), // Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO + /* 3190; 319F; Kanbun */ + new UBInfo(0x3190, 0x319F, "Kanbun"), // Character.UnicodeBlock.KANBUN + /* 31A0; 31BF; Bopomofo Extended */ + new UBInfo(0x31A0, 0x31BF, "BopomofoExtended"), // Character.UnicodeBlock.BOPOMOFO_EXTENDED + /* 3200; 32FF; Enclosed CJK Letters and Months */ + new UBInfo(0x3200, 0x32FF, "EnclosedCJKLettersandMonths"), // Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS + /* 3300; 33FF; CJK Compatibility */ + new UBInfo(0x3300, 0x33FF, "CJKCompatibility"), // Character.UnicodeBlock.CJK_COMPATIBILITY + /* 3400; 4DB5; CJK Unified Ideographs Extension A */ + new UBInfo(0x3400, 0x4DB5, "CJKUnifiedIdeographsExtensionA"), // Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A + /* 4E00; 9FFF; CJK Unified Ideographs */ + new UBInfo(0x4E00, 0x9FFF, "CJKUnifiedIdeographs"), // Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS + /* A000; A48F; Yi Syllables */ + new UBInfo(0xA000, 0xA48F, "YiSyllables"), // Character.UnicodeBlock.YI_SYLLABLES + /* A490; A4CF; Yi Radicals */ + new UBInfo(0xA490, 0xA4CF, "YiRadicals"), // Character.UnicodeBlock.YI_RADICALS + /* AC00; D7A3; Hangul Syllables */ + new UBInfo(0xAC00, 0xD7A3, "HangulSyllables"), // Character.UnicodeBlock.HANGUL_SYLLABLES + /* D800; DB7F; High Surrogates */ + /* DB80; DBFF; High Private Use Surrogates */ + /* DC00; DFFF; Low Surrogates */ + /* E000; F8FF; Private Use */ + /* F900; FAFF; CJK Compatibility Ideographs */ + new UBInfo(0xF900, 0xFAFF, "CJKCompatibilityIdeographs"), // Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS + /* FB00; FB4F; Alphabetic Presentation Forms */ + new UBInfo(0xFB00, 0xFB4F, "AlphabeticPresentationForms"), // Character.UnicodeBlock.ALPHABETIC_PRESENTATION_FORMS + /* FB50; FDFF; Arabic Presentation Forms-A */ + new UBInfo(0xFB50, 0xFDFF, "ArabicPresentationForms-A"), // Character.UnicodeBlock.ARABIC_PRESENTATION_FORMS_A + /* FE20; FE2F; Combining Half Marks */ + new UBInfo(0xFE20, 0xFE2F, "CombiningHalfMarks"), // Character.UnicodeBlock.COMBINING_HALF_MARKS + /* FE30; FE4F; CJK Compatibility Forms */ + new UBInfo(0xFE30, 0xFE4F, "CJKCompatibilityForms"), // Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS + /* FE50; FE6F; Small Form Variants */ + new UBInfo(0xFE50, 0xFE6F, "SmallFormVariants"), // Character.UnicodeBlock.SMALL_FORM_VARIANTS + /* FE70; FEFE; Arabic Presentation Forms-B */ + // new UBInfo (0xFE70,0xFEFE,"InArabicPresentationForms-B"), // + // Character.UnicodeBlock.ARABIC_PRESENTATION_FORMS_B + /* FEFF; FEFF; Specials */ + new UBInfo(0xFEFF, 0xFEFF, "Specials"), // Character.UnicodeBlock.SPECIALS + /* FF00; FFEF; Halfwidth and Fullwidth Forms */ + new UBInfo(0xFF00, 0xFFEF, "HalfwidthandFullwidthForms"), // Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS + /* FFF0; FFFD; Specials */ + new UBInfo(0xFFF0, 0xFFFD, "Specials") // Character.UnicodeBlock.SPECIALS + }; +} \ No newline at end of file diff --git a/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/PatternErrorTest.java b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/PatternErrorTest.java new file mode 100644 index 000000000..a73f91895 --- /dev/null +++ b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/PatternErrorTest.java @@ -0,0 +1,67 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +import java.util.regex.Pattern; + +import junit.framework.TestCase; + +/** + * Test boundary and error conditions in java.util.regex.Pattern + */ +@SuppressWarnings("nls") +public class PatternErrorTest extends TestCase { + public void testCompileErrors() throws Exception { + // null regex string - should get NullPointerException + try { + Pattern.compile(null); + fail("NullPointerException expected"); + } catch (NullPointerException e) { + } + + // empty regex string - no exception should be thrown + Pattern.compile(""); + + // note: invalid regex syntax checked in PatternSyntaxExceptionTest + + // flags = 0 should raise no exception + int flags = 0; + Pattern.compile("foo", flags); + + // check that all valid flags accepted without exception + flags |= Pattern.UNIX_LINES; + flags |= Pattern.CASE_INSENSITIVE; + flags |= Pattern.MULTILINE; + flags |= Pattern.CANON_EQ; + flags |= Pattern.COMMENTS; + flags |= Pattern.DOTALL; + flags |= Pattern.UNICODE_CASE; + Pattern.compile("foo", flags); + + // add invalid flags - should get IllegalArgumentException + // regression test for HARMONY-4248 + flags |= 0xFFFFFFFF; + // TODO investigate, why this fails and uncomment + /* + try { + Pattern.compile("foo", flags); + fail("Expected IllegalArgumentException to be thrown"); + } catch (IllegalArgumentException e) { + // This is the expected exception + }*/ + } +} diff --git a/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/PatternSyntaxExceptionTest.java b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/PatternSyntaxExceptionTest.java new file mode 100644 index 000000000..c9e53a633 --- /dev/null +++ b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/PatternSyntaxExceptionTest.java @@ -0,0 +1,61 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import junit.framework.TestCase; + + +/** + * TODO Type description + */ +@SuppressWarnings("nls") +public class PatternSyntaxExceptionTest extends TestCase { + public void testCase() { + String regex = "("; + try { + Pattern.compile(regex); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + // TOFIX: Commented out assertEquals tests... + // TOFIX: should we match exception strings? + // assertEquals("Unclosed group", e.getDescription()); + assertEquals(1, e.getIndex()); + // assertEquals("Unclosed group near index 1\n(\n ^", + // e.getMessage()); + assertEquals(regex, e.getPattern()); + } + } + + public void testCase2() { + String regex = "[4-"; + try { + Pattern.compile(regex); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException e) { + // TOFIX: Commented out assertEquals tests... + // TOFIX: should we match exception strings? + // assertEquals("Illegal character range", e.getDescription()); + assertEquals(3, e.getIndex()); + // assertEquals("Illegal character range near index 3\n[4-\n ^", + // e.getMessage()); + assertEquals(regex, e.getPattern()); + } + } +} diff --git a/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/PatternTest.java b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/PatternTest.java new file mode 100644 index 000000000..121520e82 --- /dev/null +++ b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/PatternTest.java @@ -0,0 +1,1353 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +import static org.junit.Assert.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import org.junit.Test; + +public class PatternTest { + String[] testPatterns = { + "(a|b)*abb", + "(1*2*3*4*)*567", + "(a|b|c|d)*aab", + "(1|2|3|4|5|6|7|8|9|0)(1|2|3|4|5|6|7|8|9|0)*", + "(abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ)*", + "(a|b)*(a|b)*A(a|b)*lice.*", + "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)(a|b|c|d|e|f|g|h|" + + "i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)*(1|2|3|4|5|6|7|8|9|0)*|while|for|struct|if|do", + "x(?c)y", "x(?cc)y", "x(?:c)y" + + }; + + @Test + public void testCommentsInPattern() { + Pattern p = Pattern.compile("ab# this is a comment\ncd", Pattern.COMMENTS); + assertTrue(p.matcher("abcd").matches()); + } + + @Test + public void testSplitCharSequenceint() { + // splitting CharSequence which ends with pattern + // bug6193 + assertEquals(",,".split(",", 3).length, 3); + assertEquals(",,".split(",", 4).length, 3); + // bug6193 + // bug5391 + assertEquals(Pattern.compile("o").split("boo:and:foo", 5).length, 5); + assertEquals(Pattern.compile("b").split("ab", -1).length, 2); + // bug5391 + String s[]; + Pattern pat = Pattern.compile("x"); + s = pat.split("zxx:zzz:zxx", 10); + assertEquals(s.length, 5); + s = pat.split("zxx:zzz:zxx", 3); + assertEquals(s.length, 3); + s = pat.split("zxx:zzz:zxx", -1); + assertEquals(s.length, 5); + s = pat.split("zxx:zzz:zxx", 0); + assertEquals(s.length, 3); + // other splitting + // negative limit + pat = Pattern.compile("b"); + s = pat.split("abccbadfebb", -1); + assertEquals(s.length, 5); + s = pat.split("", -1); + assertEquals(s.length, 1); + pat = Pattern.compile(""); + s = pat.split("", -1); + assertEquals(s.length, 1); + s = pat.split("abccbadfe", -1); + assertEquals(s.length, 11); + // zero limit + pat = Pattern.compile("b"); + s = pat.split("abccbadfebb", 0); + assertEquals(s.length, 3); + s = pat.split("", 0); + assertEquals(s.length, 1); + pat = Pattern.compile(""); + s = pat.split("", 0); + assertEquals(s.length, 1); + s = pat.split("abccbadfe", 0); + assertEquals(s.length, 10); + // positive limit + pat = Pattern.compile("b"); + s = pat.split("abccbadfebb", 12); + assertEquals(s.length, 5); + s = pat.split("", 6); + assertEquals(s.length, 1); + pat = Pattern.compile(""); + s = pat.split("", 11); + assertEquals(s.length, 1); + s = pat.split("abccbadfe", 15); + assertEquals(s.length, 11); + + pat = Pattern.compile("b"); + s = pat.split("abccbadfebb", 5); + assertEquals(s.length, 5); + s = pat.split("", 1); + assertEquals(s.length, 1); + pat = Pattern.compile(""); + s = pat.split("", 1); + assertEquals(s.length, 1); + s = pat.split("abccbadfe", 11); + assertEquals(s.length, 11); + + pat = Pattern.compile("b"); + s = pat.split("abccbadfebb", 3); + assertEquals(s.length, 3); + pat = Pattern.compile(""); + s = pat.split("abccbadfe", 5); + assertEquals(s.length, 5); + } + + @Test + public void testSplitCharSequence() { + String s[]; + Pattern pat = Pattern.compile("b"); + s = pat.split("abccbadfebb"); + assertEquals(s.length, 3); + s = pat.split(""); + assertEquals(s.length, 1); + pat = Pattern.compile(""); + s = pat.split(""); + assertEquals(s.length, 1); + s = pat.split("abccbadfe"); + assertEquals(s.length, 10); + // bug6544 + String s1 = ""; + String[] arr = s1.split(":"); + assertEquals(arr.length, 1); + // bug6544 + } + + public void testPattern() { + } + + @Test + public void testFlags() { + String baseString; + String testString; + Pattern pat; + Matcher mat; + + baseString = "((?i)|b)a"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + baseString = "(?i)a|b"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)a|b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "c|(?i)a|b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)a|(?s)b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)a|(?-i)b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + baseString = "(?i)a|(?-i)c|b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + baseString = "(?i)a|(?-i)c|(?i)b"; + testString = "B"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)a|(?-i)b"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "((?i))a"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + baseString = "|(?i)|a"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)((?s)a.)"; + testString = "A\n"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)((?-i)a)"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + baseString = "(?i)(?s:a.)"; + testString = "A\n"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)fgh(?s:aa)"; + testString = "fghAA"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?i)((?-i))a"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "abc(?i)d"; + testString = "ABCD"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + testString = "abcD"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "a(?i)a(?-i)a(?i)a(?-i)a"; + testString = "aAaAa"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "aAAAa"; + mat = pat.matcher(testString); + assertFalse(mat.matches()); + } + + @Test + public void testFlagsMethod() { + String baseString; + Pattern pat; + + /* + * These tests are for compatibility with RI only. Logically we have to + * return only flags specified during the compilation. For example + * pat.flags() == 0 when we compile Pattern pat = + * Pattern.compile("(?i)abc(?-i)"); but the whole expression is compiled + * in a case insensitive manner. So there is little sense to do calls to + * flags() now. + */ + baseString = "(?-i)"; + pat = Pattern.compile(baseString); + + baseString = "(?idmsux)abc(?-i)vg(?-dmu)"; + pat = Pattern.compile(baseString); + assertEquals(pat.flags(), Pattern.DOTALL | Pattern.COMMENTS); + + baseString = "(?idmsux)abc|(?-i)vg|(?-dmu)"; + pat = Pattern.compile(baseString); + assertEquals(pat.flags(), Pattern.DOTALL | Pattern.COMMENTS); + + baseString = "(?is)a((?x)b.)"; + pat = Pattern.compile(baseString); + assertEquals(pat.flags(), Pattern.DOTALL | Pattern.CASE_INSENSITIVE); + + baseString = "(?i)a((?-i))"; + pat = Pattern.compile(baseString); + assertEquals(pat.flags(), Pattern.CASE_INSENSITIVE); + + baseString = "((?i)a)"; + pat = Pattern.compile(baseString); + assertEquals(pat.flags(), 0); + + pat = Pattern.compile("(?is)abc"); + assertEquals(pat.flags(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL); + } + + @Test + public void testCompileStringint() { + /* + * this tests are needed to verify that appropriate exceptions are + * thrown + */ + String pattern = "b)a"; + try { + Pattern.compile(pattern); + fail("Expected a PatternSyntaxException when compiling pattern: " + + pattern); + } catch (PatternSyntaxException e) { + // pass + } + pattern = "bcde)a"; + try { + Pattern.compile(pattern); + fail("Expected a PatternSyntaxException when compiling pattern: " + + pattern); + } catch (PatternSyntaxException e) { + // pass + } + pattern = "bbg())a"; + try { + Pattern.compile(pattern); + fail("Expected a PatternSyntaxException when compiling pattern: " + + pattern); + } catch (PatternSyntaxException e) { + // pass + } + + pattern = "cdb(?i))a"; + try { + Pattern.compile(pattern); + fail("Expected a PatternSyntaxException when compiling pattern: " + + pattern); + } catch (PatternSyntaxException e) { + // pass + } + + /* + * This pattern should compile - HARMONY-2127 + */ + pattern = "x(?c)y"; + Pattern.compile(pattern); + + /* + * this pattern doesn't match any string, but should be compiled anyway + */ + pattern = "(b\\1)a"; + Pattern.compile(pattern); + } + + @Test + public void testQuantCompileNeg() { + String[] patterns = { "5{,2}", "{5asd", "{hgdhg", "{5,hjkh", "{,5hdsh", + "{5,3shdfkjh}" }; + for (String element : patterns) { + try { + Pattern.compile(element); + fail("PatternSyntaxException was expected, but compilation succeeds"); + } catch (PatternSyntaxException pse) { + continue; + } + } + // Regression for HARMONY-1365 + String pattern = "(?![^\\\\G*?)(?![^|\\]\\070\\ne\\{\\t\\[\\053\\?\\\\\\x51\\a\\075\\0023-\\[&&[|\\022-\\xEA\\00-\\u41C2&&[^|a-\\xCC&&[^\\037\\uECB3\\u3D9A\\x31\\|\\[^\\016\\r\\{\\,\\uA29D\\034\\02[\\02-\\[|\\t\\056\\uF599\\x62\\e\\<\\032\\uF0AC\\0026\\0205Q\\|\\\\\\06\\0164[|\\057-\\u7A98&&[\\061-g|\\|\\0276\\n\\042\\011\\e\\xE8\\x64B\\04\\u6D0EDW^\\p{Lower}]]]]?)(?<=[^\\n\\\\\\t\\u8E13\\,\\0114\\u656E\\xA5\\]&&[\\03-\\026|\\uF39D\\01\\{i\\u3BC2\\u14FE]])(?<=[^|\\uAE62\\054H\\|\\}&&^\\p{Space}])(?sxx)(?<=[\\f\\006\\a\\r\\xB4]*+)|(?x-xd:^{5}+)()"; + assertNotNull(Pattern.compile(pattern)); + } + + @Test + public void testQuantCompilePos() { + String[] patterns = { "abc{2,}", "abc{5}" }; + for (String element : patterns) { + Pattern.compile(element); + } + } + + @Test + public void testQuantComposition() { + String pattern = "(a{1,3})aab"; + java.util.regex.Pattern pat = java.util.regex.Pattern.compile(pattern); + java.util.regex.Matcher mat = pat.matcher("aaab"); + mat.matches(); + mat.start(1); + mat.group(1); + } + + @Test + public void testMatches() { + String[][] posSeq = { + { "abb", "ababb", "abababbababb", "abababbababbabababbbbbabb" }, + { "213567", "12324567", "1234567", "213213567", + "21312312312567", "444444567" }, + { "abcdaab", "aab", "abaab", "cdaab", "acbdadcbaab" }, + { "213234567", "3458", "0987654", "7689546432", "0398576", + "98432", "5" }, + { + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" }, + { "ababbaAabababblice", "ababbaAliceababab", "ababbAabliceaaa", + "abbbAbbbliceaaa", "Alice" }, + { "a123", "bnxnvgds156", "for", "while", "if", "struct" }, + { "xy" }, { "xy" }, { "xcy" } + + }; + + for (int i = 0; i < testPatterns.length; i++) { + for (int j = 0; j < posSeq[i].length; j++) { + assertTrue("Incorrect match: " + testPatterns[i] + " vs " + + posSeq[i][j], Pattern.matches(testPatterns[i], + posSeq[i][j])); + } + } + } + + @Test + public void testTimeZoneIssue() { + Pattern p = Pattern.compile("GMT(\\+|\\-)(\\d+)(:(\\d+))?"); + Matcher m = p.matcher("GMT-9:45"); + assertTrue(m.matches()); + assertEquals("-", m.group(1)); + assertEquals("9", m.group(2)); + assertEquals(":45", m.group(3)); + assertEquals("45", m.group(4)); + } + + @Test + public void testCompileRanges() { + String[] correctTestPatterns = { "[^]*abb]*", "[^a-d[^m-p]]*abb", + "[a-d\\d]*abb", "[abc]*abb", "[a-e&&[de]]*abb", "[^abc]*abb", + "[a-e&&[^de]]*abb", "[a-z&&[^m-p]]*abb", "[a-d[m-p]]*abb", + "[a-zA-Z]*abb", "[+*?]*abb", "[^+*?]*abb" }; + + String[] inputSecuence = { "kkkk", "admpabb", "abcabcd124654abb", + "abcabccbacababb", "dededededededeedabb", "gfdhfghgdfghabb", + "accabacbcbaabb", "acbvfgtyabb", "adbcacdbmopabcoabb", + "jhfkjhaSDFGHJkdfhHNJMjkhfabb", "+*??+*abb", "sdfghjkabb" }; + + for (int i = 0; i < correctTestPatterns.length; i++) { + assertTrue("pattern: " + correctTestPatterns[i] + " input: " + + inputSecuence[i], Pattern.matches(correctTestPatterns[i], + inputSecuence[i])); + + } + + String[] wrongInputSecuence = { "]", "admpkk", "abcabcd124k654abb", + "abwcabccbacababb", "abababdeababdeabb", "abcabcacbacbabb", + "acdcbecbaabb", "acbotyabb", "adbcaecdbmopabcoabb", + "jhfkjhaSDFGHJk;dfhHNJMjkhfabb", "+*?a?+*abb", "sdf+ghjkabb" }; + + for (int i = 0; i < correctTestPatterns.length; i++) { + assertFalse("pattern: " + correctTestPatterns[i] + " input: " + + wrongInputSecuence[i], Pattern.matches( + correctTestPatterns[i], wrongInputSecuence[i])); + + } + } + + @Test + public void testRangesSpecialCases() { + String neg_patterns[] = { "[a-&&[b-c]]", "[a-\\w]", "[b-a]", "[]" }; + + for (String element : neg_patterns) { + try { + Pattern.compile(element); + fail("PatternSyntaxException was expected: " + element); + } catch (PatternSyntaxException pse) { + } + } + + String pos_patterns[] = { "[-]+", "----", "[a-]+", "a-a-a-a-aa--", + "[\\w-a]+", "123-2312--aaa-213", "[a-]]+", "-]]]]]]]]]]]]]]]" }; + + for (int i = 0; i < pos_patterns.length; i++) { + String pat = pos_patterns[i++]; + String inp = pos_patterns[i]; + assertTrue("pattern: " + pat + " input: " + inp, Pattern.matches( + pat, inp)); + } + } + + @Test + public void testZeroSymbols() { + assertTrue(Pattern.matches("[\0]*abb", "\0\0\0\0\0\0abb")); + } + + @Test + public void testEscapes() { + Pattern pat = Pattern.compile("\\Q{]()*?"); + Matcher mat = pat.matcher("{]()*?"); + + assertTrue(mat.matches()); + } + + @Test + public void testRegressions() { + // Bug 181 + Pattern.compile("[\\t-\\r]"); + + // HARMONY-4472 + Pattern.compile("a*.+"); + + // Bug187 + Pattern + .compile("|(?idmsux-idmsux)|(?idmsux-idmsux)|[^|\\[-\\0274|\\,-\\\\[^|W\\}\\nq\\x65\\002\\xFE\\05\\06\\00\\x66\\x47i\\,\\xF2\\=\\06\\u0EA4\\x9B\\x3C\\f\\|\\{\\xE5\\05\\r\\u944A\\xCA\\e|\\x19\\04\\x07\\04\\u607B\\023\\0073\\x91Tr\\0150\\x83]]?(?idmsux-idmsux:\\p{Alpha}{7}?)||(?<=[^\\uEC47\\01\\02\\u3421\\a\\f\\a\\013q\\035w\\e])(?<=\\p{Punct}{0,}?)(?=^\\p{Lower})(?!\\b{8,14})(?[\\x3E-\\]])|(?idmsux-idmsux:\\p{Punct})|(?[|\\n\\042\\uB09F\\06\\u0F2B\\uC96D\\x89\\uC166\\xAA|\\04-\\][^|\\a\\|\\rx\\04\\uA770\\n\\02\\t\\052\\056\\0274\\|\\=\\07\\e|\\00-\\x1D&&[^\\005\\uB15B\\uCDAC\\n\\x74\\0103\\0147\\uD91B\\n\\062G\\u9B4B\\077\\}\\0324&&[^\\0302\\,\\0221\\04\\u6D16\\04xy\\uD193\\[\\061\\06\\045\\x0F|\\e\\xBB\\f\\u1B52\\023\\u3AD2\\033\\007\\022\\}\\x66\\uA63FJ-\\0304]]]]{0,0})||(?^+)|(?![^|\\|\\nJ\\t\\<\\04E\\\\\\t\\01\\\\\\02\\|\\=\\}\\xF3\\uBEC2\\032K\\014\\uCC5F\\072q\\|\\0153\\xD9\\0322\\uC6C8[^\\t\\0342\\x34\\x91\\06\\{\\xF1\\a\\u1710\\?\\xE7\\uC106\\02pF\\<&&[^|\\]\\064\\u381D\\u50CF\\eO&&[^|\\06\\x2F\\04\\045\\032\\u8536W\\0377\\0017|\\x06\\uE5FA\\05\\xD4\\020\\04c\\xFC\\02H\\x0A\\r]]]]+?)(?idmsux-idmsux)|(?[\\{-\\0207|\\06-\\0276\\p{XDigit}])(?idmsux-idmsux:[^|\\x52\\0012\\]u\\xAD\\0051f\\0142\\\\l\\|\\050\\05\\f\\t\\u7B91\\r\\u7763\\{|h\\0104\\a\\f\\0234\\u2D4F&&^\\P{InGreek}]))"); + // HARMONY-5858 + Pattern.compile("\\u6211", Pattern.LITERAL); + } + + @Test + public void testOrphanQuantifiers() { + try { + Pattern.compile("+++++"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException pse) { + } + } + + @Test + public void testOrphanQuantifiers2() { + try { + Pattern.compile("\\d+*"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException pse) { + } + } + + @Test + public void testBug197() { + Object[] vals = { ":", new Integer(2), + new String[] { "boo", "and:foo" }, ":", new Integer(5), + new String[] { "boo", "and", "foo" }, ":", new Integer(-2), + new String[] { "boo", "and", "foo" }, ":", new Integer(3), + new String[] { "boo", "and", "foo" }, ":", new Integer(1), + new String[] { "boo:and:foo" }, "o", new Integer(5), + new String[] { "b", "", ":and:f", "", "" }, "o", + new Integer(4), new String[] { "b", "", ":and:f", "o" }, "o", + new Integer(-2), new String[] { "b", "", ":and:f", "", "" }, + "o", new Integer(0), new String[] { "b", "", ":and:f" } }; + + for (int i = 0; i < vals.length / 3;) { + String[] res = Pattern.compile(vals[i++].toString()).split( + "boo:and:foo", ((Integer) vals[i++]).intValue()); + String[] expectedRes = (String[]) vals[i++]; + + assertEquals(expectedRes.length, res.length); + + for (int j = 0; j < expectedRes.length; j++) { + assertEquals(expectedRes[j], res[j]); + } + } + } + + @Test + public void testURIPatterns() { + String URI_REGEXP_STR = "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"; + String SCHEME_REGEXP_STR = "^[a-zA-Z]{1}[\\w+-.]+$"; + String REL_URI_REGEXP_STR = "^(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"; + String IPV6_REGEXP_STR = "^[0-9a-fA-F\\:\\.]+(\\%\\w+)?$"; + String IPV6_REGEXP_STR2 = "^\\[[0-9a-fA-F\\:\\.]+(\\%\\w+)?\\]$"; + String IPV4_REGEXP_STR = "^[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}$"; + String HOSTNAME_REGEXP_STR = "\\w+[\\w\\-\\.]*"; + + Pattern.compile(URI_REGEXP_STR); + Pattern.compile(REL_URI_REGEXP_STR); + Pattern.compile(SCHEME_REGEXP_STR); + Pattern.compile(IPV4_REGEXP_STR); + Pattern.compile(IPV6_REGEXP_STR); + Pattern.compile(IPV6_REGEXP_STR2); + Pattern.compile(HOSTNAME_REGEXP_STR); + } + + @Test + public void testFindBoundaryCases1() { + Pattern pat = Pattern.compile(".*\n"); + Matcher mat = pat.matcher("a\n"); + + mat.find(); + assertEquals("a\n", mat.group()); + + } + + @Test + public void testFindBoundaryCases2() { + Pattern pat = Pattern.compile(".*A"); + Matcher mat = pat.matcher("aAa"); + + mat.find(); + assertEquals("aA", mat.group()); + + } + + @Test + public void testFindBoundaryCases3() { + Pattern pat = Pattern.compile(".*A"); + Matcher mat = pat.matcher("a\naA\n"); + + mat.find(); + assertEquals("aA", mat.group()); + + } + + @Test + public void testFindBoundaryCases4() { + Pattern pat = Pattern.compile("A.*"); + Matcher mat = pat.matcher("A\n"); + + mat.find(); + assertEquals("A", mat.group()); + + } + + @Test + public void testFindBoundaryCases5() { + Pattern pat = Pattern.compile(".*A.*"); + Matcher mat = pat.matcher("\nA\naaa\nA\naaAaa\naaaA\n"); + // Matcher mat = pat.matcher("\nA\n"); + String[] res = { "A", "A", "aaAaa", "aaaA" }; + int k = 0; + for (; mat.find(); k++) { + assertEquals(res[k], mat.group()); + } + } + + @Test + public void testFindBoundaryCases6() { + String[] res = { "", "a", "", "" }; + Pattern pat = Pattern.compile(".*"); + Matcher mat = pat.matcher("\na\n"); + int k = 0; + + for (; mat.find(); k++) { + assertEquals(res[k], mat.group()); + } + } + + public void _testFindBoundaryCases7() { + Pattern pat = Pattern.compile(".*"); + Matcher mat = pat.matcher("\na\n"); + + while (mat.find()) { + System.out.println(mat.group()); + System.out.flush(); + } + } + + @Test + public void testBackReferences() { + Pattern pat = Pattern.compile("(\\((\\w*):(.*):(\\2)\\))"); + Matcher mat = pat + .matcher("(start1: word :start1)(start2: word :start2)"); + int k = 1; + for (; mat.find(); k++) { + assertEquals("start" + k, mat.group(2)); + assertEquals(" word ", mat.group(3)); + assertEquals("start" + k, mat.group(4)); + + } + + assertEquals(3, k); + pat = Pattern.compile(".*(.)\\1"); + mat = pat.matcher("saa"); + assertTrue(mat.matches()); + } + + public void _testBackReferences1() { + Pattern pat = Pattern.compile("(\\((\\w*):(.*):(\\2)\\))"); + Matcher mat = pat + .matcher("(start1: word :start1)(start2: word :start2)"); + int k = 1; + for (; mat.find(); k++) { + System.out.println(mat.group(2)); + System.out.println(mat.group(3)); + System.out.println(mat.group(4)); + + } + + assertEquals(3, k); + } + + @Test + public void testNewLine() { + Pattern pat = Pattern.compile("(^$)*\n", Pattern.MULTILINE); + Matcher mat = pat.matcher("\r\n\n"); + int counter = 0; + while (mat.find()) { + counter++; + } + assertEquals(2, counter); + } + + @Test + public void testFindGreedy() { + Pattern pat = Pattern.compile(".*aaa", Pattern.DOTALL); + Matcher mat = pat.matcher("aaaa\naaa\naaaaaa"); + mat.matches(); + assertEquals(15, mat.end()); + } + + @Test + public void testSOLQuant() { + Pattern pat = Pattern.compile("$*", Pattern.MULTILINE); + Matcher mat = pat.matcher("\n\n"); + int counter = 0; + while (mat.find()) { + counter++; + } + + assertEquals(3, counter); + } + + @Test + public void testIllegalEscape() { + try { + Pattern.compile("\\y"); + fail("PatternSyntaxException expected"); + } catch (PatternSyntaxException pse) { + } + } + + @Test + public void testEmptyFamily() { + Pattern.compile("\\p{Lower}"); + } + + @Test + public void testNonCaptConstr() { + // Flags + Pattern pat = Pattern.compile("(?i)b*(?-i)a*"); + assertTrue(pat.matcher("bBbBaaaa").matches()); + assertFalse(pat.matcher("bBbBAaAa").matches()); + + // Non-capturing groups + pat = Pattern.compile("(?i:b*)a*"); + assertTrue(pat.matcher("bBbBaaaa").matches()); + assertFalse(pat.matcher("bBbBAaAa").matches()); + + pat = Pattern + // 1 2 3 4 5 6 7 8 9 10 11 + .compile("(?:-|(-?\\d+\\d\\d\\d))?(?:-|-(\\d\\d))?(?:-|-(\\d\\d))?(T)?(?:(\\d\\d):(\\d\\d):(\\d\\d)(\\.\\d+)?)?(?:(?:((?:\\+|\\-)\\d\\d):(\\d\\d))|(Z))?"); + Matcher mat = pat.matcher("-1234-21-31T41:51:61.789+71:81"); + assertTrue(mat.matches()); + assertEquals("-1234", mat.group(1)); + assertEquals("21", mat.group(2)); + assertEquals("31", mat.group(3)); + assertEquals("T", mat.group(4)); + assertEquals("41", mat.group(5)); + assertEquals("51", mat.group(6)); + assertEquals("61", mat.group(7)); + assertEquals(".789", mat.group(8)); + assertEquals("+71", mat.group(9)); + assertEquals("81", mat.group(10)); + + // positive lookahead + pat = Pattern.compile(".*\\.(?=log$).*$"); + assertTrue(pat.matcher("a.b.c.log").matches()); + assertFalse(pat.matcher("a.b.c.log.").matches()); + + // negative lookahead + pat = Pattern.compile(".*\\.(?!log$).*$"); + assertFalse(pat.matcher("abc.log").matches()); + assertTrue(pat.matcher("abc.logg").matches()); + + // positive lookbehind + pat = Pattern.compile(".*(?<=abc)\\.log$"); + assertFalse(pat.matcher("cde.log").matches()); + assertTrue(pat.matcher("abc.log").matches()); + + // negative lookbehind + pat = Pattern.compile(".*(?a*)abb"); + assertFalse(pat.matcher("aaabb").matches()); + pat = Pattern.compile("(?>a*)bb"); + assertTrue(pat.matcher("aaabb").matches()); + + pat = Pattern.compile("(?>a|aa)aabb"); + assertTrue(pat.matcher("aaabb").matches()); + pat = Pattern.compile("(?>aa|a)aabb"); + assertFalse(pat.matcher("aaabb").matches()); + + // quantifiers over look ahead + pat = Pattern.compile(".*(?<=abc)*\\.log$"); + assertTrue(pat.matcher("cde.log").matches()); + pat = Pattern.compile(".*(?<=abc)+\\.log$"); + assertFalse(pat.matcher("cde.log").matches()); + + } + + public void _testCorrectReplacementBackreferencedJointSet() { + Pattern.compile("ab(a)*\\1"); + Pattern.compile("abc(cd)fg"); + Pattern.compile("aba*cd"); + Pattern.compile("ab(a)*+cd"); + Pattern.compile("ab(a)*?cd"); + Pattern.compile("ab(a)+cd"); + Pattern.compile(".*(.)\\1"); + Pattern.compile("ab((a)|c|d)e"); + Pattern.compile("abc((a(b))cd)"); + Pattern.compile("ab(a)++cd"); + Pattern.compile("ab(a)?(c)d"); + Pattern.compile("ab(a)?+cd"); + Pattern.compile("ab(a)??cd"); + Pattern.compile("ab(a)??cd"); + Pattern.compile("ab(a){1,3}?(c)d"); + } + + @Test + public void testCompilePatternWithTerminatorMark() { + Pattern pat = Pattern.compile("a\u0000\u0000cd"); + Matcher mat = pat.matcher("a\u0000\u0000cd"); + assertTrue(mat.matches()); + } + + @Test + public void testAlternations() { + String baseString = "|a|bc"; + Pattern pat = Pattern.compile(baseString); + Matcher mat = pat.matcher(""); + + assertTrue(mat.matches()); + + baseString = "a||bc"; + pat = Pattern.compile(baseString); + mat = pat.matcher(""); + assertTrue(mat.matches()); + + baseString = "a|bc|"; + pat = Pattern.compile(baseString); + mat = pat.matcher(""); + assertTrue(mat.matches()); + + baseString = "a|b|"; + pat = Pattern.compile(baseString); + mat = pat.matcher(""); + assertTrue(mat.matches()); + + baseString = "a(|b|cd)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "a(b||cd)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "a(b|cd|)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "a(b|c|)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "a(|)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "|"; + pat = Pattern.compile(baseString); + mat = pat.matcher(""); + assertTrue(mat.matches()); + + baseString = "a(?:|)e"; + pat = Pattern.compile(baseString); + mat = pat.matcher("ae"); + assertTrue(mat.matches()); + + baseString = "a||||bc"; + pat = Pattern.compile(baseString); + mat = pat.matcher(""); + assertTrue(mat.matches()); + + baseString = "(?i-is)|a"; + pat = Pattern.compile(baseString); + mat = pat.matcher("a"); + assertTrue(mat.matches()); + } + + @Test + public void testMatchWithGroups() { + String baseString = "jwkerhjwehrkwjehrkwjhrwkjehrjwkehrjkwhrkwehrkwhrkwrhwkhrwkjehr"; + String pattern = ".*(..).*\\1.*"; + assertTrue(Pattern.compile(pattern).matcher(baseString).matches()); + + baseString = "saa"; + pattern = ".*(.)\\1"; + assertTrue(Pattern.compile(pattern).matcher(baseString).matches()); + assertTrue(Pattern.compile(pattern).matcher(baseString).find()); + } + + @Test + public void testSplitEmptyCharSequence() { + String s1 = ""; + String[] arr = s1.split(":"); + assertEquals(arr.length, 1); + } + + @Test + public void testSplitEndsWithPattern() { + assertEquals(",,".split(",", 3).length, 3); + assertEquals(",,".split(",", 4).length, 3); + + assertEquals(Pattern.compile("o").split("boo:and:foo", 5).length, 5); + assertEquals(Pattern.compile("b").split("ab", -1).length, 2); + } + + @Test + public void testCaseInsensitiveFlag() { + assertTrue(Pattern.matches("(?i-:AbC)", "ABC")); + } + + @Test + public void testEmptyGroups() { + Pattern pat = Pattern.compile("ab(?>)cda"); + Matcher mat = pat.matcher("abcda"); + assertTrue(mat.matches()); + + pat = Pattern.compile("ab()"); + mat = pat.matcher("ab"); + assertTrue(mat.matches()); + + pat = Pattern.compile("abc(?:)(..)"); + mat = pat.matcher("abcgf"); + assertTrue(mat.matches()); + } + + @Test + public void testCompileNonCaptGroup() { + boolean isCompiled = false; + + try { + Pattern.compile("(?:)", Pattern.CANON_EQ); + Pattern.compile("(?:)", Pattern.CANON_EQ | Pattern.DOTALL); + Pattern + .compile("(?:)", Pattern.CANON_EQ + | Pattern.CASE_INSENSITIVE); + Pattern.compile("(?:)", Pattern.CANON_EQ | Pattern.COMMENTS + | Pattern.UNIX_LINES); + isCompiled = true; + } catch (PatternSyntaxException e) { + System.out.println(e); + } + assertTrue(isCompiled); + } + + @Test + public void testEmbeddedFlags() { + String baseString = "(?i)((?s)a)"; + String testString = "A"; + Pattern pat = Pattern.compile(baseString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?x)(?i)(?s)(?d)a"; + testString = "A"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "(?x)(?i)(?s)(?d)a."; + testString = "a\n"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "abc(?x:(?i)(?s)(?d)a.)"; + testString = "abcA\n"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + baseString = "abc((?x)d)(?i)(?s)a"; + testString = "abcdA"; + pat = Pattern.compile(baseString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + @Test + public void testAltWithFlags() { + Pattern.compile("|(?i-xi)|()"); + } + + @Test + public void testRestoreFlagsAfterGroup() { + String baseString = "abc((?x)d) a"; + String testString = "abcd a"; + Pattern pat = Pattern.compile(baseString); + Matcher mat = pat.matcher(testString); + + assertTrue(mat.matches()); + } + + @Test + public void testCompileCharacterClass() { + // Regression for HARMONY-606, 696 + Pattern pattern = Pattern.compile("\\p{javaLowerCase}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaUpperCase}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaWhitespace}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaMirrored}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaDefined}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaDigit}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaIdentifierIgnorable}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaISOControl}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaJavaIdentifierPart}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaJavaIdentifierStart}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaLetter}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaLetterOrDigit}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaSpaceChar}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaTitleCase}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaUnicodeIdentifierPart}"); + assertNotNull(pattern); + + pattern = Pattern.compile("\\p{javaUnicodeIdentifierStart}"); + assertNotNull(pattern); + } + + @Test + public void testRangesWithSurrogatesSupplementary() { + String patString = "[abc\uD8D2]"; + String testString = "\uD8D2"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "a"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "ef\uD8D2\uDD71gh"; + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "ef\uD8D2gh"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "[abc\uD8D3&&[c\uD8D3]]"; + testString = "c"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "a"; + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + testString = "ef\uD8D3\uDD71gh"; + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "ef\uD8D3gh"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "[abc\uD8D3\uDBEE\uDF0C&&[c\uD8D3\uDBEE\uDF0C]]"; + testString = "c"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uDBEE\uDF0C"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "ef\uD8D3\uDD71gh"; + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "ef\uD8D3gh"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "[abc\uDBFC]\uDDC2cd"; + testString = "\uDBFC\uDDC2cd"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + testString = "a\uDDC2cd"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + @Test + public void testSequencesWithSurrogatesSupplementary() { + String patString = "abcd\uD8D3"; + String testString = "abcd\uD8D3\uDFFC"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "abcd\uD8D3abc"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "ab\uDBEFcd"; + testString = "ab\uDBEFcd"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + patString = "\uDFFCabcd"; + testString = "\uD8D3\uDFFCabcd"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "abc\uDFFCabcdecd"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "\uD8D3\uDFFCabcd"; + testString = "abc\uD8D3\uD8D3\uDFFCabcd"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.find()); + } + + @Test + public void testPredefinedClassesWithSurrogatesSupplementary() { + String patString = "[123\\D]"; + String testString = "a"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.find()); + + testString = "5"; + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "3"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + // low surrogate + testString = "\uDFC4"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + // high surrogate + testString = "\uDADA"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + testString = "\uDADA\uDFC4"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "[123[^\\p{javaDigit}]]"; + testString = "a"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.find()); + + testString = "5"; + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "3"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + // low surrogate + testString = "\uDFC4"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + // high surrogate + testString = "\uDADA"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + testString = "\uDADA\uDFC4"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + // surrogate characters + patString = "\\p{Cs}"; + testString = "\uD916\uDE27"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + + // swap low and high surrogates + testString = "\uDE27\uD916"; + mat = pat.matcher(testString); + assertTrue(mat.find()); + + patString = "[\uD916\uDE271\uD91623&&[^\\p{Cs}]]"; + testString = "1"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.find()); + + testString = "\uD916"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertFalse(mat.find()); + + testString = "\uD916\uDE27"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.find()); + + // \uD9A0\uDE8E=\u7828E + // \u78281=\uD9A0\uDE81 + patString = "[a-\uD9A0\uDE8E]"; + testString = "\uD9A0\uDE81"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + @Test + public void testDotConstructionWithSurrogatesSupplementary() { + String patString = "."; + String testString = "\uD9A0\uDE81"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uDE81"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uD9A0"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\n"; + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + patString = ".*\uDE81"; + testString = "\uD9A0\uDE81\uD9A0\uDE81\uD9A0\uDE81"; + pat = Pattern.compile(patString); + mat = pat.matcher(testString); + assertFalse(mat.matches()); + + testString = "\uD9A0\uDE81\uD9A0\uDE81\uDE81"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + patString = ".*"; + testString = "\uD9A0\uDE81\n\uD9A0\uDE81\uD9A0\n\uDE81"; + pat = Pattern.compile(patString, Pattern.DOTALL); + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + @Test + public void testQuantifiersWithSurrogatesSupplementary() { + String patString = "\uD9A0\uDE81*abc"; + String testString = "\uD9A0\uDE81\uD9A0\uDE81abc"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "abc"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + } + + @Test + public void testAlternationsWithSurrogatesSupplementary() { + String patString = "\uDE81|\uD9A0\uDE81|\uD9A0"; + String testString = "\uD9A0"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uDE81"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uD9A0\uDE81"; + mat = pat.matcher(testString); + assertTrue(mat.matches()); + + testString = "\uDE81\uD9A0"; + mat = pat.matcher(testString); + assertFalse(mat.matches()); + } + + @Test + public void testGroupsWithSurrogatesSupplementary() { + + // this pattern matches nothing + String patString = "(\uD9A0)\uDE81"; + String testString = "\uD9A0\uDE81"; + Pattern pat = Pattern.compile(patString); + Matcher mat = pat.matcher(testString); + assertFalse(mat.matches()); + + patString = "(\uD9A0)"; + testString = "\uD9A0\uDE81"; + pat = Pattern.compile(patString, Pattern.DOTALL); + mat = pat.matcher(testString); + assertFalse(mat.find()); + } + + @Test + public void testUnicodeCategoryWithSurrogatesSupplementary() { + Pattern p = Pattern.compile("\\p{javaLowerCase}"); + Matcher matcher = p.matcher("\uD801\uDC28"); + assertTrue(matcher.find()); + } +} diff --git a/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/ReplaceTest.java b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/ReplaceTest.java new file mode 100644 index 000000000..2f2e122e7 --- /dev/null +++ b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/ReplaceTest.java @@ -0,0 +1,92 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +import static org.junit.Assert.assertEquals; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import org.junit.Test; + +public class ReplaceTest { + + @Test + public void testSimpleReplace() throws PatternSyntaxException { + String target, pattern, repl; + + target = "foobarfobarfoofo1"; + pattern = "fo[^o]"; + repl = "xxx"; + + Pattern p = Pattern.compile(pattern); + Matcher m = p.matcher(target); + + assertEquals("foobarxxxarfoofo1", m.replaceFirst(repl)); + assertEquals("foobarxxxarfooxxx", m.replaceAll(repl)); + } + + @Test + public void testCaptureReplace() { + String target, pattern, repl, s; + Pattern p = null; + Matcher m; + + target = "[31]foo;bar[42];[99]xyz"; + pattern = "\\[([0-9]+)\\]([a-z]+)"; + repl = "$2[$1]"; + + p = Pattern.compile(pattern); + m = p.matcher(target); + s = m.replaceFirst(repl); + assertEquals("foo[31];bar[42];[99]xyz", s); + s = m.replaceAll(repl); + assertEquals("foo[31];bar[42];xyz[99]", s); + + target = "[31]foo(42)bar{63}zoo;[12]abc(34)def{56}ghi;{99}xyz[88]xyz(77)xyz;"; + pattern = "\\[([0-9]+)\\]([a-z]+)\\(([0-9]+)\\)([a-z]+)\\{([0-9]+)\\}([a-z]+)"; + repl = "[$5]$6($3)$4{$1}$2"; + p = Pattern.compile(pattern); + m = p.matcher(target); + s = m.replaceFirst(repl); + // System.out.println(s); + assertEquals( + "[63]zoo(42)bar{31}foo;[12]abc(34)def{56}ghi;{99}xyz[88]xyz(77)xyz;", + s); + s = m.replaceAll(repl); + // System.out.println(s); + assertEquals( + "[63]zoo(42)bar{31}foo;[56]ghi(34)def{12}abc;{99}xyz[88]xyz(77)xyz;", + s); + } + + @Test + public void testEscapeReplace() { + String target, pattern, repl, s; + + target = "foo'bar''foo"; + pattern = "'"; + repl = "\\'"; + s = target.replaceAll(pattern, repl); + assertEquals("foo'bar''foo", s); + repl = "\\\\'"; + s = target.replaceAll(pattern, repl); + assertEquals("foo\\'bar\\'\\'foo", s); + repl = "\\$3"; + s = target.replaceAll(pattern, repl); + assertEquals("foo$3bar$3$3foo", s); + } +} diff --git a/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/SplitTest.java b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/SplitTest.java new file mode 100644 index 000000000..48c534687 --- /dev/null +++ b/teavm-classlib/src/test/java/org/teavm/classlib/java/util/regex/SplitTest.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.teavm.classlib.java.util.regex; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import org.junit.Test; + +/** + * TODO Type description + */ +public class SplitTest { + + @Test + public void testSimple() { + Pattern p = Pattern.compile("/"); + String[] results = p.split("have/you/done/it/right"); + String[] expected = new String[] { "have", "you", "done", "it", "right" }; + assertEquals(expected.length, results.length); + for (int i = 0; i < expected.length; i++) { + assertEquals(results[i], expected[i]); + } + } + + @Test + public void testSplit1() throws PatternSyntaxException { + Pattern p = Pattern.compile(" "); + + String input = "poodle zoo"; + String tokens[]; + + tokens = p.split(input, 1); + assertEquals(1, tokens.length); + assertTrue(tokens[0].equals(input)); + tokens = p.split(input, 2); + assertEquals(2, tokens.length); + assertEquals("poodle", tokens[0]); + assertEquals("zoo", tokens[1]); + tokens = p.split(input, 5); + assertEquals(2, tokens.length); + assertEquals("poodle", tokens[0]); + assertEquals("zoo", tokens[1]); + tokens = p.split(input, -2); + assertEquals(2, tokens.length); + assertEquals("poodle", tokens[0]); + assertEquals("zoo", tokens[1]); + tokens = p.split(input, 0); + assertEquals(2, tokens.length); + assertEquals("poodle", tokens[0]); + assertEquals("zoo", tokens[1]); + tokens = p.split(input); + assertEquals(2, tokens.length); + assertEquals("poodle", tokens[0]); + assertEquals("zoo", tokens[1]); + + p = Pattern.compile("d"); + + tokens = p.split(input, 1); + assertEquals(1, tokens.length); + assertTrue(tokens[0].equals(input)); + tokens = p.split(input, 2); + assertEquals(2, tokens.length); + assertEquals("poo", tokens[0]); + assertEquals("le zoo", tokens[1]); + tokens = p.split(input, 5); + assertEquals(2, tokens.length); + assertEquals("poo", tokens[0]); + assertEquals("le zoo", tokens[1]); + tokens = p.split(input, -2); + assertEquals(2, tokens.length); + assertEquals("poo", tokens[0]); + assertEquals("le zoo", tokens[1]); + tokens = p.split(input, 0); + assertEquals(2, tokens.length); + assertEquals("poo", tokens[0]); + assertEquals("le zoo", tokens[1]); + tokens = p.split(input); + assertEquals(2, tokens.length); + assertEquals("poo", tokens[0]); + assertEquals("le zoo", tokens[1]); + + p = Pattern.compile("o"); + + tokens = p.split(input, 1); + assertEquals(1, tokens.length); + assertTrue(tokens[0].equals(input)); + tokens = p.split(input, 2); + assertEquals(2, tokens.length); + assertEquals("p", tokens[0]); + assertEquals("odle zoo", tokens[1]); + tokens = p.split(input, 5); + assertEquals(5, tokens.length); + assertEquals("p", tokens[0]); + assertTrue(tokens[1].equals("")); + assertEquals("dle z", tokens[2]); + assertTrue(tokens[3].equals("")); + assertTrue(tokens[4].equals("")); + tokens = p.split(input, -2); + assertEquals(5, tokens.length); + assertEquals("p", tokens[0]); + assertTrue(tokens[1].equals("")); + assertEquals("dle z", tokens[2]); + assertTrue(tokens[3].equals("")); + assertTrue(tokens[4].equals("")); + tokens = p.split(input, 0); + assertEquals(3, tokens.length); + assertEquals("p", tokens[0]); + assertTrue(tokens[1].equals("")); + assertEquals("dle z", tokens[2]); + tokens = p.split(input); + assertEquals(3, tokens.length); + assertEquals("p", tokens[0]); + assertTrue(tokens[1].equals("")); + assertEquals("dle z", tokens[2]); + } + + @Test + public void testSplit2() { + Pattern p = Pattern.compile(""); + String s[]; + s = p.split("a", -1); + assertEquals(3, s.length); + assertEquals("", s[0]); + assertEquals("a", s[1]); + assertEquals("", s[2]); + + s = p.split("", -1); + assertEquals(1, s.length); + assertEquals("", s[0]); + + s = p.split("abcd", -1); + assertEquals(6, s.length); + assertEquals("", s[0]); + assertEquals("a", s[1]); + assertEquals("b", s[2]); + assertEquals("c", s[3]); + assertEquals("d", s[4]); + assertEquals("", s[5]); + } + + @Test + public void testSplitSupplementaryWithEmptyString() { + + /* + * See http://www.unicode.org/reports/tr18/#Supplementary_Characters We + * have to treat text as code points not code units. + */ + Pattern p = Pattern.compile(""); + String s[]; + s = p.split("a\ud869\uded6b", -1); + assertEquals(6, s.length); + assertEquals("", s[0]); + assertEquals("a", s[1]); + assertEquals("\ud869", s[2]); + assertEquals("\uded6", s[3]); + assertEquals("b", s[4]); + assertEquals("", s[5]); + } +}