284 lines
9.8 KiB
Java
284 lines
9.8 KiB
Java
/*
|
|
* Copyright (C) 2009 The Guava Authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package com.google.common.escape;
|
|
|
|
import static com.google.common.base.Preconditions.checkNotNull;
|
|
|
|
import java.util.HashMap;
|
|
import java.util.Map;
|
|
|
|
import javax.annotation.Nullable;
|
|
|
|
import com.google.common.annotations.Beta;
|
|
import com.google.common.annotations.GwtCompatible;
|
|
|
|
/**
|
|
* Static utility methods pertaining to {@link Escaper} instances.
|
|
*
|
|
* @author Sven Mawson
|
|
* @author David Beaumont
|
|
* @since 15.0
|
|
*/
|
|
@Beta
|
|
@GwtCompatible
|
|
public final class Escapers {
|
|
private Escapers() {
|
|
}
|
|
|
|
/**
|
|
* Returns an {@link Escaper} that does no escaping, passing all character data
|
|
* through unchanged.
|
|
*/
|
|
public static Escaper nullEscaper() {
|
|
return NULL_ESCAPER;
|
|
}
|
|
|
|
// An Escaper that efficiently performs no escaping.
|
|
// Extending CharEscaper (instead of Escaper) makes Escapers.compose() easier.
|
|
private static final Escaper NULL_ESCAPER = new CharEscaper() {
|
|
@Override
|
|
public String escape(String string) {
|
|
return checkNotNull(string);
|
|
}
|
|
|
|
@Override
|
|
protected char[] escape(char c) {
|
|
// TODO: Fix tests not to call this directly and make it throw an error.
|
|
return null;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Returns a builder for creating simple, fast escapers. A builder instance can
|
|
* be reused and each escaper that is created will be a snapshot of the current
|
|
* builder state. Builders are not thread safe.
|
|
*
|
|
* <p>
|
|
* The initial state of the builder is such that:
|
|
* <ul>
|
|
* <li>There are no replacement mappings
|
|
* <li>
|
|
* <li>{@code safeMin == Character.MIN_VALUE}</li>
|
|
* <li>{@code safeMax == Character.MAX_VALUE}</li>
|
|
* <li>{@code unsafeReplacement == null}</li>
|
|
* </ul>
|
|
* <p>
|
|
* For performance reasons escapers created by this builder are not Unicode
|
|
* aware and will not validate the well-formedness of their input.
|
|
*/
|
|
public static Builder builder() {
|
|
return new Builder();
|
|
}
|
|
|
|
/**
|
|
* A builder for simple, fast escapers.
|
|
*
|
|
* <p>
|
|
* Typically an escaper needs to deal with the escaping of high valued
|
|
* characters or code points. In these cases it is necessary to extend either
|
|
* {@link ArrayBasedCharEscaper} or {@link ArrayBasedUnicodeEscaper} to provide
|
|
* the desired behavior. However this builder is suitable for creating escapers
|
|
* that replace a relative small set of characters.
|
|
*
|
|
* @author David Beaumont
|
|
* @since 15.0
|
|
*/
|
|
@Beta
|
|
public static final class Builder {
|
|
private final Map<Character, String> replacementMap = new HashMap<Character, String>();
|
|
private char safeMin = Character.MIN_VALUE;
|
|
private char safeMax = Character.MAX_VALUE;
|
|
private String unsafeReplacement = null;
|
|
|
|
// The constructor is exposed via the builder() method above.
|
|
private Builder() {
|
|
}
|
|
|
|
/**
|
|
* Sets the safe range of characters for the escaper. Characters in this range
|
|
* that have no explicit replacement are considered 'safe' and remain unescaped
|
|
* in the output. If {@code safeMax < safeMin} then the safe range is empty.
|
|
*
|
|
* @param safeMin the lowest 'safe' character
|
|
* @param safeMax the highest 'safe' character
|
|
* @return the builder instance
|
|
*/
|
|
public Builder setSafeRange(char safeMin, char safeMax) {
|
|
this.safeMin = safeMin;
|
|
this.safeMax = safeMax;
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Sets the replacement string for any characters outside the 'safe' range that
|
|
* have no explicit replacement. If {@code unsafeReplacement} is {@code null}
|
|
* then no replacement will occur, if it is {@code ""} then the unsafe
|
|
* characters are removed from the output.
|
|
*
|
|
* @param unsafeReplacement the string to replace unsafe chracters
|
|
* @return the builder instance
|
|
*/
|
|
public Builder setUnsafeReplacement(@Nullable String unsafeReplacement) {
|
|
this.unsafeReplacement = unsafeReplacement;
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Adds a replacement string for the given input character. The specified
|
|
* character will be replaced by the given string whenever it occurs in the
|
|
* input, irrespective of whether it lies inside or outside the 'safe' range.
|
|
*
|
|
* @param c the character to be replaced
|
|
* @param replacement the string to replace the given character
|
|
* @return the builder instance
|
|
* @throws NullPointerException if {@code replacement} is null
|
|
*/
|
|
public Builder addEscape(char c, String replacement) {
|
|
checkNotNull(replacement);
|
|
// This can replace an existing character (the builder is re-usable).
|
|
replacementMap.put(c, replacement);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Returns a new escaper based on the current state of the builder.
|
|
*/
|
|
public Escaper build() {
|
|
return new ArrayBasedCharEscaper(replacementMap, safeMin, safeMax) {
|
|
private final char[] replacementChars = unsafeReplacement != null ? unsafeReplacement.toCharArray()
|
|
: null;
|
|
|
|
@Override
|
|
protected char[] escapeUnsafe(char c) {
|
|
return replacementChars;
|
|
}
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns a {@link UnicodeEscaper} equivalent to the given escaper instance. If
|
|
* the escaper is already a UnicodeEscaper then it is simply returned, otherwise
|
|
* it is wrapped in a UnicodeEscaper.
|
|
*
|
|
* <p>
|
|
* When a {@link CharEscaper} escaper is wrapped by this method it acquires
|
|
* extra behavior with respect to the well-formedness of Unicode character
|
|
* sequences and will throw {@link IllegalArgumentException} when given bad
|
|
* input.
|
|
*
|
|
* @param escaper the instance to be wrapped
|
|
* @return a UnicodeEscaper with the same behavior as the given instance
|
|
* @throws NullPointerException if escaper is null
|
|
* @throws IllegalArgumentException if escaper is not a UnicodeEscaper or a
|
|
* CharEscaper
|
|
*/
|
|
static UnicodeEscaper asUnicodeEscaper(Escaper escaper) {
|
|
checkNotNull(escaper);
|
|
if (escaper instanceof UnicodeEscaper) {
|
|
return (UnicodeEscaper) escaper;
|
|
} else if (escaper instanceof CharEscaper) {
|
|
return wrap((CharEscaper) escaper);
|
|
}
|
|
// In practice this shouldn't happen because it would be very odd not to
|
|
// extend either CharEscaper or UnicodeEscaper for non trivial cases.
|
|
throw new IllegalArgumentException("Cannot create a UnicodeEscaper from: " + escaper.getClass().getName());
|
|
}
|
|
|
|
/**
|
|
* Returns a string that would replace the given character in the specified
|
|
* escaper, or {@code null} if no replacement should be made. This method is
|
|
* intended for use in tests through the {@code EscaperAsserts} class;
|
|
* production users of {@link CharEscaper} should limit themselves to its public
|
|
* interface.
|
|
*
|
|
* @param c the character to escape if necessary
|
|
* @return the replacement string, or {@code null} if no escaping was needed
|
|
*/
|
|
public static String computeReplacement(CharEscaper escaper, char c) {
|
|
return stringOrNull(escaper.escape(c));
|
|
}
|
|
|
|
/**
|
|
* Returns a string that would replace the given character in the specified
|
|
* escaper, or {@code null} if no replacement should be made. This method is
|
|
* intended for use in tests through the {@code EscaperAsserts} class;
|
|
* production users of {@link UnicodeEscaper} should limit themselves to its
|
|
* public interface.
|
|
*
|
|
* @param cp the Unicode code point to escape if necessary
|
|
* @return the replacement string, or {@code null} if no escaping was needed
|
|
*/
|
|
public static String computeReplacement(UnicodeEscaper escaper, int cp) {
|
|
return stringOrNull(escaper.escape(cp));
|
|
}
|
|
|
|
private static String stringOrNull(char[] in) {
|
|
return (in == null) ? null : new String(in);
|
|
}
|
|
|
|
/** Private helper to wrap a CharEscaper as a UnicodeEscaper. */
|
|
private static UnicodeEscaper wrap(final CharEscaper escaper) {
|
|
return new UnicodeEscaper() {
|
|
@Override
|
|
protected char[] escape(int cp) {
|
|
// If a code point maps to a single character, just escape that.
|
|
if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
|
|
return escaper.escape((char) cp);
|
|
}
|
|
// Convert the code point to a surrogate pair and escape them both.
|
|
// Note: This code path is horribly slow and typically allocates 4 new
|
|
// char[] each time it is invoked. However this avoids any
|
|
// synchronization issues and makes the escaper thread safe.
|
|
char[] surrogateChars = new char[2];
|
|
Character.toChars(cp, surrogateChars, 0);
|
|
char[] hiChars = escaper.escape(surrogateChars[0]);
|
|
char[] loChars = escaper.escape(surrogateChars[1]);
|
|
|
|
// If either hiChars or lowChars are non-null, the CharEscaper is trying
|
|
// to escape the characters of a surrogate pair separately. This is
|
|
// uncommon and applies only to escapers that assume UCS-2 rather than
|
|
// UTF-16. See: http://en.wikipedia.org/wiki/UTF-16/UCS-2
|
|
if (hiChars == null && loChars == null) {
|
|
// We expect this to be the common code path for most escapers.
|
|
return null;
|
|
}
|
|
// Combine the characters and/or escaped sequences into a single array.
|
|
int hiCount = hiChars != null ? hiChars.length : 1;
|
|
int loCount = loChars != null ? loChars.length : 1;
|
|
char[] output = new char[hiCount + loCount];
|
|
if (hiChars != null) {
|
|
// TODO: Is this faster than System.arraycopy() for small arrays?
|
|
for (int n = 0; n < hiChars.length; ++n) {
|
|
output[n] = hiChars[n];
|
|
}
|
|
} else {
|
|
output[0] = surrogateChars[0];
|
|
}
|
|
if (loChars != null) {
|
|
for (int n = 0; n < loChars.length; ++n) {
|
|
output[hiCount + n] = loChars[n];
|
|
}
|
|
} else {
|
|
output[hiCount] = surrogateChars[1];
|
|
}
|
|
return output;
|
|
}
|
|
};
|
|
}
|
|
}
|