1 /*
2 * Copyright (C) 2009 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package com.google.common.escape;
18
19 import static com.google.common.base.Preconditions.checkNotNull;
20
21 import com.google.common.annotations.Beta;
22 import com.google.common.annotations.GwtCompatible;
23
24 import java.util.HashMap;
25 import java.util.Map;
26
27 import javax.annotation.Nullable;
28
29 /**
30 * Static utility methods pertaining to {@link Escaper} instances.
31 *
32 * @author Sven Mawson
33 * @author David Beaumont
34 * @since 15.0
35 */
36 @Beta
37 @GwtCompatible
38 public final class Escapers {
39 private Escapers() {}
40
41 /**
42 * Returns an {@link Escaper} that does no escaping, passing all character
43 * data through unchanged.
44 */
45 public static Escaper nullEscaper() {
46 return NULL_ESCAPER;
47 }
48
49 // An Escaper that efficiently performs no escaping.
50 // Extending CharEscaper (instead of Escaper) makes Escapers.compose() easier.
51 private static final Escaper NULL_ESCAPER = new CharEscaper() {
52 @Override public String escape(String string) {
53 return checkNotNull(string);
54 }
55
56 @Override protected char[] escape(char c) {
57 // TODO: Fix tests not to call this directly and make it throw an error.
58 return null;
59 }
60 };
61
62 /**
63 * Returns a builder for creating simple, fast escapers. A builder instance
64 * can be reused and each escaper that is created will be a snapshot of the
65 * current builder state. Builders are not thread safe.
66 *
67 * <p>The initial state of the builder is such that:
68 * <ul>
69 * <li>There are no replacement mappings<li>
70 * <li>{@code safeMin == Character.MIN_VALUE}</li>
71 * <li>{@code safeMax == Character.MAX_VALUE}</li>
72 * <li>{@code unsafeReplacement == null}</li>
73 * </ul>
74 * <p>For performance reasons escapers created by this builder are not
75 * Unicode aware and will not validate the well-formedness of their input.
76 */
77 public static Builder builder() {
78 return new Builder();
79 }
80
81 /**
82 * A builder for simple, fast escapers.
83 *
84 * <p>Typically an escaper needs to deal with the escaping of high valued
85 * characters or code points. In these cases it is necessary to extend either
86 * {@link ArrayBasedCharEscaper} or {@link ArrayBasedUnicodeEscaper} to
87 * provide the desired behavior. However this builder is suitable for creating
88 * escapers that replace a relative small set of characters.
89 *
90 * @author David Beaumont
91 * @since 15.0
92 */
93 @Beta
94 public static final class Builder {
95 private final Map<Character, String> replacementMap =
96 new HashMap<Character, String>();
97 private char safeMin = Character.MIN_VALUE;
98 private char safeMax = Character.MAX_VALUE;
99 private String unsafeReplacement = null;
100
101 // The constructor is exposed via the builder() method above.
102 private Builder() {}
103
104 /**
105 * Sets the safe range of characters for the escaper. Characters in this
106 * range that have no explicit replacement are considered 'safe' and remain
107 * unescaped in the output. If {@code safeMax < safeMin} then the safe range
108 * is empty.
109 *
110 * @param safeMin the lowest 'safe' character
111 * @param safeMax the highest 'safe' character
112 * @return the builder instance
113 */
114 public Builder setSafeRange(char safeMin, char safeMax) {
115 this.safeMin = safeMin;
116 this.safeMax = safeMax;
117 return this;
118 }
119
120 /**
121 * Sets the replacement string for any characters outside the 'safe' range
122 * that have no explicit replacement. If {@code unsafeReplacement} is
123 * {@code null} then no replacement will occur, if it is {@code ""} then
124 * the unsafe characters are removed from the output.
125 *
126 * @param unsafeReplacement the string to replace unsafe chracters
127 * @return the builder instance
128 */
129 public Builder setUnsafeReplacement(@Nullable String unsafeReplacement) {
130 this.unsafeReplacement = unsafeReplacement;
131 return this;
132 }
133
134 /**
135 * Adds a replacement string for the given input character. The specified
136 * character will be replaced by the given string whenever it occurs in the
137 * input, irrespective of whether it lies inside or outside the 'safe'
138 * range.
139 *
140 * @param c the character to be replaced
141 * @param replacement the string to replace the given character
142 * @return the builder instance
143 * @throws NullPointerException if {@code replacement} is null
144 */
145 public Builder addEscape(char c, String replacement) {
146 checkNotNull(replacement);
147 // This can replace an existing character (the builder is re-usable).
148 replacementMap.put(c, replacement);
149 return this;
150 }
151
152 /**
153 * Returns a new escaper based on the current state of the builder.
154 */
155 public Escaper build() {
156 return new ArrayBasedCharEscaper(replacementMap, safeMin, safeMax) {
157 private final char[] replacementChars =
158 unsafeReplacement != null ? unsafeReplacement.toCharArray() : null;
159 @Override protected char[] escapeUnsafe(char c) {
160 return replacementChars;
161 }
162 };
163 }
164 }
165
166 /**
167 * Returns a {@link UnicodeEscaper} equivalent to the given escaper instance.
168 * If the escaper is already a UnicodeEscaper then it is simply returned,
169 * otherwise it is wrapped in a UnicodeEscaper.
170 *
171 * <p>When a {@link CharEscaper} escaper is wrapped by this method it acquires
172 * extra behavior with respect to the well-formedness of Unicode character
173 * sequences and will throw {@link IllegalArgumentException} when given bad
174 * input.
175 *
176 * @param escaper the instance to be wrapped
177 * @return a UnicodeEscaper with the same behavior as the given instance
178 * @throws NullPointerException if escaper is null
179 * @throws IllegalArgumentException if escaper is not a UnicodeEscaper or a
180 * CharEscaper
181 */
182 static UnicodeEscaper asUnicodeEscaper(Escaper escaper) {
183 checkNotNull(escaper);
184 if (escaper instanceof UnicodeEscaper) {
185 return (UnicodeEscaper) escaper;
186 } else if (escaper instanceof CharEscaper) {
187 return wrap((CharEscaper) escaper);
188 }
189 // In practice this shouldn't happen because it would be very odd not to
190 // extend either CharEscaper or UnicodeEscaper for non trivial cases.
191 throw new IllegalArgumentException("Cannot create a UnicodeEscaper from: " +
192 escaper.getClass().getName());
193 }
194
195 /**
196 * Returns a string that would replace the given character in the specified
197 * escaper, or {@code null} if no replacement should be made. This method is
198 * intended for use in tests through the {@code EscaperAsserts} class;
199 * production users of {@link CharEscaper} should limit themselves to its
200 * public interface.
201 *
202 * @param c the character to escape if necessary
203 * @return the replacement string, or {@code null} if no escaping was needed
204 */
205 public static String computeReplacement(CharEscaper escaper, char c) {
206 return stringOrNull(escaper.escape(c));
207 }
208
209 /**
210 * Returns a string that would replace the given character in the specified
211 * escaper, or {@code null} if no replacement should be made. This method is
212 * intended for use in tests through the {@code EscaperAsserts} class;
213 * production users of {@link UnicodeEscaper} should limit themselves to its
214 * public interface.
215 *
216 * @param cp the Unicode code point to escape if necessary
217 * @return the replacement string, or {@code null} if no escaping was needed
218 */
219 public static String computeReplacement(UnicodeEscaper escaper, int cp) {
220 return stringOrNull(escaper.escape(cp));
221 }
222
223 private static String stringOrNull(char[] in) {
224 return (in == null) ? null : new String(in);
225 }
226
227 /** Private helper to wrap a CharEscaper as a UnicodeEscaper. */
228 private static UnicodeEscaper wrap(final CharEscaper escaper) {
229 return new UnicodeEscaper() {
230 @Override protected char[] escape(int cp) {
231 // If a code point maps to a single character, just escape that.
232 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
233 return escaper.escape((char) cp);
234 }
235 // Convert the code point to a surrogate pair and escape them both.
236 // Note: This code path is horribly slow and typically allocates 4 new
237 // char[] each time it is invoked. However this avoids any
238 // synchronization issues and makes the escaper thread safe.
239 char[] surrogateChars = new char[2];
240 Character.toChars(cp, surrogateChars, 0);
241 char[] hiChars = escaper.escape(surrogateChars[0]);
242 char[] loChars = escaper.escape(surrogateChars[1]);
243
244 // If either hiChars or lowChars are non-null, the CharEscaper is trying
245 // to escape the characters of a surrogate pair separately. This is
246 // uncommon and applies only to escapers that assume UCS-2 rather than
247 // UTF-16. See: http://en.wikipedia.org/wiki/UTF-16/UCS-2
248 if (hiChars == null && loChars == null) {
249 // We expect this to be the common code path for most escapers.
250 return null;
251 }
252 // Combine the characters and/or escaped sequences into a single array.
253 int hiCount = hiChars != null ? hiChars.length : 1;
254 int loCount = loChars != null ? loChars.length : 1;
255 char[] output = new char[hiCount + loCount];
256 if (hiChars != null) {
257 // TODO: Is this faster than System.arraycopy() for small arrays?
258 for (int n = 0; n < hiChars.length; ++n) {
259 output[n] = hiChars[n];
260 }
261 } else {
262 output[0] = surrogateChars[0];
263 }
264 if (loChars != null) {
265 for (int n = 0; n < loChars.length; ++n) {
266 output[hiCount + n] = loChars[n];
267 }
268 } else {
269 output[hiCount] = surrogateChars[1];
270 }
271 return output;
272 }
273 };
274 }
275 }