View Javadoc
1   /*
2    * Copyright (C) 2008 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    * http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package com.google.common.escape;
18  
19  import com.google.common.annotations.Beta;
20  import com.google.common.annotations.GwtCompatible;
21  import com.google.common.base.Function;
22  
23  /**
24   * An object that converts literal text into a format safe for inclusion in a particular context
25   * (such as an XML document). Typically (but not always), the inverse process of "unescaping" the
26   * text is performed automatically by the relevant parser.
27   *
28   * <p>For example, an XML escaper would convert the literal string {@code "Foo<Bar>"} into {@code
29   * "Foo&lt;Bar&gt;"} to prevent {@code "<Bar>"} from being confused with an XML tag. When the
30   * resulting XML document is parsed, the parser API will return this text as the original literal
31   * string {@code "Foo<Bar>"}.
32   *
33   * <p>An {@code Escaper} instance is required to be stateless, and safe when used concurrently by
34   * multiple threads.
35   *
36   * <p>Because, in general, escaping operates on the code points of a string and not on its
37   * individual {@code char} values, it is not safe to assume that {@code escape(s)} is equivalent to
38   * {@code escape(s.substring(0, n)) + escape(s.substing(n))} for arbitrary {@code n}. This is
39   * because of the possibility of splitting a surrogate pair. The only case in which it is safe to
40   * escape strings and concatenate the results is if you can rule out this possibility, either by
41   * splitting an existing long string into short strings adaptively around {@linkplain
42   * Character#isHighSurrogate surrogate} {@linkplain Character#isLowSurrogate pairs}, or by starting
43   * with short strings already known to be free of unpaired surrogates.
44   *
45   * <p>The two primary implementations of this interface are {@link CharEscaper} and {@link
46   * UnicodeEscaper}. They are heavily optimized for performance and greatly simplify the task of
47   * implementing new escapers. It is strongly recommended that when implementing a new escaper you
48   * extend one of these classes. If you find that you are unable to achieve the desired behavior
49   * using either of these classes, please contact the Java libraries team for advice.
50   *
51   * <p>Several popular escapers are defined as constants in classes like {@link
52   * com.google.common.html.HtmlEscapers}, {@link com.google.common.xml.XmlEscapers}, and {@link
53   * SourceCodeEscapers}. To create your own escapers, use {@link CharEscaperBuilder}, or extend
54   * {@code CharEscaper} or {@code UnicodeEscaper}.
55   *
56   * @author David Beaumont
57   * @since 15.0
58   */
59  @Beta
60  @GwtCompatible
61  public abstract class Escaper {
62    // TODO(user): evaluate custom implementations, considering package private constructor.
63    /** Constructor for use by subclasses. */
64    protected Escaper() {}
65  
66    /**
67     * Returns the escaped form of a given literal string.
68     *
69     * <p>Note that this method may treat input characters differently depending on the specific
70     * escaper implementation.
71     *
72     * <ul>
73     * <li>{@link UnicodeEscaper} handles <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a>
74     *     correctly, including surrogate character pairs. If the input is badly formed the escaper
75     *     should throw {@link IllegalArgumentException}.
76     * <li>{@link CharEscaper} handles Java characters independently and does not verify the input for
77     *     well formed characters. A {@code CharEscaper} should not be used in situations where input
78     *     is not guaranteed to be restricted to the Basic Multilingual Plane (BMP).
79     * </ul>
80     *
81     * @param string the literal string to be escaped
82     * @return the escaped form of {@code string}
83     * @throws NullPointerException if {@code string} is null
84     * @throws IllegalArgumentException if {@code string} contains badly formed UTF-16 or cannot be
85     *         escaped for any other reason
86     */
87    public abstract String escape(String string);
88  
89    private final Function<String, String> asFunction =
90        new Function<String, String>() {
91          @Override
92          public String apply(String from) {
93            return escape(from);
94          }
95        };
96  
97    /**
98     * Returns a {@link Function} that invokes {@link #escape(String)} on this escaper.
99     */
100   public final Function<String, String> asFunction() {
101     return asFunction;
102   }
103 }