View Javadoc
1   /*
2    * Copyright (C) 2008 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    * http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package com.google.common.base;
18  
19  import static com.google.common.base.Preconditions.checkArgument;
20  import static com.google.common.base.Preconditions.checkNotNull;
21  
22  import com.google.common.annotations.Beta;
23  import com.google.common.annotations.GwtCompatible;
24  import com.google.common.annotations.GwtIncompatible;
25  
26  import java.util.Arrays;
27  import java.util.BitSet;
28  
29  import javax.annotation.CheckReturnValue;
30  
31  /**
32   * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
33   * for any {@link Object}. Also offers basic text processing methods based on this function.
34   * Implementations are strongly encouraged to be side-effect-free and immutable.
35   *
36   * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
37   * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
38   *
39   * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
40   * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
41   * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
42   * treats these just as two separate characters.
43   *
44   * <p>Example usages: <pre>
45   *   String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
46   *   if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
47   *
48   * <p>See the Guava User Guide article on <a href=
49   * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
50   * {@code CharMatcher}</a>.
51   *
52   * @author Kevin Bourrillion
53   * @since 1.0
54   */
55  @Beta // Possibly change from chars to code points; decide constants vs. methods
56  @GwtCompatible(emulated = true)
57  public abstract class CharMatcher implements Predicate<Character> {
58  
59    // Constants
60    /**
61     * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
62     * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
63     * discussion of that term.
64     *
65     * @since 2.0
66     */
67    public static final CharMatcher BREAKING_WHITESPACE = new CharMatcher() {
68      @Override
69      public boolean matches(char c) {
70        switch (c) {
71          case '\t':
72          case '\n':
73          case '\013':
74          case '\f':
75          case '\r':
76          case ' ':
77          case '\u0085':
78          case '\u1680':
79          case '\u2028':
80          case '\u2029':
81          case '\u205f':
82          case '\u3000':
83            return true;
84          case '\u2007':
85            return false;
86          default:
87            return c >= '\u2000' && c <= '\u200a';
88        }
89      }
90  
91      @Override
92      public String toString() {
93        return "CharMatcher.BREAKING_WHITESPACE";
94      }
95    };
96  
97    /**
98     * Determines whether a character is ASCII, meaning that its code point is less than 128.
99     */
100   public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
101 
102   private static class RangesMatcher extends CharMatcher {
103     private final char[] rangeStarts;
104     private final char[] rangeEnds;
105 
106     RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
107       super(description);
108       this.rangeStarts = rangeStarts;
109       this.rangeEnds = rangeEnds;
110       checkArgument(rangeStarts.length == rangeEnds.length);
111       for (int i = 0; i < rangeStarts.length; i++) {
112         checkArgument(rangeStarts[i] <= rangeEnds[i]);
113         if (i + 1 < rangeStarts.length) {
114           checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
115         }
116       }
117     }
118 
119     @Override
120     public boolean matches(char c) {
121       int index = Arrays.binarySearch(rangeStarts, c);
122       if (index >= 0) {
123         return true;
124       } else {
125         index = ~index - 1;
126         return index >= 0 && c <= rangeEnds[index];
127       }
128     }
129   }
130 
131   // Must be in ascending order.
132   private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6"
133       + "\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1b50\u1bb0"
134       + "\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
135 
136   private static final String NINES;
137   static {
138     StringBuilder builder = new StringBuilder(ZEROES.length());
139     for (int i = 0; i < ZEROES.length(); i++) {
140       builder.append((char) (ZEROES.charAt(i) + 9));
141     }
142     NINES = builder.toString();
143   }
144 
145   /**
146    * Determines whether a character is a digit according to
147    * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
148    * If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
149    */
150   public static final CharMatcher DIGIT = new RangesMatcher(
151       "CharMatcher.DIGIT", ZEROES.toCharArray(), NINES.toCharArray());
152 
153   /**
154    * Determines whether a character is a digit according to {@linkplain Character#isDigit(char)
155    * Java's definition}. If you only care to match ASCII digits, you can use {@code
156    * inRange('0', '9')}.
157    */
158   public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
159     @Override public boolean matches(char c) {
160       return Character.isDigit(c);
161     }
162   };
163 
164   /**
165    * Determines whether a character is a letter according to {@linkplain Character#isLetter(char)
166    * Java's definition}. If you only care to match letters of the Latin alphabet, you can use {@code
167    * inRange('a', 'z').or(inRange('A', 'Z'))}.
168    */
169   public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
170     @Override public boolean matches(char c) {
171       return Character.isLetter(c);
172     }
173   };
174 
175   /**
176    * Determines whether a character is a letter or digit according to {@linkplain
177    * Character#isLetterOrDigit(char) Java's definition}.
178    */
179   public static final CharMatcher JAVA_LETTER_OR_DIGIT =
180       new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
181     @Override public boolean matches(char c) {
182       return Character.isLetterOrDigit(c);
183     }
184   };
185 
186   /**
187    * Determines whether a character is upper case according to {@linkplain
188    * Character#isUpperCase(char) Java's definition}.
189    */
190   public static final CharMatcher JAVA_UPPER_CASE =
191       new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
192     @Override public boolean matches(char c) {
193       return Character.isUpperCase(c);
194     }
195   };
196 
197   /**
198    * Determines whether a character is lower case according to {@linkplain
199    * Character#isLowerCase(char) Java's definition}.
200    */
201   public static final CharMatcher JAVA_LOWER_CASE =
202       new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
203     @Override public boolean matches(char c) {
204       return Character.isLowerCase(c);
205     }
206   };
207 
208   /**
209    * Determines whether a character is an ISO control character as specified by {@link
210    * Character#isISOControl(char)}.
211    */
212   public static final CharMatcher JAVA_ISO_CONTROL =
213       inRange('\u0000', '\u001f')
214       .or(inRange('\u007f', '\u009f'))
215       .withToString("CharMatcher.JAVA_ISO_CONTROL");
216 
217   /**
218    * Determines whether a character is invisible; that is, if its Unicode category is any of
219    * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
220    * PRIVATE_USE according to ICU4J.
221    */
222   public static final CharMatcher INVISIBLE = new RangesMatcher("CharMatcher.INVISIBLE", (
223       "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u2066\u2067\u2068"
224       + "\u2069\u206a\u3000\ud800\ufeff\ufff9\ufffa").toCharArray(), (
225       "\u0020\u00a0\u00ad\u0604\u061c\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u2066\u2067\u2068"
226       + "\u2069\u206f\u3000\uf8ff\ufeff\ufff9\ufffb").toCharArray());
227 
228   private static String showCharacter(char c) {
229     String hex = "0123456789ABCDEF";
230     char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
231     for (int i = 0; i < 4; i++) {
232       tmp[5 - i] = hex.charAt(c & 0xF);
233       c >>= 4;
234     }
235     return String.copyValueOf(tmp);
236 
237   }
238 
239   /**
240    * Determines whether a character is single-width (not double-width). When in doubt, this matcher
241    * errs on the side of returning {@code false} (that is, it tends to assume a character is
242    * double-width).
243    *
244    * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
245    * date.
246    */
247   public static final CharMatcher SINGLE_WIDTH = new RangesMatcher("CharMatcher.SINGLE_WIDTH",
248       "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
249       "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
250 
251   /** Matches any character. */
252   public static final CharMatcher ANY =
253       new FastMatcher("CharMatcher.ANY") {
254         @Override public boolean matches(char c) {
255           return true;
256         }
257 
258         @Override public int indexIn(CharSequence sequence) {
259           return (sequence.length() == 0) ? -1 : 0;
260         }
261 
262         @Override public int indexIn(CharSequence sequence, int start) {
263           int length = sequence.length();
264           Preconditions.checkPositionIndex(start, length);
265           return (start == length) ? -1 : start;
266         }
267 
268         @Override public int lastIndexIn(CharSequence sequence) {
269           return sequence.length() - 1;
270         }
271 
272         @Override public boolean matchesAllOf(CharSequence sequence) {
273           checkNotNull(sequence);
274           return true;
275         }
276 
277         @Override public boolean matchesNoneOf(CharSequence sequence) {
278           return sequence.length() == 0;
279         }
280 
281         @Override public String removeFrom(CharSequence sequence) {
282           checkNotNull(sequence);
283           return "";
284         }
285 
286         @Override public String replaceFrom(CharSequence sequence, char replacement) {
287           char[] array = new char[sequence.length()];
288           Arrays.fill(array, replacement);
289           return new String(array);
290         }
291 
292         @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
293           StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
294           for (int i = 0; i < sequence.length(); i++) {
295             retval.append(replacement);
296           }
297           return retval.toString();
298         }
299 
300         @Override public String collapseFrom(CharSequence sequence, char replacement) {
301           return (sequence.length() == 0) ? "" : String.valueOf(replacement);
302         }
303 
304         @Override public String trimFrom(CharSequence sequence) {
305           checkNotNull(sequence);
306           return "";
307         }
308 
309         @Override public int countIn(CharSequence sequence) {
310           return sequence.length();
311         }
312 
313         @Override public CharMatcher and(CharMatcher other) {
314           return checkNotNull(other);
315         }
316 
317         @Override public CharMatcher or(CharMatcher other) {
318           checkNotNull(other);
319           return this;
320         }
321 
322         @Override public CharMatcher negate() {
323           return NONE;
324         }
325       };
326 
327   /** Matches no characters. */
328   public static final CharMatcher NONE =
329       new FastMatcher("CharMatcher.NONE") {
330         @Override public boolean matches(char c) {
331           return false;
332         }
333 
334         @Override public int indexIn(CharSequence sequence) {
335           checkNotNull(sequence);
336           return -1;
337         }
338 
339         @Override public int indexIn(CharSequence sequence, int start) {
340           int length = sequence.length();
341           Preconditions.checkPositionIndex(start, length);
342           return -1;
343         }
344 
345         @Override public int lastIndexIn(CharSequence sequence) {
346           checkNotNull(sequence);
347           return -1;
348         }
349 
350         @Override public boolean matchesAllOf(CharSequence sequence) {
351           return sequence.length() == 0;
352         }
353 
354         @Override public boolean matchesNoneOf(CharSequence sequence) {
355           checkNotNull(sequence);
356           return true;
357         }
358 
359         @Override public String removeFrom(CharSequence sequence) {
360           return sequence.toString();
361         }
362 
363         @Override public String replaceFrom(CharSequence sequence, char replacement) {
364           return sequence.toString();
365         }
366 
367         @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
368           checkNotNull(replacement);
369           return sequence.toString();
370         }
371 
372         @Override public String collapseFrom(CharSequence sequence, char replacement) {
373           return sequence.toString();
374         }
375 
376         @Override public String trimFrom(CharSequence sequence) {
377           return sequence.toString();
378         }
379 
380         @Override
381         public String trimLeadingFrom(CharSequence sequence) {
382           return sequence.toString();
383         }
384 
385         @Override
386         public String trimTrailingFrom(CharSequence sequence) {
387           return sequence.toString();
388         }
389 
390         @Override public int countIn(CharSequence sequence) {
391           checkNotNull(sequence);
392           return 0;
393         }
394 
395         @Override public CharMatcher and(CharMatcher other) {
396           checkNotNull(other);
397           return this;
398         }
399 
400         @Override public CharMatcher or(CharMatcher other) {
401           return checkNotNull(other);
402         }
403 
404         @Override public CharMatcher negate() {
405           return ANY;
406         }
407       };
408 
409   // Static factories
410 
411   /**
412    * Returns a {@code char} matcher that matches only one specified character.
413    */
414   public static CharMatcher is(final char match) {
415     String description = "CharMatcher.is('" + showCharacter(match) + "')";
416     return new FastMatcher(description) {
417       @Override public boolean matches(char c) {
418         return c == match;
419       }
420 
421       @Override public String replaceFrom(CharSequence sequence, char replacement) {
422         return sequence.toString().replace(match, replacement);
423       }
424 
425       @Override public CharMatcher and(CharMatcher other) {
426         return other.matches(match) ? this : NONE;
427       }
428 
429       @Override public CharMatcher or(CharMatcher other) {
430         return other.matches(match) ? other : super.or(other);
431       }
432 
433       @Override public CharMatcher negate() {
434         return isNot(match);
435       }
436 
437       @GwtIncompatible("java.util.BitSet")
438       @Override
439       void setBits(BitSet table) {
440         table.set(match);
441       }
442     };
443   }
444 
445   /**
446    * Returns a {@code char} matcher that matches any character except the one specified.
447    *
448    * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
449    */
450   public static CharMatcher isNot(final char match) {
451     String description = "CharMatcher.isNot('" + showCharacter(match) + "')";
452     return new FastMatcher(description) {
453       @Override public boolean matches(char c) {
454         return c != match;
455       }
456 
457       @Override public CharMatcher and(CharMatcher other) {
458         return other.matches(match) ? super.and(other) : other;
459       }
460 
461       @Override public CharMatcher or(CharMatcher other) {
462         return other.matches(match) ? ANY : this;
463       }
464 
465       @GwtIncompatible("java.util.BitSet")
466       @Override
467       void setBits(BitSet table) {
468         table.set(0, match);
469         table.set(match + 1, Character.MAX_VALUE + 1);
470       }
471 
472       @Override public CharMatcher negate() {
473         return is(match);
474       }
475     };
476   }
477 
478   /**
479    * Returns a {@code char} matcher that matches any character present in the given character
480    * sequence.
481    */
482   public static CharMatcher anyOf(final CharSequence sequence) {
483     switch (sequence.length()) {
484       case 0:
485         return NONE;
486       case 1:
487         return is(sequence.charAt(0));
488       case 2:
489         return isEither(sequence.charAt(0), sequence.charAt(1));
490       default:
491         // continue below to handle the general case
492     }
493     // TODO(user): is it potentially worth just going ahead and building a precomputed matcher?
494     final char[] chars = sequence.toString().toCharArray();
495     Arrays.sort(chars);
496     StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
497     for (char c : chars) {
498       description.append(showCharacter(c));
499     }
500     description.append("\")");
501     return new CharMatcher(description.toString()) {
502       @Override public boolean matches(char c) {
503         return Arrays.binarySearch(chars, c) >= 0;
504       }
505 
506       @Override
507       @GwtIncompatible("java.util.BitSet")
508       void setBits(BitSet table) {
509         for (char c : chars) {
510           table.set(c);
511         }
512       }
513     };
514   }
515 
516   private static CharMatcher isEither(
517       final char match1,
518       final char match2) {
519     String description = "CharMatcher.anyOf(\"" +
520         showCharacter(match1) + showCharacter(match2) + "\")";
521     return new FastMatcher(description) {
522       @Override public boolean matches(char c) {
523         return c == match1 || c == match2;
524       }
525 
526       @GwtIncompatible("java.util.BitSet")
527       @Override void setBits(BitSet table) {
528         table.set(match1);
529         table.set(match2);
530       }
531     };
532   }
533 
534   /**
535    * Returns a {@code char} matcher that matches any character not present in the given character
536    * sequence.
537    */
538   public static CharMatcher noneOf(CharSequence sequence) {
539     return anyOf(sequence).negate();
540   }
541 
542   /**
543    * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
544    * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
545    * CharMatcher.inRange('a', 'z')}.
546    *
547    * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
548    */
549   public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
550     checkArgument(endInclusive >= startInclusive);
551     String description = "CharMatcher.inRange('" +
552         showCharacter(startInclusive) + "', '" +
553         showCharacter(endInclusive) + "')";
554     return inRange(startInclusive, endInclusive, description);
555   }
556 
557   static CharMatcher inRange(final char startInclusive, final char endInclusive,
558       String description) {
559     return new FastMatcher(description) {
560       @Override public boolean matches(char c) {
561         return startInclusive <= c && c <= endInclusive;
562       }
563 
564       @GwtIncompatible("java.util.BitSet")
565       @Override void setBits(BitSet table) {
566         table.set(startInclusive, endInclusive + 1);
567       }
568     };
569   }
570 
571   /**
572    * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
573    * which operates on primitive {@code char} instances instead.
574    */
575   public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
576     checkNotNull(predicate);
577     if (predicate instanceof CharMatcher) {
578       return (CharMatcher) predicate;
579     }
580     String description = "CharMatcher.forPredicate(" + predicate + ")";
581     return new CharMatcher(description) {
582       @Override public boolean matches(char c) {
583         return predicate.apply(c);
584       }
585 
586       @Override public boolean apply(Character character) {
587         return predicate.apply(checkNotNull(character));
588       }
589     };
590   }
591 
592   // State
593   final String description;
594 
595   // Constructors
596 
597   /**
598    * Sets the {@code toString()} from the given description.
599    */
600   CharMatcher(String description) {
601     this.description = description;
602   }
603 
604   /**
605    * Constructor for use by subclasses. When subclassing, you may want to override
606    * {@code toString()} to provide a useful description.
607    */
608   protected CharMatcher() {
609     description = super.toString();
610   }
611 
612   // Abstract methods
613 
614   /** Determines a true or false value for the given character. */
615   public abstract boolean matches(char c);
616 
617   // Non-static factories
618 
619   /**
620    * Returns a matcher that matches any character not matched by this matcher.
621    */
622   public CharMatcher negate() {
623     return new NegatedMatcher(this);
624   }
625 
626   private static class NegatedMatcher extends CharMatcher {
627     final CharMatcher original;
628 
629     NegatedMatcher(String toString, CharMatcher original) {
630       super(toString);
631       this.original = original;
632     }
633 
634     NegatedMatcher(CharMatcher original) {
635       this(original + ".negate()", original);
636     }
637 
638     @Override public boolean matches(char c) {
639       return !original.matches(c);
640     }
641 
642     @Override public boolean matchesAllOf(CharSequence sequence) {
643       return original.matchesNoneOf(sequence);
644     }
645 
646     @Override public boolean matchesNoneOf(CharSequence sequence) {
647       return original.matchesAllOf(sequence);
648     }
649 
650     @Override public int countIn(CharSequence sequence) {
651       return sequence.length() - original.countIn(sequence);
652     }
653 
654     @GwtIncompatible("java.util.BitSet")
655     @Override
656     void setBits(BitSet table) {
657       BitSet tmp = new BitSet();
658       original.setBits(tmp);
659       tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
660       table.or(tmp);
661     }
662 
663     @Override public CharMatcher negate() {
664       return original;
665     }
666 
667     @Override
668     CharMatcher withToString(String description) {
669       return new NegatedMatcher(description, original);
670     }
671   }
672 
673   /**
674    * Returns a matcher that matches any character matched by both this matcher and {@code other}.
675    */
676   public CharMatcher and(CharMatcher other) {
677     return new And(this, checkNotNull(other));
678   }
679 
680   private static class And extends CharMatcher {
681     final CharMatcher first;
682     final CharMatcher second;
683 
684     And(CharMatcher a, CharMatcher b) {
685       this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
686     }
687 
688     And(CharMatcher a, CharMatcher b, String description) {
689       super(description);
690       first = checkNotNull(a);
691       second = checkNotNull(b);
692     }
693 
694     @Override
695     public boolean matches(char c) {
696       return first.matches(c) && second.matches(c);
697     }
698 
699     @GwtIncompatible("java.util.BitSet")
700     @Override
701     void setBits(BitSet table) {
702       BitSet tmp1 = new BitSet();
703       first.setBits(tmp1);
704       BitSet tmp2 = new BitSet();
705       second.setBits(tmp2);
706       tmp1.and(tmp2);
707       table.or(tmp1);
708     }
709 
710     @Override
711     CharMatcher withToString(String description) {
712       return new And(first, second, description);
713     }
714   }
715 
716   /**
717    * Returns a matcher that matches any character matched by either this matcher or {@code other}.
718    */
719   public CharMatcher or(CharMatcher other) {
720     return new Or(this, checkNotNull(other));
721   }
722 
723   private static class Or extends CharMatcher {
724     final CharMatcher first;
725     final CharMatcher second;
726 
727     Or(CharMatcher a, CharMatcher b, String description) {
728       super(description);
729       first = checkNotNull(a);
730       second = checkNotNull(b);
731     }
732 
733     Or(CharMatcher a, CharMatcher b) {
734       this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
735     }
736 
737     @GwtIncompatible("java.util.BitSet")
738     @Override
739     void setBits(BitSet table) {
740       first.setBits(table);
741       second.setBits(table);
742     }
743 
744     @Override
745     public boolean matches(char c) {
746       return first.matches(c) || second.matches(c);
747     }
748 
749     @Override
750     CharMatcher withToString(String description) {
751       return new Or(first, second, description);
752     }
753   }
754 
755   /**
756    * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
757    * query than the original; your mileage may vary. Precomputation takes time and is likely to be
758    * worthwhile only if the precomputed matcher is queried many thousands of times.
759    *
760    * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
761    * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
762    * worthwhile tradeoff in a browser.
763    */
764   public CharMatcher precomputed() {
765     return Platform.precomputeCharMatcher(this);
766   }
767 
768   /**
769    * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
770    * but with their {@code toString} method overridden with the new description.
771    *
772    * <p>This is unsupported by default.
773    */
774   CharMatcher withToString(String description) {
775     throw new UnsupportedOperationException();
776   }
777 
778   private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
779 
780   /**
781    * This is the actual implementation of {@link #precomputed}, but we bounce calls through a
782    * method on {@link Platform} so that we can have different behavior in GWT.
783    *
784    * <p>This implementation tries to be smart in a number of ways.  It recognizes cases where
785    * the negation is cheaper to precompute than the matcher itself; it tries to build small
786    * hash tables for matchers that only match a few characters, and so on.  In the worst-case
787    * scenario, it constructs an eight-kilobyte bit array and queries that.
788    * In many situations this produces a matcher which is faster to query than the original.
789    */
790   @GwtIncompatible("java.util.BitSet")
791   CharMatcher precomputedInternal() {
792     final BitSet table = new BitSet();
793     setBits(table);
794     int totalCharacters = table.cardinality();
795     if (totalCharacters * 2 <= DISTINCT_CHARS) {
796       return precomputedPositive(totalCharacters, table, description);
797     } else {
798       // TODO(user): is it worth it to worry about the last character of large matchers?
799       table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
800       int negatedCharacters = DISTINCT_CHARS - totalCharacters;
801       String suffix = ".negate()";
802       String negatedDescription = description.endsWith(suffix)
803           ? description.substring(0, description.length() - suffix.length())
804           : description + suffix;
805       return new NegatedFastMatcher(toString(),
806           precomputedPositive(negatedCharacters, table, negatedDescription));
807     }
808   }
809 
810   /**
811    * A matcher for which precomputation will not yield any significant benefit.
812    */
813   abstract static class FastMatcher extends CharMatcher {
814     FastMatcher() {
815       super();
816     }
817 
818     FastMatcher(String description) {
819       super(description);
820     }
821 
822     @Override
823     public final CharMatcher precomputed() {
824       return this;
825     }
826 
827     @Override
828     public CharMatcher negate() {
829       return new NegatedFastMatcher(this);
830     }
831   }
832 
833   static final class NegatedFastMatcher extends NegatedMatcher {
834     NegatedFastMatcher(CharMatcher original) {
835       super(original);
836     }
837 
838     NegatedFastMatcher(String toString, CharMatcher original) {
839       super(toString, original);
840     }
841 
842     @Override
843     public final CharMatcher precomputed() {
844       return this;
845     }
846 
847     @Override
848     CharMatcher withToString(String description) {
849       return new NegatedFastMatcher(description, original);
850     }
851   }
852 
853   /**
854    * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
855    */
856   @GwtIncompatible("java.util.BitSet")
857   private static CharMatcher precomputedPositive(
858       int totalCharacters,
859       BitSet table,
860       String description) {
861     switch (totalCharacters) {
862       case 0:
863         return NONE;
864       case 1:
865         return is((char) table.nextSetBit(0));
866       case 2:
867         char c1 = (char) table.nextSetBit(0);
868         char c2 = (char) table.nextSetBit(c1 + 1);
869         return isEither(c1, c2);
870       default:
871         return isSmall(totalCharacters, table.length())
872             ? SmallCharMatcher.from(table, description)
873             : new BitSetMatcher(table, description);
874     }
875   }
876 
877   @GwtIncompatible("SmallCharMatcher")
878   private static boolean isSmall(int totalCharacters, int tableLength) {
879     return totalCharacters <= SmallCharMatcher.MAX_SIZE
880         && tableLength > (totalCharacters * 4 * Character.SIZE);
881         // err on the side of BitSetMatcher
882   }
883 
884   @GwtIncompatible("java.util.BitSet")
885   private static class BitSetMatcher extends FastMatcher {
886     private final BitSet table;
887 
888     private BitSetMatcher(BitSet table, String description) {
889       super(description);
890       if (table.length() + Long.SIZE < table.size()) {
891         table = (BitSet) table.clone();
892         // If only we could actually call BitSet.trimToSize() ourselves...
893       }
894       this.table = table;
895     }
896 
897     @Override public boolean matches(char c) {
898       return table.get(c);
899     }
900 
901     @Override
902     void setBits(BitSet bitSet) {
903       bitSet.or(table);
904     }
905   }
906 
907   /**
908    * Sets bits in {@code table} matched by this matcher.
909    */
910   @GwtIncompatible("java.util.BitSet")
911   void setBits(BitSet table) {
912     for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
913       if (matches((char) c)) {
914         table.set(c);
915       }
916     }
917   }
918 
919   // Text processing routines
920 
921   /**
922    * Returns {@code true} if a character sequence contains at least one matching character.
923    * Equivalent to {@code !matchesNoneOf(sequence)}.
924    *
925    * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
926    * character, until this returns {@code true} or the end is reached.
927    *
928    * @param sequence the character sequence to examine, possibly empty
929    * @return {@code true} if this matcher matches at least one character in the sequence
930    * @since 8.0
931    */
932   public boolean matchesAnyOf(CharSequence sequence) {
933     return !matchesNoneOf(sequence);
934   }
935 
936   /**
937    * Returns {@code true} if a character sequence contains only matching characters.
938    *
939    * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
940    * character, until this returns {@code false} or the end is reached.
941    *
942    * @param sequence the character sequence to examine, possibly empty
943    * @return {@code true} if this matcher matches every character in the sequence, including when
944    *         the sequence is empty
945    */
946   public boolean matchesAllOf(CharSequence sequence) {
947     for (int i = sequence.length() - 1; i >= 0; i--) {
948       if (!matches(sequence.charAt(i))) {
949         return false;
950       }
951     }
952     return true;
953   }
954 
955   /**
956    * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
957    * {@code !matchesAnyOf(sequence)}.
958    *
959    * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
960    * character, until this returns {@code false} or the end is reached.
961    *
962    * @param sequence the character sequence to examine, possibly empty
963    * @return {@code true} if this matcher matches every character in the sequence, including when
964    *         the sequence is empty
965    */
966   public boolean matchesNoneOf(CharSequence sequence) {
967     return indexIn(sequence) == -1;
968   }
969 
970   /**
971    * Returns the index of the first matching character in a character sequence, or {@code -1} if no
972    * matching character is present.
973    *
974    * <p>The default implementation iterates over the sequence in forward order calling {@link
975    * #matches} for each character.
976    *
977    * @param sequence the character sequence to examine from the beginning
978    * @return an index, or {@code -1} if no character matches
979    */
980   public int indexIn(CharSequence sequence) {
981     int length = sequence.length();
982     for (int i = 0; i < length; i++) {
983       if (matches(sequence.charAt(i))) {
984         return i;
985       }
986     }
987     return -1;
988   }
989 
990   /**
991    * Returns the index of the first matching character in a character sequence, starting from a
992    * given position, or {@code -1} if no character matches after that position.
993    *
994    * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
995    * start}, calling {@link #matches} for each character.
996    *
997    * @param sequence the character sequence to examine
998    * @param start the first index to examine; must be nonnegative and no greater than {@code
999    *        sequence.length()}
1000    * @return the index of the first matching character, guaranteed to be no less than {@code start},
1001    *         or {@code -1} if no character matches
1002    * @throws IndexOutOfBoundsException if start is negative or greater than {@code
1003    *         sequence.length()}
1004    */
1005   public int indexIn(CharSequence sequence, int start) {
1006     int length = sequence.length();
1007     Preconditions.checkPositionIndex(start, length);
1008     for (int i = start; i < length; i++) {
1009       if (matches(sequence.charAt(i))) {
1010         return i;
1011       }
1012     }
1013     return -1;
1014   }
1015 
1016   /**
1017    * Returns the index of the last matching character in a character sequence, or {@code -1} if no
1018    * matching character is present.
1019    *
1020    * <p>The default implementation iterates over the sequence in reverse order calling {@link
1021    * #matches} for each character.
1022    *
1023    * @param sequence the character sequence to examine from the end
1024    * @return an index, or {@code -1} if no character matches
1025    */
1026   public int lastIndexIn(CharSequence sequence) {
1027     for (int i = sequence.length() - 1; i >= 0; i--) {
1028       if (matches(sequence.charAt(i))) {
1029         return i;
1030       }
1031     }
1032     return -1;
1033   }
1034 
1035   /**
1036    * Returns the number of matching characters found in a character sequence.
1037    */
1038   public int countIn(CharSequence sequence) {
1039     int count = 0;
1040     for (int i = 0; i < sequence.length(); i++) {
1041       if (matches(sequence.charAt(i))) {
1042         count++;
1043       }
1044     }
1045     return count;
1046   }
1047 
1048   /**
1049    * Returns a string containing all non-matching characters of a character sequence, in order. For
1050    * example: <pre>   {@code
1051    *
1052    *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
1053    *
1054    * ... returns {@code "bzr"}.
1055    */
1056   @CheckReturnValue
1057   public String removeFrom(CharSequence sequence) {
1058     String string = sequence.toString();
1059     int pos = indexIn(string);
1060     if (pos == -1) {
1061       return string;
1062     }
1063 
1064     char[] chars = string.toCharArray();
1065     int spread = 1;
1066 
1067     // This unusual loop comes from extensive benchmarking
1068     OUT: while (true) {
1069       pos++;
1070       while (true) {
1071         if (pos == chars.length) {
1072           break OUT;
1073         }
1074         if (matches(chars[pos])) {
1075           break;
1076         }
1077         chars[pos - spread] = chars[pos];
1078         pos++;
1079       }
1080       spread++;
1081     }
1082     return new String(chars, 0, pos - spread);
1083   }
1084 
1085   /**
1086    * Returns a string containing all matching characters of a character sequence, in order. For
1087    * example: <pre>   {@code
1088    *
1089    *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
1090    *
1091    * ... returns {@code "aaa"}.
1092    */
1093   @CheckReturnValue
1094   public String retainFrom(CharSequence sequence) {
1095     return negate().removeFrom(sequence);
1096   }
1097 
1098   /**
1099    * Returns a string copy of the input character sequence, with each character that matches this
1100    * matcher replaced by a given replacement character. For example: <pre>   {@code
1101    *
1102    *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
1103    *
1104    * ... returns {@code "rodor"}.
1105    *
1106    * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1107    * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1108    * character.
1109    *
1110    * @param sequence the character sequence to replace matching characters in
1111    * @param replacement the character to append to the result string in place of each matching
1112    *        character in {@code sequence}
1113    * @return the new string
1114    */
1115   @CheckReturnValue
1116   public String replaceFrom(CharSequence sequence, char replacement) {
1117     String string = sequence.toString();
1118     int pos = indexIn(string);
1119     if (pos == -1) {
1120       return string;
1121     }
1122     char[] chars = string.toCharArray();
1123     chars[pos] = replacement;
1124     for (int i = pos + 1; i < chars.length; i++) {
1125       if (matches(chars[i])) {
1126         chars[i] = replacement;
1127       }
1128     }
1129     return new String(chars);
1130   }
1131 
1132   /**
1133    * Returns a string copy of the input character sequence, with each character that matches this
1134    * matcher replaced by a given replacement sequence. For example: <pre>   {@code
1135    *
1136    *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1137    *
1138    * ... returns {@code "yoohoo"}.
1139    *
1140    * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1141    * off calling {@link #replaceFrom(CharSequence, char)} directly.
1142    *
1143    * @param sequence the character sequence to replace matching characters in
1144    * @param replacement the characters to append to the result string in place of each matching
1145    *        character in {@code sequence}
1146    * @return the new string
1147    */
1148   @CheckReturnValue
1149   public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1150     int replacementLen = replacement.length();
1151     if (replacementLen == 0) {
1152       return removeFrom(sequence);
1153     }
1154     if (replacementLen == 1) {
1155       return replaceFrom(sequence, replacement.charAt(0));
1156     }
1157 
1158     String string = sequence.toString();
1159     int pos = indexIn(string);
1160     if (pos == -1) {
1161       return string;
1162     }
1163 
1164     int len = string.length();
1165     StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1166 
1167     int oldpos = 0;
1168     do {
1169       buf.append(string, oldpos, pos);
1170       buf.append(replacement);
1171       oldpos = pos + 1;
1172       pos = indexIn(string, oldpos);
1173     } while (pos != -1);
1174 
1175     buf.append(string, oldpos, len);
1176     return buf.toString();
1177   }
1178 
1179   /**
1180    * Returns a substring of the input character sequence that omits all characters this matcher
1181    * matches from the beginning and from the end of the string. For example: <pre>   {@code
1182    *
1183    *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1184    *
1185    * ... returns {@code "cat"}.
1186    *
1187    * <p>Note that: <pre>   {@code
1188    *
1189    *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1190    *
1191    * ... is equivalent to {@link String#trim()}.
1192    */
1193   @CheckReturnValue
1194   public String trimFrom(CharSequence sequence) {
1195     int len = sequence.length();
1196     int first;
1197     int last;
1198 
1199     for (first = 0; first < len; first++) {
1200       if (!matches(sequence.charAt(first))) {
1201         break;
1202       }
1203     }
1204     for (last = len - 1; last > first; last--) {
1205       if (!matches(sequence.charAt(last))) {
1206         break;
1207       }
1208     }
1209 
1210     return sequence.subSequence(first, last + 1).toString();
1211   }
1212 
1213   /**
1214    * Returns a substring of the input character sequence that omits all characters this matcher
1215    * matches from the beginning of the string. For example: <pre> {@code
1216    *
1217    *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1218    *
1219    * ... returns {@code "catbab"}.
1220    */
1221   @CheckReturnValue
1222   public String trimLeadingFrom(CharSequence sequence) {
1223     int len = sequence.length();
1224     for (int first = 0; first < len; first++) {
1225       if (!matches(sequence.charAt(first))) {
1226         return sequence.subSequence(first, len).toString();
1227       }
1228     }
1229     return "";
1230   }
1231 
1232   /**
1233    * Returns a substring of the input character sequence that omits all characters this matcher
1234    * matches from the end of the string. For example: <pre> {@code
1235    *
1236    *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1237    *
1238    * ... returns {@code "abacat"}.
1239    */
1240   @CheckReturnValue
1241   public String trimTrailingFrom(CharSequence sequence) {
1242     int len = sequence.length();
1243     for (int last = len - 1; last >= 0; last--) {
1244       if (!matches(sequence.charAt(last))) {
1245         return sequence.subSequence(0, last + 1).toString();
1246       }
1247     }
1248     return "";
1249   }
1250 
1251   /**
1252    * Returns a string copy of the input character sequence, with each group of consecutive
1253    * characters that match this matcher replaced by a single replacement character. For example:
1254    * <pre>   {@code
1255    *
1256    *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1257    *
1258    * ... returns {@code "b-p-r"}.
1259    *
1260    * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1261    * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1262    * character.
1263    *
1264    * @param sequence the character sequence to replace matching groups of characters in
1265    * @param replacement the character to append to the result string in place of each group of
1266    *        matching characters in {@code sequence}
1267    * @return the new string
1268    */
1269   @CheckReturnValue
1270   public String collapseFrom(CharSequence sequence, char replacement) {
1271     // This implementation avoids unnecessary allocation.
1272     int len = sequence.length();
1273     for (int i = 0; i < len; i++) {
1274       char c = sequence.charAt(i);
1275       if (matches(c)) {
1276         if (c == replacement
1277             && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
1278           // a no-op replacement
1279           i++;
1280         } else {
1281           StringBuilder builder = new StringBuilder(len)
1282               .append(sequence.subSequence(0, i))
1283               .append(replacement);
1284           return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
1285         }
1286       }
1287     }
1288     // no replacement needed
1289     return sequence.toString();
1290   }
1291 
1292   /**
1293    * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1294    * groups of matching characters at the start or end of the sequence are removed without
1295    * replacement.
1296    */
1297   @CheckReturnValue
1298   public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1299     // This implementation avoids unnecessary allocation.
1300     int len = sequence.length();
1301     int first;
1302     int last;
1303 
1304     for (first = 0; first < len && matches(sequence.charAt(first)); first++) {}
1305     for (last = len - 1; last > first && matches(sequence.charAt(last)); last--) {}
1306 
1307     return (first == 0 && last == len - 1)
1308         ? collapseFrom(sequence, replacement)
1309         : finishCollapseFrom(
1310               sequence, first, last + 1, replacement,
1311               new StringBuilder(last + 1 - first),
1312               false);
1313   }
1314 
1315   private String finishCollapseFrom(
1316       CharSequence sequence, int start, int end, char replacement,
1317       StringBuilder builder, boolean inMatchingGroup) {
1318     for (int i = start; i < end; i++) {
1319       char c = sequence.charAt(i);
1320       if (matches(c)) {
1321         if (!inMatchingGroup) {
1322           builder.append(replacement);
1323           inMatchingGroup = true;
1324         }
1325       } else {
1326         builder.append(c);
1327         inMatchingGroup = false;
1328       }
1329     }
1330     return builder.toString();
1331   }
1332 
1333   /**
1334    * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches}
1335    *     instead.
1336    */
1337   @Deprecated
1338   @Override
1339   public boolean apply(Character character) {
1340     return matches(character);
1341   }
1342 
1343   /**
1344    * Returns a string representation of this {@code CharMatcher}, such as
1345    * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1346    */
1347   @Override
1348   public String toString() {
1349     return description;
1350   }
1351 
1352   static final String WHITESPACE_TABLE = ""
1353       + "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000"
1354       + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680"
1355       + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009"
1356       + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000";
1357   static final int WHITESPACE_MULTIPLIER = 1682554634;
1358   static final int WHITESPACE_SHIFT = Integer.numberOfLeadingZeros(WHITESPACE_TABLE.length() - 1);
1359 
1360   /**
1361    * Determines whether a character is whitespace according to the latest Unicode standard, as
1362    * illustrated
1363    * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1364    * This is not the same definition used by other Java APIs. (See a
1365    * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1366    * definitions of "whitespace"</a>.)
1367    *
1368    * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1369    * to date.
1370    */
1371   public static final CharMatcher WHITESPACE = new FastMatcher("WHITESPACE") {
1372     @Override
1373     public boolean matches(char c) {
1374       return WHITESPACE_TABLE.charAt((WHITESPACE_MULTIPLIER * c) >>> WHITESPACE_SHIFT) == c;
1375     }
1376 
1377     @GwtIncompatible("java.util.BitSet")
1378     @Override
1379     void setBits(BitSet table) {
1380       for (int i = 0; i < WHITESPACE_TABLE.length(); i++) {
1381         table.set(WHITESPACE_TABLE.charAt(i));
1382       }
1383     }
1384   };
1385 }