View Javadoc
1   /*
2    * Copyright (C) 2009 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    * http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package com.google.common.base;
18  
19  import static com.google.common.base.Preconditions.checkArgument;
20  import static com.google.common.base.Preconditions.checkNotNull;
21  
22  import com.google.common.annotations.Beta;
23  import com.google.common.annotations.GwtCompatible;
24  import com.google.common.annotations.GwtIncompatible;
25  
26  import java.util.ArrayList;
27  import java.util.Collections;
28  import java.util.Iterator;
29  import java.util.LinkedHashMap;
30  import java.util.List;
31  import java.util.Map;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  import javax.annotation.CheckReturnValue;
36  
37  /**
38   * Extracts non-overlapping substrings from an input string, typically by
39   * recognizing appearances of a <i>separator</i> sequence. This separator can be
40   * specified as a single {@linkplain #on(char) character}, fixed {@linkplain
41   * #on(String) string}, {@linkplain #onPattern regular expression} or {@link
42   * #on(CharMatcher) CharMatcher} instance. Or, instead of using a separator at
43   * all, a splitter can extract adjacent substrings of a given {@linkplain
44   * #fixedLength fixed length}.
45   *
46   * <p>For example, this expression: <pre>   {@code
47   *
48   *   Splitter.on(',').split("foo,bar,qux")}</pre>
49   *
50   * ... produces an {@code Iterable} containing {@code "foo"}, {@code "bar"} and
51   * {@code "qux"}, in that order.
52   *
53   * <p>By default, {@code Splitter}'s behavior is simplistic and unassuming. The
54   * following expression: <pre>   {@code
55   *
56   *   Splitter.on(',').split(" foo,,,  bar ,")}</pre>
57   *
58   * ... yields the substrings {@code [" foo", "", "", "  bar ", ""]}. If this
59   * is not the desired behavior, use configuration methods to obtain a <i>new</i>
60   * splitter instance with modified behavior: <pre>   {@code
61   *
62   *   private static final Splitter MY_SPLITTER = Splitter.on(',')
63   *       .trimResults()
64   *       .omitEmptyStrings();}</pre>
65   *
66   * <p>Now {@code MY_SPLITTER.split("foo,,,  bar ,")} returns just {@code ["foo",
67   * "bar"]}. Note that the order in which these configuration methods are called
68   * is never significant.
69   *
70   * <p><b>Warning:</b> Splitter instances are immutable. Invoking a configuration
71   * method has no effect on the receiving instance; you must store and use the
72   * new splitter instance it returns instead. <pre>   {@code
73   *
74   *   // Do NOT do this
75   *   Splitter splitter = Splitter.on('/');
76   *   splitter.trimResults(); // does nothing!
77   *   return splitter.split("wrong / wrong / wrong");}</pre>
78   *
79   * <p>For separator-based splitters that do not use {@code omitEmptyStrings}, an
80   * input string containing {@code n} occurrences of the separator naturally
81   * yields an iterable of size {@code n + 1}. So if the separator does not occur
82   * anywhere in the input, a single substring is returned containing the entire
83   * input. Consequently, all splitters split the empty string to {@code [""]}
84   * (note: even fixed-length splitters).
85   *
86   * <p>Splitter instances are thread-safe immutable, and are therefore safe to
87   * store as {@code static final} constants.
88   *
89   * <p>The {@link Joiner} class provides the inverse operation to splitting, but
90   * note that a round-trip between the two should be assumed to be lossy.
91   *
92   * <p>See the Guava User Guide article on <a href=
93   * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#Splitter">
94   * {@code Splitter}</a>.
95   *
96   * @author Julien Silland
97   * @author Jesse Wilson
98   * @author Kevin Bourrillion
99   * @author Louis Wasserman
100  * @since 1.0
101  */
102 @GwtCompatible(emulated = true)
103 public final class Splitter {
104   private final CharMatcher trimmer;
105   private final boolean omitEmptyStrings;
106   private final Strategy strategy;
107   private final int limit;
108 
109   private Splitter(Strategy strategy) {
110     this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE);
111   }
112 
113   private Splitter(Strategy strategy, boolean omitEmptyStrings,
114       CharMatcher trimmer, int limit) {
115     this.strategy = strategy;
116     this.omitEmptyStrings = omitEmptyStrings;
117     this.trimmer = trimmer;
118     this.limit = limit;
119   }
120 
121   /**
122    * Returns a splitter that uses the given single-character separator. For
123    * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
124    * containing {@code ["foo", "", "bar"]}.
125    *
126    * @param separator the character to recognize as a separator
127    * @return a splitter, with default settings, that recognizes that separator
128    */
129   public static Splitter on(char separator) {
130     return on(CharMatcher.is(separator));
131   }
132 
133   /**
134    * Returns a splitter that considers any single character matched by the
135    * given {@code CharMatcher} to be a separator. For example, {@code
136    * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
137    * iterable containing {@code ["foo", "", "bar", "quux"]}.
138    *
139    * @param separatorMatcher a {@link CharMatcher} that determines whether a
140    *     character is a separator
141    * @return a splitter, with default settings, that uses this matcher
142    */
143   public static Splitter on(final CharMatcher separatorMatcher) {
144     checkNotNull(separatorMatcher);
145 
146     return new Splitter(new Strategy() {
147       @Override public SplittingIterator iterator(
148           Splitter splitter, final CharSequence toSplit) {
149         return new SplittingIterator(splitter, toSplit) {
150           @Override int separatorStart(int start) {
151             return separatorMatcher.indexIn(toSplit, start);
152           }
153 
154           @Override int separatorEnd(int separatorPosition) {
155             return separatorPosition + 1;
156           }
157         };
158       }
159     });
160   }
161 
162   /**
163    * Returns a splitter that uses the given fixed string as a separator. For
164    * example, {@code Splitter.on(", ").split("foo, bar,baz")} returns an
165    * iterable containing {@code ["foo", "bar,baz"]}.
166    *
167    * @param separator the literal, nonempty string to recognize as a separator
168    * @return a splitter, with default settings, that recognizes that separator
169    */
170   public static Splitter on(final String separator) {
171     checkArgument(separator.length() != 0,
172         "The separator may not be the empty string.");
173 
174     return new Splitter(new Strategy() {
175       @Override public SplittingIterator iterator(
176           Splitter splitter, CharSequence toSplit) {
177         return new SplittingIterator(splitter, toSplit) {
178           @Override public int separatorStart(int start) {
179             int separatorLength = separator.length();
180 
181             positions:
182             for (int p = start, last = toSplit.length() - separatorLength;
183                 p <= last; p++) {
184               for (int i = 0; i < separatorLength; i++) {
185                 if (toSplit.charAt(i + p) != separator.charAt(i)) {
186                   continue positions;
187                 }
188               }
189               return p;
190             }
191             return -1;
192           }
193 
194           @Override public int separatorEnd(int separatorPosition) {
195             return separatorPosition + separator.length();
196           }
197         };
198       }
199     });
200   }
201 
202   /**
203    * Returns a splitter that considers any subsequence matching {@code
204    * pattern} to be a separator. For example, {@code
205    * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string
206    * into lines whether it uses DOS-style or UNIX-style line terminators.
207    *
208    * @param separatorPattern the pattern that determines whether a subsequence
209    *     is a separator. This pattern may not match the empty string.
210    * @return a splitter, with default settings, that uses this pattern
211    * @throws IllegalArgumentException if {@code separatorPattern} matches the
212    *     empty string
213    */
214   @GwtIncompatible("java.util.regex")
215   public static Splitter on(final Pattern separatorPattern) {
216     checkNotNull(separatorPattern);
217     checkArgument(!separatorPattern.matcher("").matches(),
218         "The pattern may not match the empty string: %s", separatorPattern);
219 
220     return new Splitter(new Strategy() {
221       @Override public SplittingIterator iterator(
222           final Splitter splitter, CharSequence toSplit) {
223         final Matcher matcher = separatorPattern.matcher(toSplit);
224         return new SplittingIterator(splitter, toSplit) {
225           @Override public int separatorStart(int start) {
226             return matcher.find(start) ? matcher.start() : -1;
227           }
228 
229           @Override public int separatorEnd(int separatorPosition) {
230             return matcher.end();
231           }
232         };
233       }
234     });
235   }
236 
237   /**
238    * Returns a splitter that considers any subsequence matching a given
239    * pattern (regular expression) to be a separator. For example, {@code
240    * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines
241    * whether it uses DOS-style or UNIX-style line terminators. This is
242    * equivalent to {@code Splitter.on(Pattern.compile(pattern))}.
243    *
244    * @param separatorPattern the pattern that determines whether a subsequence
245    *     is a separator. This pattern may not match the empty string.
246    * @return a splitter, with default settings, that uses this pattern
247    * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern}
248    *     is a malformed expression
249    * @throws IllegalArgumentException if {@code separatorPattern} matches the
250    *     empty string
251    */
252   @GwtIncompatible("java.util.regex")
253   public static Splitter onPattern(String separatorPattern) {
254     return on(Pattern.compile(separatorPattern));
255   }
256 
257   /**
258    * Returns a splitter that divides strings into pieces of the given length.
259    * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
260    * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
261    * smaller than {@code length} but will never be empty.
262    *
263    * <p><b>Exception:</b> for consistency with separator-based splitters, {@code
264    * split("")} does not yield an empty iterable, but an iterable containing
265    * {@code ""}. This is the only case in which {@code
266    * Iterables.size(split(input))} does not equal {@code
267    * IntMath.divide(input.length(), length, CEILING)}. To avoid this behavior,
268    * use {@code omitEmptyStrings}.
269    *
270    * @param length the desired length of pieces after splitting, a positive
271    *     integer
272    * @return a splitter, with default settings, that can split into fixed sized
273    *     pieces
274    * @throws IllegalArgumentException if {@code length} is zero or negative
275    */
276   public static Splitter fixedLength(final int length) {
277     checkArgument(length > 0, "The length may not be less than 1");
278 
279     return new Splitter(new Strategy() {
280       @Override public SplittingIterator iterator(
281           final Splitter splitter, CharSequence toSplit) {
282         return new SplittingIterator(splitter, toSplit) {
283           @Override public int separatorStart(int start) {
284             int nextChunkStart = start + length;
285             return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
286           }
287 
288           @Override public int separatorEnd(int separatorPosition) {
289             return separatorPosition;
290           }
291         };
292       }
293     });
294   }
295 
296   /**
297    * Returns a splitter that behaves equivalently to {@code this} splitter, but
298    * automatically omits empty strings from the results. For example, {@code
299    * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
300    * iterable containing only {@code ["a", "b", "c"]}.
301    *
302    * <p>If either {@code trimResults} option is also specified when creating a
303    * splitter, that splitter always trims results first before checking for
304    * emptiness. So, for example, {@code
305    * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
306    * an empty iterable.
307    *
308    * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
309    * to return an empty iterable, but when using this option, it can (if the
310    * input sequence consists of nothing but separators).
311    *
312    * @return a splitter with the desired configuration
313    */
314   @CheckReturnValue
315   public Splitter omitEmptyStrings() {
316     return new Splitter(strategy, true, trimmer, limit);
317   }
318 
319   /**
320    * Returns a splitter that behaves equivalently to {@code this} splitter but
321    * stops splitting after it reaches the limit.
322    * The limit defines the maximum number of items returned by the iterator.
323    *
324    * <p>For example,
325    * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable
326    * containing {@code ["a", "b", "c,d"]}.  When omitting empty strings, the
327    * omitted strings do no count.  Hence,
328    * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")}
329    * returns an iterable containing {@code ["a", "b", "c,d"}.
330    * When trim is requested, all entries, including the last are trimmed.  Hence
331    * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")}
332    * results in @{code ["a", "b", "c , d"]}.
333    *
334    * @param limit the maximum number of items returns
335    * @return a splitter with the desired configuration
336    * @since 9.0
337    */
338   @CheckReturnValue
339   public Splitter limit(int limit) {
340     checkArgument(limit > 0, "must be greater than zero: %s", limit);
341     return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
342   }
343 
344   /**
345    * Returns a splitter that behaves equivalently to {@code this} splitter, but
346    * automatically removes leading and trailing {@linkplain
347    * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
348    * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
349    * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
350    * containing {@code ["a", "b", "c"]}.
351    *
352    * @return a splitter with the desired configuration
353    */
354   @CheckReturnValue
355   public Splitter trimResults() {
356     return trimResults(CharMatcher.WHITESPACE);
357   }
358 
359   /**
360    * Returns a splitter that behaves equivalently to {@code this} splitter, but
361    * removes all leading or trailing characters matching the given {@code
362    * CharMatcher} from each returned substring. For example, {@code
363    * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
364    * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
365    *
366    * @param trimmer a {@link CharMatcher} that determines whether a character
367    *     should be removed from the beginning/end of a subsequence
368    * @return a splitter with the desired configuration
369    */
370   // TODO(kevinb): throw if a trimmer was already specified!
371   @CheckReturnValue
372   public Splitter trimResults(CharMatcher trimmer) {
373     checkNotNull(trimmer);
374     return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
375   }
376 
377   /**
378    * Splits {@code sequence} into string components and makes them available
379    * through an {@link Iterator}, which may be lazily evaluated. If you want
380    * an eagerly computed {@link List}, use {@link #splitToList(CharSequence)}.
381    *
382    * @param sequence the sequence of characters to split
383    * @return an iteration over the segments split from the parameter.
384    */
385   public Iterable<String> split(final CharSequence sequence) {
386     checkNotNull(sequence);
387 
388     return new Iterable<String>() {
389       @Override public Iterator<String> iterator() {
390         return splittingIterator(sequence);
391       }
392       @Override public String toString() {
393         return Joiner.on(", ")
394             .appendTo(new StringBuilder().append('['), this)
395             .append(']')
396             .toString();
397       }
398     };
399   }
400 
401   private Iterator<String> splittingIterator(CharSequence sequence) {
402     return strategy.iterator(this, sequence);
403   }
404 
405   /**
406    * Splits {@code sequence} into string components and returns them as
407    * an immutable list. If you want an {@link Iterable} which may be lazily
408    * evaluated, use {@link #split(CharSequence)}.
409    *
410    * @param sequence the sequence of characters to split
411    * @return an immutable list of the segments split from the parameter
412    * @since 15.0
413    */
414   @Beta
415   public List<String> splitToList(CharSequence sequence) {
416     checkNotNull(sequence);
417 
418     Iterator<String> iterator = splittingIterator(sequence);
419     List<String> result = new ArrayList<String>();
420 
421     while (iterator.hasNext()) {
422       result.add(iterator.next());
423     }
424 
425     return Collections.unmodifiableList(result);
426   }
427 
428   /**
429    * Returns a {@code MapSplitter} which splits entries based on this splitter,
430    * and splits entries into keys and values using the specified separator.
431    *
432    * @since 10.0
433    */
434   @CheckReturnValue
435   @Beta
436   public MapSplitter withKeyValueSeparator(String separator) {
437     return withKeyValueSeparator(on(separator));
438   }
439 
440   /**
441    * Returns a {@code MapSplitter} which splits entries based on this splitter,
442    * and splits entries into keys and values using the specified separator.
443    *
444    * @since 14.0
445    */
446   @CheckReturnValue
447   @Beta
448   public MapSplitter withKeyValueSeparator(char separator) {
449     return withKeyValueSeparator(on(separator));
450   }
451 
452   /**
453    * Returns a {@code MapSplitter} which splits entries based on this splitter,
454    * and splits entries into keys and values using the specified key-value
455    * splitter.
456    *
457    * @since 10.0
458    */
459   @CheckReturnValue
460   @Beta
461   public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) {
462     return new MapSplitter(this, keyValueSplitter);
463   }
464 
465   /**
466    * An object that splits strings into maps as {@code Splitter} splits
467    * iterables and lists. Like {@code Splitter}, it is thread-safe and
468    * immutable.
469    *
470    * @since 10.0
471    */
472   @Beta
473   public static final class MapSplitter {
474     private static final String INVALID_ENTRY_MESSAGE =
475         "Chunk [%s] is not a valid entry";
476     private final Splitter outerSplitter;
477     private final Splitter entrySplitter;
478 
479     private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) {
480       this.outerSplitter = outerSplitter; // only "this" is passed
481       this.entrySplitter = checkNotNull(entrySplitter);
482     }
483 
484     /**
485      * Splits {@code sequence} into substrings, splits each substring into
486      * an entry, and returns an unmodifiable map with each of the entries. For
487      * example, <code>
488      * Splitter.on(';').trimResults().withKeyValueSeparator("=>")
489      * .split("a=>b ; c=>b")
490      * </code> will return a mapping from {@code "a"} to {@code "b"} and
491      * {@code "c"} to {@code b}.
492      *
493      * <p>The returned map preserves the order of the entries from
494      * {@code sequence}.
495      *
496      * @throws IllegalArgumentException if the specified sequence does not split
497      *         into valid map entries, or if there are duplicate keys
498      */
499     public Map<String, String> split(CharSequence sequence) {
500       Map<String, String> map = new LinkedHashMap<String, String>();
501       for (String entry : outerSplitter.split(sequence)) {
502         Iterator<String> entryFields = entrySplitter.splittingIterator(entry);
503 
504         checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
505         String key = entryFields.next();
506         checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key);
507 
508         checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
509         String value = entryFields.next();
510         map.put(key, value);
511 
512         checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
513       }
514       return Collections.unmodifiableMap(map);
515     }
516   }
517 
518   private interface Strategy {
519     Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
520   }
521 
522   private abstract static class SplittingIterator extends AbstractIterator<String> {
523     final CharSequence toSplit;
524     final CharMatcher trimmer;
525     final boolean omitEmptyStrings;
526 
527     /**
528      * Returns the first index in {@code toSplit} at or after {@code start}
529      * that contains the separator.
530      */
531     abstract int separatorStart(int start);
532 
533     /**
534      * Returns the first index in {@code toSplit} after {@code
535      * separatorPosition} that does not contain a separator. This method is only
536      * invoked after a call to {@code separatorStart}.
537      */
538     abstract int separatorEnd(int separatorPosition);
539 
540     int offset = 0;
541     int limit;
542 
543     protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
544       this.trimmer = splitter.trimmer;
545       this.omitEmptyStrings = splitter.omitEmptyStrings;
546       this.limit = splitter.limit;
547       this.toSplit = toSplit;
548     }
549 
550     @Override protected String computeNext() {
551       /*
552        * The returned string will be from the end of the last match to the
553        * beginning of the next one. nextStart is the start position of the
554        * returned substring, while offset is the place to start looking for a
555        * separator.
556        */
557       int nextStart = offset;
558       while (offset != -1) {
559         int start = nextStart;
560         int end;
561 
562         int separatorPosition = separatorStart(offset);
563         if (separatorPosition == -1) {
564           end = toSplit.length();
565           offset = -1;
566         } else {
567           end = separatorPosition;
568           offset = separatorEnd(separatorPosition);
569         }
570         if (offset == nextStart) {
571           /*
572            * This occurs when some pattern has an empty match, even if it
573            * doesn't match the empty string -- for example, if it requires
574            * lookahead or the like. The offset must be increased to look for
575            * separators beyond this point, without changing the start position
576            * of the next returned substring -- so nextStart stays the same.
577            */
578           offset++;
579           if (offset >= toSplit.length()) {
580             offset = -1;
581           }
582           continue;
583         }
584 
585         while (start < end && trimmer.matches(toSplit.charAt(start))) {
586           start++;
587         }
588         while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
589           end--;
590         }
591 
592         if (omitEmptyStrings && start == end) {
593           // Don't include the (unused) separator in next split string.
594           nextStart = offset;
595           continue;
596         }
597 
598         if (limit == 1) {
599           // The limit has been reached, return the rest of the string as the
600           // final item.  This is tested after empty string removal so that
601           // empty strings do not count towards the limit.
602           end = toSplit.length();
603           offset = -1;
604           // Since we may have changed the end, we need to trim it again.
605           while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
606             end--;
607           }
608         } else {
609           limit--;
610         }
611 
612         return toSplit.subSequence(start, end).toString();
613       }
614       return endOfData();
615     }
616   }
617 }