001 /*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.base;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021 import static com.google.common.base.Preconditions.checkState;
022
023 import com.google.common.annotations.Beta;
024 import com.google.common.annotations.GwtCompatible;
025 import com.google.common.annotations.GwtIncompatible;
026
027 import java.util.Collections;
028 import java.util.Iterator;
029 import java.util.LinkedHashMap;
030 import java.util.Map;
031 import java.util.NoSuchElementException;
032 import java.util.regex.Matcher;
033 import java.util.regex.Pattern;
034
035 import javax.annotation.CheckReturnValue;
036
037 /**
038 * An object that divides strings (or other instances of {@code CharSequence})
039 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter")
040 * which can be expressed as a single character, literal string, regular
041 * expression, {@code CharMatcher}, or by using a fixed substring length. This
042 * class provides the complementary functionality to {@link Joiner}.
043 *
044 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code
045 *
046 * Splitter.on(',').split("foo,bar")}</pre>
047 *
048 * This invocation returns an {@code Iterable<String>} containing {@code "foo"}
049 * and {@code "bar"}, in that order.
050 *
051 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code
052 *
053 * Splitter.on(',').split("foo,,bar, quux")}</pre>
054 *
055 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}.
056 * Notice that the splitter does not assume that you want empty strings removed,
057 * or that you wish to trim whitespace. If you want features like these, simply
058 * ask for them: <pre> {@code
059 *
060 * private static final Splitter MY_SPLITTER = Splitter.on(',')
061 * .trimResults()
062 * .omitEmptyStrings();}</pre>
063 *
064 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable
065 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which
066 * the configuration methods are called is never significant; for instance,
067 * trimming is always applied first before checking for an empty result,
068 * regardless of the order in which the {@link #trimResults()} and
069 * {@link #omitEmptyStrings()} methods were invoked.
070 *
071 * <p><b>Warning: splitter instances are always immutable</b>; a configuration
072 * method such as {@code omitEmptyStrings} has no effect on the instance it
073 * is invoked on! You must store and use the new splitter instance returned by
074 * the method. This makes splitters thread-safe, and safe to store as {@code
075 * static final} constants (as illustrated above). <pre> {@code
076 *
077 * // Bad! Do not do this!
078 * Splitter splitter = Splitter.on('/');
079 * splitter.trimResults(); // does nothing!
080 * return splitter.split("wrong / wrong / wrong");}</pre>
081 *
082 * The separator recognized by the splitter does not have to be a single
083 * literal character as in the examples above. See the methods {@link
084 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples
085 * of other ways to specify separators.
086 *
087 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of
088 * similar JDK methods; for instance, it does not silently discard trailing
089 * separators, as does {@link String#split(String)}, nor does it have a default
090 * behavior of using five particular whitespace characters as separators, like
091 * {@link java.util.StringTokenizer}.
092 *
093 * @author Julien Silland
094 * @author Jesse Wilson
095 * @author Kevin Bourrillion
096 * @author Louis Wasserman
097 * @since 1.0
098 */
099 @GwtCompatible(emulated = true)
100 public final class Splitter {
101 private final CharMatcher trimmer;
102 private final boolean omitEmptyStrings;
103 private final Strategy strategy;
104 private final int limit;
105
106 private Splitter(Strategy strategy) {
107 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE);
108 }
109
110 private Splitter(Strategy strategy, boolean omitEmptyStrings,
111 CharMatcher trimmer, int limit) {
112 this.strategy = strategy;
113 this.omitEmptyStrings = omitEmptyStrings;
114 this.trimmer = trimmer;
115 this.limit = limit;
116 }
117
118 /**
119 * Returns a splitter that uses the given single-character separator. For
120 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
121 * containing {@code ["foo", "", "bar"]}.
122 *
123 * @param separator the character to recognize as a separator
124 * @return a splitter, with default settings, that recognizes that separator
125 */
126 public static Splitter on(char separator) {
127 return on(CharMatcher.is(separator));
128 }
129
130 /**
131 * Returns a splitter that considers any single character matched by the
132 * given {@code CharMatcher} to be a separator. For example, {@code
133 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
134 * iterable containing {@code ["foo", "", "bar", "quux"]}.
135 *
136 * @param separatorMatcher a {@link CharMatcher} that determines whether a
137 * character is a separator
138 * @return a splitter, with default settings, that uses this matcher
139 */
140 public static Splitter on(final CharMatcher separatorMatcher) {
141 checkNotNull(separatorMatcher);
142
143 return new Splitter(new Strategy() {
144 @Override public SplittingIterator iterator(
145 Splitter splitter, final CharSequence toSplit) {
146 return new SplittingIterator(splitter, toSplit) {
147 @Override int separatorStart(int start) {
148 return separatorMatcher.indexIn(toSplit, start);
149 }
150
151 @Override int separatorEnd(int separatorPosition) {
152 return separatorPosition + 1;
153 }
154 };
155 }
156 });
157 }
158
159 /**
160 * Returns a splitter that uses the given fixed string as a separator. For
161 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an
162 * iterable containing {@code ["foo", "bar", "baz,qux"]}.
163 *
164 * @param separator the literal, nonempty string to recognize as a separator
165 * @return a splitter, with default settings, that recognizes that separator
166 */
167 public static Splitter on(final String separator) {
168 checkArgument(separator.length() != 0,
169 "The separator may not be the empty string.");
170
171 return new Splitter(new Strategy() {
172 @Override public SplittingIterator iterator(
173 Splitter splitter, CharSequence toSplit) {
174 return new SplittingIterator(splitter, toSplit) {
175 @Override public int separatorStart(int start) {
176 int delimeterLength = separator.length();
177
178 positions:
179 for (int p = start, last = toSplit.length() - delimeterLength;
180 p <= last; p++) {
181 for (int i = 0; i < delimeterLength; i++) {
182 if (toSplit.charAt(i + p) != separator.charAt(i)) {
183 continue positions;
184 }
185 }
186 return p;
187 }
188 return -1;
189 }
190
191 @Override public int separatorEnd(int separatorPosition) {
192 return separatorPosition + separator.length();
193 }
194 };
195 }
196 });
197 }
198
199 /**
200 * Returns a splitter that considers any subsequence matching {@code
201 * pattern} to be a separator. For example, {@code
202 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string
203 * into lines whether it uses DOS-style or UNIX-style line terminators.
204 *
205 * @param separatorPattern the pattern that determines whether a subsequence
206 * is a separator. This pattern may not match the empty string.
207 * @return a splitter, with default settings, that uses this pattern
208 * @throws IllegalArgumentException if {@code separatorPattern} matches the
209 * empty string
210 */
211 @GwtIncompatible("java.util.regex")
212 public static Splitter on(final Pattern separatorPattern) {
213 checkNotNull(separatorPattern);
214 checkArgument(!separatorPattern.matcher("").matches(),
215 "The pattern may not match the empty string: %s", separatorPattern);
216
217 return new Splitter(new Strategy() {
218 @Override public SplittingIterator iterator(
219 final Splitter splitter, CharSequence toSplit) {
220 final Matcher matcher = separatorPattern.matcher(toSplit);
221 return new SplittingIterator(splitter, toSplit) {
222 @Override public int separatorStart(int start) {
223 return matcher.find(start) ? matcher.start() : -1;
224 }
225
226 @Override public int separatorEnd(int separatorPosition) {
227 return matcher.end();
228 }
229 };
230 }
231 });
232 }
233
234 /**
235 * Returns a splitter that considers any subsequence matching a given
236 * pattern (regular expression) to be a separator. For example, {@code
237 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines
238 * whether it uses DOS-style or UNIX-style line terminators. This is
239 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}.
240 *
241 * @param separatorPattern the pattern that determines whether a subsequence
242 * is a separator. This pattern may not match the empty string.
243 * @return a splitter, with default settings, that uses this pattern
244 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern}
245 * is a malformed expression
246 * @throws IllegalArgumentException if {@code separatorPattern} matches the
247 * empty string
248 */
249 @GwtIncompatible("java.util.regex")
250 public static Splitter onPattern(String separatorPattern) {
251 return on(Pattern.compile(separatorPattern));
252 }
253
254 /**
255 * Returns a splitter that divides strings into pieces of the given length.
256 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
257 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
258 * smaller than {@code length} but will never be empty.
259 *
260 * @param length the desired length of pieces after splitting
261 * @return a splitter, with default settings, that can split into fixed sized
262 * pieces
263 */
264 public static Splitter fixedLength(final int length) {
265 checkArgument(length > 0, "The length may not be less than 1");
266
267 return new Splitter(new Strategy() {
268 @Override public SplittingIterator iterator(
269 final Splitter splitter, CharSequence toSplit) {
270 return new SplittingIterator(splitter, toSplit) {
271 @Override public int separatorStart(int start) {
272 int nextChunkStart = start + length;
273 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
274 }
275
276 @Override public int separatorEnd(int separatorPosition) {
277 return separatorPosition;
278 }
279 };
280 }
281 });
282 }
283
284 /**
285 * Returns a splitter that behaves equivalently to {@code this} splitter, but
286 * automatically omits empty strings from the results. For example, {@code
287 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
288 * iterable containing only {@code ["a", "b", "c"]}.
289 *
290 * <p>If either {@code trimResults} option is also specified when creating a
291 * splitter, that splitter always trims results first before checking for
292 * emptiness. So, for example, {@code
293 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
294 * an empty iterable.
295 *
296 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
297 * to return an empty iterable, but when using this option, it can (if the
298 * input sequence consists of nothing but separators).
299 *
300 * @return a splitter with the desired configuration
301 */
302 @CheckReturnValue
303 public Splitter omitEmptyStrings() {
304 return new Splitter(strategy, true, trimmer, limit);
305 }
306
307 /**
308 * Returns a splitter that behaves equivalently to {@code this} splitter but
309 * stops splitting after it reaches the limit.
310 * The limit defines the maximum number of items returned by the iterator.
311 *
312 * <p>For example,
313 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable
314 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the
315 * omitted strings do no count. Hence,
316 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")}
317 * returns an iterable containing {@code ["a", "b", "c,d"}.
318 * When trim is requested, all entries, including the last are trimmed. Hence
319 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")}
320 * results in @{code ["a", "b", "c , d"]}.
321 *
322 * @param limit the maximum number of items returns
323 * @return a splitter with the desired configuration
324 * @since 9.0
325 */
326 @CheckReturnValue
327 public Splitter limit(int limit) {
328 checkArgument(limit > 0, "must be greater then zero: %s", limit);
329 return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
330 }
331
332 /**
333 * Returns a splitter that behaves equivalently to {@code this} splitter, but
334 * automatically removes leading and trailing {@linkplain
335 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
336 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
337 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
338 * containing {@code ["a", "b", "c"]}.
339 *
340 * @return a splitter with the desired configuration
341 */
342 @CheckReturnValue
343 public Splitter trimResults() {
344 return trimResults(CharMatcher.WHITESPACE);
345 }
346
347 /**
348 * Returns a splitter that behaves equivalently to {@code this} splitter, but
349 * removes all leading or trailing characters matching the given {@code
350 * CharMatcher} from each returned substring. For example, {@code
351 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
352 * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
353 *
354 * @param trimmer a {@link CharMatcher} that determines whether a character
355 * should be removed from the beginning/end of a subsequence
356 * @return a splitter with the desired configuration
357 */
358 // TODO(kevinb): throw if a trimmer was already specified!
359 @CheckReturnValue
360 public Splitter trimResults(CharMatcher trimmer) {
361 checkNotNull(trimmer);
362 return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
363 }
364
365 /**
366 * Splits {@code sequence} into string components and makes them available
367 * through an {@link Iterator}, which may be lazily evaluated.
368 *
369 * @param sequence the sequence of characters to split
370 * @return an iteration over the segments split from the parameter.
371 */
372 public Iterable<String> split(final CharSequence sequence) {
373 checkNotNull(sequence);
374
375 return new Iterable<String>() {
376 @Override public Iterator<String> iterator() {
377 return spliterator(sequence);
378 }
379 };
380 }
381
382 private Iterator<String> spliterator(CharSequence sequence) {
383 return strategy.iterator(this, sequence);
384 }
385
386 /**
387 * Returns a {@code MapSplitter} which splits entries based on this splitter,
388 * and splits entries into keys and values using the specified separator.
389 *
390 * @since 10.0
391 */
392 @CheckReturnValue
393 @Beta
394 public MapSplitter withKeyValueSeparator(String separator) {
395 return withKeyValueSeparator(on(separator));
396 }
397
398 /**
399 * Returns a {@code MapSplitter} which splits entries based on this splitter,
400 * and splits entries into keys and values using the specified key-value
401 * splitter.
402 *
403 * @since 10.0
404 */
405 @CheckReturnValue
406 @Beta
407 public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) {
408 return new MapSplitter(this, keyValueSplitter);
409 }
410
411 /**
412 * An object that splits strings into maps as {@code Splitter} splits
413 * iterables and lists. Like {@code Splitter}, it is thread-safe and
414 * immutable.
415 *
416 * @since 10.0
417 */
418 @Beta
419 public static final class MapSplitter {
420 private static final String INVALID_ENTRY_MESSAGE =
421 "Chunk [%s] is not a valid entry";
422 private final Splitter outerSplitter;
423 private final Splitter entrySplitter;
424
425 private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) {
426 this.outerSplitter = outerSplitter; // only "this" is passed
427 this.entrySplitter = checkNotNull(entrySplitter);
428 }
429
430 /**
431 * Splits {@code sequence} into substrings, splits each substring into
432 * an entry, and returns an unmodifiable map with each of the entries. For
433 * example, <code>
434 * Splitter.on(';').trimResults().withKeyValueSeparator("=>")
435 * .split("a=>b ; c=>b")
436 * </code> will return a mapping from {@code "a"} to {@code "b"} and
437 * {@code "c"} to {@code b}.
438 *
439 * <p>The returned map preserves the order of the entries from
440 * {@code sequence}.
441 *
442 * @throws IllegalArgumentException if the specified sequence does not split
443 * into valid map entries, or if there are duplicate keys
444 */
445 public Map<String, String> split(CharSequence sequence) {
446 Map<String, String> map = new LinkedHashMap<String, String>();
447 for (String entry : outerSplitter.split(sequence)) {
448 Iterator<String> entryFields = entrySplitter.spliterator(entry);
449
450 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
451 String key = entryFields.next();
452 checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key);
453
454 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
455 String value = entryFields.next();
456 map.put(key, value);
457
458 checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
459 }
460 return Collections.unmodifiableMap(map);
461 }
462 }
463
464 private interface Strategy {
465 Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
466 }
467
468 private abstract static class SplittingIterator
469 extends AbstractIterator<String> {
470 final CharSequence toSplit;
471 final CharMatcher trimmer;
472 final boolean omitEmptyStrings;
473
474 /**
475 * Returns the first index in {@code toSplit} at or after {@code start}
476 * that contains the separator.
477 */
478 abstract int separatorStart(int start);
479
480 /**
481 * Returns the first index in {@code toSplit} after {@code
482 * separatorPosition} that does not contain a separator. This method is only
483 * invoked after a call to {@code separatorStart}.
484 */
485 abstract int separatorEnd(int separatorPosition);
486
487 int offset = 0;
488 int limit;
489
490 protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
491 this.trimmer = splitter.trimmer;
492 this.omitEmptyStrings = splitter.omitEmptyStrings;
493 this.limit = splitter.limit;
494 this.toSplit = toSplit;
495 }
496
497 @Override protected String computeNext() {
498 while (offset != -1) {
499 int start = offset;
500 int end;
501
502 int separatorPosition = separatorStart(offset);
503 if (separatorPosition == -1) {
504 end = toSplit.length();
505 offset = -1;
506 } else {
507 end = separatorPosition;
508 offset = separatorEnd(separatorPosition);
509 }
510
511 while (start < end && trimmer.matches(toSplit.charAt(start))) {
512 start++;
513 }
514 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
515 end--;
516 }
517
518 if (omitEmptyStrings && start == end) {
519 continue;
520 }
521
522 if (limit == 1) {
523 // The limit has been reached, return the rest of the string as the
524 // final item. This is tested after empty string removal so that
525 // empty strings do not count towards the limit.
526 end = toSplit.length();
527 offset = -1;
528 // Since we may have changed the end, we need to trim it again.
529 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
530 end--;
531 }
532 } else {
533 limit--;
534 }
535
536 return toSplit.subSequence(start, end).toString();
537 }
538 return endOfData();
539 }
540 }
541
542 /*
543 * Copied from common.collect.AbstractIterator. TODO(kevinb): un-fork if these
544 * packages are ever combined into a single library.
545 */
546 private abstract static class AbstractIterator<T> implements Iterator<T> {
547 State state = State.NOT_READY;
548
549 enum State {
550 READY, NOT_READY, DONE, FAILED,
551 }
552
553 T next;
554
555 protected abstract T computeNext();
556
557 protected final T endOfData() {
558 state = State.DONE;
559 return null;
560 }
561
562 @Override
563 public final boolean hasNext() {
564 checkState(state != State.FAILED);
565 switch (state) {
566 case DONE:
567 return false;
568 case READY:
569 return true;
570 default:
571 }
572 return tryToComputeNext();
573 }
574
575 boolean tryToComputeNext() {
576 state = State.FAILED; // temporary pessimism
577 next = computeNext();
578 if (state != State.DONE) {
579 state = State.READY;
580 return true;
581 }
582 return false;
583 }
584
585 @Override
586 public final T next() {
587 if (!hasNext()) {
588 throw new NoSuchElementException();
589 }
590 state = State.NOT_READY;
591 return next;
592 }
593
594 @Override public void remove() {
595 throw new UnsupportedOperationException();
596 }
597 }
598 }