001 /* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 import static com.google.common.base.Preconditions.checkState; 022 023 import com.google.common.annotations.Beta; 024 import com.google.common.annotations.GwtCompatible; 025 import com.google.common.annotations.GwtIncompatible; 026 027 import java.util.Collections; 028 import java.util.Iterator; 029 import java.util.LinkedHashMap; 030 import java.util.Map; 031 import java.util.NoSuchElementException; 032 import java.util.regex.Matcher; 033 import java.util.regex.Pattern; 034 035 import javax.annotation.CheckReturnValue; 036 037 /** 038 * An object that divides strings (or other instances of {@code CharSequence}) 039 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter") 040 * which can be expressed as a single character, literal string, regular 041 * expression, {@code CharMatcher}, or by using a fixed substring length. This 042 * class provides the complementary functionality to {@link Joiner}. 043 * 044 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code 045 * 046 * Splitter.on(',').split("foo,bar")}</pre> 047 * 048 * This invocation returns an {@code Iterable<String>} containing {@code "foo"} 049 * and {@code "bar"}, in that order. 050 * 051 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code 052 * 053 * Splitter.on(',').split("foo,,bar, quux")}</pre> 054 * 055 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}. 056 * Notice that the splitter does not assume that you want empty strings removed, 057 * or that you wish to trim whitespace. If you want features like these, simply 058 * ask for them: <pre> {@code 059 * 060 * private static final Splitter MY_SPLITTER = Splitter.on(',') 061 * .trimResults() 062 * .omitEmptyStrings();}</pre> 063 * 064 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable 065 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which 066 * the configuration methods are called is never significant; for instance, 067 * trimming is always applied first before checking for an empty result, 068 * regardless of the order in which the {@link #trimResults()} and 069 * {@link #omitEmptyStrings()} methods were invoked. 070 * 071 * <p><b>Warning: splitter instances are always immutable</b>; a configuration 072 * method such as {@code omitEmptyStrings} has no effect on the instance it 073 * is invoked on! You must store and use the new splitter instance returned by 074 * the method. This makes splitters thread-safe, and safe to store as {@code 075 * static final} constants (as illustrated above). <pre> {@code 076 * 077 * // Bad! Do not do this! 078 * Splitter splitter = Splitter.on('/'); 079 * splitter.trimResults(); // does nothing! 080 * return splitter.split("wrong / wrong / wrong");}</pre> 081 * 082 * The separator recognized by the splitter does not have to be a single 083 * literal character as in the examples above. See the methods {@link 084 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples 085 * of other ways to specify separators. 086 * 087 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of 088 * similar JDK methods; for instance, it does not silently discard trailing 089 * separators, as does {@link String#split(String)}, nor does it have a default 090 * behavior of using five particular whitespace characters as separators, like 091 * {@link java.util.StringTokenizer}. 092 * 093 * @author Julien Silland 094 * @author Jesse Wilson 095 * @author Kevin Bourrillion 096 * @author Louis Wasserman 097 * @since 1.0 098 */ 099 @GwtCompatible(emulated = true) 100 public final class Splitter { 101 private final CharMatcher trimmer; 102 private final boolean omitEmptyStrings; 103 private final Strategy strategy; 104 private final int limit; 105 106 private Splitter(Strategy strategy) { 107 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE); 108 } 109 110 private Splitter(Strategy strategy, boolean omitEmptyStrings, 111 CharMatcher trimmer, int limit) { 112 this.strategy = strategy; 113 this.omitEmptyStrings = omitEmptyStrings; 114 this.trimmer = trimmer; 115 this.limit = limit; 116 } 117 118 /** 119 * Returns a splitter that uses the given single-character separator. For 120 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable 121 * containing {@code ["foo", "", "bar"]}. 122 * 123 * @param separator the character to recognize as a separator 124 * @return a splitter, with default settings, that recognizes that separator 125 */ 126 public static Splitter on(char separator) { 127 return on(CharMatcher.is(separator)); 128 } 129 130 /** 131 * Returns a splitter that considers any single character matched by the 132 * given {@code CharMatcher} to be a separator. For example, {@code 133 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an 134 * iterable containing {@code ["foo", "", "bar", "quux"]}. 135 * 136 * @param separatorMatcher a {@link CharMatcher} that determines whether a 137 * character is a separator 138 * @return a splitter, with default settings, that uses this matcher 139 */ 140 public static Splitter on(final CharMatcher separatorMatcher) { 141 checkNotNull(separatorMatcher); 142 143 return new Splitter(new Strategy() { 144 @Override public SplittingIterator iterator( 145 Splitter splitter, final CharSequence toSplit) { 146 return new SplittingIterator(splitter, toSplit) { 147 @Override int separatorStart(int start) { 148 return separatorMatcher.indexIn(toSplit, start); 149 } 150 151 @Override int separatorEnd(int separatorPosition) { 152 return separatorPosition + 1; 153 } 154 }; 155 } 156 }); 157 } 158 159 /** 160 * Returns a splitter that uses the given fixed string as a separator. For 161 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an 162 * iterable containing {@code ["foo", "bar", "baz,qux"]}. 163 * 164 * @param separator the literal, nonempty string to recognize as a separator 165 * @return a splitter, with default settings, that recognizes that separator 166 */ 167 public static Splitter on(final String separator) { 168 checkArgument(separator.length() != 0, 169 "The separator may not be the empty string."); 170 171 return new Splitter(new Strategy() { 172 @Override public SplittingIterator iterator( 173 Splitter splitter, CharSequence toSplit) { 174 return new SplittingIterator(splitter, toSplit) { 175 @Override public int separatorStart(int start) { 176 int delimeterLength = separator.length(); 177 178 positions: 179 for (int p = start, last = toSplit.length() - delimeterLength; 180 p <= last; p++) { 181 for (int i = 0; i < delimeterLength; i++) { 182 if (toSplit.charAt(i + p) != separator.charAt(i)) { 183 continue positions; 184 } 185 } 186 return p; 187 } 188 return -1; 189 } 190 191 @Override public int separatorEnd(int separatorPosition) { 192 return separatorPosition + separator.length(); 193 } 194 }; 195 } 196 }); 197 } 198 199 /** 200 * Returns a splitter that considers any subsequence matching {@code 201 * pattern} to be a separator. For example, {@code 202 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string 203 * into lines whether it uses DOS-style or UNIX-style line terminators. 204 * 205 * @param separatorPattern the pattern that determines whether a subsequence 206 * is a separator. This pattern may not match the empty string. 207 * @return a splitter, with default settings, that uses this pattern 208 * @throws IllegalArgumentException if {@code separatorPattern} matches the 209 * empty string 210 */ 211 @GwtIncompatible("java.util.regex") 212 public static Splitter on(final Pattern separatorPattern) { 213 checkNotNull(separatorPattern); 214 checkArgument(!separatorPattern.matcher("").matches(), 215 "The pattern may not match the empty string: %s", separatorPattern); 216 217 return new Splitter(new Strategy() { 218 @Override public SplittingIterator iterator( 219 final Splitter splitter, CharSequence toSplit) { 220 final Matcher matcher = separatorPattern.matcher(toSplit); 221 return new SplittingIterator(splitter, toSplit) { 222 @Override public int separatorStart(int start) { 223 return matcher.find(start) ? matcher.start() : -1; 224 } 225 226 @Override public int separatorEnd(int separatorPosition) { 227 return matcher.end(); 228 } 229 }; 230 } 231 }); 232 } 233 234 /** 235 * Returns a splitter that considers any subsequence matching a given 236 * pattern (regular expression) to be a separator. For example, {@code 237 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines 238 * whether it uses DOS-style or UNIX-style line terminators. This is 239 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}. 240 * 241 * @param separatorPattern the pattern that determines whether a subsequence 242 * is a separator. This pattern may not match the empty string. 243 * @return a splitter, with default settings, that uses this pattern 244 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern} 245 * is a malformed expression 246 * @throws IllegalArgumentException if {@code separatorPattern} matches the 247 * empty string 248 */ 249 @GwtIncompatible("java.util.regex") 250 public static Splitter onPattern(String separatorPattern) { 251 return on(Pattern.compile(separatorPattern)); 252 } 253 254 /** 255 * Returns a splitter that divides strings into pieces of the given length. 256 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an 257 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be 258 * smaller than {@code length} but will never be empty. 259 * 260 * @param length the desired length of pieces after splitting 261 * @return a splitter, with default settings, that can split into fixed sized 262 * pieces 263 */ 264 public static Splitter fixedLength(final int length) { 265 checkArgument(length > 0, "The length may not be less than 1"); 266 267 return new Splitter(new Strategy() { 268 @Override public SplittingIterator iterator( 269 final Splitter splitter, CharSequence toSplit) { 270 return new SplittingIterator(splitter, toSplit) { 271 @Override public int separatorStart(int start) { 272 int nextChunkStart = start + length; 273 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1); 274 } 275 276 @Override public int separatorEnd(int separatorPosition) { 277 return separatorPosition; 278 } 279 }; 280 } 281 }); 282 } 283 284 /** 285 * Returns a splitter that behaves equivalently to {@code this} splitter, but 286 * automatically omits empty strings from the results. For example, {@code 287 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an 288 * iterable containing only {@code ["a", "b", "c"]}. 289 * 290 * <p>If either {@code trimResults} option is also specified when creating a 291 * splitter, that splitter always trims results first before checking for 292 * emptiness. So, for example, {@code 293 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns 294 * an empty iterable. 295 * 296 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)} 297 * to return an empty iterable, but when using this option, it can (if the 298 * input sequence consists of nothing but separators). 299 * 300 * @return a splitter with the desired configuration 301 */ 302 @CheckReturnValue 303 public Splitter omitEmptyStrings() { 304 return new Splitter(strategy, true, trimmer, limit); 305 } 306 307 /** 308 * Returns a splitter that behaves equivalently to {@code this} splitter but 309 * stops splitting after it reaches the limit. 310 * The limit defines the maximum number of items returned by the iterator. 311 * 312 * <p>For example, 313 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable 314 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the 315 * omitted strings do no count. Hence, 316 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")} 317 * returns an iterable containing {@code ["a", "b", "c,d"}. 318 * When trim is requested, all entries, including the last are trimmed. Hence 319 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")} 320 * results in @{code ["a", "b", "c , d"]}. 321 * 322 * @param limit the maximum number of items returns 323 * @return a splitter with the desired configuration 324 * @since 9.0 325 */ 326 @CheckReturnValue 327 public Splitter limit(int limit) { 328 checkArgument(limit > 0, "must be greater then zero: %s", limit); 329 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 330 } 331 332 /** 333 * Returns a splitter that behaves equivalently to {@code this} splitter, but 334 * automatically removes leading and trailing {@linkplain 335 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent 336 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code 337 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable 338 * containing {@code ["a", "b", "c"]}. 339 * 340 * @return a splitter with the desired configuration 341 */ 342 @CheckReturnValue 343 public Splitter trimResults() { 344 return trimResults(CharMatcher.WHITESPACE); 345 } 346 347 /** 348 * Returns a splitter that behaves equivalently to {@code this} splitter, but 349 * removes all leading or trailing characters matching the given {@code 350 * CharMatcher} from each returned substring. For example, {@code 351 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")} 352 * returns an iterable containing {@code ["a ", "b_ ", "c"]}. 353 * 354 * @param trimmer a {@link CharMatcher} that determines whether a character 355 * should be removed from the beginning/end of a subsequence 356 * @return a splitter with the desired configuration 357 */ 358 // TODO(kevinb): throw if a trimmer was already specified! 359 @CheckReturnValue 360 public Splitter trimResults(CharMatcher trimmer) { 361 checkNotNull(trimmer); 362 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 363 } 364 365 /** 366 * Splits {@code sequence} into string components and makes them available 367 * through an {@link Iterator}, which may be lazily evaluated. 368 * 369 * @param sequence the sequence of characters to split 370 * @return an iteration over the segments split from the parameter. 371 */ 372 public Iterable<String> split(final CharSequence sequence) { 373 checkNotNull(sequence); 374 375 return new Iterable<String>() { 376 @Override public Iterator<String> iterator() { 377 return spliterator(sequence); 378 } 379 }; 380 } 381 382 private Iterator<String> spliterator(CharSequence sequence) { 383 return strategy.iterator(this, sequence); 384 } 385 386 /** 387 * Returns a {@code MapSplitter} which splits entries based on this splitter, 388 * and splits entries into keys and values using the specified separator. 389 * 390 * @since 10.0 391 */ 392 @CheckReturnValue 393 @Beta 394 public MapSplitter withKeyValueSeparator(String separator) { 395 return withKeyValueSeparator(on(separator)); 396 } 397 398 /** 399 * Returns a {@code MapSplitter} which splits entries based on this splitter, 400 * and splits entries into keys and values using the specified key-value 401 * splitter. 402 * 403 * @since 10.0 404 */ 405 @CheckReturnValue 406 @Beta 407 public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) { 408 return new MapSplitter(this, keyValueSplitter); 409 } 410 411 /** 412 * An object that splits strings into maps as {@code Splitter} splits 413 * iterables and lists. Like {@code Splitter}, it is thread-safe and 414 * immutable. 415 * 416 * @since 10.0 417 */ 418 @Beta 419 public static final class MapSplitter { 420 private static final String INVALID_ENTRY_MESSAGE = 421 "Chunk [%s] is not a valid entry"; 422 private final Splitter outerSplitter; 423 private final Splitter entrySplitter; 424 425 private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) { 426 this.outerSplitter = outerSplitter; // only "this" is passed 427 this.entrySplitter = checkNotNull(entrySplitter); 428 } 429 430 /** 431 * Splits {@code sequence} into substrings, splits each substring into 432 * an entry, and returns an unmodifiable map with each of the entries. For 433 * example, <code> 434 * Splitter.on(';').trimResults().withKeyValueSeparator("=>") 435 * .split("a=>b ; c=>b") 436 * </code> will return a mapping from {@code "a"} to {@code "b"} and 437 * {@code "c"} to {@code b}. 438 * 439 * <p>The returned map preserves the order of the entries from 440 * {@code sequence}. 441 * 442 * @throws IllegalArgumentException if the specified sequence does not split 443 * into valid map entries, or if there are duplicate keys 444 */ 445 public Map<String, String> split(CharSequence sequence) { 446 Map<String, String> map = new LinkedHashMap<String, String>(); 447 for (String entry : outerSplitter.split(sequence)) { 448 Iterator<String> entryFields = entrySplitter.spliterator(entry); 449 450 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 451 String key = entryFields.next(); 452 checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key); 453 454 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 455 String value = entryFields.next(); 456 map.put(key, value); 457 458 checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 459 } 460 return Collections.unmodifiableMap(map); 461 } 462 } 463 464 private interface Strategy { 465 Iterator<String> iterator(Splitter splitter, CharSequence toSplit); 466 } 467 468 private abstract static class SplittingIterator 469 extends AbstractIterator<String> { 470 final CharSequence toSplit; 471 final CharMatcher trimmer; 472 final boolean omitEmptyStrings; 473 474 /** 475 * Returns the first index in {@code toSplit} at or after {@code start} 476 * that contains the separator. 477 */ 478 abstract int separatorStart(int start); 479 480 /** 481 * Returns the first index in {@code toSplit} after {@code 482 * separatorPosition} that does not contain a separator. This method is only 483 * invoked after a call to {@code separatorStart}. 484 */ 485 abstract int separatorEnd(int separatorPosition); 486 487 int offset = 0; 488 int limit; 489 490 protected SplittingIterator(Splitter splitter, CharSequence toSplit) { 491 this.trimmer = splitter.trimmer; 492 this.omitEmptyStrings = splitter.omitEmptyStrings; 493 this.limit = splitter.limit; 494 this.toSplit = toSplit; 495 } 496 497 @Override protected String computeNext() { 498 while (offset != -1) { 499 int start = offset; 500 int end; 501 502 int separatorPosition = separatorStart(offset); 503 if (separatorPosition == -1) { 504 end = toSplit.length(); 505 offset = -1; 506 } else { 507 end = separatorPosition; 508 offset = separatorEnd(separatorPosition); 509 } 510 511 while (start < end && trimmer.matches(toSplit.charAt(start))) { 512 start++; 513 } 514 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 515 end--; 516 } 517 518 if (omitEmptyStrings && start == end) { 519 continue; 520 } 521 522 if (limit == 1) { 523 // The limit has been reached, return the rest of the string as the 524 // final item. This is tested after empty string removal so that 525 // empty strings do not count towards the limit. 526 end = toSplit.length(); 527 offset = -1; 528 // Since we may have changed the end, we need to trim it again. 529 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 530 end--; 531 } 532 } else { 533 limit--; 534 } 535 536 return toSplit.subSequence(start, end).toString(); 537 } 538 return endOfData(); 539 } 540 } 541 542 /* 543 * Copied from common.collect.AbstractIterator. TODO(kevinb): un-fork if these 544 * packages are ever combined into a single library. 545 */ 546 private abstract static class AbstractIterator<T> implements Iterator<T> { 547 State state = State.NOT_READY; 548 549 enum State { 550 READY, NOT_READY, DONE, FAILED, 551 } 552 553 T next; 554 555 protected abstract T computeNext(); 556 557 protected final T endOfData() { 558 state = State.DONE; 559 return null; 560 } 561 562 @Override 563 public final boolean hasNext() { 564 checkState(state != State.FAILED); 565 switch (state) { 566 case DONE: 567 return false; 568 case READY: 569 return true; 570 default: 571 } 572 return tryToComputeNext(); 573 } 574 575 boolean tryToComputeNext() { 576 state = State.FAILED; // temporary pessimism 577 next = computeNext(); 578 if (state != State.DONE) { 579 state = State.READY; 580 return true; 581 } 582 return false; 583 } 584 585 @Override 586 public final T next() { 587 if (!hasNext()) { 588 throw new NoSuchElementException(); 589 } 590 state = State.NOT_READY; 591 return next; 592 } 593 594 @Override public void remove() { 595 throw new UnsupportedOperationException(); 596 } 597 } 598 }