001 /* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 022 import com.google.common.annotations.Beta; 023 import com.google.common.annotations.GwtCompatible; 024 025 import java.util.ArrayList; 026 import java.util.Arrays; 027 import java.util.List; 028 029 import javax.annotation.CheckReturnValue; 030 031 /** 032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 033 * for any {@link Object}. Also offers basic text processing methods based on this function. 034 * Implementations are strongly encouraged to be side-effect-free and immutable. 035 * 036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 038 * 039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 042 * treats these just as two separate characters. 043 * 044 * @author Kevin Bourrillion 045 * @since 1.0 046 */ 047 @Beta // Possibly change from chars to code points; decide constants vs. methods 048 @GwtCompatible 049 public abstract class CharMatcher implements Predicate<Character> { 050 // Constants 051 052 // Excludes 2000-2000a, which is handled as a range 053 private static final String BREAKING_WHITESPACE_CHARS = 054 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000"; 055 056 // Excludes 2007, which is handled as a gap in a pair of ranges 057 private static final String NON_BREAKING_WHITESPACE_CHARS = 058 "\u00a0\u180e\u202f"; 059 060 /** 061 * Determines whether a character is whitespace according to the latest Unicode standard, as 062 * illustrated 063 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 064 * This is not the same definition used by other Java APIs. (See a 065 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 066 * definitions of "whitespace"</a>.) 067 * 068 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 069 * to date. 070 */ 071 public static final CharMatcher WHITESPACE = 072 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS) 073 .or(inRange('\u2000', '\u200a')) 074 .precomputed(); 075 076 /** 077 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 078 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 079 * discussion of that term. 080 * 081 * @since 2.0 082 */ 083 public static final CharMatcher BREAKING_WHITESPACE = 084 anyOf(BREAKING_WHITESPACE_CHARS) 085 .or(inRange('\u2000', '\u2006')) 086 .or(inRange('\u2008', '\u200a')) 087 .precomputed(); 088 089 /** 090 * Determines whether a character is ASCII, meaning that its code point is less than 128. 091 */ 092 public static final CharMatcher ASCII = inRange('\0', '\u007f'); 093 094 /** 095 * Determines whether a character is a digit according to 096 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 097 */ 098 public static final CharMatcher DIGIT; 099 100 static { 101 CharMatcher digit = inRange('0', '9'); 102 String zeroes = 103 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66" 104 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946" 105 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 106 for (char base : zeroes.toCharArray()) { 107 digit = digit.or(inRange(base, (char) (base + 9))); 108 } 109 DIGIT = digit.precomputed(); 110 } 111 112 /** 113 * Determines whether a character is whitespace according to {@link Character#isWhitespace(char) 114 * Java's definition}; it is usually preferable to use {@link #WHITESPACE}. (See a 115 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 116 * definitions of "whitespace"</a>.) 117 */ 118 public static final CharMatcher JAVA_WHITESPACE = 119 inRange('\u0009', (char) 13) // \\u000d doesn't work as a char literal 120 .or(inRange('\u001c', '\u0020')) 121 .or(is('\u1680')) 122 .or(is('\u180e')) 123 .or(inRange('\u2000', '\u2006')) 124 .or(inRange('\u2008', '\u200b')) 125 .or(inRange('\u2028', '\u2029')) 126 .or(is('\u205f')) 127 .or(is('\u3000')) 128 .precomputed(); 129 130 /** 131 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's 132 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 133 */ 134 public static final CharMatcher JAVA_DIGIT = new CharMatcher() { 135 @Override public boolean matches(char c) { 136 return Character.isDigit(c); 137 } 138 }; 139 140 /** 141 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's 142 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code 143 * inRange('a', 'z').or(inRange('A', 'Z'))}. 144 */ 145 public static final CharMatcher JAVA_LETTER = new CharMatcher() { 146 @Override public boolean matches(char c) { 147 return Character.isLetter(c); 148 } 149 }; 150 151 /** 152 * Determines whether a character is a letter or digit according to {@link 153 * Character#isLetterOrDigit(char) Java's definition}. 154 */ 155 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() { 156 @Override public boolean matches(char c) { 157 return Character.isLetterOrDigit(c); 158 } 159 }; 160 161 /** 162 * Determines whether a character is upper case according to {@link Character#isUpperCase(char) 163 * Java's definition}. 164 */ 165 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() { 166 @Override public boolean matches(char c) { 167 return Character.isUpperCase(c); 168 } 169 }; 170 171 /** 172 * Determines whether a character is lower case according to {@link Character#isLowerCase(char) 173 * Java's definition}. 174 */ 175 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() { 176 @Override public boolean matches(char c) { 177 return Character.isLowerCase(c); 178 } 179 }; 180 181 /** 182 * Determines whether a character is an ISO control character as specified by {@link 183 * Character#isISOControl(char)}. 184 */ 185 public static final CharMatcher JAVA_ISO_CONTROL = 186 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f')); 187 188 /** 189 * Determines whether a character is invisible; that is, if its Unicode category is any of 190 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 191 * PRIVATE_USE according to ICU4J. 192 */ 193 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020') 194 .or(inRange('\u007f', '\u00a0')) 195 .or(is('\u00ad')) 196 .or(inRange('\u0600', '\u0603')) 197 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e")) 198 .or(inRange('\u2000', '\u200f')) 199 .or(inRange('\u2028', '\u202f')) 200 .or(inRange('\u205f', '\u2064')) 201 .or(inRange('\u206a', '\u206f')) 202 .or(is('\u3000')) 203 .or(inRange('\ud800', '\uf8ff')) 204 .or(anyOf("\ufeff\ufff9\ufffa\ufffb")) 205 .precomputed(); 206 207 /** 208 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 209 * errs on the side of returning {@code false} (that is, it tends to assume a character is 210 * double-width). 211 * 212 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 213 * date. 214 */ 215 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9') 216 .or(is('\u05be')) 217 .or(inRange('\u05d0', '\u05ea')) 218 .or(is('\u05f3')) 219 .or(is('\u05f4')) 220 .or(inRange('\u0600', '\u06ff')) 221 .or(inRange('\u0750', '\u077f')) 222 .or(inRange('\u0e00', '\u0e7f')) 223 .or(inRange('\u1e00', '\u20af')) 224 .or(inRange('\u2100', '\u213a')) 225 .or(inRange('\ufb50', '\ufdff')) 226 .or(inRange('\ufe70', '\ufeff')) 227 .or(inRange('\uff61', '\uffdc')) 228 .precomputed(); 229 230 /** Matches any character. */ 231 public static final CharMatcher ANY = 232 new CharMatcher() { 233 @Override public boolean matches(char c) { 234 return true; 235 } 236 237 @Override public int indexIn(CharSequence sequence) { 238 return (sequence.length() == 0) ? -1 : 0; 239 } 240 241 @Override public int indexIn(CharSequence sequence, int start) { 242 int length = sequence.length(); 243 Preconditions.checkPositionIndex(start, length); 244 return (start == length) ? -1 : start; 245 } 246 247 @Override public int lastIndexIn(CharSequence sequence) { 248 return sequence.length() - 1; 249 } 250 251 @Override public boolean matchesAllOf(CharSequence sequence) { 252 checkNotNull(sequence); 253 return true; 254 } 255 256 @Override public boolean matchesNoneOf(CharSequence sequence) { 257 return sequence.length() == 0; 258 } 259 260 @Override public String removeFrom(CharSequence sequence) { 261 checkNotNull(sequence); 262 return ""; 263 } 264 265 @Override public String replaceFrom(CharSequence sequence, char replacement) { 266 char[] array = new char[sequence.length()]; 267 Arrays.fill(array, replacement); 268 return new String(array); 269 } 270 271 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 272 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 273 for (int i = 0; i < sequence.length(); i++) { 274 retval.append(replacement); 275 } 276 return retval.toString(); 277 } 278 279 @Override public String collapseFrom(CharSequence sequence, char replacement) { 280 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 281 } 282 283 @Override public String trimFrom(CharSequence sequence) { 284 checkNotNull(sequence); 285 return ""; 286 } 287 288 @Override public int countIn(CharSequence sequence) { 289 return sequence.length(); 290 } 291 292 @Override public CharMatcher and(CharMatcher other) { 293 return checkNotNull(other); 294 } 295 296 @Override public CharMatcher or(CharMatcher other) { 297 checkNotNull(other); 298 return this; 299 } 300 301 @Override public CharMatcher negate() { 302 return NONE; 303 } 304 305 @Override public CharMatcher precomputed() { 306 return this; 307 } 308 }; 309 310 /** Matches no characters. */ 311 public static final CharMatcher NONE = 312 new CharMatcher() { 313 @Override public boolean matches(char c) { 314 return false; 315 } 316 317 @Override public int indexIn(CharSequence sequence) { 318 checkNotNull(sequence); 319 return -1; 320 } 321 322 @Override public int indexIn(CharSequence sequence, int start) { 323 int length = sequence.length(); 324 Preconditions.checkPositionIndex(start, length); 325 return -1; 326 } 327 328 @Override public int lastIndexIn(CharSequence sequence) { 329 checkNotNull(sequence); 330 return -1; 331 } 332 333 @Override public boolean matchesAllOf(CharSequence sequence) { 334 return sequence.length() == 0; 335 } 336 337 @Override public boolean matchesNoneOf(CharSequence sequence) { 338 checkNotNull(sequence); 339 return true; 340 } 341 342 @Override public String removeFrom(CharSequence sequence) { 343 return sequence.toString(); 344 } 345 346 @Override public String replaceFrom(CharSequence sequence, char replacement) { 347 return sequence.toString(); 348 } 349 350 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 351 checkNotNull(replacement); 352 return sequence.toString(); 353 } 354 355 @Override public String collapseFrom(CharSequence sequence, char replacement) { 356 return sequence.toString(); 357 } 358 359 @Override public String trimFrom(CharSequence sequence) { 360 return sequence.toString(); 361 } 362 363 @Override public int countIn(CharSequence sequence) { 364 checkNotNull(sequence); 365 return 0; 366 } 367 368 @Override public CharMatcher and(CharMatcher other) { 369 checkNotNull(other); 370 return this; 371 } 372 373 @Override public CharMatcher or(CharMatcher other) { 374 return checkNotNull(other); 375 } 376 377 @Override public CharMatcher negate() { 378 return ANY; 379 } 380 381 @Override void setBits(LookupTable table) {} 382 383 @Override public CharMatcher precomputed() { 384 return this; 385 } 386 }; 387 388 // Static factories 389 390 /** 391 * Returns a {@code char} matcher that matches only one specified character. 392 */ 393 public static CharMatcher is(final char match) { 394 return new CharMatcher() { 395 @Override public boolean matches(char c) { 396 return c == match; 397 } 398 399 @Override public String replaceFrom(CharSequence sequence, char replacement) { 400 return sequence.toString().replace(match, replacement); 401 } 402 403 @Override public CharMatcher and(CharMatcher other) { 404 return other.matches(match) ? this : NONE; 405 } 406 407 @Override public CharMatcher or(CharMatcher other) { 408 return other.matches(match) ? other : super.or(other); 409 } 410 411 @Override public CharMatcher negate() { 412 return isNot(match); 413 } 414 415 @Override void setBits(LookupTable table) { 416 table.set(match); 417 } 418 419 @Override public CharMatcher precomputed() { 420 return this; 421 } 422 }; 423 } 424 425 /** 426 * Returns a {@code char} matcher that matches any character except the one specified. 427 * 428 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 429 */ 430 public static CharMatcher isNot(final char match) { 431 return new CharMatcher() { 432 @Override public boolean matches(char c) { 433 return c != match; 434 } 435 436 @Override public CharMatcher and(CharMatcher other) { 437 return other.matches(match) ? super.and(other) : other; 438 } 439 440 @Override public CharMatcher or(CharMatcher other) { 441 return other.matches(match) ? ANY : this; 442 } 443 444 @Override public CharMatcher negate() { 445 return is(match); 446 } 447 }; 448 } 449 450 /** 451 * Returns a {@code char} matcher that matches any character present in the given character 452 * sequence. 453 */ 454 public static CharMatcher anyOf(final CharSequence sequence) { 455 switch (sequence.length()) { 456 case 0: 457 return NONE; 458 case 1: 459 return is(sequence.charAt(0)); 460 case 2: 461 final char match1 = sequence.charAt(0); 462 final char match2 = sequence.charAt(1); 463 return new CharMatcher() { 464 @Override public boolean matches(char c) { 465 return c == match1 || c == match2; 466 } 467 468 @Override void setBits(LookupTable table) { 469 table.set(match1); 470 table.set(match2); 471 } 472 473 @Override public CharMatcher precomputed() { 474 return this; 475 } 476 }; 477 } 478 479 final char[] chars = sequence.toString().toCharArray(); 480 Arrays.sort(chars); // not worth collapsing duplicates 481 482 return new CharMatcher() { 483 @Override public boolean matches(char c) { 484 return Arrays.binarySearch(chars, c) >= 0; 485 } 486 487 @Override void setBits(LookupTable table) { 488 for (char c : chars) { 489 table.set(c); 490 } 491 } 492 }; 493 } 494 495 /** 496 * Returns a {@code char} matcher that matches any character not present in the given character 497 * sequence. 498 */ 499 public static CharMatcher noneOf(CharSequence sequence) { 500 return anyOf(sequence).negate(); 501 } 502 503 /** 504 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 505 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 506 * CharMatcher.inRange('a', 'z')}. 507 * 508 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 509 */ 510 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 511 checkArgument(endInclusive >= startInclusive); 512 return new CharMatcher() { 513 @Override public boolean matches(char c) { 514 return startInclusive <= c && c <= endInclusive; 515 } 516 517 @Override void setBits(LookupTable table) { 518 char c = startInclusive; 519 while (true) { 520 table.set(c); 521 if (c++ == endInclusive) { 522 break; 523 } 524 } 525 } 526 527 @Override public CharMatcher precomputed() { 528 return this; 529 } 530 }; 531 } 532 533 /** 534 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 535 * which operates on primitive {@code char} instances instead. 536 */ 537 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 538 checkNotNull(predicate); 539 if (predicate instanceof CharMatcher) { 540 return (CharMatcher) predicate; 541 } 542 return new CharMatcher() { 543 @Override public boolean matches(char c) { 544 return predicate.apply(c); 545 } 546 547 @Override public boolean apply(Character character) { 548 return predicate.apply(checkNotNull(character)); 549 } 550 }; 551 } 552 553 // Abstract methods 554 555 /** Determines a true or false value for the given character. */ 556 public abstract boolean matches(char c); 557 558 // Non-static factories 559 560 /** 561 * Returns a matcher that matches any character not matched by this matcher. 562 */ 563 public CharMatcher negate() { 564 final CharMatcher original = this; 565 return new CharMatcher() { 566 @Override public boolean matches(char c) { 567 return !original.matches(c); 568 } 569 570 @Override public boolean matchesAllOf(CharSequence sequence) { 571 return original.matchesNoneOf(sequence); 572 } 573 574 @Override public boolean matchesNoneOf(CharSequence sequence) { 575 return original.matchesAllOf(sequence); 576 } 577 578 @Override public int countIn(CharSequence sequence) { 579 return sequence.length() - original.countIn(sequence); 580 } 581 582 @Override public CharMatcher negate() { 583 return original; 584 } 585 }; 586 } 587 588 /** 589 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 590 */ 591 public CharMatcher and(CharMatcher other) { 592 return new And(Arrays.asList(this, checkNotNull(other))); 593 } 594 595 private static class And extends CharMatcher { 596 List<CharMatcher> components; 597 598 And(List<CharMatcher> components) { 599 this.components = components; // Skip defensive copy (private) 600 } 601 602 @Override public boolean matches(char c) { 603 for (CharMatcher matcher : components) { 604 if (!matcher.matches(c)) { 605 return false; 606 } 607 } 608 return true; 609 } 610 611 @Override public CharMatcher and(CharMatcher other) { 612 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 613 newComponents.add(checkNotNull(other)); 614 return new And(newComponents); 615 } 616 } 617 618 /** 619 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 620 */ 621 public CharMatcher or(CharMatcher other) { 622 return new Or(Arrays.asList(this, checkNotNull(other))); 623 } 624 625 private static class Or extends CharMatcher { 626 List<CharMatcher> components; 627 628 Or(List<CharMatcher> components) { 629 this.components = components; // Skip defensive copy (private) 630 } 631 632 @Override public boolean matches(char c) { 633 for (CharMatcher matcher : components) { 634 if (matcher.matches(c)) { 635 return true; 636 } 637 } 638 return false; 639 } 640 641 @Override public CharMatcher or(CharMatcher other) { 642 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 643 newComponents.add(checkNotNull(other)); 644 return new Or(newComponents); 645 } 646 647 @Override void setBits(LookupTable table) { 648 for (CharMatcher matcher : components) { 649 matcher.setBits(table); 650 } 651 } 652 } 653 654 /** 655 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 656 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 657 * worthwhile only if the precomputed matcher is queried many thousands of times. 658 * 659 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 660 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 661 * worthwhile tradeoff in a browser. 662 */ 663 public CharMatcher precomputed() { 664 return Platform.precomputeCharMatcher(this); 665 } 666 667 /** 668 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 669 * on {@link Platform} so that we can have different behavior in GWT. 670 * 671 * <p>The default precomputation is to cache the configuration of the original matcher in an 672 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query 673 * than the original. 674 * 675 * <p>The default implementation creates a new bit array and passes it to {@link 676 * #setBits(LookupTable)}. 677 */ 678 CharMatcher precomputedInternal() { 679 final LookupTable table = new LookupTable(); 680 setBits(table); 681 682 return new CharMatcher() { 683 @Override public boolean matches(char c) { 684 return table.get(c); 685 } 686 687 // TODO(kevinb): make methods like negate() smart? 688 689 @Override public CharMatcher precomputed() { 690 return this; 691 } 692 }; 693 } 694 695 /** 696 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal 697 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched. 698 * 699 * <p>The default implementation loops over every possible character value, invoking {@link 700 * #matches} for each one. 701 */ 702 void setBits(LookupTable table) { 703 char c = Character.MIN_VALUE; 704 while (true) { 705 if (matches(c)) { 706 table.set(c); 707 } 708 if (c++ == Character.MAX_VALUE) { 709 break; 710 } 711 } 712 } 713 714 /** 715 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}. 716 * 717 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a 718 * simpler java.util.BitSet. 719 */ 720 private static final class LookupTable { 721 int[] data = new int[2048]; 722 723 void set(char index) { 724 data[index >> 5] |= (1 << index); 725 } 726 727 boolean get(char index) { 728 return (data[index >> 5] & (1 << index)) != 0; 729 } 730 } 731 732 // Text processing routines 733 734 /** 735 * Returns {@code true} if a character sequence contains at least one matching character. 736 * Equivalent to {@code !matchesNoneOf(sequence)}. 737 * 738 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 739 * character, until this returns {@code true} or the end is reached. 740 * 741 * @param sequence the character sequence to examine, possibly empty 742 * @return {@code true} if this matcher matches at least one character in the sequence 743 * @since 8.0 744 */ 745 public boolean matchesAnyOf(CharSequence sequence) { 746 return !matchesNoneOf(sequence); 747 } 748 749 /** 750 * Returns {@code true} if a character sequence contains only matching characters. 751 * 752 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 753 * character, until this returns {@code false} or the end is reached. 754 * 755 * @param sequence the character sequence to examine, possibly empty 756 * @return {@code true} if this matcher matches every character in the sequence, including when 757 * the sequence is empty 758 */ 759 public boolean matchesAllOf(CharSequence sequence) { 760 for (int i = sequence.length() - 1; i >= 0; i--) { 761 if (!matches(sequence.charAt(i))) { 762 return false; 763 } 764 } 765 return true; 766 } 767 768 /** 769 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to 770 * {@code !matchesAnyOf(sequence)}. 771 * 772 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 773 * character, until this returns {@code false} or the end is reached. 774 * 775 * @param sequence the character sequence to examine, possibly empty 776 * @return {@code true} if this matcher matches every character in the sequence, including when 777 * the sequence is empty 778 */ 779 public boolean matchesNoneOf(CharSequence sequence) { 780 return indexIn(sequence) == -1; 781 } 782 783 // TODO(kevinb): add matchesAnyOf() 784 785 /** 786 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 787 * matching character is present. 788 * 789 * <p>The default implementation iterates over the sequence in forward order calling {@link 790 * #matches} for each character. 791 * 792 * @param sequence the character sequence to examine from the beginning 793 * @return an index, or {@code -1} if no character matches 794 */ 795 public int indexIn(CharSequence sequence) { 796 int length = sequence.length(); 797 for (int i = 0; i < length; i++) { 798 if (matches(sequence.charAt(i))) { 799 return i; 800 } 801 } 802 return -1; 803 } 804 805 /** 806 * Returns the index of the first matching character in a character sequence, starting from a 807 * given position, or {@code -1} if no character matches after that position. 808 * 809 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 810 * start}, calling {@link #matches} for each character. 811 * 812 * @param sequence the character sequence to examine 813 * @param start the first index to examine; must be nonnegative and no greater than {@code 814 * sequence.length()} 815 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 816 * or {@code -1} if no character matches 817 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 818 * sequence.length()} 819 */ 820 public int indexIn(CharSequence sequence, int start) { 821 int length = sequence.length(); 822 Preconditions.checkPositionIndex(start, length); 823 for (int i = start; i < length; i++) { 824 if (matches(sequence.charAt(i))) { 825 return i; 826 } 827 } 828 return -1; 829 } 830 831 /** 832 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 833 * matching character is present. 834 * 835 * <p>The default implementation iterates over the sequence in reverse order calling {@link 836 * #matches} for each character. 837 * 838 * @param sequence the character sequence to examine from the end 839 * @return an index, or {@code -1} if no character matches 840 */ 841 public int lastIndexIn(CharSequence sequence) { 842 for (int i = sequence.length() - 1; i >= 0; i--) { 843 if (matches(sequence.charAt(i))) { 844 return i; 845 } 846 } 847 return -1; 848 } 849 850 /** 851 * Returns the number of matching characters found in a character sequence. 852 */ 853 public int countIn(CharSequence sequence) { 854 int count = 0; 855 for (int i = 0; i < sequence.length(); i++) { 856 if (matches(sequence.charAt(i))) { 857 count++; 858 } 859 } 860 return count; 861 } 862 863 /** 864 * Returns a string containing all non-matching characters of a character sequence, in order. For 865 * example: <pre> {@code 866 * 867 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 868 * 869 * ... returns {@code "bzr"}. 870 */ 871 @CheckReturnValue 872 public String removeFrom(CharSequence sequence) { 873 String string = sequence.toString(); 874 int pos = indexIn(string); 875 if (pos == -1) { 876 return string; 877 } 878 879 char[] chars = string.toCharArray(); 880 int spread = 1; 881 882 // This unusual loop comes from extensive benchmarking 883 OUT: while (true) { 884 pos++; 885 while (true) { 886 if (pos == chars.length) { 887 break OUT; 888 } 889 if (matches(chars[pos])) { 890 break; 891 } 892 chars[pos - spread] = chars[pos]; 893 pos++; 894 } 895 spread++; 896 } 897 return new String(chars, 0, pos - spread); 898 } 899 900 /** 901 * Returns a string containing all matching characters of a character sequence, in order. For 902 * example: <pre> {@code 903 * 904 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 905 * 906 * ... returns {@code "aaa"}. 907 */ 908 @CheckReturnValue 909 public String retainFrom(CharSequence sequence) { 910 return negate().removeFrom(sequence); 911 } 912 913 /** 914 * Returns a string copy of the input character sequence, with each character that matches this 915 * matcher replaced by a given replacement character. For example: <pre> {@code 916 * 917 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 918 * 919 * ... returns {@code "rodor"}. 920 * 921 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 922 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 923 * character. 924 * 925 * @param sequence the character sequence to replace matching characters in 926 * @param replacement the character to append to the result string in place of each matching 927 * character in {@code sequence} 928 * @return the new string 929 */ 930 @CheckReturnValue 931 public String replaceFrom(CharSequence sequence, char replacement) { 932 String string = sequence.toString(); 933 int pos = indexIn(string); 934 if (pos == -1) { 935 return string; 936 } 937 char[] chars = string.toCharArray(); 938 chars[pos] = replacement; 939 for (int i = pos + 1; i < chars.length; i++) { 940 if (matches(chars[i])) { 941 chars[i] = replacement; 942 } 943 } 944 return new String(chars); 945 } 946 947 /** 948 * Returns a string copy of the input character sequence, with each character that matches this 949 * matcher replaced by a given replacement sequence. For example: <pre> {@code 950 * 951 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 952 * 953 * ... returns {@code "yoohoo"}. 954 * 955 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 956 * off calling {@link #replaceFrom(CharSequence, char)} directly. 957 * 958 * @param sequence the character sequence to replace matching characters in 959 * @param replacement the characters to append to the result string in place of each matching 960 * character in {@code sequence} 961 * @return the new string 962 */ 963 @CheckReturnValue 964 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 965 int replacementLen = replacement.length(); 966 if (replacementLen == 0) { 967 return removeFrom(sequence); 968 } 969 if (replacementLen == 1) { 970 return replaceFrom(sequence, replacement.charAt(0)); 971 } 972 973 String string = sequence.toString(); 974 int pos = indexIn(string); 975 if (pos == -1) { 976 return string; 977 } 978 979 int len = string.length(); 980 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 981 982 int oldpos = 0; 983 do { 984 buf.append(string, oldpos, pos); 985 buf.append(replacement); 986 oldpos = pos + 1; 987 pos = indexIn(string, oldpos); 988 } while (pos != -1); 989 990 buf.append(string, oldpos, len); 991 return buf.toString(); 992 } 993 994 /** 995 * Returns a substring of the input character sequence that omits all characters this matcher 996 * matches from the beginning and from the end of the string. For example: <pre> {@code 997 * 998 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 999 * 1000 * ... returns {@code "cat"}. 1001 * 1002 * <p>Note that: <pre> {@code 1003 * 1004 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 1005 * 1006 * ... is equivalent to {@link String#trim()}. 1007 */ 1008 @CheckReturnValue 1009 public String trimFrom(CharSequence sequence) { 1010 int len = sequence.length(); 1011 int first; 1012 int last; 1013 1014 for (first = 0; first < len; first++) { 1015 if (!matches(sequence.charAt(first))) { 1016 break; 1017 } 1018 } 1019 for (last = len - 1; last > first; last--) { 1020 if (!matches(sequence.charAt(last))) { 1021 break; 1022 } 1023 } 1024 1025 return sequence.subSequence(first, last + 1).toString(); 1026 } 1027 1028 /** 1029 * Returns a substring of the input character sequence that omits all characters this matcher 1030 * matches from the beginning of the string. For example: <pre> {@code 1031 * 1032 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1033 * 1034 * ... returns {@code "catbab"}. 1035 */ 1036 @CheckReturnValue 1037 public String trimLeadingFrom(CharSequence sequence) { 1038 int len = sequence.length(); 1039 int first; 1040 1041 for (first = 0; first < len; first++) { 1042 if (!matches(sequence.charAt(first))) { 1043 break; 1044 } 1045 } 1046 1047 return sequence.subSequence(first, len).toString(); 1048 } 1049 1050 /** 1051 * Returns a substring of the input character sequence that omits all characters this matcher 1052 * matches from the end of the string. For example: <pre> {@code 1053 * 1054 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1055 * 1056 * ... returns {@code "abacat"}. 1057 */ 1058 @CheckReturnValue 1059 public String trimTrailingFrom(CharSequence sequence) { 1060 int len = sequence.length(); 1061 int last; 1062 1063 for (last = len - 1; last >= 0; last--) { 1064 if (!matches(sequence.charAt(last))) { 1065 break; 1066 } 1067 } 1068 1069 return sequence.subSequence(0, last + 1).toString(); 1070 } 1071 1072 /** 1073 * Returns a string copy of the input character sequence, with each group of consecutive 1074 * characters that match this matcher replaced by a single replacement character. For example: 1075 * <pre> {@code 1076 * 1077 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1078 * 1079 * ... returns {@code "b-p-r"}. 1080 * 1081 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1082 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1083 * character. 1084 * 1085 * @param sequence the character sequence to replace matching groups of characters in 1086 * @param replacement the character to append to the result string in place of each group of 1087 * matching characters in {@code sequence} 1088 * @return the new string 1089 */ 1090 @CheckReturnValue 1091 public String collapseFrom(CharSequence sequence, char replacement) { 1092 int first = indexIn(sequence); 1093 if (first == -1) { 1094 return sequence.toString(); 1095 } 1096 1097 // TODO(kevinb): see if this implementation can be made faster 1098 StringBuilder builder = new StringBuilder(sequence.length()) 1099 .append(sequence.subSequence(0, first)) 1100 .append(replacement); 1101 boolean in = true; 1102 for (int i = first + 1; i < sequence.length(); i++) { 1103 char c = sequence.charAt(i); 1104 if (apply(c)) { 1105 if (!in) { 1106 builder.append(replacement); 1107 in = true; 1108 } 1109 } else { 1110 builder.append(c); 1111 in = false; 1112 } 1113 } 1114 return builder.toString(); 1115 } 1116 1117 /** 1118 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1119 * groups of matching characters at the start or end of the sequence are removed without 1120 * replacement. 1121 */ 1122 @CheckReturnValue 1123 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1124 int first = negate().indexIn(sequence); 1125 if (first == -1) { 1126 return ""; // everything matches. nothing's left. 1127 } 1128 StringBuilder builder = new StringBuilder(sequence.length()); 1129 boolean inMatchingGroup = false; 1130 for (int i = first; i < sequence.length(); i++) { 1131 char c = sequence.charAt(i); 1132 if (apply(c)) { 1133 inMatchingGroup = true; 1134 } else { 1135 if (inMatchingGroup) { 1136 builder.append(replacement); 1137 inMatchingGroup = false; 1138 } 1139 builder.append(c); 1140 } 1141 } 1142 return builder.toString(); 1143 } 1144 1145 // Predicate interface 1146 1147 /** 1148 * Returns {@code true} if this matcher matches the given character. 1149 * 1150 * @throws NullPointerException if {@code character} is null 1151 */ 1152 @Override public boolean apply(Character character) { 1153 return matches(character); 1154 } 1155 }