001 /*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.base;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021
022 import com.google.common.annotations.Beta;
023 import com.google.common.annotations.GwtCompatible;
024
025 import java.util.ArrayList;
026 import java.util.Arrays;
027 import java.util.List;
028
029 import javax.annotation.CheckReturnValue;
030
031 /**
032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
033 * for any {@link Object}. Also offers basic text processing methods based on this function.
034 * Implementations are strongly encouraged to be side-effect-free and immutable.
035 *
036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
038 *
039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
042 * treats these just as two separate characters.
043 *
044 * @author Kevin Bourrillion
045 * @since 1.0
046 */
047 @Beta // Possibly change from chars to code points; decide constants vs. methods
048 @GwtCompatible
049 public abstract class CharMatcher implements Predicate<Character> {
050 // Constants
051
052 // Excludes 2000-2000a, which is handled as a range
053 private static final String BREAKING_WHITESPACE_CHARS =
054 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000";
055
056 // Excludes 2007, which is handled as a gap in a pair of ranges
057 private static final String NON_BREAKING_WHITESPACE_CHARS =
058 "\u00a0\u180e\u202f";
059
060 /**
061 * Determines whether a character is whitespace according to the latest Unicode standard, as
062 * illustrated
063 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
064 * This is not the same definition used by other Java APIs. (See a
065 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
066 * definitions of "whitespace"</a>.)
067 *
068 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
069 * to date.
070 */
071 public static final CharMatcher WHITESPACE =
072 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS)
073 .or(inRange('\u2000', '\u200a'))
074 .precomputed();
075
076 /**
077 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
078 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
079 * discussion of that term.
080 *
081 * @since 2.0
082 */
083 public static final CharMatcher BREAKING_WHITESPACE =
084 anyOf(BREAKING_WHITESPACE_CHARS)
085 .or(inRange('\u2000', '\u2006'))
086 .or(inRange('\u2008', '\u200a'))
087 .precomputed();
088
089 /**
090 * Determines whether a character is ASCII, meaning that its code point is less than 128.
091 */
092 public static final CharMatcher ASCII = inRange('\0', '\u007f');
093
094 /**
095 * Determines whether a character is a digit according to
096 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
097 */
098 public static final CharMatcher DIGIT;
099
100 static {
101 CharMatcher digit = inRange('0', '9');
102 String zeroes =
103 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
104 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
105 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
106 for (char base : zeroes.toCharArray()) {
107 digit = digit.or(inRange(base, (char) (base + 9)));
108 }
109 DIGIT = digit.precomputed();
110 }
111
112 /**
113 * Determines whether a character is whitespace according to {@link Character#isWhitespace(char)
114 * Java's definition}; it is usually preferable to use {@link #WHITESPACE}. (See a
115 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
116 * definitions of "whitespace"</a>.)
117 */
118 public static final CharMatcher JAVA_WHITESPACE =
119 inRange('\u0009', (char) 13) // \\u000d doesn't work as a char literal
120 .or(inRange('\u001c', '\u0020'))
121 .or(is('\u1680'))
122 .or(is('\u180e'))
123 .or(inRange('\u2000', '\u2006'))
124 .or(inRange('\u2008', '\u200b'))
125 .or(inRange('\u2028', '\u2029'))
126 .or(is('\u205f'))
127 .or(is('\u3000'))
128 .precomputed();
129
130 /**
131 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
132 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
133 */
134 public static final CharMatcher JAVA_DIGIT = new CharMatcher() {
135 @Override public boolean matches(char c) {
136 return Character.isDigit(c);
137 }
138 };
139
140 /**
141 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
142 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
143 * inRange('a', 'z').or(inRange('A', 'Z'))}.
144 */
145 public static final CharMatcher JAVA_LETTER = new CharMatcher() {
146 @Override public boolean matches(char c) {
147 return Character.isLetter(c);
148 }
149 };
150
151 /**
152 * Determines whether a character is a letter or digit according to {@link
153 * Character#isLetterOrDigit(char) Java's definition}.
154 */
155 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() {
156 @Override public boolean matches(char c) {
157 return Character.isLetterOrDigit(c);
158 }
159 };
160
161 /**
162 * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
163 * Java's definition}.
164 */
165 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() {
166 @Override public boolean matches(char c) {
167 return Character.isUpperCase(c);
168 }
169 };
170
171 /**
172 * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
173 * Java's definition}.
174 */
175 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() {
176 @Override public boolean matches(char c) {
177 return Character.isLowerCase(c);
178 }
179 };
180
181 /**
182 * Determines whether a character is an ISO control character as specified by {@link
183 * Character#isISOControl(char)}.
184 */
185 public static final CharMatcher JAVA_ISO_CONTROL =
186 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f'));
187
188 /**
189 * Determines whether a character is invisible; that is, if its Unicode category is any of
190 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
191 * PRIVATE_USE according to ICU4J.
192 */
193 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
194 .or(inRange('\u007f', '\u00a0'))
195 .or(is('\u00ad'))
196 .or(inRange('\u0600', '\u0603'))
197 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e"))
198 .or(inRange('\u2000', '\u200f'))
199 .or(inRange('\u2028', '\u202f'))
200 .or(inRange('\u205f', '\u2064'))
201 .or(inRange('\u206a', '\u206f'))
202 .or(is('\u3000'))
203 .or(inRange('\ud800', '\uf8ff'))
204 .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
205 .precomputed();
206
207 /**
208 * Determines whether a character is single-width (not double-width). When in doubt, this matcher
209 * errs on the side of returning {@code false} (that is, it tends to assume a character is
210 * double-width).
211 *
212 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
213 * date.
214 */
215 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
216 .or(is('\u05be'))
217 .or(inRange('\u05d0', '\u05ea'))
218 .or(is('\u05f3'))
219 .or(is('\u05f4'))
220 .or(inRange('\u0600', '\u06ff'))
221 .or(inRange('\u0750', '\u077f'))
222 .or(inRange('\u0e00', '\u0e7f'))
223 .or(inRange('\u1e00', '\u20af'))
224 .or(inRange('\u2100', '\u213a'))
225 .or(inRange('\ufb50', '\ufdff'))
226 .or(inRange('\ufe70', '\ufeff'))
227 .or(inRange('\uff61', '\uffdc'))
228 .precomputed();
229
230 /** Matches any character. */
231 public static final CharMatcher ANY =
232 new CharMatcher() {
233 @Override public boolean matches(char c) {
234 return true;
235 }
236
237 @Override public int indexIn(CharSequence sequence) {
238 return (sequence.length() == 0) ? -1 : 0;
239 }
240
241 @Override public int indexIn(CharSequence sequence, int start) {
242 int length = sequence.length();
243 Preconditions.checkPositionIndex(start, length);
244 return (start == length) ? -1 : start;
245 }
246
247 @Override public int lastIndexIn(CharSequence sequence) {
248 return sequence.length() - 1;
249 }
250
251 @Override public boolean matchesAllOf(CharSequence sequence) {
252 checkNotNull(sequence);
253 return true;
254 }
255
256 @Override public boolean matchesNoneOf(CharSequence sequence) {
257 return sequence.length() == 0;
258 }
259
260 @Override public String removeFrom(CharSequence sequence) {
261 checkNotNull(sequence);
262 return "";
263 }
264
265 @Override public String replaceFrom(CharSequence sequence, char replacement) {
266 char[] array = new char[sequence.length()];
267 Arrays.fill(array, replacement);
268 return new String(array);
269 }
270
271 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
272 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
273 for (int i = 0; i < sequence.length(); i++) {
274 retval.append(replacement);
275 }
276 return retval.toString();
277 }
278
279 @Override public String collapseFrom(CharSequence sequence, char replacement) {
280 return (sequence.length() == 0) ? "" : String.valueOf(replacement);
281 }
282
283 @Override public String trimFrom(CharSequence sequence) {
284 checkNotNull(sequence);
285 return "";
286 }
287
288 @Override public int countIn(CharSequence sequence) {
289 return sequence.length();
290 }
291
292 @Override public CharMatcher and(CharMatcher other) {
293 return checkNotNull(other);
294 }
295
296 @Override public CharMatcher or(CharMatcher other) {
297 checkNotNull(other);
298 return this;
299 }
300
301 @Override public CharMatcher negate() {
302 return NONE;
303 }
304
305 @Override public CharMatcher precomputed() {
306 return this;
307 }
308 };
309
310 /** Matches no characters. */
311 public static final CharMatcher NONE =
312 new CharMatcher() {
313 @Override public boolean matches(char c) {
314 return false;
315 }
316
317 @Override public int indexIn(CharSequence sequence) {
318 checkNotNull(sequence);
319 return -1;
320 }
321
322 @Override public int indexIn(CharSequence sequence, int start) {
323 int length = sequence.length();
324 Preconditions.checkPositionIndex(start, length);
325 return -1;
326 }
327
328 @Override public int lastIndexIn(CharSequence sequence) {
329 checkNotNull(sequence);
330 return -1;
331 }
332
333 @Override public boolean matchesAllOf(CharSequence sequence) {
334 return sequence.length() == 0;
335 }
336
337 @Override public boolean matchesNoneOf(CharSequence sequence) {
338 checkNotNull(sequence);
339 return true;
340 }
341
342 @Override public String removeFrom(CharSequence sequence) {
343 return sequence.toString();
344 }
345
346 @Override public String replaceFrom(CharSequence sequence, char replacement) {
347 return sequence.toString();
348 }
349
350 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
351 checkNotNull(replacement);
352 return sequence.toString();
353 }
354
355 @Override public String collapseFrom(CharSequence sequence, char replacement) {
356 return sequence.toString();
357 }
358
359 @Override public String trimFrom(CharSequence sequence) {
360 return sequence.toString();
361 }
362
363 @Override public int countIn(CharSequence sequence) {
364 checkNotNull(sequence);
365 return 0;
366 }
367
368 @Override public CharMatcher and(CharMatcher other) {
369 checkNotNull(other);
370 return this;
371 }
372
373 @Override public CharMatcher or(CharMatcher other) {
374 return checkNotNull(other);
375 }
376
377 @Override public CharMatcher negate() {
378 return ANY;
379 }
380
381 @Override void setBits(LookupTable table) {}
382
383 @Override public CharMatcher precomputed() {
384 return this;
385 }
386 };
387
388 // Static factories
389
390 /**
391 * Returns a {@code char} matcher that matches only one specified character.
392 */
393 public static CharMatcher is(final char match) {
394 return new CharMatcher() {
395 @Override public boolean matches(char c) {
396 return c == match;
397 }
398
399 @Override public String replaceFrom(CharSequence sequence, char replacement) {
400 return sequence.toString().replace(match, replacement);
401 }
402
403 @Override public CharMatcher and(CharMatcher other) {
404 return other.matches(match) ? this : NONE;
405 }
406
407 @Override public CharMatcher or(CharMatcher other) {
408 return other.matches(match) ? other : super.or(other);
409 }
410
411 @Override public CharMatcher negate() {
412 return isNot(match);
413 }
414
415 @Override void setBits(LookupTable table) {
416 table.set(match);
417 }
418
419 @Override public CharMatcher precomputed() {
420 return this;
421 }
422 };
423 }
424
425 /**
426 * Returns a {@code char} matcher that matches any character except the one specified.
427 *
428 * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
429 */
430 public static CharMatcher isNot(final char match) {
431 return new CharMatcher() {
432 @Override public boolean matches(char c) {
433 return c != match;
434 }
435
436 @Override public CharMatcher and(CharMatcher other) {
437 return other.matches(match) ? super.and(other) : other;
438 }
439
440 @Override public CharMatcher or(CharMatcher other) {
441 return other.matches(match) ? ANY : this;
442 }
443
444 @Override public CharMatcher negate() {
445 return is(match);
446 }
447 };
448 }
449
450 /**
451 * Returns a {@code char} matcher that matches any character present in the given character
452 * sequence.
453 */
454 public static CharMatcher anyOf(final CharSequence sequence) {
455 switch (sequence.length()) {
456 case 0:
457 return NONE;
458 case 1:
459 return is(sequence.charAt(0));
460 case 2:
461 final char match1 = sequence.charAt(0);
462 final char match2 = sequence.charAt(1);
463 return new CharMatcher() {
464 @Override public boolean matches(char c) {
465 return c == match1 || c == match2;
466 }
467
468 @Override void setBits(LookupTable table) {
469 table.set(match1);
470 table.set(match2);
471 }
472
473 @Override public CharMatcher precomputed() {
474 return this;
475 }
476 };
477 }
478
479 final char[] chars = sequence.toString().toCharArray();
480 Arrays.sort(chars); // not worth collapsing duplicates
481
482 return new CharMatcher() {
483 @Override public boolean matches(char c) {
484 return Arrays.binarySearch(chars, c) >= 0;
485 }
486
487 @Override void setBits(LookupTable table) {
488 for (char c : chars) {
489 table.set(c);
490 }
491 }
492 };
493 }
494
495 /**
496 * Returns a {@code char} matcher that matches any character not present in the given character
497 * sequence.
498 */
499 public static CharMatcher noneOf(CharSequence sequence) {
500 return anyOf(sequence).negate();
501 }
502
503 /**
504 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
505 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
506 * CharMatcher.inRange('a', 'z')}.
507 *
508 * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
509 */
510 public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
511 checkArgument(endInclusive >= startInclusive);
512 return new CharMatcher() {
513 @Override public boolean matches(char c) {
514 return startInclusive <= c && c <= endInclusive;
515 }
516
517 @Override void setBits(LookupTable table) {
518 char c = startInclusive;
519 while (true) {
520 table.set(c);
521 if (c++ == endInclusive) {
522 break;
523 }
524 }
525 }
526
527 @Override public CharMatcher precomputed() {
528 return this;
529 }
530 };
531 }
532
533 /**
534 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
535 * which operates on primitive {@code char} instances instead.
536 */
537 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
538 checkNotNull(predicate);
539 if (predicate instanceof CharMatcher) {
540 return (CharMatcher) predicate;
541 }
542 return new CharMatcher() {
543 @Override public boolean matches(char c) {
544 return predicate.apply(c);
545 }
546
547 @Override public boolean apply(Character character) {
548 return predicate.apply(checkNotNull(character));
549 }
550 };
551 }
552
553 // Abstract methods
554
555 /** Determines a true or false value for the given character. */
556 public abstract boolean matches(char c);
557
558 // Non-static factories
559
560 /**
561 * Returns a matcher that matches any character not matched by this matcher.
562 */
563 public CharMatcher negate() {
564 final CharMatcher original = this;
565 return new CharMatcher() {
566 @Override public boolean matches(char c) {
567 return !original.matches(c);
568 }
569
570 @Override public boolean matchesAllOf(CharSequence sequence) {
571 return original.matchesNoneOf(sequence);
572 }
573
574 @Override public boolean matchesNoneOf(CharSequence sequence) {
575 return original.matchesAllOf(sequence);
576 }
577
578 @Override public int countIn(CharSequence sequence) {
579 return sequence.length() - original.countIn(sequence);
580 }
581
582 @Override public CharMatcher negate() {
583 return original;
584 }
585 };
586 }
587
588 /**
589 * Returns a matcher that matches any character matched by both this matcher and {@code other}.
590 */
591 public CharMatcher and(CharMatcher other) {
592 return new And(Arrays.asList(this, checkNotNull(other)));
593 }
594
595 private static class And extends CharMatcher {
596 List<CharMatcher> components;
597
598 And(List<CharMatcher> components) {
599 this.components = components; // Skip defensive copy (private)
600 }
601
602 @Override public boolean matches(char c) {
603 for (CharMatcher matcher : components) {
604 if (!matcher.matches(c)) {
605 return false;
606 }
607 }
608 return true;
609 }
610
611 @Override public CharMatcher and(CharMatcher other) {
612 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
613 newComponents.add(checkNotNull(other));
614 return new And(newComponents);
615 }
616 }
617
618 /**
619 * Returns a matcher that matches any character matched by either this matcher or {@code other}.
620 */
621 public CharMatcher or(CharMatcher other) {
622 return new Or(Arrays.asList(this, checkNotNull(other)));
623 }
624
625 private static class Or extends CharMatcher {
626 List<CharMatcher> components;
627
628 Or(List<CharMatcher> components) {
629 this.components = components; // Skip defensive copy (private)
630 }
631
632 @Override public boolean matches(char c) {
633 for (CharMatcher matcher : components) {
634 if (matcher.matches(c)) {
635 return true;
636 }
637 }
638 return false;
639 }
640
641 @Override public CharMatcher or(CharMatcher other) {
642 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
643 newComponents.add(checkNotNull(other));
644 return new Or(newComponents);
645 }
646
647 @Override void setBits(LookupTable table) {
648 for (CharMatcher matcher : components) {
649 matcher.setBits(table);
650 }
651 }
652 }
653
654 /**
655 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
656 * query than the original; your mileage may vary. Precomputation takes time and is likely to be
657 * worthwhile only if the precomputed matcher is queried many thousands of times.
658 *
659 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
660 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
661 * worthwhile tradeoff in a browser.
662 */
663 public CharMatcher precomputed() {
664 return Platform.precomputeCharMatcher(this);
665 }
666
667 /**
668 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
669 * on {@link Platform} so that we can have different behavior in GWT.
670 *
671 * <p>The default precomputation is to cache the configuration of the original matcher in an
672 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query
673 * than the original.
674 *
675 * <p>The default implementation creates a new bit array and passes it to {@link
676 * #setBits(LookupTable)}.
677 */
678 CharMatcher precomputedInternal() {
679 final LookupTable table = new LookupTable();
680 setBits(table);
681
682 return new CharMatcher() {
683 @Override public boolean matches(char c) {
684 return table.get(c);
685 }
686
687 // TODO(kevinb): make methods like negate() smart?
688
689 @Override public CharMatcher precomputed() {
690 return this;
691 }
692 };
693 }
694
695 /**
696 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
697 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
698 *
699 * <p>The default implementation loops over every possible character value, invoking {@link
700 * #matches} for each one.
701 */
702 void setBits(LookupTable table) {
703 char c = Character.MIN_VALUE;
704 while (true) {
705 if (matches(c)) {
706 table.set(c);
707 }
708 if (c++ == Character.MAX_VALUE) {
709 break;
710 }
711 }
712 }
713
714 /**
715 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
716 *
717 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
718 * simpler java.util.BitSet.
719 */
720 private static final class LookupTable {
721 int[] data = new int[2048];
722
723 void set(char index) {
724 data[index >> 5] |= (1 << index);
725 }
726
727 boolean get(char index) {
728 return (data[index >> 5] & (1 << index)) != 0;
729 }
730 }
731
732 // Text processing routines
733
734 /**
735 * Returns {@code true} if a character sequence contains at least one matching character.
736 * Equivalent to {@code !matchesNoneOf(sequence)}.
737 *
738 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
739 * character, until this returns {@code true} or the end is reached.
740 *
741 * @param sequence the character sequence to examine, possibly empty
742 * @return {@code true} if this matcher matches at least one character in the sequence
743 * @since 8.0
744 */
745 public boolean matchesAnyOf(CharSequence sequence) {
746 return !matchesNoneOf(sequence);
747 }
748
749 /**
750 * Returns {@code true} if a character sequence contains only matching characters.
751 *
752 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
753 * character, until this returns {@code false} or the end is reached.
754 *
755 * @param sequence the character sequence to examine, possibly empty
756 * @return {@code true} if this matcher matches every character in the sequence, including when
757 * the sequence is empty
758 */
759 public boolean matchesAllOf(CharSequence sequence) {
760 for (int i = sequence.length() - 1; i >= 0; i--) {
761 if (!matches(sequence.charAt(i))) {
762 return false;
763 }
764 }
765 return true;
766 }
767
768 /**
769 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
770 * {@code !matchesAnyOf(sequence)}.
771 *
772 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
773 * character, until this returns {@code false} or the end is reached.
774 *
775 * @param sequence the character sequence to examine, possibly empty
776 * @return {@code true} if this matcher matches every character in the sequence, including when
777 * the sequence is empty
778 */
779 public boolean matchesNoneOf(CharSequence sequence) {
780 return indexIn(sequence) == -1;
781 }
782
783 // TODO(kevinb): add matchesAnyOf()
784
785 /**
786 * Returns the index of the first matching character in a character sequence, or {@code -1} if no
787 * matching character is present.
788 *
789 * <p>The default implementation iterates over the sequence in forward order calling {@link
790 * #matches} for each character.
791 *
792 * @param sequence the character sequence to examine from the beginning
793 * @return an index, or {@code -1} if no character matches
794 */
795 public int indexIn(CharSequence sequence) {
796 int length = sequence.length();
797 for (int i = 0; i < length; i++) {
798 if (matches(sequence.charAt(i))) {
799 return i;
800 }
801 }
802 return -1;
803 }
804
805 /**
806 * Returns the index of the first matching character in a character sequence, starting from a
807 * given position, or {@code -1} if no character matches after that position.
808 *
809 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
810 * start}, calling {@link #matches} for each character.
811 *
812 * @param sequence the character sequence to examine
813 * @param start the first index to examine; must be nonnegative and no greater than {@code
814 * sequence.length()}
815 * @return the index of the first matching character, guaranteed to be no less than {@code start},
816 * or {@code -1} if no character matches
817 * @throws IndexOutOfBoundsException if start is negative or greater than {@code
818 * sequence.length()}
819 */
820 public int indexIn(CharSequence sequence, int start) {
821 int length = sequence.length();
822 Preconditions.checkPositionIndex(start, length);
823 for (int i = start; i < length; i++) {
824 if (matches(sequence.charAt(i))) {
825 return i;
826 }
827 }
828 return -1;
829 }
830
831 /**
832 * Returns the index of the last matching character in a character sequence, or {@code -1} if no
833 * matching character is present.
834 *
835 * <p>The default implementation iterates over the sequence in reverse order calling {@link
836 * #matches} for each character.
837 *
838 * @param sequence the character sequence to examine from the end
839 * @return an index, or {@code -1} if no character matches
840 */
841 public int lastIndexIn(CharSequence sequence) {
842 for (int i = sequence.length() - 1; i >= 0; i--) {
843 if (matches(sequence.charAt(i))) {
844 return i;
845 }
846 }
847 return -1;
848 }
849
850 /**
851 * Returns the number of matching characters found in a character sequence.
852 */
853 public int countIn(CharSequence sequence) {
854 int count = 0;
855 for (int i = 0; i < sequence.length(); i++) {
856 if (matches(sequence.charAt(i))) {
857 count++;
858 }
859 }
860 return count;
861 }
862
863 /**
864 * Returns a string containing all non-matching characters of a character sequence, in order. For
865 * example: <pre> {@code
866 *
867 * CharMatcher.is('a').removeFrom("bazaar")}</pre>
868 *
869 * ... returns {@code "bzr"}.
870 */
871 @CheckReturnValue
872 public String removeFrom(CharSequence sequence) {
873 String string = sequence.toString();
874 int pos = indexIn(string);
875 if (pos == -1) {
876 return string;
877 }
878
879 char[] chars = string.toCharArray();
880 int spread = 1;
881
882 // This unusual loop comes from extensive benchmarking
883 OUT: while (true) {
884 pos++;
885 while (true) {
886 if (pos == chars.length) {
887 break OUT;
888 }
889 if (matches(chars[pos])) {
890 break;
891 }
892 chars[pos - spread] = chars[pos];
893 pos++;
894 }
895 spread++;
896 }
897 return new String(chars, 0, pos - spread);
898 }
899
900 /**
901 * Returns a string containing all matching characters of a character sequence, in order. For
902 * example: <pre> {@code
903 *
904 * CharMatcher.is('a').retainFrom("bazaar")}</pre>
905 *
906 * ... returns {@code "aaa"}.
907 */
908 @CheckReturnValue
909 public String retainFrom(CharSequence sequence) {
910 return negate().removeFrom(sequence);
911 }
912
913 /**
914 * Returns a string copy of the input character sequence, with each character that matches this
915 * matcher replaced by a given replacement character. For example: <pre> {@code
916 *
917 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
918 *
919 * ... returns {@code "rodor"}.
920 *
921 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
922 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
923 * character.
924 *
925 * @param sequence the character sequence to replace matching characters in
926 * @param replacement the character to append to the result string in place of each matching
927 * character in {@code sequence}
928 * @return the new string
929 */
930 @CheckReturnValue
931 public String replaceFrom(CharSequence sequence, char replacement) {
932 String string = sequence.toString();
933 int pos = indexIn(string);
934 if (pos == -1) {
935 return string;
936 }
937 char[] chars = string.toCharArray();
938 chars[pos] = replacement;
939 for (int i = pos + 1; i < chars.length; i++) {
940 if (matches(chars[i])) {
941 chars[i] = replacement;
942 }
943 }
944 return new String(chars);
945 }
946
947 /**
948 * Returns a string copy of the input character sequence, with each character that matches this
949 * matcher replaced by a given replacement sequence. For example: <pre> {@code
950 *
951 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
952 *
953 * ... returns {@code "yoohoo"}.
954 *
955 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
956 * off calling {@link #replaceFrom(CharSequence, char)} directly.
957 *
958 * @param sequence the character sequence to replace matching characters in
959 * @param replacement the characters to append to the result string in place of each matching
960 * character in {@code sequence}
961 * @return the new string
962 */
963 @CheckReturnValue
964 public String replaceFrom(CharSequence sequence, CharSequence replacement) {
965 int replacementLen = replacement.length();
966 if (replacementLen == 0) {
967 return removeFrom(sequence);
968 }
969 if (replacementLen == 1) {
970 return replaceFrom(sequence, replacement.charAt(0));
971 }
972
973 String string = sequence.toString();
974 int pos = indexIn(string);
975 if (pos == -1) {
976 return string;
977 }
978
979 int len = string.length();
980 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
981
982 int oldpos = 0;
983 do {
984 buf.append(string, oldpos, pos);
985 buf.append(replacement);
986 oldpos = pos + 1;
987 pos = indexIn(string, oldpos);
988 } while (pos != -1);
989
990 buf.append(string, oldpos, len);
991 return buf.toString();
992 }
993
994 /**
995 * Returns a substring of the input character sequence that omits all characters this matcher
996 * matches from the beginning and from the end of the string. For example: <pre> {@code
997 *
998 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
999 *
1000 * ... returns {@code "cat"}.
1001 *
1002 * <p>Note that: <pre> {@code
1003 *
1004 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1005 *
1006 * ... is equivalent to {@link String#trim()}.
1007 */
1008 @CheckReturnValue
1009 public String trimFrom(CharSequence sequence) {
1010 int len = sequence.length();
1011 int first;
1012 int last;
1013
1014 for (first = 0; first < len; first++) {
1015 if (!matches(sequence.charAt(first))) {
1016 break;
1017 }
1018 }
1019 for (last = len - 1; last > first; last--) {
1020 if (!matches(sequence.charAt(last))) {
1021 break;
1022 }
1023 }
1024
1025 return sequence.subSequence(first, last + 1).toString();
1026 }
1027
1028 /**
1029 * Returns a substring of the input character sequence that omits all characters this matcher
1030 * matches from the beginning of the string. For example: <pre> {@code
1031 *
1032 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1033 *
1034 * ... returns {@code "catbab"}.
1035 */
1036 @CheckReturnValue
1037 public String trimLeadingFrom(CharSequence sequence) {
1038 int len = sequence.length();
1039 int first;
1040
1041 for (first = 0; first < len; first++) {
1042 if (!matches(sequence.charAt(first))) {
1043 break;
1044 }
1045 }
1046
1047 return sequence.subSequence(first, len).toString();
1048 }
1049
1050 /**
1051 * Returns a substring of the input character sequence that omits all characters this matcher
1052 * matches from the end of the string. For example: <pre> {@code
1053 *
1054 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1055 *
1056 * ... returns {@code "abacat"}.
1057 */
1058 @CheckReturnValue
1059 public String trimTrailingFrom(CharSequence sequence) {
1060 int len = sequence.length();
1061 int last;
1062
1063 for (last = len - 1; last >= 0; last--) {
1064 if (!matches(sequence.charAt(last))) {
1065 break;
1066 }
1067 }
1068
1069 return sequence.subSequence(0, last + 1).toString();
1070 }
1071
1072 /**
1073 * Returns a string copy of the input character sequence, with each group of consecutive
1074 * characters that match this matcher replaced by a single replacement character. For example:
1075 * <pre> {@code
1076 *
1077 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1078 *
1079 * ... returns {@code "b-p-r"}.
1080 *
1081 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1082 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1083 * character.
1084 *
1085 * @param sequence the character sequence to replace matching groups of characters in
1086 * @param replacement the character to append to the result string in place of each group of
1087 * matching characters in {@code sequence}
1088 * @return the new string
1089 */
1090 @CheckReturnValue
1091 public String collapseFrom(CharSequence sequence, char replacement) {
1092 int first = indexIn(sequence);
1093 if (first == -1) {
1094 return sequence.toString();
1095 }
1096
1097 // TODO(kevinb): see if this implementation can be made faster
1098 StringBuilder builder = new StringBuilder(sequence.length())
1099 .append(sequence.subSequence(0, first))
1100 .append(replacement);
1101 boolean in = true;
1102 for (int i = first + 1; i < sequence.length(); i++) {
1103 char c = sequence.charAt(i);
1104 if (apply(c)) {
1105 if (!in) {
1106 builder.append(replacement);
1107 in = true;
1108 }
1109 } else {
1110 builder.append(c);
1111 in = false;
1112 }
1113 }
1114 return builder.toString();
1115 }
1116
1117 /**
1118 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1119 * groups of matching characters at the start or end of the sequence are removed without
1120 * replacement.
1121 */
1122 @CheckReturnValue
1123 public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1124 int first = negate().indexIn(sequence);
1125 if (first == -1) {
1126 return ""; // everything matches. nothing's left.
1127 }
1128 StringBuilder builder = new StringBuilder(sequence.length());
1129 boolean inMatchingGroup = false;
1130 for (int i = first; i < sequence.length(); i++) {
1131 char c = sequence.charAt(i);
1132 if (apply(c)) {
1133 inMatchingGroup = true;
1134 } else {
1135 if (inMatchingGroup) {
1136 builder.append(replacement);
1137 inMatchingGroup = false;
1138 }
1139 builder.append(c);
1140 }
1141 }
1142 return builder.toString();
1143 }
1144
1145 // Predicate interface
1146
1147 /**
1148 * Returns {@code true} if this matcher matches the given character.
1149 *
1150 * @throws NullPointerException if {@code character} is null
1151 */
1152 @Override public boolean apply(Character character) {
1153 return matches(character);
1154 }
1155 }