001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.codec.language;
019
020 import org.apache.commons.codec.EncoderException;
021 import org.apache.commons.codec.StringEncoder;
022
023 /**
024 * Encodes a string into a double metaphone value.
025 * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>.
026 * <ul>
027 * <li>Original Article: <a
028 * href="http://www.cuj.com/documents/s=8038/cuj0006philips/">
029 * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li>
030 * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
031 * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li>
032 * </ul>
033 *
034 * @author Apache Software Foundation
035 * @version $Id: DoubleMetaphone.java 800153 2009-08-02 22:45:30Z ggregory $
036 */
037 public class DoubleMetaphone implements StringEncoder {
038
039 /**
040 * "Vowels" to test for
041 */
042 private static final String VOWELS = "AEIOUY";
043
044 /**
045 * Prefixes when present which are not pronounced
046 */
047 private static final String[] SILENT_START =
048 { "GN", "KN", "PN", "WR", "PS" };
049 private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
050 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
051 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
052 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
053 private static final String[] L_T_K_S_N_M_B_Z =
054 { "L", "T", "K", "S", "N", "M", "B", "Z" };
055
056 /**
057 * Maximum length of an encoding, default is 4
058 */
059 protected int maxCodeLen = 4;
060
061 /**
062 * Creates an instance of this DoubleMetaphone encoder
063 */
064 public DoubleMetaphone() {
065 super();
066 }
067
068 /**
069 * Encode a value with Double Metaphone
070 *
071 * @param value String to encode
072 * @return an encoded string
073 */
074 public String doubleMetaphone(String value) {
075 return doubleMetaphone(value, false);
076 }
077
078 /**
079 * Encode a value with Double Metaphone, optionally using the alternate
080 * encoding.
081 *
082 * @param value String to encode
083 * @param alternate use alternate encode
084 * @return an encoded string
085 */
086 public String doubleMetaphone(String value, boolean alternate) {
087 value = cleanInput(value);
088 if (value == null) {
089 return null;
090 }
091
092 boolean slavoGermanic = isSlavoGermanic(value);
093 int index = isSilentStart(value) ? 1 : 0;
094
095 DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
096
097 while (!result.isComplete() && index <= value.length() - 1) {
098 switch (value.charAt(index)) {
099 case 'A':
100 case 'E':
101 case 'I':
102 case 'O':
103 case 'U':
104 case 'Y':
105 index = handleAEIOUY(value, result, index);
106 break;
107 case 'B':
108 result.append('P');
109 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
110 break;
111 case '\u00C7':
112 // A C with a Cedilla
113 result.append('S');
114 index++;
115 break;
116 case 'C':
117 index = handleC(value, result, index);
118 break;
119 case 'D':
120 index = handleD(value, result, index);
121 break;
122 case 'F':
123 result.append('F');
124 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
125 break;
126 case 'G':
127 index = handleG(value, result, index, slavoGermanic);
128 break;
129 case 'H':
130 index = handleH(value, result, index);
131 break;
132 case 'J':
133 index = handleJ(value, result, index, slavoGermanic);
134 break;
135 case 'K':
136 result.append('K');
137 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
138 break;
139 case 'L':
140 index = handleL(value, result, index);
141 break;
142 case 'M':
143 result.append('M');
144 index = conditionM0(value, index) ? index + 2 : index + 1;
145 break;
146 case 'N':
147 result.append('N');
148 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
149 break;
150 case '\u00D1':
151 // N with a tilde (spanish ene)
152 result.append('N');
153 index++;
154 break;
155 case 'P':
156 index = handleP(value, result, index);
157 break;
158 case 'Q':
159 result.append('K');
160 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
161 break;
162 case 'R':
163 index = handleR(value, result, index, slavoGermanic);
164 break;
165 case 'S':
166 index = handleS(value, result, index, slavoGermanic);
167 break;
168 case 'T':
169 index = handleT(value, result, index);
170 break;
171 case 'V':
172 result.append('F');
173 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
174 break;
175 case 'W':
176 index = handleW(value, result, index);
177 break;
178 case 'X':
179 index = handleX(value, result, index);
180 break;
181 case 'Z':
182 index = handleZ(value, result, index, slavoGermanic);
183 break;
184 default:
185 index++;
186 break;
187 }
188 }
189
190 return alternate ? result.getAlternate() : result.getPrimary();
191 }
192
193 /**
194 * Encode the value using DoubleMetaphone. It will only work if
195 * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
196 *
197 * @param obj Object to encode (should be of type String)
198 * @return An encoded Object (will be of type String)
199 * @throws EncoderException encode parameter is not of type String
200 */
201 public Object encode(Object obj) throws EncoderException {
202 if (!(obj instanceof String)) {
203 throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
204 }
205 return doubleMetaphone((String) obj);
206 }
207
208 /**
209 * Encode the value using DoubleMetaphone.
210 *
211 * @param value String to encode
212 * @return An encoded String
213 */
214 public String encode(String value) {
215 return doubleMetaphone(value);
216 }
217
218 /**
219 * Check if the Double Metaphone values of two <code>String</code> values
220 * are equal.
221 *
222 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
223 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
224 * @return <code>true</code> if the encoded <code>String</code>s are equal;
225 * <code>false</code> otherwise.
226 * @see #isDoubleMetaphoneEqual(String,String,boolean)
227 */
228 public boolean isDoubleMetaphoneEqual(String value1, String value2) {
229 return isDoubleMetaphoneEqual(value1, value2, false);
230 }
231
232 /**
233 * Check if the Double Metaphone values of two <code>String</code> values
234 * are equal, optionally using the alternate value.
235 *
236 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
237 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
238 * @param alternate use the alternate value if <code>true</code>.
239 * @return <code>true</code> if the encoded <code>String</code>s are equal;
240 * <code>false</code> otherwise.
241 */
242 public boolean isDoubleMetaphoneEqual(String value1,
243 String value2,
244 boolean alternate) {
245 return doubleMetaphone(value1, alternate).equals(doubleMetaphone
246 (value2, alternate));
247 }
248
249 /**
250 * Returns the maxCodeLen.
251 * @return int
252 */
253 public int getMaxCodeLen() {
254 return this.maxCodeLen;
255 }
256
257 /**
258 * Sets the maxCodeLen.
259 * @param maxCodeLen The maxCodeLen to set
260 */
261 public void setMaxCodeLen(int maxCodeLen) {
262 this.maxCodeLen = maxCodeLen;
263 }
264
265 //-- BEGIN HANDLERS --//
266
267 /**
268 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
269 */
270 private int handleAEIOUY(String value, DoubleMetaphoneResult result, int
271 index) {
272 if (index == 0) {
273 result.append('A');
274 }
275 return index + 1;
276 }
277
278 /**
279 * Handles 'C' cases
280 */
281 private int handleC(String value,
282 DoubleMetaphoneResult result,
283 int index) {
284 if (conditionC0(value, index)) { // very confusing, moved out
285 result.append('K');
286 index += 2;
287 } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
288 result.append('S');
289 index += 2;
290 } else if (contains(value, index, 2, "CH")) {
291 index = handleCH(value, result, index);
292 } else if (contains(value, index, 2, "CZ") &&
293 !contains(value, index - 2, 4, "WICZ")) {
294 //-- "Czerny" --//
295 result.append('S', 'X');
296 index += 2;
297 } else if (contains(value, index + 1, 3, "CIA")) {
298 //-- "focaccia" --//
299 result.append('X');
300 index += 3;
301 } else if (contains(value, index, 2, "CC") &&
302 !(index == 1 && charAt(value, 0) == 'M')) {
303 //-- double "cc" but not "McClelland" --//
304 return handleCC(value, result, index);
305 } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
306 result.append('K');
307 index += 2;
308 } else if (contains(value, index, 2, "CI", "CE", "CY")) {
309 //-- Italian vs. English --//
310 if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
311 result.append('S', 'X');
312 } else {
313 result.append('S');
314 }
315 index += 2;
316 } else {
317 result.append('K');
318 if (contains(value, index + 1, 2, " C", " Q", " G")) {
319 //-- Mac Caffrey, Mac Gregor --//
320 index += 3;
321 } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
322 !contains(value, index + 1, 2, "CE", "CI")) {
323 index += 2;
324 } else {
325 index++;
326 }
327 }
328
329 return index;
330 }
331
332 /**
333 * Handles 'CC' cases
334 */
335 private int handleCC(String value,
336 DoubleMetaphoneResult result,
337 int index) {
338 if (contains(value, index + 2, 1, "I", "E", "H") &&
339 !contains(value, index + 2, 2, "HU")) {
340 //-- "bellocchio" but not "bacchus" --//
341 if ((index == 1 && charAt(value, index - 1) == 'A') ||
342 contains(value, index - 1, 5, "UCCEE", "UCCES")) {
343 //-- "accident", "accede", "succeed" --//
344 result.append("KS");
345 } else {
346 //-- "bacci", "bertucci", other Italian --//
347 result.append('X');
348 }
349 index += 3;
350 } else { // Pierce's rule
351 result.append('K');
352 index += 2;
353 }
354
355 return index;
356 }
357
358 /**
359 * Handles 'CH' cases
360 */
361 private int handleCH(String value,
362 DoubleMetaphoneResult result,
363 int index) {
364 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael
365 result.append('K', 'X');
366 return index + 2;
367 } else if (conditionCH0(value, index)) {
368 //-- Greek roots ("chemistry", "chorus", etc.) --//
369 result.append('K');
370 return index + 2;
371 } else if (conditionCH1(value, index)) {
372 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
373 result.append('K');
374 return index + 2;
375 } else {
376 if (index > 0) {
377 if (contains(value, 0, 2, "MC")) {
378 result.append('K');
379 } else {
380 result.append('X', 'K');
381 }
382 } else {
383 result.append('X');
384 }
385 return index + 2;
386 }
387 }
388
389 /**
390 * Handles 'D' cases
391 */
392 private int handleD(String value,
393 DoubleMetaphoneResult result,
394 int index) {
395 if (contains(value, index, 2, "DG")) {
396 //-- "Edge" --//
397 if (contains(value, index + 2, 1, "I", "E", "Y")) {
398 result.append('J');
399 index += 3;
400 //-- "Edgar" --//
401 } else {
402 result.append("TK");
403 index += 2;
404 }
405 } else if (contains(value, index, 2, "DT", "DD")) {
406 result.append('T');
407 index += 2;
408 } else {
409 result.append('T');
410 index++;
411 }
412 return index;
413 }
414
415 /**
416 * Handles 'G' cases
417 */
418 private int handleG(String value,
419 DoubleMetaphoneResult result,
420 int index,
421 boolean slavoGermanic) {
422 if (charAt(value, index + 1) == 'H') {
423 index = handleGH(value, result, index);
424 } else if (charAt(value, index + 1) == 'N') {
425 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
426 result.append("KN", "N");
427 } else if (!contains(value, index + 2, 2, "EY") &&
428 charAt(value, index + 1) != 'Y' && !slavoGermanic) {
429 result.append("N", "KN");
430 } else {
431 result.append("KN");
432 }
433 index = index + 2;
434 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
435 result.append("KL", "L");
436 index += 2;
437 } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
438 //-- -ges-, -gep-, -gel-, -gie- at beginning --//
439 result.append('K', 'J');
440 index += 2;
441 } else if ((contains(value, index + 1, 2, "ER") ||
442 charAt(value, index + 1) == 'Y') &&
443 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
444 !contains(value, index - 1, 1, "E", "I") &&
445 !contains(value, index - 1, 3, "RGY", "OGY")) {
446 //-- -ger-, -gy- --//
447 result.append('K', 'J');
448 index += 2;
449 } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
450 contains(value, index - 1, 4, "AGGI", "OGGI")) {
451 //-- Italian "biaggi" --//
452 if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) {
453 //-- obvious germanic --//
454 result.append('K');
455 } else if (contains(value, index + 1, 3, "IER")) {
456 result.append('J');
457 } else {
458 result.append('J', 'K');
459 }
460 index += 2;
461 } else if (charAt(value, index + 1) == 'G') {
462 index += 2;
463 result.append('K');
464 } else {
465 index++;
466 result.append('K');
467 }
468 return index;
469 }
470
471 /**
472 * Handles 'GH' cases
473 */
474 private int handleGH(String value,
475 DoubleMetaphoneResult result,
476 int index) {
477 if (index > 0 && !isVowel(charAt(value, index - 1))) {
478 result.append('K');
479 index += 2;
480 } else if (index == 0) {
481 if (charAt(value, index + 2) == 'I') {
482 result.append('J');
483 } else {
484 result.append('K');
485 }
486 index += 2;
487 } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
488 (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
489 (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
490 //-- Parker's rule (with some further refinements) - "hugh"
491 index += 2;
492 } else {
493 if (index > 2 && charAt(value, index - 1) == 'U' &&
494 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
495 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
496 result.append('F');
497 } else if (index > 0 && charAt(value, index - 1) != 'I') {
498 result.append('K');
499 }
500 index += 2;
501 }
502 return index;
503 }
504
505 /**
506 * Handles 'H' cases
507 */
508 private int handleH(String value,
509 DoubleMetaphoneResult result,
510 int index) {
511 //-- only keep if first & before vowel or between 2 vowels --//
512 if ((index == 0 || isVowel(charAt(value, index - 1))) &&
513 isVowel(charAt(value, index + 1))) {
514 result.append('H');
515 index += 2;
516 //-- also takes car of "HH" --//
517 } else {
518 index++;
519 }
520 return index;
521 }
522
523 /**
524 * Handles 'J' cases
525 */
526 private int handleJ(String value, DoubleMetaphoneResult result, int index,
527 boolean slavoGermanic) {
528 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
529 //-- obvious Spanish, "Jose", "San Jacinto" --//
530 if ((index == 0 && (charAt(value, index + 4) == ' ') ||
531 value.length() == 4) || contains(value, 0, 4, "SAN ")) {
532 result.append('H');
533 } else {
534 result.append('J', 'H');
535 }
536 index++;
537 } else {
538 if (index == 0 && !contains(value, index, 4, "JOSE")) {
539 result.append('J', 'A');
540 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
541 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
542 result.append('J', 'H');
543 } else if (index == value.length() - 1) {
544 result.append('J', ' ');
545 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) {
546 result.append('J');
547 }
548
549 if (charAt(value, index + 1) == 'J') {
550 index += 2;
551 } else {
552 index++;
553 }
554 }
555 return index;
556 }
557
558 /**
559 * Handles 'L' cases
560 */
561 private int handleL(String value,
562 DoubleMetaphoneResult result,
563 int index) {
564 if (charAt(value, index + 1) == 'L') {
565 if (conditionL0(value, index)) {
566 result.appendPrimary('L');
567 } else {
568 result.append('L');
569 }
570 index += 2;
571 } else {
572 index++;
573 result.append('L');
574 }
575 return index;
576 }
577
578 /**
579 * Handles 'P' cases
580 */
581 private int handleP(String value,
582 DoubleMetaphoneResult result,
583 int index) {
584 if (charAt(value, index + 1) == 'H') {
585 result.append('F');
586 index += 2;
587 } else {
588 result.append('P');
589 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
590 }
591 return index;
592 }
593
594 /**
595 * Handles 'R' cases
596 */
597 private int handleR(String value,
598 DoubleMetaphoneResult result,
599 int index,
600 boolean slavoGermanic) {
601 if (index == value.length() - 1 && !slavoGermanic &&
602 contains(value, index - 2, 2, "IE") &&
603 !contains(value, index - 4, 2, "ME", "MA")) {
604 result.appendAlternate('R');
605 } else {
606 result.append('R');
607 }
608 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
609 }
610
611 /**
612 * Handles 'S' cases
613 */
614 private int handleS(String value,
615 DoubleMetaphoneResult result,
616 int index,
617 boolean slavoGermanic) {
618 if (contains(value, index - 1, 3, "ISL", "YSL")) {
619 //-- special cases "island", "isle", "carlisle", "carlysle" --//
620 index++;
621 } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
622 //-- special case "sugar-" --//
623 result.append('X', 'S');
624 index++;
625 } else if (contains(value, index, 2, "SH")) {
626 if (contains(value, index + 1, 4,
627 "HEIM", "HOEK", "HOLM", "HOLZ")) {
628 //-- germanic --//
629 result.append('S');
630 } else {
631 result.append('X');
632 }
633 index += 2;
634 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
635 //-- Italian and Armenian --//
636 if (slavoGermanic) {
637 result.append('S');
638 } else {
639 result.append('S', 'X');
640 }
641 index += 3;
642 } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) {
643 //-- german & anglicisations, e.g. "smith" match "schmidt" //
644 // "snider" match "schneider" --//
645 //-- also, -sz- in slavic language altho in hungarian it //
646 // is pronounced "s" --//
647 result.append('S', 'X');
648 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
649 } else if (contains(value, index, 2, "SC")) {
650 index = handleSC(value, result, index);
651 } else {
652 if (index == value.length() - 1 && contains(value, index - 2,
653 2, "AI", "OI")){
654 //-- french e.g. "resnais", "artois" --//
655 result.appendAlternate('S');
656 } else {
657 result.append('S');
658 }
659 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
660 }
661 return index;
662 }
663
664 /**
665 * Handles 'SC' cases
666 */
667 private int handleSC(String value,
668 DoubleMetaphoneResult result,
669 int index) {
670 if (charAt(value, index + 2) == 'H') {
671 //-- Schlesinger's rule --//
672 if (contains(value, index + 3,
673 2, "OO", "ER", "EN", "UY", "ED", "EM")) {
674 //-- Dutch origin, e.g. "school", "schooner" --//
675 if (contains(value, index + 3, 2, "ER", "EN")) {
676 //-- "schermerhorn", "schenker" --//
677 result.append("X", "SK");
678 } else {
679 result.append("SK");
680 }
681 } else {
682 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
683 result.append('X', 'S');
684 } else {
685 result.append('X');
686 }
687 }
688 } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
689 result.append('S');
690 } else {
691 result.append("SK");
692 }
693 return index + 3;
694 }
695
696 /**
697 * Handles 'T' cases
698 */
699 private int handleT(String value,
700 DoubleMetaphoneResult result,
701 int index) {
702 if (contains(value, index, 4, "TION")) {
703 result.append('X');
704 index += 3;
705 } else if (contains(value, index, 3, "TIA", "TCH")) {
706 result.append('X');
707 index += 3;
708 } else if (contains(value, index, 2, "TH") || contains(value, index,
709 3, "TTH")) {
710 if (contains(value, index + 2, 2, "OM", "AM") ||
711 //-- special case "thomas", "thames" or germanic --//
712 contains(value, 0, 4, "VAN ", "VON ") ||
713 contains(value, 0, 3, "SCH")) {
714 result.append('T');
715 } else {
716 result.append('0', 'T');
717 }
718 index += 2;
719 } else {
720 result.append('T');
721 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
722 }
723 return index;
724 }
725
726 /**
727 * Handles 'W' cases
728 */
729 private int handleW(String value,
730 DoubleMetaphoneResult result,
731 int index) {
732 if (contains(value, index, 2, "WR")) {
733 //-- can also be in middle of word --//
734 result.append('R');
735 index += 2;
736 } else {
737 if (index == 0 && (isVowel(charAt(value, index + 1)) ||
738 contains(value, index, 2, "WH"))) {
739 if (isVowel(charAt(value, index + 1))) {
740 //-- Wasserman should match Vasserman --//
741 result.append('A', 'F');
742 } else {
743 //-- need Uomo to match Womo --//
744 result.append('A');
745 }
746 index++;
747 } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
748 contains(value, index - 1,
749 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
750 contains(value, 0, 3, "SCH")) {
751 //-- Arnow should match Arnoff --//
752 result.appendAlternate('F');
753 index++;
754 } else if (contains(value, index, 4, "WICZ", "WITZ")) {
755 //-- Polish e.g. "filipowicz" --//
756 result.append("TS", "FX");
757 index += 4;
758 } else {
759 index++;
760 }
761 }
762 return index;
763 }
764
765 /**
766 * Handles 'X' cases
767 */
768 private int handleX(String value,
769 DoubleMetaphoneResult result,
770 int index) {
771 if (index == 0) {
772 result.append('S');
773 index++;
774 } else {
775 if (!((index == value.length() - 1) &&
776 (contains(value, index - 3, 3, "IAU", "EAU") ||
777 contains(value, index - 2, 2, "AU", "OU")))) {
778 //-- French e.g. breaux --//
779 result.append("KS");
780 }
781 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
782 }
783 return index;
784 }
785
786 /**
787 * Handles 'Z' cases
788 */
789 private int handleZ(String value, DoubleMetaphoneResult result, int index,
790 boolean slavoGermanic) {
791 if (charAt(value, index + 1) == 'H') {
792 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
793 result.append('J');
794 index += 2;
795 } else {
796 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
797 result.append("S", "TS");
798 } else {
799 result.append('S');
800 }
801 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
802 }
803 return index;
804 }
805
806 //-- BEGIN CONDITIONS --//
807
808 /**
809 * Complex condition 0 for 'C'
810 */
811 private boolean conditionC0(String value, int index) {
812 if (contains(value, index, 4, "CHIA")) {
813 return true;
814 } else if (index <= 1) {
815 return false;
816 } else if (isVowel(charAt(value, index - 2))) {
817 return false;
818 } else if (!contains(value, index - 1, 3, "ACH")) {
819 return false;
820 } else {
821 char c = charAt(value, index + 2);
822 return (c != 'I' && c != 'E') ||
823 contains(value, index - 2, 6, "BACHER", "MACHER");
824 }
825 }
826
827 /**
828 * Complex condition 0 for 'CH'
829 */
830 private boolean conditionCH0(String value, int index) {
831 if (index != 0) {
832 return false;
833 } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
834 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
835 return false;
836 } else if (contains(value, 0, 5, "CHORE")) {
837 return false;
838 } else {
839 return true;
840 }
841 }
842
843 /**
844 * Complex condition 1 for 'CH'
845 */
846 private boolean conditionCH1(String value, int index) {
847 return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0,
848 3, "SCH")) ||
849 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
850 contains(value, index + 2, 1, "T", "S") ||
851 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
852 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
853 }
854
855 /**
856 * Complex condition 0 for 'L'
857 */
858 private boolean conditionL0(String value, int index) {
859 if (index == value.length() - 3 &&
860 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
861 return true;
862 } else if ((contains(value, value.length() - 2, 2, "AS", "OS") ||
863 contains(value, value.length() - 1, 1, "A", "O")) &&
864 contains(value, index - 1, 4, "ALLE")) {
865 return true;
866 } else {
867 return false;
868 }
869 }
870
871 /**
872 * Complex condition 0 for 'M'
873 */
874 private boolean conditionM0(String value, int index) {
875 if (charAt(value, index + 1) == 'M') {
876 return true;
877 }
878 return contains(value, index - 1, 3, "UMB") &&
879 ((index + 1) == value.length() - 1 || contains(value,
880 index + 2, 2, "ER"));
881 }
882
883 //-- BEGIN HELPER FUNCTIONS --//
884
885 /**
886 * Determines whether or not a value is of slavo-germanic orgin. A value is
887 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
888 */
889 private boolean isSlavoGermanic(String value) {
890 return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
891 value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
892 }
893
894 /**
895 * Determines whether or not a character is a vowel or not
896 */
897 private boolean isVowel(char ch) {
898 return VOWELS.indexOf(ch) != -1;
899 }
900
901 /**
902 * Determines whether or not the value starts with a silent letter. It will
903 * return <code>true</code> if the value starts with any of 'GN', 'KN',
904 * 'PN', 'WR' or 'PS'.
905 */
906 private boolean isSilentStart(String value) {
907 boolean result = false;
908 for (int i = 0; i < SILENT_START.length; i++) {
909 if (value.startsWith(SILENT_START[i])) {
910 result = true;
911 break;
912 }
913 }
914 return result;
915 }
916
917 /**
918 * Cleans the input
919 */
920 private String cleanInput(String input) {
921 if (input == null) {
922 return null;
923 }
924 input = input.trim();
925 if (input.length() == 0) {
926 return null;
927 }
928 return input.toUpperCase(java.util.Locale.ENGLISH);
929 }
930
931 /**
932 * Gets the character at index <code>index</code> if available, otherwise
933 * it returns <code>Character.MIN_VALUE</code> so that there is some sort
934 * of a default
935 */
936 protected char charAt(String value, int index) {
937 if (index < 0 || index >= value.length()) {
938 return Character.MIN_VALUE;
939 }
940 return value.charAt(index);
941 }
942
943 /**
944 * Shortcut method with 1 criteria
945 */
946 private static boolean contains(String value, int start, int length,
947 String criteria) {
948 return contains(value, start, length,
949 new String[] { criteria });
950 }
951
952 /**
953 * Shortcut method with 2 criteria
954 */
955 private static boolean contains(String value, int start, int length,
956 String criteria1, String criteria2) {
957 return contains(value, start, length,
958 new String[] { criteria1, criteria2 });
959 }
960
961 /**
962 * Shortcut method with 3 criteria
963 */
964 private static boolean contains(String value, int start, int length,
965 String criteria1, String criteria2,
966 String criteria3) {
967 return contains(value, start, length,
968 new String[] { criteria1, criteria2, criteria3 });
969 }
970
971 /**
972 * Shortcut method with 4 criteria
973 */
974 private static boolean contains(String value, int start, int length,
975 String criteria1, String criteria2,
976 String criteria3, String criteria4) {
977 return contains(value, start, length,
978 new String[] { criteria1, criteria2, criteria3,
979 criteria4 });
980 }
981
982 /**
983 * Shortcut method with 5 criteria
984 */
985 private static boolean contains(String value, int start, int length,
986 String criteria1, String criteria2,
987 String criteria3, String criteria4,
988 String criteria5) {
989 return contains(value, start, length,
990 new String[] { criteria1, criteria2, criteria3,
991 criteria4, criteria5 });
992 }
993
994 /**
995 * Shortcut method with 6 criteria
996 */
997 private static boolean contains(String value, int start, int length,
998 String criteria1, String criteria2,
999 String criteria3, String criteria4,
1000 String criteria5, String criteria6) {
1001 return contains(value, start, length,
1002 new String[] { criteria1, criteria2, criteria3,
1003 criteria4, criteria5, criteria6 });
1004 }
1005
1006 /**
1007 * Determines whether <code>value</code> contains any of the criteria starting at index <code>start</code> and
1008 * matching up to length <code>length</code>
1009 */
1010 protected static boolean contains(String value, int start, int length,
1011 String[] criteria) {
1012 boolean result = false;
1013 if (start >= 0 && start + length <= value.length()) {
1014 String target = value.substring(start, start + length);
1015
1016 for (int i = 0; i < criteria.length; i++) {
1017 if (target.equals(criteria[i])) {
1018 result = true;
1019 break;
1020 }
1021 }
1022 }
1023 return result;
1024 }
1025
1026 //-- BEGIN INNER CLASSES --//
1027
1028 /**
1029 * Inner class for storing results, since there is the optional alternate
1030 * encoding.
1031 */
1032 public class DoubleMetaphoneResult {
1033
1034 private StringBuffer primary = new StringBuffer(getMaxCodeLen());
1035 private StringBuffer alternate = new StringBuffer(getMaxCodeLen());
1036 private int maxLength;
1037
1038 public DoubleMetaphoneResult(int maxLength) {
1039 this.maxLength = maxLength;
1040 }
1041
1042 public void append(char value) {
1043 appendPrimary(value);
1044 appendAlternate(value);
1045 }
1046
1047 public void append(char primary, char alternate) {
1048 appendPrimary(primary);
1049 appendAlternate(alternate);
1050 }
1051
1052 public void appendPrimary(char value) {
1053 if (this.primary.length() < this.maxLength) {
1054 this.primary.append(value);
1055 }
1056 }
1057
1058 public void appendAlternate(char value) {
1059 if (this.alternate.length() < this.maxLength) {
1060 this.alternate.append(value);
1061 }
1062 }
1063
1064 public void append(String value) {
1065 appendPrimary(value);
1066 appendAlternate(value);
1067 }
1068
1069 public void append(String primary, String alternate) {
1070 appendPrimary(primary);
1071 appendAlternate(alternate);
1072 }
1073
1074 public void appendPrimary(String value) {
1075 int addChars = this.maxLength - this.primary.length();
1076 if (value.length() <= addChars) {
1077 this.primary.append(value);
1078 } else {
1079 this.primary.append(value.substring(0, addChars));
1080 }
1081 }
1082
1083 public void appendAlternate(String value) {
1084 int addChars = this.maxLength - this.alternate.length();
1085 if (value.length() <= addChars) {
1086 this.alternate.append(value);
1087 } else {
1088 this.alternate.append(value.substring(0, addChars));
1089 }
1090 }
1091
1092 public String getPrimary() {
1093 return this.primary.toString();
1094 }
1095
1096 public String getAlternate() {
1097 return this.alternate.toString();
1098 }
1099
1100 public boolean isComplete() {
1101 return this.primary.length() >= this.maxLength &&
1102 this.alternate.length() >= this.maxLength;
1103 }
1104 }
1105 }