001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.codec.language; 019 020 import org.apache.commons.codec.EncoderException; 021 import org.apache.commons.codec.StringEncoder; 022 023 /** 024 * Encodes a string into a double metaphone value. 025 * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>. 026 * <ul> 027 * <li>Original Article: <a 028 * href="http://www.cuj.com/documents/s=8038/cuj0006philips/"> 029 * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li> 030 * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip"> 031 * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li> 032 * </ul> 033 * 034 * @author Apache Software Foundation 035 * @version $Id: DoubleMetaphone.java 800153 2009-08-02 22:45:30Z ggregory $ 036 */ 037 public class DoubleMetaphone implements StringEncoder { 038 039 /** 040 * "Vowels" to test for 041 */ 042 private static final String VOWELS = "AEIOUY"; 043 044 /** 045 * Prefixes when present which are not pronounced 046 */ 047 private static final String[] SILENT_START = 048 { "GN", "KN", "PN", "WR", "PS" }; 049 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = 050 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 051 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = 052 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 053 private static final String[] L_T_K_S_N_M_B_Z = 054 { "L", "T", "K", "S", "N", "M", "B", "Z" }; 055 056 /** 057 * Maximum length of an encoding, default is 4 058 */ 059 protected int maxCodeLen = 4; 060 061 /** 062 * Creates an instance of this DoubleMetaphone encoder 063 */ 064 public DoubleMetaphone() { 065 super(); 066 } 067 068 /** 069 * Encode a value with Double Metaphone 070 * 071 * @param value String to encode 072 * @return an encoded string 073 */ 074 public String doubleMetaphone(String value) { 075 return doubleMetaphone(value, false); 076 } 077 078 /** 079 * Encode a value with Double Metaphone, optionally using the alternate 080 * encoding. 081 * 082 * @param value String to encode 083 * @param alternate use alternate encode 084 * @return an encoded string 085 */ 086 public String doubleMetaphone(String value, boolean alternate) { 087 value = cleanInput(value); 088 if (value == null) { 089 return null; 090 } 091 092 boolean slavoGermanic = isSlavoGermanic(value); 093 int index = isSilentStart(value) ? 1 : 0; 094 095 DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); 096 097 while (!result.isComplete() && index <= value.length() - 1) { 098 switch (value.charAt(index)) { 099 case 'A': 100 case 'E': 101 case 'I': 102 case 'O': 103 case 'U': 104 case 'Y': 105 index = handleAEIOUY(value, result, index); 106 break; 107 case 'B': 108 result.append('P'); 109 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 110 break; 111 case '\u00C7': 112 // A C with a Cedilla 113 result.append('S'); 114 index++; 115 break; 116 case 'C': 117 index = handleC(value, result, index); 118 break; 119 case 'D': 120 index = handleD(value, result, index); 121 break; 122 case 'F': 123 result.append('F'); 124 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 125 break; 126 case 'G': 127 index = handleG(value, result, index, slavoGermanic); 128 break; 129 case 'H': 130 index = handleH(value, result, index); 131 break; 132 case 'J': 133 index = handleJ(value, result, index, slavoGermanic); 134 break; 135 case 'K': 136 result.append('K'); 137 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 138 break; 139 case 'L': 140 index = handleL(value, result, index); 141 break; 142 case 'M': 143 result.append('M'); 144 index = conditionM0(value, index) ? index + 2 : index + 1; 145 break; 146 case 'N': 147 result.append('N'); 148 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 149 break; 150 case '\u00D1': 151 // N with a tilde (spanish ene) 152 result.append('N'); 153 index++; 154 break; 155 case 'P': 156 index = handleP(value, result, index); 157 break; 158 case 'Q': 159 result.append('K'); 160 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 161 break; 162 case 'R': 163 index = handleR(value, result, index, slavoGermanic); 164 break; 165 case 'S': 166 index = handleS(value, result, index, slavoGermanic); 167 break; 168 case 'T': 169 index = handleT(value, result, index); 170 break; 171 case 'V': 172 result.append('F'); 173 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 174 break; 175 case 'W': 176 index = handleW(value, result, index); 177 break; 178 case 'X': 179 index = handleX(value, result, index); 180 break; 181 case 'Z': 182 index = handleZ(value, result, index, slavoGermanic); 183 break; 184 default: 185 index++; 186 break; 187 } 188 } 189 190 return alternate ? result.getAlternate() : result.getPrimary(); 191 } 192 193 /** 194 * Encode the value using DoubleMetaphone. It will only work if 195 * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>). 196 * 197 * @param obj Object to encode (should be of type String) 198 * @return An encoded Object (will be of type String) 199 * @throws EncoderException encode parameter is not of type String 200 */ 201 public Object encode(Object obj) throws EncoderException { 202 if (!(obj instanceof String)) { 203 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 204 } 205 return doubleMetaphone((String) obj); 206 } 207 208 /** 209 * Encode the value using DoubleMetaphone. 210 * 211 * @param value String to encode 212 * @return An encoded String 213 */ 214 public String encode(String value) { 215 return doubleMetaphone(value); 216 } 217 218 /** 219 * Check if the Double Metaphone values of two <code>String</code> values 220 * are equal. 221 * 222 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 223 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 224 * @return <code>true</code> if the encoded <code>String</code>s are equal; 225 * <code>false</code> otherwise. 226 * @see #isDoubleMetaphoneEqual(String,String,boolean) 227 */ 228 public boolean isDoubleMetaphoneEqual(String value1, String value2) { 229 return isDoubleMetaphoneEqual(value1, value2, false); 230 } 231 232 /** 233 * Check if the Double Metaphone values of two <code>String</code> values 234 * are equal, optionally using the alternate value. 235 * 236 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 237 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 238 * @param alternate use the alternate value if <code>true</code>. 239 * @return <code>true</code> if the encoded <code>String</code>s are equal; 240 * <code>false</code> otherwise. 241 */ 242 public boolean isDoubleMetaphoneEqual(String value1, 243 String value2, 244 boolean alternate) { 245 return doubleMetaphone(value1, alternate).equals(doubleMetaphone 246 (value2, alternate)); 247 } 248 249 /** 250 * Returns the maxCodeLen. 251 * @return int 252 */ 253 public int getMaxCodeLen() { 254 return this.maxCodeLen; 255 } 256 257 /** 258 * Sets the maxCodeLen. 259 * @param maxCodeLen The maxCodeLen to set 260 */ 261 public void setMaxCodeLen(int maxCodeLen) { 262 this.maxCodeLen = maxCodeLen; 263 } 264 265 //-- BEGIN HANDLERS --// 266 267 /** 268 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases 269 */ 270 private int handleAEIOUY(String value, DoubleMetaphoneResult result, int 271 index) { 272 if (index == 0) { 273 result.append('A'); 274 } 275 return index + 1; 276 } 277 278 /** 279 * Handles 'C' cases 280 */ 281 private int handleC(String value, 282 DoubleMetaphoneResult result, 283 int index) { 284 if (conditionC0(value, index)) { // very confusing, moved out 285 result.append('K'); 286 index += 2; 287 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 288 result.append('S'); 289 index += 2; 290 } else if (contains(value, index, 2, "CH")) { 291 index = handleCH(value, result, index); 292 } else if (contains(value, index, 2, "CZ") && 293 !contains(value, index - 2, 4, "WICZ")) { 294 //-- "Czerny" --// 295 result.append('S', 'X'); 296 index += 2; 297 } else if (contains(value, index + 1, 3, "CIA")) { 298 //-- "focaccia" --// 299 result.append('X'); 300 index += 3; 301 } else if (contains(value, index, 2, "CC") && 302 !(index == 1 && charAt(value, 0) == 'M')) { 303 //-- double "cc" but not "McClelland" --// 304 return handleCC(value, result, index); 305 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 306 result.append('K'); 307 index += 2; 308 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 309 //-- Italian vs. English --// 310 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 311 result.append('S', 'X'); 312 } else { 313 result.append('S'); 314 } 315 index += 2; 316 } else { 317 result.append('K'); 318 if (contains(value, index + 1, 2, " C", " Q", " G")) { 319 //-- Mac Caffrey, Mac Gregor --// 320 index += 3; 321 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 322 !contains(value, index + 1, 2, "CE", "CI")) { 323 index += 2; 324 } else { 325 index++; 326 } 327 } 328 329 return index; 330 } 331 332 /** 333 * Handles 'CC' cases 334 */ 335 private int handleCC(String value, 336 DoubleMetaphoneResult result, 337 int index) { 338 if (contains(value, index + 2, 1, "I", "E", "H") && 339 !contains(value, index + 2, 2, "HU")) { 340 //-- "bellocchio" but not "bacchus" --// 341 if ((index == 1 && charAt(value, index - 1) == 'A') || 342 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 343 //-- "accident", "accede", "succeed" --// 344 result.append("KS"); 345 } else { 346 //-- "bacci", "bertucci", other Italian --// 347 result.append('X'); 348 } 349 index += 3; 350 } else { // Pierce's rule 351 result.append('K'); 352 index += 2; 353 } 354 355 return index; 356 } 357 358 /** 359 * Handles 'CH' cases 360 */ 361 private int handleCH(String value, 362 DoubleMetaphoneResult result, 363 int index) { 364 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 365 result.append('K', 'X'); 366 return index + 2; 367 } else if (conditionCH0(value, index)) { 368 //-- Greek roots ("chemistry", "chorus", etc.) --// 369 result.append('K'); 370 return index + 2; 371 } else if (conditionCH1(value, index)) { 372 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 373 result.append('K'); 374 return index + 2; 375 } else { 376 if (index > 0) { 377 if (contains(value, 0, 2, "MC")) { 378 result.append('K'); 379 } else { 380 result.append('X', 'K'); 381 } 382 } else { 383 result.append('X'); 384 } 385 return index + 2; 386 } 387 } 388 389 /** 390 * Handles 'D' cases 391 */ 392 private int handleD(String value, 393 DoubleMetaphoneResult result, 394 int index) { 395 if (contains(value, index, 2, "DG")) { 396 //-- "Edge" --// 397 if (contains(value, index + 2, 1, "I", "E", "Y")) { 398 result.append('J'); 399 index += 3; 400 //-- "Edgar" --// 401 } else { 402 result.append("TK"); 403 index += 2; 404 } 405 } else if (contains(value, index, 2, "DT", "DD")) { 406 result.append('T'); 407 index += 2; 408 } else { 409 result.append('T'); 410 index++; 411 } 412 return index; 413 } 414 415 /** 416 * Handles 'G' cases 417 */ 418 private int handleG(String value, 419 DoubleMetaphoneResult result, 420 int index, 421 boolean slavoGermanic) { 422 if (charAt(value, index + 1) == 'H') { 423 index = handleGH(value, result, index); 424 } else if (charAt(value, index + 1) == 'N') { 425 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 426 result.append("KN", "N"); 427 } else if (!contains(value, index + 2, 2, "EY") && 428 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 429 result.append("N", "KN"); 430 } else { 431 result.append("KN"); 432 } 433 index = index + 2; 434 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 435 result.append("KL", "L"); 436 index += 2; 437 } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 438 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 439 result.append('K', 'J'); 440 index += 2; 441 } else if ((contains(value, index + 1, 2, "ER") || 442 charAt(value, index + 1) == 'Y') && 443 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 444 !contains(value, index - 1, 1, "E", "I") && 445 !contains(value, index - 1, 3, "RGY", "OGY")) { 446 //-- -ger-, -gy- --// 447 result.append('K', 'J'); 448 index += 2; 449 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 450 contains(value, index - 1, 4, "AGGI", "OGGI")) { 451 //-- Italian "biaggi" --// 452 if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) { 453 //-- obvious germanic --// 454 result.append('K'); 455 } else if (contains(value, index + 1, 3, "IER")) { 456 result.append('J'); 457 } else { 458 result.append('J', 'K'); 459 } 460 index += 2; 461 } else if (charAt(value, index + 1) == 'G') { 462 index += 2; 463 result.append('K'); 464 } else { 465 index++; 466 result.append('K'); 467 } 468 return index; 469 } 470 471 /** 472 * Handles 'GH' cases 473 */ 474 private int handleGH(String value, 475 DoubleMetaphoneResult result, 476 int index) { 477 if (index > 0 && !isVowel(charAt(value, index - 1))) { 478 result.append('K'); 479 index += 2; 480 } else if (index == 0) { 481 if (charAt(value, index + 2) == 'I') { 482 result.append('J'); 483 } else { 484 result.append('K'); 485 } 486 index += 2; 487 } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || 488 (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || 489 (index > 3 && contains(value, index - 4, 1, "B", "H"))) { 490 //-- Parker's rule (with some further refinements) - "hugh" 491 index += 2; 492 } else { 493 if (index > 2 && charAt(value, index - 1) == 'U' && 494 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 495 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 496 result.append('F'); 497 } else if (index > 0 && charAt(value, index - 1) != 'I') { 498 result.append('K'); 499 } 500 index += 2; 501 } 502 return index; 503 } 504 505 /** 506 * Handles 'H' cases 507 */ 508 private int handleH(String value, 509 DoubleMetaphoneResult result, 510 int index) { 511 //-- only keep if first & before vowel or between 2 vowels --// 512 if ((index == 0 || isVowel(charAt(value, index - 1))) && 513 isVowel(charAt(value, index + 1))) { 514 result.append('H'); 515 index += 2; 516 //-- also takes car of "HH" --// 517 } else { 518 index++; 519 } 520 return index; 521 } 522 523 /** 524 * Handles 'J' cases 525 */ 526 private int handleJ(String value, DoubleMetaphoneResult result, int index, 527 boolean slavoGermanic) { 528 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 529 //-- obvious Spanish, "Jose", "San Jacinto" --// 530 if ((index == 0 && (charAt(value, index + 4) == ' ') || 531 value.length() == 4) || contains(value, 0, 4, "SAN ")) { 532 result.append('H'); 533 } else { 534 result.append('J', 'H'); 535 } 536 index++; 537 } else { 538 if (index == 0 && !contains(value, index, 4, "JOSE")) { 539 result.append('J', 'A'); 540 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 541 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 542 result.append('J', 'H'); 543 } else if (index == value.length() - 1) { 544 result.append('J', ' '); 545 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) { 546 result.append('J'); 547 } 548 549 if (charAt(value, index + 1) == 'J') { 550 index += 2; 551 } else { 552 index++; 553 } 554 } 555 return index; 556 } 557 558 /** 559 * Handles 'L' cases 560 */ 561 private int handleL(String value, 562 DoubleMetaphoneResult result, 563 int index) { 564 if (charAt(value, index + 1) == 'L') { 565 if (conditionL0(value, index)) { 566 result.appendPrimary('L'); 567 } else { 568 result.append('L'); 569 } 570 index += 2; 571 } else { 572 index++; 573 result.append('L'); 574 } 575 return index; 576 } 577 578 /** 579 * Handles 'P' cases 580 */ 581 private int handleP(String value, 582 DoubleMetaphoneResult result, 583 int index) { 584 if (charAt(value, index + 1) == 'H') { 585 result.append('F'); 586 index += 2; 587 } else { 588 result.append('P'); 589 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 590 } 591 return index; 592 } 593 594 /** 595 * Handles 'R' cases 596 */ 597 private int handleR(String value, 598 DoubleMetaphoneResult result, 599 int index, 600 boolean slavoGermanic) { 601 if (index == value.length() - 1 && !slavoGermanic && 602 contains(value, index - 2, 2, "IE") && 603 !contains(value, index - 4, 2, "ME", "MA")) { 604 result.appendAlternate('R'); 605 } else { 606 result.append('R'); 607 } 608 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 609 } 610 611 /** 612 * Handles 'S' cases 613 */ 614 private int handleS(String value, 615 DoubleMetaphoneResult result, 616 int index, 617 boolean slavoGermanic) { 618 if (contains(value, index - 1, 3, "ISL", "YSL")) { 619 //-- special cases "island", "isle", "carlisle", "carlysle" --// 620 index++; 621 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 622 //-- special case "sugar-" --// 623 result.append('X', 'S'); 624 index++; 625 } else if (contains(value, index, 2, "SH")) { 626 if (contains(value, index + 1, 4, 627 "HEIM", "HOEK", "HOLM", "HOLZ")) { 628 //-- germanic --// 629 result.append('S'); 630 } else { 631 result.append('X'); 632 } 633 index += 2; 634 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 635 //-- Italian and Armenian --// 636 if (slavoGermanic) { 637 result.append('S'); 638 } else { 639 result.append('S', 'X'); 640 } 641 index += 3; 642 } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) { 643 //-- german & anglicisations, e.g. "smith" match "schmidt" // 644 // "snider" match "schneider" --// 645 //-- also, -sz- in slavic language altho in hungarian it // 646 // is pronounced "s" --// 647 result.append('S', 'X'); 648 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 649 } else if (contains(value, index, 2, "SC")) { 650 index = handleSC(value, result, index); 651 } else { 652 if (index == value.length() - 1 && contains(value, index - 2, 653 2, "AI", "OI")){ 654 //-- french e.g. "resnais", "artois" --// 655 result.appendAlternate('S'); 656 } else { 657 result.append('S'); 658 } 659 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 660 } 661 return index; 662 } 663 664 /** 665 * Handles 'SC' cases 666 */ 667 private int handleSC(String value, 668 DoubleMetaphoneResult result, 669 int index) { 670 if (charAt(value, index + 2) == 'H') { 671 //-- Schlesinger's rule --// 672 if (contains(value, index + 3, 673 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 674 //-- Dutch origin, e.g. "school", "schooner" --// 675 if (contains(value, index + 3, 2, "ER", "EN")) { 676 //-- "schermerhorn", "schenker" --// 677 result.append("X", "SK"); 678 } else { 679 result.append("SK"); 680 } 681 } else { 682 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 683 result.append('X', 'S'); 684 } else { 685 result.append('X'); 686 } 687 } 688 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 689 result.append('S'); 690 } else { 691 result.append("SK"); 692 } 693 return index + 3; 694 } 695 696 /** 697 * Handles 'T' cases 698 */ 699 private int handleT(String value, 700 DoubleMetaphoneResult result, 701 int index) { 702 if (contains(value, index, 4, "TION")) { 703 result.append('X'); 704 index += 3; 705 } else if (contains(value, index, 3, "TIA", "TCH")) { 706 result.append('X'); 707 index += 3; 708 } else if (contains(value, index, 2, "TH") || contains(value, index, 709 3, "TTH")) { 710 if (contains(value, index + 2, 2, "OM", "AM") || 711 //-- special case "thomas", "thames" or germanic --// 712 contains(value, 0, 4, "VAN ", "VON ") || 713 contains(value, 0, 3, "SCH")) { 714 result.append('T'); 715 } else { 716 result.append('0', 'T'); 717 } 718 index += 2; 719 } else { 720 result.append('T'); 721 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 722 } 723 return index; 724 } 725 726 /** 727 * Handles 'W' cases 728 */ 729 private int handleW(String value, 730 DoubleMetaphoneResult result, 731 int index) { 732 if (contains(value, index, 2, "WR")) { 733 //-- can also be in middle of word --// 734 result.append('R'); 735 index += 2; 736 } else { 737 if (index == 0 && (isVowel(charAt(value, index + 1)) || 738 contains(value, index, 2, "WH"))) { 739 if (isVowel(charAt(value, index + 1))) { 740 //-- Wasserman should match Vasserman --// 741 result.append('A', 'F'); 742 } else { 743 //-- need Uomo to match Womo --// 744 result.append('A'); 745 } 746 index++; 747 } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || 748 contains(value, index - 1, 749 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 750 contains(value, 0, 3, "SCH")) { 751 //-- Arnow should match Arnoff --// 752 result.appendAlternate('F'); 753 index++; 754 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 755 //-- Polish e.g. "filipowicz" --// 756 result.append("TS", "FX"); 757 index += 4; 758 } else { 759 index++; 760 } 761 } 762 return index; 763 } 764 765 /** 766 * Handles 'X' cases 767 */ 768 private int handleX(String value, 769 DoubleMetaphoneResult result, 770 int index) { 771 if (index == 0) { 772 result.append('S'); 773 index++; 774 } else { 775 if (!((index == value.length() - 1) && 776 (contains(value, index - 3, 3, "IAU", "EAU") || 777 contains(value, index - 2, 2, "AU", "OU")))) { 778 //-- French e.g. breaux --// 779 result.append("KS"); 780 } 781 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 782 } 783 return index; 784 } 785 786 /** 787 * Handles 'Z' cases 788 */ 789 private int handleZ(String value, DoubleMetaphoneResult result, int index, 790 boolean slavoGermanic) { 791 if (charAt(value, index + 1) == 'H') { 792 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// 793 result.append('J'); 794 index += 2; 795 } else { 796 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { 797 result.append("S", "TS"); 798 } else { 799 result.append('S'); 800 } 801 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 802 } 803 return index; 804 } 805 806 //-- BEGIN CONDITIONS --// 807 808 /** 809 * Complex condition 0 for 'C' 810 */ 811 private boolean conditionC0(String value, int index) { 812 if (contains(value, index, 4, "CHIA")) { 813 return true; 814 } else if (index <= 1) { 815 return false; 816 } else if (isVowel(charAt(value, index - 2))) { 817 return false; 818 } else if (!contains(value, index - 1, 3, "ACH")) { 819 return false; 820 } else { 821 char c = charAt(value, index + 2); 822 return (c != 'I' && c != 'E') || 823 contains(value, index - 2, 6, "BACHER", "MACHER"); 824 } 825 } 826 827 /** 828 * Complex condition 0 for 'CH' 829 */ 830 private boolean conditionCH0(String value, int index) { 831 if (index != 0) { 832 return false; 833 } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 834 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 835 return false; 836 } else if (contains(value, 0, 5, "CHORE")) { 837 return false; 838 } else { 839 return true; 840 } 841 } 842 843 /** 844 * Complex condition 1 for 'CH' 845 */ 846 private boolean conditionCH1(String value, int index) { 847 return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 848 3, "SCH")) || 849 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 850 contains(value, index + 2, 1, "T", "S") || 851 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 852 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); 853 } 854 855 /** 856 * Complex condition 0 for 'L' 857 */ 858 private boolean conditionL0(String value, int index) { 859 if (index == value.length() - 3 && 860 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 861 return true; 862 } else if ((contains(value, value.length() - 2, 2, "AS", "OS") || 863 contains(value, value.length() - 1, 1, "A", "O")) && 864 contains(value, index - 1, 4, "ALLE")) { 865 return true; 866 } else { 867 return false; 868 } 869 } 870 871 /** 872 * Complex condition 0 for 'M' 873 */ 874 private boolean conditionM0(String value, int index) { 875 if (charAt(value, index + 1) == 'M') { 876 return true; 877 } 878 return contains(value, index - 1, 3, "UMB") && 879 ((index + 1) == value.length() - 1 || contains(value, 880 index + 2, 2, "ER")); 881 } 882 883 //-- BEGIN HELPER FUNCTIONS --// 884 885 /** 886 * Determines whether or not a value is of slavo-germanic orgin. A value is 887 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. 888 */ 889 private boolean isSlavoGermanic(String value) { 890 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 891 value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1; 892 } 893 894 /** 895 * Determines whether or not a character is a vowel or not 896 */ 897 private boolean isVowel(char ch) { 898 return VOWELS.indexOf(ch) != -1; 899 } 900 901 /** 902 * Determines whether or not the value starts with a silent letter. It will 903 * return <code>true</code> if the value starts with any of 'GN', 'KN', 904 * 'PN', 'WR' or 'PS'. 905 */ 906 private boolean isSilentStart(String value) { 907 boolean result = false; 908 for (int i = 0; i < SILENT_START.length; i++) { 909 if (value.startsWith(SILENT_START[i])) { 910 result = true; 911 break; 912 } 913 } 914 return result; 915 } 916 917 /** 918 * Cleans the input 919 */ 920 private String cleanInput(String input) { 921 if (input == null) { 922 return null; 923 } 924 input = input.trim(); 925 if (input.length() == 0) { 926 return null; 927 } 928 return input.toUpperCase(java.util.Locale.ENGLISH); 929 } 930 931 /** 932 * Gets the character at index <code>index</code> if available, otherwise 933 * it returns <code>Character.MIN_VALUE</code> so that there is some sort 934 * of a default 935 */ 936 protected char charAt(String value, int index) { 937 if (index < 0 || index >= value.length()) { 938 return Character.MIN_VALUE; 939 } 940 return value.charAt(index); 941 } 942 943 /** 944 * Shortcut method with 1 criteria 945 */ 946 private static boolean contains(String value, int start, int length, 947 String criteria) { 948 return contains(value, start, length, 949 new String[] { criteria }); 950 } 951 952 /** 953 * Shortcut method with 2 criteria 954 */ 955 private static boolean contains(String value, int start, int length, 956 String criteria1, String criteria2) { 957 return contains(value, start, length, 958 new String[] { criteria1, criteria2 }); 959 } 960 961 /** 962 * Shortcut method with 3 criteria 963 */ 964 private static boolean contains(String value, int start, int length, 965 String criteria1, String criteria2, 966 String criteria3) { 967 return contains(value, start, length, 968 new String[] { criteria1, criteria2, criteria3 }); 969 } 970 971 /** 972 * Shortcut method with 4 criteria 973 */ 974 private static boolean contains(String value, int start, int length, 975 String criteria1, String criteria2, 976 String criteria3, String criteria4) { 977 return contains(value, start, length, 978 new String[] { criteria1, criteria2, criteria3, 979 criteria4 }); 980 } 981 982 /** 983 * Shortcut method with 5 criteria 984 */ 985 private static boolean contains(String value, int start, int length, 986 String criteria1, String criteria2, 987 String criteria3, String criteria4, 988 String criteria5) { 989 return contains(value, start, length, 990 new String[] { criteria1, criteria2, criteria3, 991 criteria4, criteria5 }); 992 } 993 994 /** 995 * Shortcut method with 6 criteria 996 */ 997 private static boolean contains(String value, int start, int length, 998 String criteria1, String criteria2, 999 String criteria3, String criteria4, 1000 String criteria5, String criteria6) { 1001 return contains(value, start, length, 1002 new String[] { criteria1, criteria2, criteria3, 1003 criteria4, criteria5, criteria6 }); 1004 } 1005 1006 /** 1007 * Determines whether <code>value</code> contains any of the criteria starting at index <code>start</code> and 1008 * matching up to length <code>length</code> 1009 */ 1010 protected static boolean contains(String value, int start, int length, 1011 String[] criteria) { 1012 boolean result = false; 1013 if (start >= 0 && start + length <= value.length()) { 1014 String target = value.substring(start, start + length); 1015 1016 for (int i = 0; i < criteria.length; i++) { 1017 if (target.equals(criteria[i])) { 1018 result = true; 1019 break; 1020 } 1021 } 1022 } 1023 return result; 1024 } 1025 1026 //-- BEGIN INNER CLASSES --// 1027 1028 /** 1029 * Inner class for storing results, since there is the optional alternate 1030 * encoding. 1031 */ 1032 public class DoubleMetaphoneResult { 1033 1034 private StringBuffer primary = new StringBuffer(getMaxCodeLen()); 1035 private StringBuffer alternate = new StringBuffer(getMaxCodeLen()); 1036 private int maxLength; 1037 1038 public DoubleMetaphoneResult(int maxLength) { 1039 this.maxLength = maxLength; 1040 } 1041 1042 public void append(char value) { 1043 appendPrimary(value); 1044 appendAlternate(value); 1045 } 1046 1047 public void append(char primary, char alternate) { 1048 appendPrimary(primary); 1049 appendAlternate(alternate); 1050 } 1051 1052 public void appendPrimary(char value) { 1053 if (this.primary.length() < this.maxLength) { 1054 this.primary.append(value); 1055 } 1056 } 1057 1058 public void appendAlternate(char value) { 1059 if (this.alternate.length() < this.maxLength) { 1060 this.alternate.append(value); 1061 } 1062 } 1063 1064 public void append(String value) { 1065 appendPrimary(value); 1066 appendAlternate(value); 1067 } 1068 1069 public void append(String primary, String alternate) { 1070 appendPrimary(primary); 1071 appendAlternate(alternate); 1072 } 1073 1074 public void appendPrimary(String value) { 1075 int addChars = this.maxLength - this.primary.length(); 1076 if (value.length() <= addChars) { 1077 this.primary.append(value); 1078 } else { 1079 this.primary.append(value.substring(0, addChars)); 1080 } 1081 } 1082 1083 public void appendAlternate(String value) { 1084 int addChars = this.maxLength - this.alternate.length(); 1085 if (value.length() <= addChars) { 1086 this.alternate.append(value); 1087 } else { 1088 this.alternate.append(value.substring(0, addChars)); 1089 } 1090 } 1091 1092 public String getPrimary() { 1093 return this.primary.toString(); 1094 } 1095 1096 public String getAlternate() { 1097 return this.alternate.toString(); 1098 } 1099 1100 public boolean isComplete() { 1101 return this.primary.length() >= this.maxLength && 1102 this.alternate.length() >= this.maxLength; 1103 } 1104 } 1105 }