001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.codec.language; 019 020 import org.apache.commons.codec.EncoderException; 021 import org.apache.commons.codec.StringEncoder; 022 023 /** 024 * Encodes a string into a metaphone value. 025 * <p> 026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 028 * </p> 029 * <p> 030 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p 031 * 39.</CITE> 032 * </p> 033 * <p> 034 * Note, that this does not match the algorithm that ships with PHP, or the algorithm 035 * found in the Perl <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>. 036 * They have had undocumented changes from the originally published algorithm. 037 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 038 * </p> 039 * 040 * @author Apache Software Foundation 041 * @version $Id: Metaphone.java 797690 2009-07-24 23:28:35Z ggregory $ 042 */ 043 public class Metaphone implements StringEncoder { 044 045 /** 046 * Five values in the English language 047 */ 048 private static final String VOWELS = "AEIOU" ; 049 050 /** 051 * Variable used in Metaphone algorithm 052 */ 053 private static final String FRONTV = "EIY" ; 054 055 /** 056 * Variable used in Metaphone algorithm 057 */ 058 private static final String VARSON = "CSPTG" ; 059 060 /** 061 * The max code length for metaphone is 4 062 */ 063 private int maxCodeLen = 4 ; 064 065 /** 066 * Creates an instance of the Metaphone encoder 067 */ 068 public Metaphone() { 069 super(); 070 } 071 072 /** 073 * Find the metaphone value of a String. This is similar to the 074 * soundex algorithm, but better at finding similar sounding words. 075 * All input is converted to upper case. 076 * Limitations: Input format is expected to be a single ASCII word 077 * with only characters in the A - Z range, no punctuation or numbers. 078 * 079 * @param txt String to find the metaphone code for 080 * @return A metaphone code corresponding to the String supplied 081 */ 082 public String metaphone(String txt) { 083 boolean hard = false ; 084 if ((txt == null) || (txt.length() == 0)) { 085 return "" ; 086 } 087 // single character is itself 088 if (txt.length() == 1) { 089 return txt.toUpperCase(java.util.Locale.ENGLISH) ; 090 } 091 092 char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ; 093 094 StringBuffer local = new StringBuffer(40); // manipulate 095 StringBuffer code = new StringBuffer(10) ; // output 096 // handle initial 2 characters exceptions 097 switch(inwd[0]) { 098 case 'K' : 099 case 'G' : 100 case 'P' : /* looking for KN, etc*/ 101 if (inwd[1] == 'N') { 102 local.append(inwd, 1, inwd.length - 1); 103 } else { 104 local.append(inwd); 105 } 106 break; 107 case 'A': /* looking for AE */ 108 if (inwd[1] == 'E') { 109 local.append(inwd, 1, inwd.length - 1); 110 } else { 111 local.append(inwd); 112 } 113 break; 114 case 'W' : /* looking for WR or WH */ 115 if (inwd[1] == 'R') { // WR -> R 116 local.append(inwd, 1, inwd.length - 1); 117 break ; 118 } 119 if (inwd[1] == 'H') { 120 local.append(inwd, 1, inwd.length - 1); 121 local.setCharAt(0, 'W'); // WH -> W 122 } else { 123 local.append(inwd); 124 } 125 break; 126 case 'X' : /* initial X becomes S */ 127 inwd[0] = 'S'; 128 local.append(inwd); 129 break ; 130 default : 131 local.append(inwd); 132 } // now local has working string with initials fixed 133 134 int wdsz = local.length(); 135 int n = 0 ; 136 137 while ((code.length() < this.getMaxCodeLen()) && 138 (n < wdsz) ) { // max code size of 4 works well 139 char symb = local.charAt(n) ; 140 // remove duplicate letters except C 141 if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { 142 n++ ; 143 } else { // not dup 144 switch(symb) { 145 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : 146 if (n == 0) { 147 code.append(symb); 148 } 149 break ; // only use vowel if leading char 150 case 'B' : 151 if ( isPreviousChar(local, n, 'M') && 152 isLastChar(wdsz, n) ) { // B is silent if word ends in MB 153 break; 154 } 155 code.append(symb); 156 break; 157 case 'C' : // lots of C special cases 158 /* discard if SCI, SCE or SCY */ 159 if ( isPreviousChar(local, n, 'S') && 160 !isLastChar(wdsz, n) && 161 (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) { 162 break; 163 } 164 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 165 code.append('X'); 166 break; 167 } 168 if (!isLastChar(wdsz, n) && 169 (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) { 170 code.append('S'); 171 break; // CI,CE,CY -> S 172 } 173 if (isPreviousChar(local, n, 'S') && 174 isNextChar(local, n, 'H') ) { // SCH->sk 175 code.append('K') ; 176 break ; 177 } 178 if (isNextChar(local, n, 'H')) { // detect CH 179 if ((n == 0) && 180 (wdsz >= 3) && 181 isVowel(local,2) ) { // CH consonant -> K consonant 182 code.append('K'); 183 } else { 184 code.append('X'); // CHvowel -> X 185 } 186 } else { 187 code.append('K'); 188 } 189 break ; 190 case 'D' : 191 if (!isLastChar(wdsz, n + 1) && 192 isNextChar(local, n, 'G') && 193 (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 194 code.append('J'); n += 2 ; 195 } else { 196 code.append('T'); 197 } 198 break ; 199 case 'G' : // GH silent at end or before consonant 200 if (isLastChar(wdsz, n + 1) && 201 isNextChar(local, n, 'H')) { 202 break; 203 } 204 if (!isLastChar(wdsz, n + 1) && 205 isNextChar(local,n,'H') && 206 !isVowel(local,n+2)) { 207 break; 208 } 209 if ((n > 0) && 210 ( regionMatch(local, n, "GN") || 211 regionMatch(local, n, "GNED") ) ) { 212 break; // silent G 213 } 214 if (isPreviousChar(local, n, 'G')) { 215 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 216 hard = true ; 217 } else { 218 hard = false ; 219 } 220 if (!isLastChar(wdsz, n) && 221 (FRONTV.indexOf(local.charAt(n + 1)) >= 0) && 222 (!hard)) { 223 code.append('J'); 224 } else { 225 code.append('K'); 226 } 227 break ; 228 case 'H': 229 if (isLastChar(wdsz, n)) { 230 break ; // terminal H 231 } 232 if ((n > 0) && 233 (VARSON.indexOf(local.charAt(n - 1)) >= 0)) { 234 break; 235 } 236 if (isVowel(local,n+1)) { 237 code.append('H'); // Hvowel 238 } 239 break; 240 case 'F': 241 case 'J' : 242 case 'L' : 243 case 'M': 244 case 'N' : 245 case 'R' : 246 code.append(symb); 247 break; 248 case 'K' : 249 if (n > 0) { // not initial 250 if (!isPreviousChar(local, n, 'C')) { 251 code.append(symb); 252 } 253 } else { 254 code.append(symb); // initial K 255 } 256 break ; 257 case 'P' : 258 if (isNextChar(local,n,'H')) { 259 // PH -> F 260 code.append('F'); 261 } else { 262 code.append(symb); 263 } 264 break ; 265 case 'Q' : 266 code.append('K'); 267 break; 268 case 'S' : 269 if (regionMatch(local,n,"SH") || 270 regionMatch(local,n,"SIO") || 271 regionMatch(local,n,"SIA")) { 272 code.append('X'); 273 } else { 274 code.append('S'); 275 } 276 break; 277 case 'T' : 278 if (regionMatch(local,n,"TIA") || 279 regionMatch(local,n,"TIO")) { 280 code.append('X'); 281 break; 282 } 283 if (regionMatch(local,n,"TCH")) { 284 // Silent if in "TCH" 285 break; 286 } 287 // substitute numeral 0 for TH (resembles theta after all) 288 if (regionMatch(local,n,"TH")) { 289 code.append('0'); 290 } else { 291 code.append('T'); 292 } 293 break ; 294 case 'V' : 295 code.append('F'); break ; 296 case 'W' : case 'Y' : // silent if not followed by vowel 297 if (!isLastChar(wdsz,n) && 298 isVowel(local,n+1)) { 299 code.append(symb); 300 } 301 break ; 302 case 'X' : 303 code.append('K'); code.append('S'); 304 break ; 305 case 'Z' : 306 code.append('S'); break ; 307 } // end switch 308 n++ ; 309 } // end else from symb != 'C' 310 if (code.length() > this.getMaxCodeLen()) { 311 code.setLength(this.getMaxCodeLen()); 312 } 313 } 314 return code.toString(); 315 } 316 317 private boolean isVowel(StringBuffer string, int index) { 318 return VOWELS.indexOf(string.charAt(index)) >= 0; 319 } 320 321 private boolean isPreviousChar(StringBuffer string, int index, char c) { 322 boolean matches = false; 323 if( index > 0 && 324 index < string.length() ) { 325 matches = string.charAt(index - 1) == c; 326 } 327 return matches; 328 } 329 330 private boolean isNextChar(StringBuffer string, int index, char c) { 331 boolean matches = false; 332 if( index >= 0 && 333 index < string.length() - 1 ) { 334 matches = string.charAt(index + 1) == c; 335 } 336 return matches; 337 } 338 339 private boolean regionMatch(StringBuffer string, int index, String test) { 340 boolean matches = false; 341 if( index >= 0 && 342 (index + test.length() - 1) < string.length() ) { 343 String substring = string.substring( index, index + test.length()); 344 matches = substring.equals( test ); 345 } 346 return matches; 347 } 348 349 private boolean isLastChar(int wdsz, int n) { 350 return n + 1 == wdsz; 351 } 352 353 354 /** 355 * Encodes an Object using the metaphone algorithm. This method 356 * is provided in order to satisfy the requirements of the 357 * Encoder interface, and will throw an EncoderException if the 358 * supplied object is not of type java.lang.String. 359 * 360 * @param pObject Object to encode 361 * @return An object (or type java.lang.String) containing the 362 * metaphone code which corresponds to the String supplied. 363 * @throws EncoderException if the parameter supplied is not 364 * of type java.lang.String 365 */ 366 public Object encode(Object pObject) throws EncoderException { 367 if (!(pObject instanceof String)) { 368 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 369 } 370 return metaphone((String) pObject); 371 } 372 373 /** 374 * Encodes a String using the Metaphone algorithm. 375 * 376 * @param pString String object to encode 377 * @return The metaphone code corresponding to the String supplied 378 */ 379 public String encode(String pString) { 380 return metaphone(pString); 381 } 382 383 /** 384 * Tests is the metaphones of two strings are identical. 385 * 386 * @param str1 First of two strings to compare 387 * @param str2 Second of two strings to compare 388 * @return <code>true</code> if the metaphones of these strings are identical, 389 * <code>false</code> otherwise. 390 */ 391 public boolean isMetaphoneEqual(String str1, String str2) { 392 return metaphone(str1).equals(metaphone(str2)); 393 } 394 395 /** 396 * Returns the maxCodeLen. 397 * @return int 398 */ 399 public int getMaxCodeLen() { return this.maxCodeLen; } 400 401 /** 402 * Sets the maxCodeLen. 403 * @param maxCodeLen The maxCodeLen to set 404 */ 405 public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } 406 407 }