001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.codec.language;
019
020 import org.apache.commons.codec.EncoderException;
021 import org.apache.commons.codec.StringEncoder;
022
023 /**
024 * Encodes a string into a metaphone value.
025 * <p>
026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
028 * </p>
029 * <p>
030 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
031 * 39.</CITE>
032 * </p>
033 * <p>
034 * Note, that this does not match the algorithm that ships with PHP, or the algorithm
035 * found in the Perl <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>.
036 * They have had undocumented changes from the originally published algorithm.
037 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
038 * </p>
039 *
040 * @author Apache Software Foundation
041 * @version $Id: Metaphone.java 797690 2009-07-24 23:28:35Z ggregory $
042 */
043 public class Metaphone implements StringEncoder {
044
045 /**
046 * Five values in the English language
047 */
048 private static final String VOWELS = "AEIOU" ;
049
050 /**
051 * Variable used in Metaphone algorithm
052 */
053 private static final String FRONTV = "EIY" ;
054
055 /**
056 * Variable used in Metaphone algorithm
057 */
058 private static final String VARSON = "CSPTG" ;
059
060 /**
061 * The max code length for metaphone is 4
062 */
063 private int maxCodeLen = 4 ;
064
065 /**
066 * Creates an instance of the Metaphone encoder
067 */
068 public Metaphone() {
069 super();
070 }
071
072 /**
073 * Find the metaphone value of a String. This is similar to the
074 * soundex algorithm, but better at finding similar sounding words.
075 * All input is converted to upper case.
076 * Limitations: Input format is expected to be a single ASCII word
077 * with only characters in the A - Z range, no punctuation or numbers.
078 *
079 * @param txt String to find the metaphone code for
080 * @return A metaphone code corresponding to the String supplied
081 */
082 public String metaphone(String txt) {
083 boolean hard = false ;
084 if ((txt == null) || (txt.length() == 0)) {
085 return "" ;
086 }
087 // single character is itself
088 if (txt.length() == 1) {
089 return txt.toUpperCase(java.util.Locale.ENGLISH) ;
090 }
091
092 char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ;
093
094 StringBuffer local = new StringBuffer(40); // manipulate
095 StringBuffer code = new StringBuffer(10) ; // output
096 // handle initial 2 characters exceptions
097 switch(inwd[0]) {
098 case 'K' :
099 case 'G' :
100 case 'P' : /* looking for KN, etc*/
101 if (inwd[1] == 'N') {
102 local.append(inwd, 1, inwd.length - 1);
103 } else {
104 local.append(inwd);
105 }
106 break;
107 case 'A': /* looking for AE */
108 if (inwd[1] == 'E') {
109 local.append(inwd, 1, inwd.length - 1);
110 } else {
111 local.append(inwd);
112 }
113 break;
114 case 'W' : /* looking for WR or WH */
115 if (inwd[1] == 'R') { // WR -> R
116 local.append(inwd, 1, inwd.length - 1);
117 break ;
118 }
119 if (inwd[1] == 'H') {
120 local.append(inwd, 1, inwd.length - 1);
121 local.setCharAt(0, 'W'); // WH -> W
122 } else {
123 local.append(inwd);
124 }
125 break;
126 case 'X' : /* initial X becomes S */
127 inwd[0] = 'S';
128 local.append(inwd);
129 break ;
130 default :
131 local.append(inwd);
132 } // now local has working string with initials fixed
133
134 int wdsz = local.length();
135 int n = 0 ;
136
137 while ((code.length() < this.getMaxCodeLen()) &&
138 (n < wdsz) ) { // max code size of 4 works well
139 char symb = local.charAt(n) ;
140 // remove duplicate letters except C
141 if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
142 n++ ;
143 } else { // not dup
144 switch(symb) {
145 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
146 if (n == 0) {
147 code.append(symb);
148 }
149 break ; // only use vowel if leading char
150 case 'B' :
151 if ( isPreviousChar(local, n, 'M') &&
152 isLastChar(wdsz, n) ) { // B is silent if word ends in MB
153 break;
154 }
155 code.append(symb);
156 break;
157 case 'C' : // lots of C special cases
158 /* discard if SCI, SCE or SCY */
159 if ( isPreviousChar(local, n, 'S') &&
160 !isLastChar(wdsz, n) &&
161 (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) {
162 break;
163 }
164 if (regionMatch(local, n, "CIA")) { // "CIA" -> X
165 code.append('X');
166 break;
167 }
168 if (!isLastChar(wdsz, n) &&
169 (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) {
170 code.append('S');
171 break; // CI,CE,CY -> S
172 }
173 if (isPreviousChar(local, n, 'S') &&
174 isNextChar(local, n, 'H') ) { // SCH->sk
175 code.append('K') ;
176 break ;
177 }
178 if (isNextChar(local, n, 'H')) { // detect CH
179 if ((n == 0) &&
180 (wdsz >= 3) &&
181 isVowel(local,2) ) { // CH consonant -> K consonant
182 code.append('K');
183 } else {
184 code.append('X'); // CHvowel -> X
185 }
186 } else {
187 code.append('K');
188 }
189 break ;
190 case 'D' :
191 if (!isLastChar(wdsz, n + 1) &&
192 isNextChar(local, n, 'G') &&
193 (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
194 code.append('J'); n += 2 ;
195 } else {
196 code.append('T');
197 }
198 break ;
199 case 'G' : // GH silent at end or before consonant
200 if (isLastChar(wdsz, n + 1) &&
201 isNextChar(local, n, 'H')) {
202 break;
203 }
204 if (!isLastChar(wdsz, n + 1) &&
205 isNextChar(local,n,'H') &&
206 !isVowel(local,n+2)) {
207 break;
208 }
209 if ((n > 0) &&
210 ( regionMatch(local, n, "GN") ||
211 regionMatch(local, n, "GNED") ) ) {
212 break; // silent G
213 }
214 if (isPreviousChar(local, n, 'G')) {
215 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
216 hard = true ;
217 } else {
218 hard = false ;
219 }
220 if (!isLastChar(wdsz, n) &&
221 (FRONTV.indexOf(local.charAt(n + 1)) >= 0) &&
222 (!hard)) {
223 code.append('J');
224 } else {
225 code.append('K');
226 }
227 break ;
228 case 'H':
229 if (isLastChar(wdsz, n)) {
230 break ; // terminal H
231 }
232 if ((n > 0) &&
233 (VARSON.indexOf(local.charAt(n - 1)) >= 0)) {
234 break;
235 }
236 if (isVowel(local,n+1)) {
237 code.append('H'); // Hvowel
238 }
239 break;
240 case 'F':
241 case 'J' :
242 case 'L' :
243 case 'M':
244 case 'N' :
245 case 'R' :
246 code.append(symb);
247 break;
248 case 'K' :
249 if (n > 0) { // not initial
250 if (!isPreviousChar(local, n, 'C')) {
251 code.append(symb);
252 }
253 } else {
254 code.append(symb); // initial K
255 }
256 break ;
257 case 'P' :
258 if (isNextChar(local,n,'H')) {
259 // PH -> F
260 code.append('F');
261 } else {
262 code.append(symb);
263 }
264 break ;
265 case 'Q' :
266 code.append('K');
267 break;
268 case 'S' :
269 if (regionMatch(local,n,"SH") ||
270 regionMatch(local,n,"SIO") ||
271 regionMatch(local,n,"SIA")) {
272 code.append('X');
273 } else {
274 code.append('S');
275 }
276 break;
277 case 'T' :
278 if (regionMatch(local,n,"TIA") ||
279 regionMatch(local,n,"TIO")) {
280 code.append('X');
281 break;
282 }
283 if (regionMatch(local,n,"TCH")) {
284 // Silent if in "TCH"
285 break;
286 }
287 // substitute numeral 0 for TH (resembles theta after all)
288 if (regionMatch(local,n,"TH")) {
289 code.append('0');
290 } else {
291 code.append('T');
292 }
293 break ;
294 case 'V' :
295 code.append('F'); break ;
296 case 'W' : case 'Y' : // silent if not followed by vowel
297 if (!isLastChar(wdsz,n) &&
298 isVowel(local,n+1)) {
299 code.append(symb);
300 }
301 break ;
302 case 'X' :
303 code.append('K'); code.append('S');
304 break ;
305 case 'Z' :
306 code.append('S'); break ;
307 } // end switch
308 n++ ;
309 } // end else from symb != 'C'
310 if (code.length() > this.getMaxCodeLen()) {
311 code.setLength(this.getMaxCodeLen());
312 }
313 }
314 return code.toString();
315 }
316
317 private boolean isVowel(StringBuffer string, int index) {
318 return VOWELS.indexOf(string.charAt(index)) >= 0;
319 }
320
321 private boolean isPreviousChar(StringBuffer string, int index, char c) {
322 boolean matches = false;
323 if( index > 0 &&
324 index < string.length() ) {
325 matches = string.charAt(index - 1) == c;
326 }
327 return matches;
328 }
329
330 private boolean isNextChar(StringBuffer string, int index, char c) {
331 boolean matches = false;
332 if( index >= 0 &&
333 index < string.length() - 1 ) {
334 matches = string.charAt(index + 1) == c;
335 }
336 return matches;
337 }
338
339 private boolean regionMatch(StringBuffer string, int index, String test) {
340 boolean matches = false;
341 if( index >= 0 &&
342 (index + test.length() - 1) < string.length() ) {
343 String substring = string.substring( index, index + test.length());
344 matches = substring.equals( test );
345 }
346 return matches;
347 }
348
349 private boolean isLastChar(int wdsz, int n) {
350 return n + 1 == wdsz;
351 }
352
353
354 /**
355 * Encodes an Object using the metaphone algorithm. This method
356 * is provided in order to satisfy the requirements of the
357 * Encoder interface, and will throw an EncoderException if the
358 * supplied object is not of type java.lang.String.
359 *
360 * @param pObject Object to encode
361 * @return An object (or type java.lang.String) containing the
362 * metaphone code which corresponds to the String supplied.
363 * @throws EncoderException if the parameter supplied is not
364 * of type java.lang.String
365 */
366 public Object encode(Object pObject) throws EncoderException {
367 if (!(pObject instanceof String)) {
368 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
369 }
370 return metaphone((String) pObject);
371 }
372
373 /**
374 * Encodes a String using the Metaphone algorithm.
375 *
376 * @param pString String object to encode
377 * @return The metaphone code corresponding to the String supplied
378 */
379 public String encode(String pString) {
380 return metaphone(pString);
381 }
382
383 /**
384 * Tests is the metaphones of two strings are identical.
385 *
386 * @param str1 First of two strings to compare
387 * @param str2 Second of two strings to compare
388 * @return <code>true</code> if the metaphones of these strings are identical,
389 * <code>false</code> otherwise.
390 */
391 public boolean isMetaphoneEqual(String str1, String str2) {
392 return metaphone(str1).equals(metaphone(str2));
393 }
394
395 /**
396 * Returns the maxCodeLen.
397 * @return int
398 */
399 public int getMaxCodeLen() { return this.maxCodeLen; }
400
401 /**
402 * Sets the maxCodeLen.
403 * @param maxCodeLen The maxCodeLen to set
404 */
405 public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
406
407 }