001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import org.apache.commons.codec.EncoderException;
021    import org.apache.commons.codec.StringEncoder;
022    
023    /**
024     * Encodes a string into a metaphone value. 
025     * <p>
026     * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 
027     * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
028     * </p>
029     * <p>
030     * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
031     * 39.</CITE>
032     * </p>
033     * <p>
034     * Note, that this does not match the algorithm that ships with PHP, or the algorithm 
035     * found in the Perl <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>.
036     * They have had undocumented changes from the originally published algorithm. 
037     * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
038     * </p>
039     * 
040     * @author Apache Software Foundation
041     * @version $Id: Metaphone.java 797690 2009-07-24 23:28:35Z ggregory $
042     */
043    public class Metaphone implements StringEncoder {
044    
045        /**
046         * Five values in the English language 
047         */
048        private static final String VOWELS = "AEIOU" ;
049    
050        /**
051         * Variable used in Metaphone algorithm
052         */
053        private static final String FRONTV = "EIY"   ;
054    
055        /**
056         * Variable used in Metaphone algorithm
057         */
058        private static final String VARSON = "CSPTG" ;
059    
060        /**
061         * The max code length for metaphone is 4
062         */
063        private int maxCodeLen = 4 ;
064    
065        /**
066         * Creates an instance of the Metaphone encoder
067         */
068        public Metaphone() {
069            super();
070        }
071    
072        /**
073         * Find the metaphone value of a String. This is similar to the
074         * soundex algorithm, but better at finding similar sounding words.
075         * All input is converted to upper case.
076         * Limitations: Input format is expected to be a single ASCII word
077         * with only characters in the A - Z range, no punctuation or numbers.
078         *
079         * @param txt String to find the metaphone code for
080         * @return A metaphone code corresponding to the String supplied
081         */
082        public String metaphone(String txt) {
083            boolean hard = false ;
084            if ((txt == null) || (txt.length() == 0)) {
085                return "" ;
086            }
087            // single character is itself
088            if (txt.length() == 1) {
089                return txt.toUpperCase(java.util.Locale.ENGLISH) ;
090            }
091          
092            char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ;
093          
094            StringBuffer local = new StringBuffer(40); // manipulate
095            StringBuffer code = new StringBuffer(10) ; //   output
096            // handle initial 2 characters exceptions
097            switch(inwd[0]) {
098            case 'K' : 
099            case 'G' : 
100            case 'P' : /* looking for KN, etc*/
101                if (inwd[1] == 'N') {
102                    local.append(inwd, 1, inwd.length - 1);
103                } else {
104                    local.append(inwd);
105                }
106                break;
107            case 'A': /* looking for AE */
108                if (inwd[1] == 'E') {
109                    local.append(inwd, 1, inwd.length - 1);
110                } else {
111                    local.append(inwd);
112                }
113                break;
114            case 'W' : /* looking for WR or WH */
115                if (inwd[1] == 'R') {   // WR -> R
116                    local.append(inwd, 1, inwd.length - 1); 
117                    break ;
118                }
119                if (inwd[1] == 'H') {
120                    local.append(inwd, 1, inwd.length - 1);
121                    local.setCharAt(0, 'W'); // WH -> W
122                } else {
123                    local.append(inwd);
124                }
125                break;
126            case 'X' : /* initial X becomes S */
127                inwd[0] = 'S';
128                local.append(inwd);
129                break ;
130            default :
131                local.append(inwd);
132            } // now local has working string with initials fixed
133    
134            int wdsz = local.length();
135            int n = 0 ;
136    
137            while ((code.length() < this.getMaxCodeLen()) && 
138                   (n < wdsz) ) { // max code size of 4 works well
139                char symb = local.charAt(n) ;
140                // remove duplicate letters except C
141                if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
142                    n++ ;
143                } else { // not dup
144                    switch(symb) {
145                    case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
146                        if (n == 0) { 
147                            code.append(symb);
148                        }
149                        break ; // only use vowel if leading char
150                    case 'B' :
151                        if ( isPreviousChar(local, n, 'M') && 
152                             isLastChar(wdsz, n) ) { // B is silent if word ends in MB
153                            break;
154                        }
155                        code.append(symb);
156                        break;
157                    case 'C' : // lots of C special cases
158                        /* discard if SCI, SCE or SCY */
159                        if ( isPreviousChar(local, n, 'S') && 
160                             !isLastChar(wdsz, n) && 
161                             (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) { 
162                            break;
163                        }
164                        if (regionMatch(local, n, "CIA")) { // "CIA" -> X
165                            code.append('X'); 
166                            break;
167                        }
168                        if (!isLastChar(wdsz, n) && 
169                            (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) {
170                            code.append('S');
171                            break; // CI,CE,CY -> S
172                        }
173                        if (isPreviousChar(local, n, 'S') &&
174                            isNextChar(local, n, 'H') ) { // SCH->sk
175                            code.append('K') ; 
176                            break ;
177                        }
178                        if (isNextChar(local, n, 'H')) { // detect CH
179                            if ((n == 0) && 
180                                (wdsz >= 3) && 
181                                isVowel(local,2) ) { // CH consonant -> K consonant
182                                code.append('K');
183                            } else { 
184                                code.append('X'); // CHvowel -> X
185                            }
186                        } else { 
187                            code.append('K');
188                        }
189                        break ;
190                    case 'D' :
191                        if (!isLastChar(wdsz, n + 1) && 
192                            isNextChar(local, n, 'G') && 
193                            (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 
194                            code.append('J'); n += 2 ;
195                        } else { 
196                            code.append('T');
197                        }
198                        break ;
199                    case 'G' : // GH silent at end or before consonant
200                        if (isLastChar(wdsz, n + 1) && 
201                            isNextChar(local, n, 'H')) {
202                            break;
203                        }
204                        if (!isLastChar(wdsz, n + 1) &&  
205                            isNextChar(local,n,'H') && 
206                            !isVowel(local,n+2)) {
207                            break;
208                        }
209                        if ((n > 0) && 
210                            ( regionMatch(local, n, "GN") ||
211                              regionMatch(local, n, "GNED") ) ) {
212                            break; // silent G
213                        }
214                        if (isPreviousChar(local, n, 'G')) {
215                            // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
216                            hard = true ;
217                        } else {
218                            hard = false ;
219                        }
220                        if (!isLastChar(wdsz, n) && 
221                            (FRONTV.indexOf(local.charAt(n + 1)) >= 0) && 
222                            (!hard)) {
223                            code.append('J');
224                        } else {
225                            code.append('K');
226                        }
227                        break ;
228                    case 'H':
229                        if (isLastChar(wdsz, n)) {
230                            break ; // terminal H
231                        }
232                        if ((n > 0) && 
233                            (VARSON.indexOf(local.charAt(n - 1)) >= 0)) {
234                            break;
235                        }
236                        if (isVowel(local,n+1)) {
237                            code.append('H'); // Hvowel
238                        }
239                        break;
240                    case 'F': 
241                    case 'J' : 
242                    case 'L' :
243                    case 'M': 
244                    case 'N' : 
245                    case 'R' :
246                        code.append(symb); 
247                        break;
248                    case 'K' :
249                        if (n > 0) { // not initial
250                            if (!isPreviousChar(local, n, 'C')) {
251                                code.append(symb);
252                            }
253                        } else {
254                            code.append(symb); // initial K
255                        }
256                        break ;
257                    case 'P' :
258                        if (isNextChar(local,n,'H')) {
259                            // PH -> F
260                            code.append('F');
261                        } else {
262                            code.append(symb);
263                        }
264                        break ;
265                    case 'Q' :
266                        code.append('K');
267                        break;
268                    case 'S' :
269                        if (regionMatch(local,n,"SH") || 
270                            regionMatch(local,n,"SIO") || 
271                            regionMatch(local,n,"SIA")) {
272                            code.append('X');
273                        } else {
274                            code.append('S');
275                        }
276                        break;
277                    case 'T' :
278                        if (regionMatch(local,n,"TIA") || 
279                            regionMatch(local,n,"TIO")) {
280                            code.append('X'); 
281                            break;
282                        }
283                        if (regionMatch(local,n,"TCH")) {
284                            // Silent if in "TCH"
285                            break;
286                        }
287                        // substitute numeral 0 for TH (resembles theta after all)
288                        if (regionMatch(local,n,"TH")) {
289                            code.append('0');
290                        } else {
291                            code.append('T');
292                        }
293                        break ;
294                    case 'V' :
295                        code.append('F'); break ;
296                    case 'W' : case 'Y' : // silent if not followed by vowel
297                        if (!isLastChar(wdsz,n) && 
298                            isVowel(local,n+1)) {
299                            code.append(symb);
300                        }
301                        break ;
302                    case 'X' :
303                        code.append('K'); code.append('S');
304                        break ;
305                    case 'Z' :
306                        code.append('S'); break ;
307                    } // end switch
308                    n++ ;
309                } // end else from symb != 'C'
310                if (code.length() > this.getMaxCodeLen()) { 
311                    code.setLength(this.getMaxCodeLen()); 
312                }
313            }
314            return code.toString();
315        }
316    
317        private boolean isVowel(StringBuffer string, int index) {
318            return VOWELS.indexOf(string.charAt(index)) >= 0;
319        }
320    
321        private boolean isPreviousChar(StringBuffer string, int index, char c) {
322            boolean matches = false;
323            if( index > 0 &&
324                index < string.length() ) {
325                matches = string.charAt(index - 1) == c;
326            }
327            return matches;
328        }
329    
330        private boolean isNextChar(StringBuffer string, int index, char c) {
331            boolean matches = false;
332            if( index >= 0 &&
333                index < string.length() - 1 ) {
334                matches = string.charAt(index + 1) == c;
335            }
336            return matches;
337        }
338    
339        private boolean regionMatch(StringBuffer string, int index, String test) {
340            boolean matches = false;
341            if( index >= 0 &&
342                (index + test.length() - 1) < string.length() ) {
343                String substring = string.substring( index, index + test.length());
344                matches = substring.equals( test );
345            }
346            return matches;
347        }
348    
349        private boolean isLastChar(int wdsz, int n) {
350            return n + 1 == wdsz;
351        } 
352        
353        
354        /**
355         * Encodes an Object using the metaphone algorithm.  This method
356         * is provided in order to satisfy the requirements of the
357         * Encoder interface, and will throw an EncoderException if the
358         * supplied object is not of type java.lang.String.
359         *
360         * @param pObject Object to encode
361         * @return An object (or type java.lang.String) containing the 
362         *         metaphone code which corresponds to the String supplied.
363         * @throws EncoderException if the parameter supplied is not
364         *                          of type java.lang.String
365         */
366        public Object encode(Object pObject) throws EncoderException {
367            if (!(pObject instanceof String)) {
368                throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 
369            }
370            return metaphone((String) pObject);
371        }
372    
373        /**
374         * Encodes a String using the Metaphone algorithm. 
375         *
376         * @param pString String object to encode
377         * @return The metaphone code corresponding to the String supplied
378         */
379        public String encode(String pString) {
380            return metaphone(pString);   
381        }
382    
383        /**
384         * Tests is the metaphones of two strings are identical.
385         *
386         * @param str1 First of two strings to compare
387         * @param str2 Second of two strings to compare
388         * @return <code>true</code> if the metaphones of these strings are identical, 
389         *        <code>false</code> otherwise.
390         */
391        public boolean isMetaphoneEqual(String str1, String str2) {
392            return metaphone(str1).equals(metaphone(str2));
393        }
394    
395        /**
396         * Returns the maxCodeLen.
397         * @return int
398         */
399        public int getMaxCodeLen() { return this.maxCodeLen; }
400    
401        /**
402         * Sets the maxCodeLen.
403         * @param maxCodeLen The maxCodeLen to set
404         */
405        public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
406    
407    }