001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import org.apache.commons.codec.EncoderException;
021    import org.apache.commons.codec.StringEncoder;
022    
023    /**
024     * Encodes a string into a Refined Soundex value. A refined soundex code is
025     * optimized for spell checking words. Soundex method originally developed by
026     * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
027     * 
028     * @author Apache Software Foundation
029     * @version $Id: RefinedSoundex.java 797690 2009-07-24 23:28:35Z ggregory $
030     */
031    public class RefinedSoundex implements StringEncoder {
032    
033        /**
034         * @since 1.4
035         */
036        public static final String US_ENGLISH_MAPPING_STRING = "01360240043788015936020505";
037    
038       /**
039         * RefinedSoundex is *refined* for a number of reasons one being that the
040         * mappings have been altered. This implementation contains default
041         * mappings for US English.
042         */
043        public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
044    
045        /**
046         * Every letter of the alphabet is "mapped" to a numerical value. This char
047         * array holds the values to which each letter is mapped. This
048         * implementation contains a default map for US_ENGLISH
049         */
050        private final char[] soundexMapping;
051    
052        /**
053         * This static variable contains an instance of the RefinedSoundex using
054         * the US_ENGLISH mapping.
055         */
056        public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
057    
058         /**
059         * Creates an instance of the RefinedSoundex object using the default US
060         * English mapping.
061         */
062        public RefinedSoundex() {
063            this.soundexMapping = US_ENGLISH_MAPPING;
064        }
065    
066        /**
067         * Creates a refined soundex instance using a custom mapping. This
068         * constructor can be used to customize the mapping, and/or possibly
069         * provide an internationalized mapping for a non-Western character set.
070         * 
071         * @param mapping
072         *                  Mapping array to use when finding the corresponding code for
073         *                  a given character
074         */
075        public RefinedSoundex(char[] mapping) {
076            this.soundexMapping = new char[mapping.length];
077            System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
078        }
079    
080        /**
081         * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
082         * and/or possibly provide an internationalized mapping for a non-Western character set.
083         * 
084         * @param mapping
085         *            Mapping string to use when finding the corresponding code for a given character
086         * @since 1.4
087         */
088        public RefinedSoundex(String mapping) {
089            this.soundexMapping = mapping.toCharArray();
090        }
091    
092        /**
093         * Returns the number of characters in the two encoded Strings that are the
094         * same. This return value ranges from 0 to the length of the shortest
095         * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
096         * example) indicates strong similarity or identical values. For refined
097         * Soundex, the return value can be greater than 4.
098         * 
099         * @param s1
100         *                  A String that will be encoded and compared.
101         * @param s2
102         *                  A String that will be encoded and compared.
103         * @return The number of characters in the two encoded Strings that are the
104         *             same from 0 to to the length of the shortest encoded String.
105         * 
106         * @see SoundexUtils#difference(StringEncoder,String,String)
107         * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
108         *          MS T-SQL DIFFERENCE</a>
109         * 
110         * @throws EncoderException
111         *                  if an error occurs encoding one of the strings
112         * @since 1.3
113         */
114        public int difference(String s1, String s2) throws EncoderException {
115            return SoundexUtils.difference(this, s1, s2);
116        }
117    
118        /**
119         * Encodes an Object using the refined soundex algorithm. This method is
120         * provided in order to satisfy the requirements of the Encoder interface,
121         * and will throw an EncoderException if the supplied object is not of type
122         * java.lang.String.
123         * 
124         * @param pObject
125         *                  Object to encode
126         * @return An object (or type java.lang.String) containing the refined
127         *             soundex code which corresponds to the String supplied.
128         * @throws EncoderException
129         *                  if the parameter supplied is not of type java.lang.String
130         */
131        public Object encode(Object pObject) throws EncoderException {
132            if (!(pObject instanceof String)) {
133                throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
134            }
135            return soundex((String) pObject);
136        }
137    
138        /**
139         * Encodes a String using the refined soundex algorithm.
140         * 
141         * @param pString
142         *                  A String object to encode
143         * @return A Soundex code corresponding to the String supplied
144         */
145        public String encode(String pString) {
146            return soundex(pString);
147        }
148    
149        /**
150         * Returns the mapping code for a given character. The mapping codes are
151         * maintained in an internal char array named soundexMapping, and the
152         * default values of these mappings are US English.
153         * 
154         * @param c
155         *                  char to get mapping for
156         * @return A character (really a numeral) to return for the given char
157         */
158        char getMappingCode(char c) {
159            if (!Character.isLetter(c)) {
160                return 0;
161            }
162            return this.soundexMapping[Character.toUpperCase(c) - 'A'];
163        }
164    
165        /**
166         * Retreives the Refined Soundex code for a given String object.
167         * 
168         * @param str
169         *                  String to encode using the Refined Soundex algorithm
170         * @return A soundex code for the String supplied
171         */
172        public String soundex(String str) {
173            if (str == null) {
174                return null;
175            }
176            str = SoundexUtils.clean(str);
177            if (str.length() == 0) {
178                return str;
179            }
180    
181            StringBuffer sBuf = new StringBuffer();
182            sBuf.append(str.charAt(0));
183    
184            char last, current;
185            last = '*';
186    
187            for (int i = 0; i < str.length(); i++) {
188    
189                current = getMappingCode(str.charAt(i));
190                if (current == last) {
191                    continue;
192                } else if (current != 0) {
193                    sBuf.append(current);
194                }
195    
196                last = current;
197    
198            }
199    
200            return sBuf.toString();
201        }
202    }