001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import org.apache.commons.codec.EncoderException;
021    import org.apache.commons.codec.StringEncoder;
022    
023    /**
024     * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
025     * general purpose scheme to find word with similar phonemes.
026     * 
027     * @author Apache Software Foundation
028     * @version $Id: Soundex.java 794026 2009-07-14 19:21:23Z ggregory $
029     */
030    public class Soundex implements StringEncoder {
031    
032        /**
033         * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
034         * means do not encode.
035         * <p>
036         * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
037         * up the value for the constant values page.)
038         * </p>
039         * 
040         * @see #US_ENGLISH_MAPPING
041         */
042        public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
043    
044        /**
045         * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
046         * means do not encode.
047         * 
048         * @see Soundex#Soundex(char[])
049         */
050        public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
051    
052        /**
053         * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
054         * 
055         * @see #US_ENGLISH_MAPPING
056         */
057        public static final Soundex US_ENGLISH = new Soundex();
058    
059    
060        /**
061         * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
062         * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
063         * identical values.
064         * 
065         * @param s1
066         *                  A String that will be encoded and compared.
067         * @param s2
068         *                  A String that will be encoded and compared.
069         * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
070         * 
071         * @see SoundexUtils#difference(StringEncoder,String,String)
072         * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
073         *          T-SQL DIFFERENCE </a>
074         * 
075         * @throws EncoderException
076         *                  if an error occurs encoding one of the strings
077         * @since 1.3
078         */
079        public int difference(String s1, String s2) throws EncoderException {
080            return SoundexUtils.difference(this, s1, s2);
081        }
082    
083        /**
084         * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
085         * 
086         * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
087         */
088        private int maxLength = 4;
089    
090        /**
091         * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
092         * letter is mapped. This implementation contains a default map for US_ENGLISH
093         */
094        private final char[] soundexMapping;
095    
096        /**
097         * Creates an instance using US_ENGLISH_MAPPING
098         * 
099         * @see Soundex#Soundex(char[])
100         * @see Soundex#US_ENGLISH_MAPPING
101         */
102        public Soundex() {
103            this.soundexMapping = US_ENGLISH_MAPPING;
104        }
105    
106        /**
107         * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
108         * mapping for a non-Western character set.
109         * 
110         * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
111         * letter is mapped. This implementation contains a default map for US_ENGLISH
112         * 
113         * @param mapping
114         *                  Mapping array to use when finding the corresponding code for a given character
115         */
116        public Soundex(char[] mapping) {
117            this.soundexMapping = new char[mapping.length];
118            System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
119        }
120    
121        /**
122         * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
123         * and/or possibly provide an internationalized mapping for a non-Western character set.
124         * 
125         * @param mapping
126         *            Mapping string to use when finding the corresponding code for a given character
127         * @since 1.4
128         */
129        public Soundex(String mapping) {
130            this.soundexMapping = mapping.toCharArray();
131        }
132    
133        /**
134         * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
135         * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
136         * 
137         * @param pObject
138         *                  Object to encode
139         * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
140         *             supplied.
141         * @throws EncoderException
142         *                  if the parameter supplied is not of type java.lang.String
143         * @throws IllegalArgumentException
144         *                  if a character is not mapped
145         */
146        public Object encode(Object pObject) throws EncoderException {
147            if (!(pObject instanceof String)) {
148                throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
149            }
150            return soundex((String) pObject);
151        }
152    
153        /**
154         * Encodes a String using the soundex algorithm.
155         * 
156         * @param pString
157         *                  A String object to encode
158         * @return A Soundex code corresponding to the String supplied
159         * @throws IllegalArgumentException
160         *                  if a character is not mapped
161         */
162        public String encode(String pString) {
163            return soundex(pString);
164        }
165    
166        /**
167         * Used internally by the SoundEx algorithm.
168         * 
169         * Consonants from the same code group separated by W or H are treated as one.
170         * 
171         * @param str
172         *                  the cleaned working string to encode (in upper case).
173         * @param index
174         *                  the character position to encode
175         * @return Mapping code for a particular character
176         * @throws IllegalArgumentException
177         *                  if the character is not mapped
178         */
179        private char getMappingCode(String str, int index) {
180            // map() throws IllegalArgumentException
181            char mappedChar = this.map(str.charAt(index));
182            // HW rule check
183            if (index > 1 && mappedChar != '0') {
184                char hwChar = str.charAt(index - 1);
185                if ('H' == hwChar || 'W' == hwChar) {
186                    char preHWChar = str.charAt(index - 2);
187                    char firstCode = this.map(preHWChar);
188                    if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) {
189                        return 0;
190                    }
191                }
192            }
193            return mappedChar;
194        }
195    
196        /**
197         * Returns the maxLength. Standard Soundex
198         * 
199         * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
200         * @return int
201         */
202        public int getMaxLength() {
203            return this.maxLength;
204        }
205    
206        /**
207         * Returns the soundex mapping.
208         * 
209         * @return soundexMapping.
210         */
211        private char[] getSoundexMapping() {
212            return this.soundexMapping;
213        }
214    
215        /**
216         * Maps the given upper-case character to its Soundex code.
217         * 
218         * @param ch
219         *                  An upper-case character.
220         * @return A Soundex code.
221         * @throws IllegalArgumentException
222         *                  Thrown if <code>ch</code> is not mapped.
223         */
224        private char map(char ch) {
225            int index = ch - 'A';
226            if (index < 0 || index >= this.getSoundexMapping().length) {
227                throw new IllegalArgumentException("The character is not mapped: " + ch);
228            }
229            return this.getSoundexMapping()[index];
230        }
231    
232        /**
233         * Sets the maxLength.
234         * 
235         * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
236         * @param maxLength
237         *                  The maxLength to set
238         */
239        public void setMaxLength(int maxLength) {
240            this.maxLength = maxLength;
241        }
242    
243        /**
244         * Retrieves the Soundex code for a given String object.
245         * 
246         * @param str
247         *                  String to encode using the Soundex algorithm
248         * @return A soundex code for the String supplied
249         * @throws IllegalArgumentException
250         *                  if a character is not mapped
251         */
252        public String soundex(String str) {
253            if (str == null) {
254                return null;
255            }
256            str = SoundexUtils.clean(str);
257            if (str.length() == 0) {
258                return str;
259            }
260            char out[] = {'0', '0', '0', '0'};
261            char last, mapped;
262            int incount = 1, count = 1;
263            out[0] = str.charAt(0);
264            // getMappingCode() throws IllegalArgumentException
265            last = getMappingCode(str, 0);
266            while ((incount < str.length()) && (count < out.length)) {
267                mapped = getMappingCode(str, incount++);
268                if (mapped != 0) {
269                    if ((mapped != '0') && (mapped != last)) {
270                        out[count++] = mapped;
271                    }
272                    last = mapped;
273                }
274            }
275            return new String(out);
276        }
277    
278    }