001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.codec.language;
019
020 import org.apache.commons.codec.EncoderException;
021 import org.apache.commons.codec.StringEncoder;
022
023 /**
024 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
025 * general purpose scheme to find word with similar phonemes.
026 *
027 * @author Apache Software Foundation
028 * @version $Id: Soundex.java 794026 2009-07-14 19:21:23Z ggregory $
029 */
030 public class Soundex implements StringEncoder {
031
032 /**
033 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
034 * means do not encode.
035 * <p>
036 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
037 * up the value for the constant values page.)
038 * </p>
039 *
040 * @see #US_ENGLISH_MAPPING
041 */
042 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
043
044 /**
045 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
046 * means do not encode.
047 *
048 * @see Soundex#Soundex(char[])
049 */
050 public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
051
052 /**
053 * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
054 *
055 * @see #US_ENGLISH_MAPPING
056 */
057 public static final Soundex US_ENGLISH = new Soundex();
058
059
060 /**
061 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
062 * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
063 * identical values.
064 *
065 * @param s1
066 * A String that will be encoded and compared.
067 * @param s2
068 * A String that will be encoded and compared.
069 * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
070 *
071 * @see SoundexUtils#difference(StringEncoder,String,String)
072 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
073 * T-SQL DIFFERENCE </a>
074 *
075 * @throws EncoderException
076 * if an error occurs encoding one of the strings
077 * @since 1.3
078 */
079 public int difference(String s1, String s2) throws EncoderException {
080 return SoundexUtils.difference(this, s1, s2);
081 }
082
083 /**
084 * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
085 *
086 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
087 */
088 private int maxLength = 4;
089
090 /**
091 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
092 * letter is mapped. This implementation contains a default map for US_ENGLISH
093 */
094 private final char[] soundexMapping;
095
096 /**
097 * Creates an instance using US_ENGLISH_MAPPING
098 *
099 * @see Soundex#Soundex(char[])
100 * @see Soundex#US_ENGLISH_MAPPING
101 */
102 public Soundex() {
103 this.soundexMapping = US_ENGLISH_MAPPING;
104 }
105
106 /**
107 * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
108 * mapping for a non-Western character set.
109 *
110 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
111 * letter is mapped. This implementation contains a default map for US_ENGLISH
112 *
113 * @param mapping
114 * Mapping array to use when finding the corresponding code for a given character
115 */
116 public Soundex(char[] mapping) {
117 this.soundexMapping = new char[mapping.length];
118 System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
119 }
120
121 /**
122 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
123 * and/or possibly provide an internationalized mapping for a non-Western character set.
124 *
125 * @param mapping
126 * Mapping string to use when finding the corresponding code for a given character
127 * @since 1.4
128 */
129 public Soundex(String mapping) {
130 this.soundexMapping = mapping.toCharArray();
131 }
132
133 /**
134 * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
135 * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
136 *
137 * @param pObject
138 * Object to encode
139 * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
140 * supplied.
141 * @throws EncoderException
142 * if the parameter supplied is not of type java.lang.String
143 * @throws IllegalArgumentException
144 * if a character is not mapped
145 */
146 public Object encode(Object pObject) throws EncoderException {
147 if (!(pObject instanceof String)) {
148 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
149 }
150 return soundex((String) pObject);
151 }
152
153 /**
154 * Encodes a String using the soundex algorithm.
155 *
156 * @param pString
157 * A String object to encode
158 * @return A Soundex code corresponding to the String supplied
159 * @throws IllegalArgumentException
160 * if a character is not mapped
161 */
162 public String encode(String pString) {
163 return soundex(pString);
164 }
165
166 /**
167 * Used internally by the SoundEx algorithm.
168 *
169 * Consonants from the same code group separated by W or H are treated as one.
170 *
171 * @param str
172 * the cleaned working string to encode (in upper case).
173 * @param index
174 * the character position to encode
175 * @return Mapping code for a particular character
176 * @throws IllegalArgumentException
177 * if the character is not mapped
178 */
179 private char getMappingCode(String str, int index) {
180 // map() throws IllegalArgumentException
181 char mappedChar = this.map(str.charAt(index));
182 // HW rule check
183 if (index > 1 && mappedChar != '0') {
184 char hwChar = str.charAt(index - 1);
185 if ('H' == hwChar || 'W' == hwChar) {
186 char preHWChar = str.charAt(index - 2);
187 char firstCode = this.map(preHWChar);
188 if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) {
189 return 0;
190 }
191 }
192 }
193 return mappedChar;
194 }
195
196 /**
197 * Returns the maxLength. Standard Soundex
198 *
199 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
200 * @return int
201 */
202 public int getMaxLength() {
203 return this.maxLength;
204 }
205
206 /**
207 * Returns the soundex mapping.
208 *
209 * @return soundexMapping.
210 */
211 private char[] getSoundexMapping() {
212 return this.soundexMapping;
213 }
214
215 /**
216 * Maps the given upper-case character to its Soundex code.
217 *
218 * @param ch
219 * An upper-case character.
220 * @return A Soundex code.
221 * @throws IllegalArgumentException
222 * Thrown if <code>ch</code> is not mapped.
223 */
224 private char map(char ch) {
225 int index = ch - 'A';
226 if (index < 0 || index >= this.getSoundexMapping().length) {
227 throw new IllegalArgumentException("The character is not mapped: " + ch);
228 }
229 return this.getSoundexMapping()[index];
230 }
231
232 /**
233 * Sets the maxLength.
234 *
235 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
236 * @param maxLength
237 * The maxLength to set
238 */
239 public void setMaxLength(int maxLength) {
240 this.maxLength = maxLength;
241 }
242
243 /**
244 * Retrieves the Soundex code for a given String object.
245 *
246 * @param str
247 * String to encode using the Soundex algorithm
248 * @return A soundex code for the String supplied
249 * @throws IllegalArgumentException
250 * if a character is not mapped
251 */
252 public String soundex(String str) {
253 if (str == null) {
254 return null;
255 }
256 str = SoundexUtils.clean(str);
257 if (str.length() == 0) {
258 return str;
259 }
260 char out[] = {'0', '0', '0', '0'};
261 char last, mapped;
262 int incount = 1, count = 1;
263 out[0] = str.charAt(0);
264 // getMappingCode() throws IllegalArgumentException
265 last = getMappingCode(str, 0);
266 while ((incount < str.length()) && (count < out.length)) {
267 mapped = getMappingCode(str, incount++);
268 if (mapped != 0) {
269 if ((mapped != '0') && (mapped != last)) {
270 out[count++] = mapped;
271 }
272 last = mapped;
273 }
274 }
275 return new String(out);
276 }
277
278 }