001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.codec.language;
019
020 import org.apache.commons.codec.EncoderException;
021 import org.apache.commons.codec.StringEncoder;
022
023 /**
024 * Encodes a string into a Refined Soundex value. A refined soundex code is
025 * optimized for spell checking words. Soundex method originally developed by
026 * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
027 *
028 * @author Apache Software Foundation
029 * @version $Id: RefinedSoundex.java 797690 2009-07-24 23:28:35Z ggregory $
030 */
031 public class RefinedSoundex implements StringEncoder {
032
033 /**
034 * @since 1.4
035 */
036 public static final String US_ENGLISH_MAPPING_STRING = "01360240043788015936020505";
037
038 /**
039 * RefinedSoundex is *refined* for a number of reasons one being that the
040 * mappings have been altered. This implementation contains default
041 * mappings for US English.
042 */
043 public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
044
045 /**
046 * Every letter of the alphabet is "mapped" to a numerical value. This char
047 * array holds the values to which each letter is mapped. This
048 * implementation contains a default map for US_ENGLISH
049 */
050 private final char[] soundexMapping;
051
052 /**
053 * This static variable contains an instance of the RefinedSoundex using
054 * the US_ENGLISH mapping.
055 */
056 public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
057
058 /**
059 * Creates an instance of the RefinedSoundex object using the default US
060 * English mapping.
061 */
062 public RefinedSoundex() {
063 this.soundexMapping = US_ENGLISH_MAPPING;
064 }
065
066 /**
067 * Creates a refined soundex instance using a custom mapping. This
068 * constructor can be used to customize the mapping, and/or possibly
069 * provide an internationalized mapping for a non-Western character set.
070 *
071 * @param mapping
072 * Mapping array to use when finding the corresponding code for
073 * a given character
074 */
075 public RefinedSoundex(char[] mapping) {
076 this.soundexMapping = new char[mapping.length];
077 System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
078 }
079
080 /**
081 * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
082 * and/or possibly provide an internationalized mapping for a non-Western character set.
083 *
084 * @param mapping
085 * Mapping string to use when finding the corresponding code for a given character
086 * @since 1.4
087 */
088 public RefinedSoundex(String mapping) {
089 this.soundexMapping = mapping.toCharArray();
090 }
091
092 /**
093 * Returns the number of characters in the two encoded Strings that are the
094 * same. This return value ranges from 0 to the length of the shortest
095 * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
096 * example) indicates strong similarity or identical values. For refined
097 * Soundex, the return value can be greater than 4.
098 *
099 * @param s1
100 * A String that will be encoded and compared.
101 * @param s2
102 * A String that will be encoded and compared.
103 * @return The number of characters in the two encoded Strings that are the
104 * same from 0 to to the length of the shortest encoded String.
105 *
106 * @see SoundexUtils#difference(StringEncoder,String,String)
107 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
108 * MS T-SQL DIFFERENCE</a>
109 *
110 * @throws EncoderException
111 * if an error occurs encoding one of the strings
112 * @since 1.3
113 */
114 public int difference(String s1, String s2) throws EncoderException {
115 return SoundexUtils.difference(this, s1, s2);
116 }
117
118 /**
119 * Encodes an Object using the refined soundex algorithm. This method is
120 * provided in order to satisfy the requirements of the Encoder interface,
121 * and will throw an EncoderException if the supplied object is not of type
122 * java.lang.String.
123 *
124 * @param pObject
125 * Object to encode
126 * @return An object (or type java.lang.String) containing the refined
127 * soundex code which corresponds to the String supplied.
128 * @throws EncoderException
129 * if the parameter supplied is not of type java.lang.String
130 */
131 public Object encode(Object pObject) throws EncoderException {
132 if (!(pObject instanceof String)) {
133 throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
134 }
135 return soundex((String) pObject);
136 }
137
138 /**
139 * Encodes a String using the refined soundex algorithm.
140 *
141 * @param pString
142 * A String object to encode
143 * @return A Soundex code corresponding to the String supplied
144 */
145 public String encode(String pString) {
146 return soundex(pString);
147 }
148
149 /**
150 * Returns the mapping code for a given character. The mapping codes are
151 * maintained in an internal char array named soundexMapping, and the
152 * default values of these mappings are US English.
153 *
154 * @param c
155 * char to get mapping for
156 * @return A character (really a numeral) to return for the given char
157 */
158 char getMappingCode(char c) {
159 if (!Character.isLetter(c)) {
160 return 0;
161 }
162 return this.soundexMapping[Character.toUpperCase(c) - 'A'];
163 }
164
165 /**
166 * Retreives the Refined Soundex code for a given String object.
167 *
168 * @param str
169 * String to encode using the Refined Soundex algorithm
170 * @return A soundex code for the String supplied
171 */
172 public String soundex(String str) {
173 if (str == null) {
174 return null;
175 }
176 str = SoundexUtils.clean(str);
177 if (str.length() == 0) {
178 return str;
179 }
180
181 StringBuffer sBuf = new StringBuffer();
182 sBuf.append(str.charAt(0));
183
184 char last, current;
185 last = '*';
186
187 for (int i = 0; i < str.length(); i++) {
188
189 current = getMappingCode(str.charAt(i));
190 if (current == last) {
191 continue;
192 } else if (current != 0) {
193 sBuf.append(current);
194 }
195
196 last = current;
197
198 }
199
200 return sBuf.toString();
201 }
202 }