001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.codec.language; 019 020 import org.apache.commons.codec.EncoderException; 021 import org.apache.commons.codec.StringEncoder; 022 023 /** 024 * Encodes a string into a Refined Soundex value. A refined soundex code is 025 * optimized for spell checking words. Soundex method originally developed by 026 * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>. 027 * 028 * @author Apache Software Foundation 029 * @version $Id: RefinedSoundex.java 797690 2009-07-24 23:28:35Z ggregory $ 030 */ 031 public class RefinedSoundex implements StringEncoder { 032 033 /** 034 * @since 1.4 035 */ 036 public static final String US_ENGLISH_MAPPING_STRING = "01360240043788015936020505"; 037 038 /** 039 * RefinedSoundex is *refined* for a number of reasons one being that the 040 * mappings have been altered. This implementation contains default 041 * mappings for US English. 042 */ 043 public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); 044 045 /** 046 * Every letter of the alphabet is "mapped" to a numerical value. This char 047 * array holds the values to which each letter is mapped. This 048 * implementation contains a default map for US_ENGLISH 049 */ 050 private final char[] soundexMapping; 051 052 /** 053 * This static variable contains an instance of the RefinedSoundex using 054 * the US_ENGLISH mapping. 055 */ 056 public static final RefinedSoundex US_ENGLISH = new RefinedSoundex(); 057 058 /** 059 * Creates an instance of the RefinedSoundex object using the default US 060 * English mapping. 061 */ 062 public RefinedSoundex() { 063 this.soundexMapping = US_ENGLISH_MAPPING; 064 } 065 066 /** 067 * Creates a refined soundex instance using a custom mapping. This 068 * constructor can be used to customize the mapping, and/or possibly 069 * provide an internationalized mapping for a non-Western character set. 070 * 071 * @param mapping 072 * Mapping array to use when finding the corresponding code for 073 * a given character 074 */ 075 public RefinedSoundex(char[] mapping) { 076 this.soundexMapping = new char[mapping.length]; 077 System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length); 078 } 079 080 /** 081 * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, 082 * and/or possibly provide an internationalized mapping for a non-Western character set. 083 * 084 * @param mapping 085 * Mapping string to use when finding the corresponding code for a given character 086 * @since 1.4 087 */ 088 public RefinedSoundex(String mapping) { 089 this.soundexMapping = mapping.toCharArray(); 090 } 091 092 /** 093 * Returns the number of characters in the two encoded Strings that are the 094 * same. This return value ranges from 0 to the length of the shortest 095 * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for 096 * example) indicates strong similarity or identical values. For refined 097 * Soundex, the return value can be greater than 4. 098 * 099 * @param s1 100 * A String that will be encoded and compared. 101 * @param s2 102 * A String that will be encoded and compared. 103 * @return The number of characters in the two encoded Strings that are the 104 * same from 0 to to the length of the shortest encoded String. 105 * 106 * @see SoundexUtils#difference(StringEncoder,String,String) 107 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> 108 * MS T-SQL DIFFERENCE</a> 109 * 110 * @throws EncoderException 111 * if an error occurs encoding one of the strings 112 * @since 1.3 113 */ 114 public int difference(String s1, String s2) throws EncoderException { 115 return SoundexUtils.difference(this, s1, s2); 116 } 117 118 /** 119 * Encodes an Object using the refined soundex algorithm. This method is 120 * provided in order to satisfy the requirements of the Encoder interface, 121 * and will throw an EncoderException if the supplied object is not of type 122 * java.lang.String. 123 * 124 * @param pObject 125 * Object to encode 126 * @return An object (or type java.lang.String) containing the refined 127 * soundex code which corresponds to the String supplied. 128 * @throws EncoderException 129 * if the parameter supplied is not of type java.lang.String 130 */ 131 public Object encode(Object pObject) throws EncoderException { 132 if (!(pObject instanceof String)) { 133 throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String"); 134 } 135 return soundex((String) pObject); 136 } 137 138 /** 139 * Encodes a String using the refined soundex algorithm. 140 * 141 * @param pString 142 * A String object to encode 143 * @return A Soundex code corresponding to the String supplied 144 */ 145 public String encode(String pString) { 146 return soundex(pString); 147 } 148 149 /** 150 * Returns the mapping code for a given character. The mapping codes are 151 * maintained in an internal char array named soundexMapping, and the 152 * default values of these mappings are US English. 153 * 154 * @param c 155 * char to get mapping for 156 * @return A character (really a numeral) to return for the given char 157 */ 158 char getMappingCode(char c) { 159 if (!Character.isLetter(c)) { 160 return 0; 161 } 162 return this.soundexMapping[Character.toUpperCase(c) - 'A']; 163 } 164 165 /** 166 * Retreives the Refined Soundex code for a given String object. 167 * 168 * @param str 169 * String to encode using the Refined Soundex algorithm 170 * @return A soundex code for the String supplied 171 */ 172 public String soundex(String str) { 173 if (str == null) { 174 return null; 175 } 176 str = SoundexUtils.clean(str); 177 if (str.length() == 0) { 178 return str; 179 } 180 181 StringBuffer sBuf = new StringBuffer(); 182 sBuf.append(str.charAt(0)); 183 184 char last, current; 185 last = '*'; 186 187 for (int i = 0; i < str.length(); i++) { 188 189 current = getMappingCode(str.charAt(i)); 190 if (current == last) { 191 continue; 192 } else if (current != 0) { 193 sBuf.append(current); 194 } 195 196 last = current; 197 198 } 199 200 return sBuf.toString(); 201 } 202 }