001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.net;
019    
020    import java.io.ByteArrayOutputStream;
021    import java.io.UnsupportedEncodingException;
022    import java.util.BitSet;
023    
024    import org.apache.commons.codec.BinaryDecoder;
025    import org.apache.commons.codec.BinaryEncoder;
026    import org.apache.commons.codec.DecoderException;
027    import org.apache.commons.codec.EncoderException;
028    import org.apache.commons.codec.CharEncoding;
029    import org.apache.commons.codec.StringDecoder;
030    import org.apache.commons.codec.StringEncoder;
031    import org.apache.commons.codec.binary.StringUtils;
032    
033    /**
034     * <p>Implements the 'www-form-urlencoded' encoding scheme, 
035     * also misleadingly known as URL encoding.</p>
036     *  
037     * <p>For more detailed information please refer to 
038     * <a href="http://www.w3.org/TR/html4/interact/forms.html#h-17.13.4.1">
039     * Chapter 17.13.4 'Form content types'</a> of the 
040     * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification<a></p>
041     * 
042     * <p> 
043     * This codec is meant to be a replacement for standard Java classes
044     * {@link java.net.URLEncoder} and {@link java.net.URLDecoder} 
045     * on older Java platforms, as these classes in Java versions below 
046     * 1.4 rely on the platform's default charset encoding.
047     * </p>
048     * 
049     * @author Apache Software Foundation
050     * @since 1.2
051     * @version $Id: URLCodec.java 798416 2009-07-28 06:35:58Z ggregory $
052     */
053    public class URLCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
054        
055        /**
056         * Radix used in encoding and decoding.
057         */
058        static final int RADIX = 16;
059        
060        /**
061         * The default charset used for string decoding and encoding. Consider this field final. The next major release may
062         * break compatibility and make this field be final.
063         */
064        protected String charset;
065        
066        /**
067         * Consider this field final. The next major release may break compatibility and make this field be final.
068         */
069        protected static byte ESCAPE_CHAR = '%';
070        /**
071         * BitSet of www-form-url safe characters.
072         */
073        protected static final BitSet WWW_FORM_URL = new BitSet(256);
074        
075        // Static initializer for www_form_url
076        static {
077            // alpha characters
078            for (int i = 'a'; i <= 'z'; i++) {
079                WWW_FORM_URL.set(i);
080            }
081            for (int i = 'A'; i <= 'Z'; i++) {
082                WWW_FORM_URL.set(i);
083            }
084            // numeric characters
085            for (int i = '0'; i <= '9'; i++) {
086                WWW_FORM_URL.set(i);
087            }
088            // special chars
089            WWW_FORM_URL.set('-');
090            WWW_FORM_URL.set('_');
091            WWW_FORM_URL.set('.');
092            WWW_FORM_URL.set('*');
093            // blank to be replaced with +
094            WWW_FORM_URL.set(' ');
095        }
096    
097    
098        /**
099         * Default constructor.
100         */
101        public URLCodec() {
102            this(CharEncoding.UTF_8);
103        }
104    
105        /**
106         * Constructor which allows for the selection of a default charset
107         * 
108         * @param charset the default string charset to use.
109         */
110        public URLCodec(String charset) {
111            super();
112            this.charset = charset;
113        }
114    
115        /**
116         * Encodes an array of bytes into an array of URL safe 7-bit characters. Unsafe characters are escaped.
117         * 
118         * @param urlsafe
119         *            bitset of characters deemed URL safe
120         * @param bytes
121         *            array of bytes to convert to URL safe characters
122         * @return array of bytes containing URL safe characters
123         */
124        public static final byte[] encodeUrl(BitSet urlsafe, byte[] bytes) {
125            if (bytes == null) {
126                return null;
127            }
128            if (urlsafe == null) {
129                urlsafe = WWW_FORM_URL;
130            }
131    
132            ByteArrayOutputStream buffer = new ByteArrayOutputStream();
133            for (int i = 0; i < bytes.length; i++) {
134                int b = bytes[i];
135                if (b < 0) {
136                    b = 256 + b;
137                }
138                if (urlsafe.get(b)) {
139                    if (b == ' ') {
140                        b = '+';
141                    }
142                    buffer.write(b);
143                } else {
144                    buffer.write(ESCAPE_CHAR);
145                    char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, RADIX));
146                    char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, RADIX));
147                    buffer.write(hex1);
148                    buffer.write(hex2);
149                }
150            }
151            return buffer.toByteArray();
152        }
153    
154        /**
155         * Decodes an array of URL safe 7-bit characters into an array of 
156         * original bytes. Escaped characters are converted back to their 
157         * original representation.
158         *
159         * @param bytes array of URL safe characters
160         * @return array of original bytes 
161         * @throws DecoderException Thrown if URL decoding is unsuccessful
162         */
163        public static final byte[] decodeUrl(byte[] bytes) throws DecoderException {
164            if (bytes == null) {
165                return null;
166            }
167            ByteArrayOutputStream buffer = new ByteArrayOutputStream();
168            for (int i = 0; i < bytes.length; i++) {
169                int b = bytes[i];
170                if (b == '+') {
171                    buffer.write(' ');
172                } else if (b == ESCAPE_CHAR) {
173                    try {
174                        int u = Utils.digit16(bytes[++i]);
175                        int l = Utils.digit16(bytes[++i]);
176                        buffer.write((char) ((u << 4) + l));
177                    } catch (ArrayIndexOutOfBoundsException e) {
178                        throw new DecoderException("Invalid URL encoding: ", e);
179                    }
180                } else {
181                    buffer.write(b);
182                }
183            }
184            return buffer.toByteArray();
185        }
186    
187        /**
188         * Encodes an array of bytes into an array of URL safe 7-bit 
189         * characters. Unsafe characters are escaped.
190         *
191         * @param bytes array of bytes to convert to URL safe characters
192         * @return array of bytes containing URL safe characters
193         */
194        public byte[] encode(byte[] bytes) {
195            return encodeUrl(WWW_FORM_URL, bytes);
196        }
197    
198    
199        /**
200         * Decodes an array of URL safe 7-bit characters into an array of 
201         * original bytes. Escaped characters are converted back to their 
202         * original representation.
203         *
204         * @param bytes array of URL safe characters
205         * @return array of original bytes 
206         * @throws DecoderException Thrown if URL decoding is unsuccessful
207         */
208        public byte[] decode(byte[] bytes) throws DecoderException {
209            return decodeUrl(bytes);
210        }
211    
212        /**
213         * Encodes a string into its URL safe form using the specified string charset. Unsafe characters are escaped.
214         * 
215         * @param pString
216         *            string to convert to a URL safe form
217         * @param charset
218         *            the charset for pString
219         * @return URL safe string
220         * @throws UnsupportedEncodingException
221         *             Thrown if charset is not supported
222         */
223        public String encode(String pString, String charset) throws UnsupportedEncodingException {
224            if (pString == null) {
225                return null;
226            }
227            return StringUtils.newStringUsAscii(encode(pString.getBytes(charset)));
228        }
229    
230        /**
231         * Encodes a string into its URL safe form using the default string 
232         * charset. Unsafe characters are escaped.
233         *
234         * @param pString string to convert to a URL safe form
235         * @return URL safe string
236         * @throws EncoderException Thrown if URL encoding is unsuccessful
237         * 
238         * @see #getDefaultCharset()
239         */
240        public String encode(String pString) throws EncoderException {
241            if (pString == null) {
242                return null;
243            }
244            try {
245                return encode(pString, getDefaultCharset());
246            } catch (UnsupportedEncodingException e) {
247                throw new EncoderException(e.getMessage(), e);
248            }
249        }
250    
251    
252        /**
253         * Decodes a URL safe string into its original form using the 
254         * specified encoding. Escaped characters are converted back 
255         * to their original representation.
256         *
257         * @param pString URL safe string to convert into its original form
258         * @param charset the original string charset
259         * @return original string 
260         * @throws DecoderException Thrown if URL decoding is unsuccessful
261         * @throws UnsupportedEncodingException Thrown if charset is not
262         *                                      supported 
263         */
264        public String decode(String pString, String charset) throws DecoderException, UnsupportedEncodingException {
265            if (pString == null) {
266                return null;
267            }
268            return new String(decode(StringUtils.getBytesUsAscii(pString)), charset);
269        }
270    
271        /**
272         * Decodes a URL safe string into its original form using the default
273         * string charset. Escaped characters are converted back to their 
274         * original representation.
275         *
276         * @param pString URL safe string to convert into its original form
277         * @return original string 
278         * @throws DecoderException Thrown if URL decoding is unsuccessful
279         * 
280         * @see #getDefaultCharset()
281         */
282        public String decode(String pString) throws DecoderException {
283            if (pString == null) {
284                return null;
285            }
286            try {
287                return decode(pString, getDefaultCharset());
288            } catch (UnsupportedEncodingException e) {
289                throw new DecoderException(e.getMessage(), e);
290            }
291        }
292    
293        /**
294         * Encodes an object into its URL safe form. Unsafe characters are 
295         * escaped.
296         *
297         * @param pObject string to convert to a URL safe form
298         * @return URL safe object
299         * @throws EncoderException Thrown if URL encoding is not 
300         *                          applicable to objects of this type or
301         *                          if encoding is unsuccessful
302         */
303        public Object encode(Object pObject) throws EncoderException {
304            if (pObject == null) {
305                return null;
306            } else if (pObject instanceof byte[]) {
307                return encode((byte[])pObject);
308            } else if (pObject instanceof String) {
309                return encode((String)pObject);
310            } else {
311                throw new EncoderException("Objects of type " +
312                    pObject.getClass().getName() + " cannot be URL encoded"); 
313                  
314            }
315        }
316    
317        /**
318         * Decodes a URL safe object into its original form. Escaped characters are converted back to their original
319         * representation.
320         * 
321         * @param pObject
322         *                  URL safe object to convert into its original form
323         * @return original object
324         * @throws DecoderException
325         *                  Thrown if the argument is not a <code>String</code> or <code>byte[]</code>. Thrown if a failure condition is
326         *                  encountered during the decode process.
327         */
328        public Object decode(Object pObject) throws DecoderException {
329            if (pObject == null) {
330                return null;
331            } else if (pObject instanceof byte[]) {
332                return decode((byte[]) pObject);
333            } else if (pObject instanceof String) {
334                return decode((String) pObject);
335            } else {
336                throw new DecoderException("Objects of type " + pObject.getClass().getName() + " cannot be URL decoded");
337    
338            }
339        }
340    
341        /**
342         * The <code>String</code> encoding used for decoding and encoding.
343         * 
344         * @return Returns the encoding.
345         * 
346         * @deprecated use #getDefaultCharset()
347         */
348        public String getEncoding() {
349            return this.charset;
350        }
351    
352        /**
353         * The default charset used for string decoding and encoding.
354         *
355         * @return the default string charset.
356         */
357        public String getDefaultCharset() {
358            return this.charset;
359        }
360    
361    }