View Javadoc

1   /*
2    * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/java/org/apache/commons/httpclient/util/URIUtil.java,v 1.27 2004/05/05 20:34:01 olegk Exp $
3    * $Revision: 155418 $
4    * $Date: 2005-02-26 08:01:52 -0500 (Sat, 26 Feb 2005) $
5    *
6    * ====================================================================
7    *
8    *  Copyright 2002-2004 The Apache Software Foundation
9    *
10   *  Licensed under the Apache License, Version 2.0 (the "License");
11   *  you may not use this file except in compliance with the License.
12   *  You may obtain a copy of the License at
13   *
14   *      http://www.apache.org/licenses/LICENSE-2.0
15   *
16   *  Unless required by applicable law or agreed to in writing, software
17   *  distributed under the License is distributed on an "AS IS" BASIS,
18   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19   *  See the License for the specific language governing permissions and
20   *  limitations under the License.
21   * ====================================================================
22   *
23   * This software consists of voluntary contributions made by many
24   * individuals on behalf of the Apache Software Foundation.  For more
25   * information on the Apache Software Foundation, please see
26   * <http://www.apache.org/>.
27   *
28   */
29  
30  package org.apache.commons.httpclient.util;
31  
32  import java.util.BitSet;
33  
34  import org.apache.commons.codec.DecoderException;
35  import org.apache.commons.codec.net.URLCodec;
36  import org.apache.commons.httpclient.URI;
37  import org.apache.commons.httpclient.URIException;
38  
39  /***
40   * The URI escape and character encoding and decoding utility.
41   * It's compatible with {@link org.apache.commons.httpclient.HttpURL} rather
42   * than {@link org.apache.commons.httpclient.URI}.
43   *
44   * @author <a href="mailto:jericho@apache.org">Sung-Gu</a>
45   * @version $Revision: 155418 $ $Date: 2002/03/14 15:14:01 
46   */
47  public class URIUtil {
48  
49      // ----------------------------------------------------- Instance variables
50  
51      protected static final BitSet empty = new BitSet(1);
52  
53      // ---------------------------------------------------------- URI utilities
54  
55      /***
56       * Get the basename of an URI.   It's possibly an empty string.
57       *
58       * @param uri a string regarded an URI
59       * @return the basename string; an empty string if the path ends with slash
60       */
61      public static String getName(String uri) {
62          if (uri == null || uri.length() == 0) { return uri; } 
63          String path = URIUtil.getPath(uri);
64          int at = path.lastIndexOf("/");
65          int to = path.length();
66          return (at >= 0) ? path.substring(at + 1, to) : path;
67      }
68  
69  
70      /***
71       * Get the query of an URI.
72       *
73       * @param uri a string regarded an URI
74       * @return the query string; <code>null</code> if empty or undefined
75       */
76      public static String getQuery(String uri) {
77          if (uri == null || uri.length() == 0) { return null; } 
78          // consider of net_path
79          int at = uri.indexOf("//");
80          int from = uri.indexOf(
81              "/", 
82              at >= 0 ? (uri.lastIndexOf("/", at - 1) >= 0 ? 0 : at + 2) : 0
83          );
84          // the authority part of URI ignored
85          int to = uri.length();
86          // reuse the at and from variables to consider the query
87          at = uri.indexOf("?", from);
88          if (at >= 0) {
89              from = at + 1;
90          } else {
91              return null;
92          }
93          // check the fragment
94          if (uri.lastIndexOf("#") > from) {
95              to = uri.lastIndexOf("#");
96          }
97          // get the path and query.
98          return (from < 0 || from == to) ? null : uri.substring(from, to);
99      }
100 
101 
102     /***
103      * Get the path of an URI.
104      *
105      * @param uri a string regarded an URI
106      * @return the path string
107      */
108     public static String getPath(String uri) {
109         if (uri == null) {
110             return null;
111         } 
112         // consider of net_path
113         int at = uri.indexOf("//");
114         int from = uri.indexOf(
115             "/", 
116             at >= 0 ? (uri.lastIndexOf("/", at - 1) >= 0 ? 0 : at + 2) : 0
117         );
118         // the authority part of URI ignored 
119         int to = uri.length();
120         // check the query
121         if (uri.indexOf('?', from) != -1) {
122             to = uri.indexOf('?', from);
123         }
124         // check the fragment
125         if (uri.lastIndexOf("#") > from && uri.lastIndexOf("#") < to) {
126             to = uri.lastIndexOf("#");
127         }
128         // get only the path.
129         return (from < 0) ? (at >= 0 ? "/" : uri) : uri.substring(from, to);
130     }
131 
132 
133     /***
134      * Get the path and query of an URI.
135      *
136      * @param uri a string regarded an URI
137      * @return the path and query string
138      */
139     public static String getPathQuery(String uri) {
140         if (uri == null) {
141             return null;
142         } 
143         // consider of net_path
144         int at = uri.indexOf("//");
145         int from = uri.indexOf(
146             "/", 
147             at >= 0 ? (uri.lastIndexOf("/", at - 1) >= 0 ? 0 : at + 2) : 0
148         );
149         // the authority part of URI ignored
150         int to = uri.length();
151         // Ignore the '?' mark so to ignore the query.
152         // check the fragment
153         if (uri.lastIndexOf("#") > from) {
154             to = uri.lastIndexOf("#");
155         }
156         // get the path and query.
157         return (from < 0) ? (at >= 0 ? "/" : uri) : uri.substring(from, to);
158     }
159 
160 
161     /***
162      * Get the path of an URI and its rest part.
163      *
164      * @param uri a string regarded an URI
165      * @return the string from the path part
166      */
167     public static String getFromPath(String uri) {
168         if (uri == null) {
169             return null;
170         } 
171         // consider of net_path
172         int at = uri.indexOf("//");
173         int from = uri.indexOf(
174             "/", 
175             at >= 0 ? (uri.lastIndexOf("/", at - 1) >= 0 ? 0 : at + 2) : 0
176         );
177         // get the path and its rest.
178         return (from < 0) ? (at >= 0 ? "/" : uri) : uri.substring(from);
179     }
180 
181     // ----------------------------------------------------- Encoding utilities
182 
183     /***
184      * Get the all escaped and encoded string with the default protocl charset.
185      * It's the same function to use <code>encode(String unescaped, Bitset
186      * empty, URI.getDefaultProtocolCharset())</code>.
187      *
188      * @param unescaped an unescaped string
189      * @return the escaped string
190      * 
191      * @throws URIException if the default protocol charset is not supported
192      *
193      * @see URI#getDefaultProtocolCharset
194      * @see #encode
195      */
196     public static String encodeAll(String unescaped) throws URIException {
197         return encodeAll(unescaped, URI.getDefaultProtocolCharset());
198     }
199  
200 
201     /***
202      * Get the all escaped and encoded string with a given charset.
203      * It's the same function to use <code>encode(String unescaped, Bitset
204      * empty, String charset)</code>.
205      *
206      * @param unescaped an unescaped string
207      * @param charset the charset
208      * @return the escaped string
209      * 
210      * @throws URIException if the charset is not supported
211      * 
212      * @see #encode
213      */
214     public static String encodeAll(String unescaped, String charset)
215         throws URIException {
216 
217         return encode(unescaped, empty, charset);
218     }
219   
220 
221     /***
222      * Escape and encode a string regarded as within the authority component of
223      * an URI with the default protocol charset.
224      * Within the authority component, the characters ";", ":", "@", "?", and
225      * "/" are reserved.
226      *
227      * @param unescaped an unescaped string
228      * @return the escaped string
229      * 
230      * @throws URIException if the default protocol charset is not supported
231      * 
232      * @see URI#getDefaultProtocolCharset
233      * @see #encode
234      */
235     public static String encodeWithinAuthority(String unescaped)
236         throws URIException {
237 
238         return encodeWithinAuthority(unescaped, URI.getDefaultProtocolCharset());
239     }
240 
241 
242     /***
243      * Escape and encode a string regarded as within the authority component of
244      * an URI with a given charset.
245      * Within the authority component, the characters ";", ":", "@", "?", and
246      * "/" are reserved.
247      *
248      * @param unescaped an unescaped string
249      * @param charset the charset
250      * @return the escaped string
251      * 
252      * @throws URIException if the charset is not supported
253      * 
254      * @see #encode
255      */
256     public static String encodeWithinAuthority(String unescaped, String charset)
257         throws URIException {
258 
259         return encode(unescaped, URI.allowed_within_authority, charset);
260     }
261 
262 
263     /***
264      * Escape and encode a string regarded as the path and query components of
265      * an URI with the default protocol charset.
266      *
267      * @param unescaped an unescaped string
268      * @return the escaped string
269      * 
270      * @throws URIException if the default protocol charset is not supported
271      * 
272      * @see URI#getDefaultProtocolCharset
273      * @see #encode
274      */
275     public static String encodePathQuery(String unescaped) throws URIException {
276         return encodePathQuery(unescaped, URI.getDefaultProtocolCharset());
277     }
278 
279 
280     /***
281      * Escape and encode a string regarded as the path and query components of
282      * an URI with a given charset.
283      *
284      * @param unescaped an unescaped string
285      * @param charset the charset
286      * @return the escaped string
287      * 
288      * @throws URIException if the charset is not supported
289      * 
290      * @see #encode
291      */
292     public static String encodePathQuery(String unescaped, String charset)
293         throws URIException {
294 
295         int at = unescaped.indexOf('?');
296         if (at < 0) {
297             return encode(unescaped, URI.allowed_abs_path, charset);
298         }
299         // else
300         return  encode(unescaped.substring(0, at), URI.allowed_abs_path, charset)
301             + '?' + encode(unescaped.substring(at + 1), URI.allowed_query, charset);
302     }
303 
304 
305     /***
306      * Escape and encode a string regarded as within the path component of an
307      * URI with the default protocol charset.
308      * The path may consist of a sequence of path segments separated by a
309      * single slash "/" character.  Within a path segment, the characters
310      * "/", ";", "=", and "?" are reserved.
311      *
312      * @param unescaped an unescaped string
313      * @return the escaped string
314      * 
315      * @throws URIException if the default protocol charset is not supported
316      * 
317      * @see URI#getDefaultProtocolCharset
318      * @see #encode
319      */
320     public static String encodeWithinPath(String unescaped)
321         throws URIException {
322 
323         return encodeWithinPath(unescaped, URI.getDefaultProtocolCharset());
324     }
325 
326 
327     /***
328      * Escape and encode a string regarded as within the path component of an
329      * URI with a given charset.
330      * The path may consist of a sequence of path segments separated by a
331      * single slash "/" character.  Within a path segment, the characters
332      * "/", ";", "=", and "?" are reserved.
333      *
334      * @param unescaped an unescaped string
335      * @param charset the charset
336      * @return the escaped string
337      * 
338      * @throws URIException if the charset is not supported
339      * 
340      * @see #encode
341      */
342     public static String encodeWithinPath(String unescaped, String charset)
343         throws URIException {
344 
345         return encode(unescaped, URI.allowed_within_path, charset);
346     }
347 
348 
349     /***
350      * Escape and encode a string regarded as the path component of an URI with
351      * the default protocol charset.
352      *
353      * @param unescaped an unescaped string
354      * @return the escaped string
355      * 
356      * @throws URIException if the default protocol charset is not supported
357      * 
358      * @see URI#getDefaultProtocolCharset
359      * @see #encode
360      */
361     public static String encodePath(String unescaped) throws URIException {
362         return encodePath(unescaped, URI.getDefaultProtocolCharset());
363     }
364 
365 
366     /***
367      * Escape and encode a string regarded as the path component of an URI with
368      * a given charset.
369      *
370      * @param unescaped an unescaped string
371      * @param charset the charset
372      * @return the escaped string
373      * 
374      * @throws URIException if the charset is not supported
375      * 
376      * @see #encode
377      */
378     public static String encodePath(String unescaped, String charset)
379         throws URIException {
380 
381         return encode(unescaped, URI.allowed_abs_path, charset);
382     }
383 
384 
385     /***
386      * Escape and encode a string regarded as within the query component of an
387      * URI with the default protocol charset.
388      * When a query comprise the name and value pairs, it is used in order
389      * to encode each name and value string.  The reserved special characters
390      * within a query component are being included in encoding the query.
391      *
392      * @param unescaped an unescaped string
393      * @return the escaped string
394      * 
395      * @throws URIException if the default protocol charset is not supported
396      * 
397      * @see URI#getDefaultProtocolCharset
398      * @see #encode
399      */
400     public static String encodeWithinQuery(String unescaped)
401         throws URIException {
402 
403         return encodeWithinQuery(unescaped, URI.getDefaultProtocolCharset());
404     }
405 
406 
407     /***
408      * Escape and encode a string regarded as within the query component of an
409      * URI with a given charset.
410      * When a query comprise the name and value pairs, it is used in order
411      * to encode each name and value string.  The reserved special characters
412      * within a query component are being included in encoding the query.
413      *
414      * @param unescaped an unescaped string
415      * @param charset the charset
416      * @return the escaped string
417      * 
418      * @throws URIException if the charset is not supported
419      * 
420      * @see #encode
421      */
422     public static String encodeWithinQuery(String unescaped, String charset)
423         throws URIException {
424 
425         return encode(unescaped, URI.allowed_within_query, charset);
426     }
427 
428 
429     /***
430      * Escape and encode a string regarded as the query component of an URI with
431      * the default protocol charset.
432      * When a query string is not misunderstood the reserved special characters
433      * ("&amp;", "=", "+", ",", and "$") within a query component, this method
434      * is recommended to use in encoding the whole query.
435      *
436      * @param unescaped an unescaped string
437      * @return the escaped string
438      * 
439      * @throws URIException if the default protocol charset is not supported
440      * 
441      * @see URI#getDefaultProtocolCharset
442      * @see #encode
443      */
444     public static String encodeQuery(String unescaped) throws URIException {
445         return encodeQuery(unescaped, URI.getDefaultProtocolCharset());
446     }
447 
448 
449     /***
450      * Escape and encode a string regarded as the query component of an URI with
451      * a given charset.
452      * When a query string is not misunderstood the reserved special characters
453      * ("&amp;", "=", "+", ",", and "$") within a query component, this method
454      * is recommended to use in encoding the whole query.
455      *
456      * @param unescaped an unescaped string
457      * @param charset the charset
458      * @return the escaped string
459      * 
460      * @throws URIException if the charset is not supported
461      * 
462      * @see #encode
463      */
464     public static String encodeQuery(String unescaped, String charset)
465         throws URIException {
466 
467         return encode(unescaped, URI.allowed_query, charset);
468     }
469 
470 
471     /***
472      * Escape and encode a given string with allowed characters not to be
473      * escaped and the default protocol charset.
474      *
475      * @param unescaped a string
476      * @param allowed allowed characters not to be escaped
477      * @return the escaped string
478      * 
479      * @throws URIException if the default protocol charset is not supported
480      * 
481      * @see URI#getDefaultProtocolCharset
482      */
483     public static String encode(String unescaped, BitSet allowed)
484         throws URIException {
485 
486         return encode(unescaped, allowed, URI.getDefaultProtocolCharset());
487     }
488 
489 
490     /***
491      * Escape and encode a given string with allowed characters not to be
492      * escaped and a given charset.
493      *
494      * @param unescaped a string
495      * @param allowed allowed characters not to be escaped
496      * @param charset the charset
497      * @return the escaped string
498      */
499     public static String encode(String unescaped, BitSet allowed,
500             String charset) throws URIException {
501         byte[] rawdata = URLCodec.encodeUrl(allowed, 
502             EncodingUtil.getBytes(unescaped, charset));
503         return EncodingUtil.getAsciiString(rawdata);
504     }
505 
506 
507     /***
508      * Unescape and decode a given string regarded as an escaped string with the
509      * default protocol charset.
510      *
511      * @param escaped a string
512      * @return the unescaped string
513      * 
514      * @throws URIException if the string cannot be decoded (invalid)
515      * 
516      * @see URI#getDefaultProtocolCharset
517      */
518     public static String decode(String escaped) throws URIException {
519         try {
520             byte[] rawdata = URLCodec.decodeUrl(EncodingUtil.getAsciiBytes(escaped));
521             return EncodingUtil.getString(rawdata, URI.getDefaultProtocolCharset());
522         } catch (DecoderException e) {
523             throw new URIException(e.getMessage());
524         }
525     }
526 
527     /***
528      * Unescape and decode a given string regarded as an escaped string.
529      *
530      * @param escaped a string
531      * @param charset the charset
532      * @return the unescaped string
533      * 
534      * @throws URIException if the charset is not supported
535      * 
536      * @see Coder#decode
537      */
538     public static String decode(String escaped, String charset)
539         throws URIException {
540 
541         return Coder.decode(escaped.toCharArray(), charset);
542     }
543 
544     // ---------------------------------------------------------- Inner classes
545 
546     /***
547      * The basic and internal utility for URI escape and character encoding and
548      * decoding.
549      * 
550      * @deprecated use org.apache.commons.codec.net.URLCodec
551      */
552     protected static class Coder extends URI {
553 
554         /***
555          * Escape and encode a given string with allowed characters not to be
556          * escaped.
557          *
558          * @param unescapedComponent an unescaped component
559          * @param allowed allowed characters not to be escaped
560          * @param charset the charset to encode
561          * @return the escaped and encoded string
562          * 
563          * @throws URIException if the charset is not supported
564          * 
565          * @deprecated use org.apache.commons.codec.net.URLCodec
566          */
567         public static char[] encode(String unescapedComponent, BitSet allowed, String charset) 
568             throws URIException {
569 
570             return URI.encode(unescapedComponent, allowed, charset);
571         }
572 
573 
574         /***
575          * Unescape and decode a given string.
576          *
577          * @param escapedComponent an being-unescaped component
578          * @param charset the charset to decode
579          * @return the escaped and encoded string
580          * 
581          * @throws URIException if the charset is not supported
582          * 
583          * @deprecated use org.apache.commons.codec.net.URLCodec
584          */
585         public static String decode(char[] escapedComponent, String charset)
586             throws URIException {
587 
588             return URI.decode(escapedComponent, charset);
589         }
590 
591 
592         /***
593          * Verify whether a given string is escaped or not
594          *
595          * @param original given characters
596          * @return true if the given character array is 7 bit ASCII-compatible.
597          */
598         public static boolean verifyEscaped(char[] original) {
599             for (int i = 0; i < original.length; i++) {
600                 int c = original[i];
601                 if (c > 128) {
602                     return false;
603                 } else if (c == '%') {
604                     if (Character.digit(original[++i], 16) == -1 
605                         || Character.digit(original[++i], 16) == -1) {
606                         return false;
607                     }
608                 }
609             }
610             return true;
611         }
612 
613 
614         /***
615          * Replace from a given character to given character in an array order
616          * for a given string.
617          *
618          * @param original a given string
619          * @param from a replacing character array
620          * @param to a replaced character array
621          * @return the replaced string
622          */
623         public static String replace(String original, char[] from, char[] to) {
624             for (int i = from.length; i > 0; --i) {
625                 original = replace(original, from[i], to[i]);
626             }
627             return original.toString();
628         }
629 
630 
631         /***
632          * Replace from a given character to given character for a given string.
633          *
634          * @param original a given string
635          * @param from a replacing character array
636          * @param to a replaced character array
637          * @return the replaced string
638          */
639         public static String replace(String original, char from, char to) {
640             StringBuffer result = new StringBuffer(original.length());
641             int at, saved = 0;
642             do {
643                 at = original.indexOf(from);
644                 if (at >= 0) {
645                     result.append(original.substring(0, at));
646                     result.append(to);
647                 } else {
648                     result.append(original.substring(saved));
649                 }
650                 saved = at;
651             } while (at >= 0);
652             return result.toString();
653         }
654     }
655 
656 }
657