1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 package org.apache.commons.httpclient;
31
32 import java.io.IOException;
33 import java.io.ObjectInputStream;
34 import java.io.ObjectOutputStream;
35 import java.io.Serializable;
36 import java.util.Locale;
37 import java.util.BitSet;
38 import java.util.Hashtable;
39
40 import org.apache.commons.codec.DecoderException;
41 import org.apache.commons.codec.net.URLCodec;
42 import org.apache.commons.httpclient.util.EncodingUtil;
43
44 /***
45 * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396.
46 * This class has the purpose of supportting of parsing a URI reference to
47 * extend any specific protocols, the character encoding of the protocol to
48 * be transported and the charset of the document.
49 * <p>
50 * A URI is always in an "escaped" form, since escaping or unescaping a
51 * completed URI might change its semantics.
52 * <p>
53 * Implementers should be careful not to escape or unescape the same string
54 * more than once, since unescaping an already unescaped string might lead to
55 * misinterpreting a percent data character as another escaped character,
56 * or vice versa in the case of escaping an already escaped string.
57 * <p>
58 * In order to avoid these problems, data types used as follows:
59 * <p><blockquote><pre>
60 * URI character sequence: char
61 * octet sequence: byte
62 * original character sequence: String
63 * </pre></blockquote><p>
64 *
65 * So, a URI is a sequence of characters as an array of a char type, which
66 * is not always represented as a sequence of octets as an array of byte.
67 * <p>
68 *
69 * URI Syntactic Components
70 * <p><blockquote><pre>
71 * - In general, written as follows:
72 * Absolute URI = <scheme>:<scheme-specific-part>
73 * Generic URI = <scheme>://<authority><path>?<query>
74 *
75 * - Syntax
76 * absoluteURI = scheme ":" ( hier_part | opaque_part )
77 * hier_part = ( net_path | abs_path ) [ "?" query ]
78 * net_path = "//" authority [ abs_path ]
79 * abs_path = "/" path_segments
80 * </pre></blockquote><p>
81 *
82 * The following examples illustrate URI that are in common use.
83 * <pre>
84 * ftp://ftp.is.co.za/rfc/rfc1808.txt
85 * -- ftp scheme for File Transfer Protocol services
86 * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
87 * -- gopher scheme for Gopher and Gopher+ Protocol services
88 * http://www.math.uio.no/faq/compression-faq/part1.html
89 * -- http scheme for Hypertext Transfer Protocol services
90 * mailto:mduerst@ifi.unizh.ch
91 * -- mailto scheme for electronic mail addresses
92 * news:comp.infosystems.www.servers.unix
93 * -- news scheme for USENET news groups and articles
94 * telnet://melvyl.ucop.edu/
95 * -- telnet scheme for interactive services via the TELNET Protocol
96 * </pre>
97 * Please, notice that there are many modifications from URL(RFC 1738) and
98 * relative URL(RFC 1808).
99 * <p>
100 * <b>The expressions for a URI</b>
101 * <p><pre>
102 * For escaped URI forms
103 * - URI(char[]) // constructor
104 * - char[] getRawXxx() // method
105 * - String getEscapedXxx() // method
106 * - String toString() // method
107 * <p>
108 * For unescaped URI forms
109 * - URI(String) // constructor
110 * - String getXXX() // method
111 * </pre><p>
112 *
113 * @author <a href="mailto:jericho@apache.org">Sung-Gu</a>
114 * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a>
115 * @version $Revision: 372560 $ $Date: 2002/03/14 15:14:01
116 */
117 public class URI implements Cloneable, Comparable, Serializable {
118
119
120
121
122 /*** Create an instance as an internal use */
123 protected URI() {
124 }
125
126 /***
127 * Construct a URI from a string with the given charset. The input string can
128 * be either in escaped or unescaped form.
129 *
130 * @param s URI character sequence
131 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
132 * <tt>false</tt> otherwise.
133 * @param charset the charset string to do escape encoding, if required
134 *
135 * @throws URIException If the URI cannot be created.
136 * @throws NullPointerException if input string is <code>null</code>
137 *
138 * @see #getProtocolCharset
139 *
140 * @since 3.0
141 */
142 public URI(String s, boolean escaped, String charset)
143 throws URIException, NullPointerException {
144 protocolCharset = charset;
145 parseUriReference(s, escaped);
146 }
147
148 /***
149 * Construct a URI from a string with the given charset. The input string can
150 * be either in escaped or unescaped form.
151 *
152 * @param s URI character sequence
153 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
154 * <tt>false</tt> otherwise.
155 *
156 * @throws URIException If the URI cannot be created.
157 * @throws NullPointerException if input string is <code>null</code>
158 *
159 * @see #getProtocolCharset
160 *
161 * @since 3.0
162 */
163 public URI(String s, boolean escaped)
164 throws URIException, NullPointerException {
165 parseUriReference(s, escaped);
166 }
167
168 /***
169 * Construct a URI as an escaped form of a character array with the given
170 * charset.
171 *
172 * @param escaped the URI character sequence
173 * @param charset the charset string to do escape encoding
174 * @throws URIException If the URI cannot be created.
175 * @throws NullPointerException if <code>escaped</code> is <code>null</code>
176 * @see #getProtocolCharset
177 *
178 * @deprecated Use #URI(String, boolean, String)
179 */
180 public URI(char[] escaped, String charset)
181 throws URIException, NullPointerException {
182 protocolCharset = charset;
183 parseUriReference(new String(escaped), true);
184 }
185
186
187 /***
188 * Construct a URI as an escaped form of a character array.
189 * An URI can be placed within double-quotes or angle brackets like
190 * "http://test.com/" and <http://test.com/>
191 *
192 * @param escaped the URI character sequence
193 * @throws URIException If the URI cannot be created.
194 * @throws NullPointerException if <code>escaped</code> is <code>null</code>
195 * @see #getDefaultProtocolCharset
196 *
197 * @deprecated Use #URI(String, boolean)
198 */
199 public URI(char[] escaped)
200 throws URIException, NullPointerException {
201 parseUriReference(new String(escaped), true);
202 }
203
204
205 /***
206 * Construct a URI from the given string with the given charset.
207 *
208 * @param original the string to be represented to URI character sequence
209 * It is one of absoluteURI and relativeURI.
210 * @param charset the charset string to do escape encoding
211 * @throws URIException If the URI cannot be created.
212 * @see #getProtocolCharset
213 *
214 * @deprecated Use #URI(String, boolean, String)
215 */
216 public URI(String original, String charset) throws URIException {
217 protocolCharset = charset;
218 parseUriReference(original, false);
219 }
220
221
222 /***
223 * Construct a URI from the given string.
224 * <p><blockquote><pre>
225 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
226 * </pre></blockquote><p>
227 * An URI can be placed within double-quotes or angle brackets like
228 * "http://test.com/" and <http://test.com/>
229 *
230 * @param original the string to be represented to URI character sequence
231 * It is one of absoluteURI and relativeURI.
232 * @throws URIException If the URI cannot be created.
233 * @see #getDefaultProtocolCharset
234 *
235 * @deprecated Use #URI(String, boolean)
236 */
237 public URI(String original) throws URIException {
238 parseUriReference(original, false);
239 }
240
241
242 /***
243 * Construct a general URI from the given components.
244 * <p><blockquote><pre>
245 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
246 * absoluteURI = scheme ":" ( hier_part | opaque_part )
247 * opaque_part = uric_no_slash *uric
248 * </pre></blockquote><p>
249 * It's for absolute URI = <scheme>:<scheme-specific-part>#
250 * <fragment>.
251 *
252 * @param scheme the scheme string
253 * @param schemeSpecificPart scheme_specific_part
254 * @param fragment the fragment string
255 * @throws URIException If the URI cannot be created.
256 * @see #getDefaultProtocolCharset
257 */
258 public URI(String scheme, String schemeSpecificPart, String fragment)
259 throws URIException {
260
261
262 if (scheme == null) {
263 throw new URIException(URIException.PARSING, "scheme required");
264 }
265 char[] s = scheme.toLowerCase().toCharArray();
266 if (validate(s, URI.scheme)) {
267 _scheme = s;
268 } else {
269 throw new URIException(URIException.PARSING, "incorrect scheme");
270 }
271 _opaque = encode(schemeSpecificPart, allowed_opaque_part,
272 getProtocolCharset());
273
274 _is_opaque_part = true;
275 _fragment = fragment == null ? null : fragment.toCharArray();
276 setURI();
277 }
278
279
280 /***
281 * Construct a general URI from the given components.
282 * <p><blockquote><pre>
283 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
284 * absoluteURI = scheme ":" ( hier_part | opaque_part )
285 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
286 * hier_part = ( net_path | abs_path ) [ "?" query ]
287 * </pre></blockquote><p>
288 * It's for absolute URI = <scheme>:<path>?<query>#<
289 * fragment> and relative URI = <path>?<query>#<fragment
290 * >.
291 *
292 * @param scheme the scheme string
293 * @param authority the authority string
294 * @param path the path string
295 * @param query the query string
296 * @param fragment the fragment string
297 * @throws URIException If the new URI cannot be created.
298 * @see #getDefaultProtocolCharset
299 */
300 public URI(String scheme, String authority, String path, String query,
301 String fragment) throws URIException {
302
303
304 StringBuffer buff = new StringBuffer();
305 if (scheme != null) {
306 buff.append(scheme);
307 buff.append(':');
308 }
309 if (authority != null) {
310 buff.append("//");
311 buff.append(authority);
312 }
313 if (path != null) {
314 if ((scheme != null || authority != null)
315 && !path.startsWith("/")) {
316 throw new URIException(URIException.PARSING,
317 "abs_path requested");
318 }
319 buff.append(path);
320 }
321 if (query != null) {
322 buff.append('?');
323 buff.append(query);
324 }
325 if (fragment != null) {
326 buff.append('#');
327 buff.append(fragment);
328 }
329 parseUriReference(buff.toString(), false);
330 }
331
332
333 /***
334 * Construct a general URI from the given components.
335 *
336 * @param scheme the scheme string
337 * @param userinfo the userinfo string
338 * @param host the host string
339 * @param port the port number
340 * @throws URIException If the new URI cannot be created.
341 * @see #getDefaultProtocolCharset
342 */
343 public URI(String scheme, String userinfo, String host, int port)
344 throws URIException {
345
346 this(scheme, userinfo, host, port, null, null, null);
347 }
348
349
350 /***
351 * Construct a general URI from the given components.
352 *
353 * @param scheme the scheme string
354 * @param userinfo the userinfo string
355 * @param host the host string
356 * @param port the port number
357 * @param path the path string
358 * @throws URIException If the new URI cannot be created.
359 * @see #getDefaultProtocolCharset
360 */
361 public URI(String scheme, String userinfo, String host, int port,
362 String path) throws URIException {
363
364 this(scheme, userinfo, host, port, path, null, null);
365 }
366
367
368 /***
369 * Construct a general URI from the given components.
370 *
371 * @param scheme the scheme string
372 * @param userinfo the userinfo string
373 * @param host the host string
374 * @param port the port number
375 * @param path the path string
376 * @param query the query string
377 * @throws URIException If the new URI cannot be created.
378 * @see #getDefaultProtocolCharset
379 */
380 public URI(String scheme, String userinfo, String host, int port,
381 String path, String query) throws URIException {
382
383 this(scheme, userinfo, host, port, path, query, null);
384 }
385
386
387 /***
388 * Construct a general URI from the given components.
389 *
390 * @param scheme the scheme string
391 * @param userinfo the userinfo string
392 * @param host the host string
393 * @param port the port number
394 * @param path the path string
395 * @param query the query string
396 * @param fragment the fragment string
397 * @throws URIException If the new URI cannot be created.
398 * @see #getDefaultProtocolCharset
399 */
400 public URI(String scheme, String userinfo, String host, int port,
401 String path, String query, String fragment) throws URIException {
402
403 this(scheme, (host == null) ? null
404 : ((userinfo != null) ? userinfo + '@' : "") + host
405 + ((port != -1) ? ":" + port : ""), path, query, fragment);
406 }
407
408
409 /***
410 * Construct a general URI from the given components.
411 *
412 * @param scheme the scheme string
413 * @param host the host string
414 * @param path the path string
415 * @param fragment the fragment string
416 * @throws URIException If the new URI cannot be created.
417 * @see #getDefaultProtocolCharset
418 */
419 public URI(String scheme, String host, String path, String fragment)
420 throws URIException {
421
422 this(scheme, host, path, null, fragment);
423 }
424
425
426 /***
427 * Construct a general URI with the given relative URI string.
428 *
429 * @param base the base URI
430 * @param relative the relative URI string
431 * @throws URIException If the new URI cannot be created.
432 *
433 * @deprecated Use #URI(URI, String, boolean)
434 */
435 public URI(URI base, String relative) throws URIException {
436 this(base, new URI(relative));
437 }
438
439
440 /***
441 * Construct a general URI with the given relative URI string.
442 *
443 * @param base the base URI
444 * @param relative the relative URI string
445 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
446 * <tt>false</tt> otherwise.
447 *
448 * @throws URIException If the new URI cannot be created.
449 *
450 * @since 3.0
451 */
452 public URI(URI base, String relative, boolean escaped) throws URIException {
453 this(base, new URI(relative, escaped));
454 }
455
456
457 /***
458 * Construct a general URI with the given relative URI.
459 * <p><blockquote><pre>
460 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
461 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
462 * </pre></blockquote><p>
463 * Resolving Relative References to Absolute Form.
464 *
465 * <strong>Examples of Resolving Relative URI References</strong>
466 *
467 * Within an object with a well-defined base URI of
468 * <p><blockquote><pre>
469 * http://a/b/c/d;p?q
470 * </pre></blockquote><p>
471 * the relative URI would be resolved as follows:
472 *
473 * Normal Examples
474 *
475 * <p><blockquote><pre>
476 * g:h = g:h
477 * g = http://a/b/c/g
478 * ./g = http://a/b/c/g
479 * g/ = http://a/b/c/g/
480 * /g = http://a/g
481 * //g = http://g
482 * ?y = http://a/b/c/?y
483 * g?y = http://a/b/c/g?y
484 * #s = (current document)#s
485 * g#s = http://a/b/c/g#s
486 * g?y#s = http://a/b/c/g?y#s
487 * ;x = http://a/b/c/;x
488 * g;x = http://a/b/c/g;x
489 * g;x?y#s = http://a/b/c/g;x?y#s
490 * . = http://a/b/c/
491 * ./ = http://a/b/c/
492 * .. = http://a/b/
493 * ../ = http://a/b/
494 * ../g = http://a/b/g
495 * ../.. = http://a/
496 * ../../ = http://a/
497 * ../../g = http://a/g
498 * </pre></blockquote><p>
499 *
500 * Some URI schemes do not allow a hierarchical syntax matching the
501 * <hier_part> syntax, and thus cannot use relative references.
502 *
503 * @param base the base URI
504 * @param relative the relative URI
505 * @throws URIException If the new URI cannot be created.
506 */
507 public URI(URI base, URI relative) throws URIException {
508
509 if (base._scheme == null) {
510 throw new URIException(URIException.PARSING, "base URI required");
511 }
512 if (base._scheme != null) {
513 this._scheme = base._scheme;
514 this._authority = base._authority;
515 }
516 if (base._is_opaque_part || relative._is_opaque_part) {
517 this._scheme = base._scheme;
518 this._is_opaque_part = base._is_opaque_part
519 || relative._is_opaque_part;
520 this._opaque = relative._opaque;
521 this._fragment = relative._fragment;
522 this.setURI();
523 return;
524 }
525 if (relative._scheme != null) {
526 this._scheme = relative._scheme;
527 this._is_net_path = relative._is_net_path;
528 this._authority = relative._authority;
529 if (relative._is_server) {
530 this._is_server = relative._is_server;
531 this._userinfo = relative._userinfo;
532 this._host = relative._host;
533 this._port = relative._port;
534 } else if (relative._is_reg_name) {
535 this._is_reg_name = relative._is_reg_name;
536 }
537 this._is_abs_path = relative._is_abs_path;
538 this._is_rel_path = relative._is_rel_path;
539 this._path = relative._path;
540 } else if (base._authority != null && relative._scheme == null) {
541 this._is_net_path = base._is_net_path;
542 this._authority = base._authority;
543 if (base._is_server) {
544 this._is_server = base._is_server;
545 this._userinfo = base._userinfo;
546 this._host = base._host;
547 this._port = base._port;
548 } else if (base._is_reg_name) {
549 this._is_reg_name = base._is_reg_name;
550 }
551 }
552 if (relative._authority != null) {
553 this._is_net_path = relative._is_net_path;
554 this._authority = relative._authority;
555 if (relative._is_server) {
556 this._is_server = relative._is_server;
557 this._userinfo = relative._userinfo;
558 this._host = relative._host;
559 this._port = relative._port;
560 } else if (relative._is_reg_name) {
561 this._is_reg_name = relative._is_reg_name;
562 }
563 this._is_abs_path = relative._is_abs_path;
564 this._is_rel_path = relative._is_rel_path;
565 this._path = relative._path;
566 }
567
568 if (relative._scheme == null && relative._authority == null) {
569 if ((relative._path == null || relative._path.length == 0)
570 && relative._query == null) {
571
572
573 this._path = base._path;
574 this._query = base._query;
575 } else {
576 this._path = resolvePath(base._path, relative._path);
577 }
578 }
579
580 if (relative._query != null) {
581 this._query = relative._query;
582 }
583
584 if (relative._fragment != null) {
585 this._fragment = relative._fragment;
586 }
587 this.setURI();
588
589
590 parseUriReference(new String(_uri), true);
591 }
592
593
594
595 /*** Version ID for serialization */
596 static final long serialVersionUID = 604752400577948726L;
597
598
599 /***
600 * Cache the hash code for this URI.
601 */
602 protected int hash = 0;
603
604
605 /***
606 * This Uniform Resource Identifier (URI).
607 * The URI is always in an "escaped" form, since escaping or unescaping
608 * a completed URI might change its semantics.
609 */
610 protected char[] _uri = null;
611
612
613 /***
614 * The charset of the protocol used by this URI instance.
615 */
616 protected String protocolCharset = null;
617
618
619 /***
620 * The default charset of the protocol. RFC 2277, 2396
621 */
622 protected static String defaultProtocolCharset = "UTF-8";
623
624
625 /***
626 * The default charset of the document. RFC 2277, 2396
627 * The platform's charset is used for the document by default.
628 */
629 protected static String defaultDocumentCharset = null;
630 protected static String defaultDocumentCharsetByLocale = null;
631 protected static String defaultDocumentCharsetByPlatform = null;
632
633 static {
634 Locale locale = Locale.getDefault();
635
636 if (locale != null) {
637 defaultDocumentCharsetByLocale =
638 LocaleToCharsetMap.getCharset(locale);
639
640 defaultDocumentCharset = defaultDocumentCharsetByLocale;
641 }
642
643 try {
644 defaultDocumentCharsetByPlatform = System.getProperty("file.encoding");
645 } catch (SecurityException ignore) {
646 }
647 if (defaultDocumentCharset == null) {
648
649 defaultDocumentCharset = defaultDocumentCharsetByPlatform;
650 }
651 }
652
653
654 /***
655 * The scheme.
656 */
657 protected char[] _scheme = null;
658
659
660 /***
661 * The opaque.
662 */
663 protected char[] _opaque = null;
664
665
666 /***
667 * The authority.
668 */
669 protected char[] _authority = null;
670
671
672 /***
673 * The userinfo.
674 */
675 protected char[] _userinfo = null;
676
677
678 /***
679 * The host.
680 */
681 protected char[] _host = null;
682
683
684 /***
685 * The port.
686 */
687 protected int _port = -1;
688
689
690 /***
691 * The path.
692 */
693 protected char[] _path = null;
694
695
696 /***
697 * The query.
698 */
699 protected char[] _query = null;
700
701
702 /***
703 * The fragment.
704 */
705 protected char[] _fragment = null;
706
707
708 /***
709 * The root path.
710 */
711 protected static char[] rootPath = { '/' };
712
713
714
715 /***
716 * The percent "%" character always has the reserved purpose of being the
717 * escape indicator, it must be escaped as "%25" in order to be used as
718 * data within a URI.
719 */
720 protected static final BitSet percent = new BitSet(256);
721
722 static {
723 percent.set('%');
724 }
725
726
727 /***
728 * BitSet for digit.
729 * <p><blockquote><pre>
730 * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
731 * "8" | "9"
732 * </pre></blockquote><p>
733 */
734 protected static final BitSet digit = new BitSet(256);
735
736 static {
737 for (int i = '0'; i <= '9'; i++) {
738 digit.set(i);
739 }
740 }
741
742
743 /***
744 * BitSet for alpha.
745 * <p><blockquote><pre>
746 * alpha = lowalpha | upalpha
747 * </pre></blockquote><p>
748 */
749 protected static final BitSet alpha = new BitSet(256);
750
751 static {
752 for (int i = 'a'; i <= 'z'; i++) {
753 alpha.set(i);
754 }
755 for (int i = 'A'; i <= 'Z'; i++) {
756 alpha.set(i);
757 }
758 }
759
760
761 /***
762 * BitSet for alphanum (join of alpha & digit).
763 * <p><blockquote><pre>
764 * alphanum = alpha | digit
765 * </pre></blockquote><p>
766 */
767 protected static final BitSet alphanum = new BitSet(256);
768
769 static {
770 alphanum.or(alpha);
771 alphanum.or(digit);
772 }
773
774
775 /***
776 * BitSet for hex.
777 * <p><blockquote><pre>
778 * hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
779 * "a" | "b" | "c" | "d" | "e" | "f"
780 * </pre></blockquote><p>
781 */
782 protected static final BitSet hex = new BitSet(256);
783
784 static {
785 hex.or(digit);
786 for (int i = 'a'; i <= 'f'; i++) {
787 hex.set(i);
788 }
789 for (int i = 'A'; i <= 'F'; i++) {
790 hex.set(i);
791 }
792 }
793
794
795 /***
796 * BitSet for escaped.
797 * <p><blockquote><pre>
798 * escaped = "%" hex hex
799 * </pre></blockquote><p>
800 */
801 protected static final BitSet escaped = new BitSet(256);
802
803 static {
804 escaped.or(percent);
805 escaped.or(hex);
806 }
807
808
809 /***
810 * BitSet for mark.
811 * <p><blockquote><pre>
812 * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
813 * "(" | ")"
814 * </pre></blockquote><p>
815 */
816 protected static final BitSet mark = new BitSet(256);
817
818 static {
819 mark.set('-');
820 mark.set('_');
821 mark.set('.');
822 mark.set('!');
823 mark.set('~');
824 mark.set('*');
825 mark.set('\'');
826 mark.set('(');
827 mark.set(')');
828 }
829
830
831 /***
832 * Data characters that are allowed in a URI but do not have a reserved
833 * purpose are called unreserved.
834 * <p><blockquote><pre>
835 * unreserved = alphanum | mark
836 * </pre></blockquote><p>
837 */
838 protected static final BitSet unreserved = new BitSet(256);
839
840 static {
841 unreserved.or(alphanum);
842 unreserved.or(mark);
843 }
844
845
846 /***
847 * BitSet for reserved.
848 * <p><blockquote><pre>
849 * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
850 * "$" | ","
851 * </pre></blockquote><p>
852 */
853 protected static final BitSet reserved = new BitSet(256);
854
855 static {
856 reserved.set(';');
857 reserved.set('/');
858 reserved.set('?');
859 reserved.set(':');
860 reserved.set('@');
861 reserved.set('&');
862 reserved.set('=');
863 reserved.set('+');
864 reserved.set('$');
865 reserved.set(',');
866 }
867
868
869 /***
870 * BitSet for uric.
871 * <p><blockquote><pre>
872 * uric = reserved | unreserved | escaped
873 * </pre></blockquote><p>
874 */
875 protected static final BitSet uric = new BitSet(256);
876
877 static {
878 uric.or(reserved);
879 uric.or(unreserved);
880 uric.or(escaped);
881 }
882
883
884 /***
885 * BitSet for fragment (alias for uric).
886 * <p><blockquote><pre>
887 * fragment = *uric
888 * </pre></blockquote><p>
889 */
890 protected static final BitSet fragment = uric;
891
892
893 /***
894 * BitSet for query (alias for uric).
895 * <p><blockquote><pre>
896 * query = *uric
897 * </pre></blockquote><p>
898 */
899 protected static final BitSet query = uric;
900
901
902 /***
903 * BitSet for pchar.
904 * <p><blockquote><pre>
905 * pchar = unreserved | escaped |
906 * ":" | "@" | "&" | "=" | "+" | "$" | ","
907 * </pre></blockquote><p>
908 */
909 protected static final BitSet pchar = new BitSet(256);
910
911 static {
912 pchar.or(unreserved);
913 pchar.or(escaped);
914 pchar.set(':');
915 pchar.set('@');
916 pchar.set('&');
917 pchar.set('=');
918 pchar.set('+');
919 pchar.set('$');
920 pchar.set(',');
921 }
922
923
924 /***
925 * BitSet for param (alias for pchar).
926 * <p><blockquote><pre>
927 * param = *pchar
928 * </pre></blockquote><p>
929 */
930 protected static final BitSet param = pchar;
931
932
933 /***
934 * BitSet for segment.
935 * <p><blockquote><pre>
936 * segment = *pchar *( ";" param )
937 * </pre></blockquote><p>
938 */
939 protected static final BitSet segment = new BitSet(256);
940
941 static {
942 segment.or(pchar);
943 segment.set(';');
944 segment.or(param);
945 }
946
947
948 /***
949 * BitSet for path segments.
950 * <p><blockquote><pre>
951 * path_segments = segment *( "/" segment )
952 * </pre></blockquote><p>
953 */
954 protected static final BitSet path_segments = new BitSet(256);
955
956 static {
957 path_segments.set('/');
958 path_segments.or(segment);
959 }
960
961
962 /***
963 * URI absolute path.
964 * <p><blockquote><pre>
965 * abs_path = "/" path_segments
966 * </pre></blockquote><p>
967 */
968 protected static final BitSet abs_path = new BitSet(256);
969
970 static {
971 abs_path.set('/');
972 abs_path.or(path_segments);
973 }
974
975
976 /***
977 * URI bitset for encoding typical non-slash characters.
978 * <p><blockquote><pre>
979 * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
980 * "&" | "=" | "+" | "$" | ","
981 * </pre></blockquote><p>
982 */
983 protected static final BitSet uric_no_slash = new BitSet(256);
984
985 static {
986 uric_no_slash.or(unreserved);
987 uric_no_slash.or(escaped);
988 uric_no_slash.set(';');
989 uric_no_slash.set('?');
990 uric_no_slash.set(';');
991 uric_no_slash.set('@');
992 uric_no_slash.set('&');
993 uric_no_slash.set('=');
994 uric_no_slash.set('+');
995 uric_no_slash.set('$');
996 uric_no_slash.set(',');
997 }
998
999
1000 /***
1001 * URI bitset that combines uric_no_slash and uric.
1002 * <p><blockquote><pre>
1003 * opaque_part = uric_no_slash *uric
1004 * </pre></blockquote><p>
1005 */
1006 protected static final BitSet opaque_part = new BitSet(256);
1007
1008 static {
1009
1010 opaque_part.or(uric_no_slash);
1011 opaque_part.or(uric);
1012 }
1013
1014
1015 /***
1016 * URI bitset that combines absolute path and opaque part.
1017 * <p><blockquote><pre>
1018 * path = [ abs_path | opaque_part ]
1019 * </pre></blockquote><p>
1020 */
1021 protected static final BitSet path = new BitSet(256);
1022
1023 static {
1024 path.or(abs_path);
1025 path.or(opaque_part);
1026 }
1027
1028
1029 /***
1030 * Port, a logical alias for digit.
1031 */
1032 protected static final BitSet port = digit;
1033
1034
1035 /***
1036 * Bitset that combines digit and dot fo IPv$address.
1037 * <p><blockquote><pre>
1038 * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
1039 * </pre></blockquote><p>
1040 */
1041 protected static final BitSet IPv4address = new BitSet(256);
1042
1043 static {
1044 IPv4address.or(digit);
1045 IPv4address.set('.');
1046 }
1047
1048
1049 /***
1050 * RFC 2373.
1051 * <p><blockquote><pre>
1052 * IPv6address = hexpart [ ":" IPv4address ]
1053 * </pre></blockquote><p>
1054 */
1055 protected static final BitSet IPv6address = new BitSet(256);
1056
1057 static {
1058 IPv6address.or(hex);
1059 IPv6address.set(':');
1060 IPv6address.or(IPv4address);
1061 }
1062
1063
1064 /***
1065 * RFC 2732, 2373.
1066 * <p><blockquote><pre>
1067 * IPv6reference = "[" IPv6address "]"
1068 * </pre></blockquote><p>
1069 */
1070 protected static final BitSet IPv6reference = new BitSet(256);
1071
1072 static {
1073 IPv6reference.set('[');
1074 IPv6reference.or(IPv6address);
1075 IPv6reference.set(']');
1076 }
1077
1078
1079 /***
1080 * BitSet for toplabel.
1081 * <p><blockquote><pre>
1082 * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
1083 * </pre></blockquote><p>
1084 */
1085 protected static final BitSet toplabel = new BitSet(256);
1086
1087 static {
1088 toplabel.or(alphanum);
1089 toplabel.set('-');
1090 }
1091
1092
1093 /***
1094 * BitSet for domainlabel.
1095 * <p><blockquote><pre>
1096 * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
1097 * </pre></blockquote><p>
1098 */
1099 protected static final BitSet domainlabel = toplabel;
1100
1101
1102 /***
1103 * BitSet for hostname.
1104 * <p><blockquote><pre>
1105 * hostname = *( domainlabel "." ) toplabel [ "." ]
1106 * </pre></blockquote><p>
1107 */
1108 protected static final BitSet hostname = new BitSet(256);
1109
1110 static {
1111 hostname.or(toplabel);
1112
1113 hostname.set('.');
1114 }
1115
1116
1117 /***
1118 * BitSet for host.
1119 * <p><blockquote><pre>
1120 * host = hostname | IPv4address | IPv6reference
1121 * </pre></blockquote><p>
1122 */
1123 protected static final BitSet host = new BitSet(256);
1124
1125 static {
1126 host.or(hostname);
1127
1128 host.or(IPv6reference);
1129 }
1130
1131
1132 /***
1133 * BitSet for hostport.
1134 * <p><blockquote><pre>
1135 * hostport = host [ ":" port ]
1136 * </pre></blockquote><p>
1137 */
1138 protected static final BitSet hostport = new BitSet(256);
1139
1140 static {
1141 hostport.or(host);
1142 hostport.set(':');
1143 hostport.or(port);
1144 }
1145
1146
1147 /***
1148 * Bitset for userinfo.
1149 * <p><blockquote><pre>
1150 * userinfo = *( unreserved | escaped |
1151 * ";" | ":" | "&" | "=" | "+" | "$" | "," )
1152 * </pre></blockquote><p>
1153 */
1154 protected static final BitSet userinfo = new BitSet(256);
1155
1156 static {
1157 userinfo.or(unreserved);
1158 userinfo.or(escaped);
1159 userinfo.set(';');
1160 userinfo.set(':');
1161 userinfo.set('&');
1162 userinfo.set('=');
1163 userinfo.set('+');
1164 userinfo.set('$');
1165 userinfo.set(',');
1166 }
1167
1168
1169 /***
1170 * BitSet for within the userinfo component like user and password.
1171 */
1172 public static final BitSet within_userinfo = new BitSet(256);
1173
1174 static {
1175 within_userinfo.or(userinfo);
1176 within_userinfo.clear(';');
1177 within_userinfo.clear(':');
1178 within_userinfo.clear('@');
1179 within_userinfo.clear('?');
1180 within_userinfo.clear('/');
1181 }
1182
1183
1184 /***
1185 * Bitset for server.
1186 * <p><blockquote><pre>
1187 * server = [ [ userinfo "@" ] hostport ]
1188 * </pre></blockquote><p>
1189 */
1190 protected static final BitSet server = new BitSet(256);
1191
1192 static {
1193 server.or(userinfo);
1194 server.set('@');
1195 server.or(hostport);
1196 }
1197
1198
1199 /***
1200 * BitSet for reg_name.
1201 * <p><blockquote><pre>
1202 * reg_name = 1*( unreserved | escaped | "$" | "," |
1203 * ";" | ":" | "@" | "&" | "=" | "+" )
1204 * </pre></blockquote><p>
1205 */
1206 protected static final BitSet reg_name = new BitSet(256);
1207
1208 static {
1209 reg_name.or(unreserved);
1210 reg_name.or(escaped);
1211 reg_name.set('$');
1212 reg_name.set(',');
1213 reg_name.set(';');
1214 reg_name.set(':');
1215 reg_name.set('@');
1216 reg_name.set('&');
1217 reg_name.set('=');
1218 reg_name.set('+');
1219 }
1220
1221
1222 /***
1223 * BitSet for authority.
1224 * <p><blockquote><pre>
1225 * authority = server | reg_name
1226 * </pre></blockquote><p>
1227 */
1228 protected static final BitSet authority = new BitSet(256);
1229
1230 static {
1231 authority.or(server);
1232 authority.or(reg_name);
1233 }
1234
1235
1236 /***
1237 * BitSet for scheme.
1238 * <p><blockquote><pre>
1239 * scheme = alpha *( alpha | digit | "+" | "-" | "." )
1240 * </pre></blockquote><p>
1241 */
1242 protected static final BitSet scheme = new BitSet(256);
1243
1244 static {
1245 scheme.or(alpha);
1246 scheme.or(digit);
1247 scheme.set('+');
1248 scheme.set('-');
1249 scheme.set('.');
1250 }
1251
1252
1253 /***
1254 * BitSet for rel_segment.
1255 * <p><blockquote><pre>
1256 * rel_segment = 1*( unreserved | escaped |
1257 * ";" | "@" | "&" | "=" | "+" | "$" | "," )
1258 * </pre></blockquote><p>
1259 */
1260 protected static final BitSet rel_segment = new BitSet(256);
1261
1262 static {
1263 rel_segment.or(unreserved);
1264 rel_segment.or(escaped);
1265 rel_segment.set(';');
1266 rel_segment.set('@');
1267 rel_segment.set('&');
1268 rel_segment.set('=');
1269 rel_segment.set('+');
1270 rel_segment.set('$');
1271 rel_segment.set(',');
1272 }
1273
1274
1275 /***
1276 * BitSet for rel_path.
1277 * <p><blockquote><pre>
1278 * rel_path = rel_segment [ abs_path ]
1279 * </pre></blockquote><p>
1280 */
1281 protected static final BitSet rel_path = new BitSet(256);
1282
1283 static {
1284 rel_path.or(rel_segment);
1285 rel_path.or(abs_path);
1286 }
1287
1288
1289 /***
1290 * BitSet for net_path.
1291 * <p><blockquote><pre>
1292 * net_path = "//" authority [ abs_path ]
1293 * </pre></blockquote><p>
1294 */
1295 protected static final BitSet net_path = new BitSet(256);
1296
1297 static {
1298 net_path.set('/');
1299 net_path.or(authority);
1300 net_path.or(abs_path);
1301 }
1302
1303
1304 /***
1305 * BitSet for hier_part.
1306 * <p><blockquote><pre>
1307 * hier_part = ( net_path | abs_path ) [ "?" query ]
1308 * </pre></blockquote><p>
1309 */
1310 protected static final BitSet hier_part = new BitSet(256);
1311
1312 static {
1313 hier_part.or(net_path);
1314 hier_part.or(abs_path);
1315
1316 hier_part.or(query);
1317 }
1318
1319
1320 /***
1321 * BitSet for relativeURI.
1322 * <p><blockquote><pre>
1323 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
1324 * </pre></blockquote><p>
1325 */
1326 protected static final BitSet relativeURI = new BitSet(256);
1327
1328 static {
1329 relativeURI.or(net_path);
1330 relativeURI.or(abs_path);
1331 relativeURI.or(rel_path);
1332
1333 relativeURI.or(query);
1334 }
1335
1336
1337 /***
1338 * BitSet for absoluteURI.
1339 * <p><blockquote><pre>
1340 * absoluteURI = scheme ":" ( hier_part | opaque_part )
1341 * </pre></blockquote><p>
1342 */
1343 protected static final BitSet absoluteURI = new BitSet(256);
1344
1345 static {
1346 absoluteURI.or(scheme);
1347 absoluteURI.set(':');
1348 absoluteURI.or(hier_part);
1349 absoluteURI.or(opaque_part);
1350 }
1351
1352
1353 /***
1354 * BitSet for URI-reference.
1355 * <p><blockquote><pre>
1356 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
1357 * </pre></blockquote><p>
1358 */
1359 protected static final BitSet URI_reference = new BitSet(256);
1360
1361 static {
1362 URI_reference.or(absoluteURI);
1363 URI_reference.or(relativeURI);
1364 URI_reference.set('#');
1365 URI_reference.or(fragment);
1366 }
1367
1368
1369
1370
1371 /***
1372 * BitSet for control.
1373 */
1374 public static final BitSet control = new BitSet(256);
1375
1376 static {
1377 for (int i = 0; i <= 0x1F; i++) {
1378 control.set(i);
1379 }
1380 control.set(0x7F);
1381 }
1382
1383 /***
1384 * BitSet for space.
1385 */
1386 public static final BitSet space = new BitSet(256);
1387
1388 static {
1389 space.set(0x20);
1390 }
1391
1392
1393 /***
1394 * BitSet for delims.
1395 */
1396 public static final BitSet delims = new BitSet(256);
1397
1398 static {
1399 delims.set('<');
1400 delims.set('>');
1401 delims.set('#');
1402 delims.set('%');
1403 delims.set('"');
1404 }
1405
1406
1407 /***
1408 * BitSet for unwise.
1409 */
1410 public static final BitSet unwise = new BitSet(256);
1411
1412 static {
1413 unwise.set('{');
1414 unwise.set('}');
1415 unwise.set('|');
1416 unwise.set('//');
1417 unwise.set('^');
1418 unwise.set('[');
1419 unwise.set(']');
1420 unwise.set('`');
1421 }
1422
1423
1424 /***
1425 * Disallowed rel_path before escaping.
1426 */
1427 public static final BitSet disallowed_rel_path = new BitSet(256);
1428
1429 static {
1430 disallowed_rel_path.or(uric);
1431 disallowed_rel_path.andNot(rel_path);
1432 }
1433
1434
1435 /***
1436 * Disallowed opaque_part before escaping.
1437 */
1438 public static final BitSet disallowed_opaque_part = new BitSet(256);
1439
1440 static {
1441 disallowed_opaque_part.or(uric);
1442 disallowed_opaque_part.andNot(opaque_part);
1443 }
1444
1445
1446
1447 /***
1448 * Those characters that are allowed for the authority component.
1449 */
1450 public static final BitSet allowed_authority = new BitSet(256);
1451
1452 static {
1453 allowed_authority.or(authority);
1454 allowed_authority.clear('%');
1455 }
1456
1457
1458 /***
1459 * Those characters that are allowed for the opaque_part.
1460 */
1461 public static final BitSet allowed_opaque_part = new BitSet(256);
1462
1463 static {
1464 allowed_opaque_part.or(opaque_part);
1465 allowed_opaque_part.clear('%');
1466 }
1467
1468
1469 /***
1470 * Those characters that are allowed for the reg_name.
1471 */
1472 public static final BitSet allowed_reg_name = new BitSet(256);
1473
1474 static {
1475 allowed_reg_name.or(reg_name);
1476
1477 allowed_reg_name.clear('%');
1478 }
1479
1480
1481 /***
1482 * Those characters that are allowed for the userinfo component.
1483 */
1484 public static final BitSet allowed_userinfo = new BitSet(256);
1485
1486 static {
1487 allowed_userinfo.or(userinfo);
1488
1489 allowed_userinfo.clear('%');
1490 }
1491
1492
1493 /***
1494 * Those characters that are allowed for within the userinfo component.
1495 */
1496 public static final BitSet allowed_within_userinfo = new BitSet(256);
1497
1498 static {
1499 allowed_within_userinfo.or(within_userinfo);
1500 allowed_within_userinfo.clear('%');
1501 }
1502
1503
1504 /***
1505 * Those characters that are allowed for the IPv6reference component.
1506 * The characters '[', ']' in IPv6reference should be excluded.
1507 */
1508 public static final BitSet allowed_IPv6reference = new BitSet(256);
1509
1510 static {
1511 allowed_IPv6reference.or(IPv6reference);
1512
1513 allowed_IPv6reference.clear('[');
1514 allowed_IPv6reference.clear(']');
1515 }
1516
1517
1518 /***
1519 * Those characters that are allowed for the host component.
1520 * The characters '[', ']' in IPv6reference should be excluded.
1521 */
1522 public static final BitSet allowed_host = new BitSet(256);
1523
1524 static {
1525 allowed_host.or(hostname);
1526 allowed_host.or(allowed_IPv6reference);
1527 }
1528
1529
1530 /***
1531 * Those characters that are allowed for the authority component.
1532 */
1533 public static final BitSet allowed_within_authority = new BitSet(256);
1534
1535 static {
1536 allowed_within_authority.or(server);
1537 allowed_within_authority.or(reg_name);
1538 allowed_within_authority.clear(';');
1539 allowed_within_authority.clear(':');
1540 allowed_within_authority.clear('@');
1541 allowed_within_authority.clear('?');
1542 allowed_within_authority.clear('/');
1543 }
1544
1545
1546 /***
1547 * Those characters that are allowed for the abs_path.
1548 */
1549 public static final BitSet allowed_abs_path = new BitSet(256);
1550
1551 static {
1552 allowed_abs_path.or(abs_path);
1553
1554 allowed_abs_path.andNot(percent);
1555 }
1556
1557
1558 /***
1559 * Those characters that are allowed for the rel_path.
1560 */
1561 public static final BitSet allowed_rel_path = new BitSet(256);
1562
1563 static {
1564 allowed_rel_path.or(rel_path);
1565 allowed_rel_path.clear('%');
1566 }
1567
1568
1569 /***
1570 * Those characters that are allowed within the path.
1571 */
1572 public static final BitSet allowed_within_path = new BitSet(256);
1573
1574 static {
1575 allowed_within_path.or(abs_path);
1576 allowed_within_path.clear('/');
1577 allowed_within_path.clear(';');
1578 allowed_within_path.clear('=');
1579 allowed_within_path.clear('?');
1580 }
1581
1582
1583 /***
1584 * Those characters that are allowed for the query component.
1585 */
1586 public static final BitSet allowed_query = new BitSet(256);
1587
1588 static {
1589 allowed_query.or(uric);
1590 allowed_query.clear('%');
1591 }
1592
1593
1594 /***
1595 * Those characters that are allowed within the query component.
1596 */
1597 public static final BitSet allowed_within_query = new BitSet(256);
1598
1599 static {
1600 allowed_within_query.or(allowed_query);
1601 allowed_within_query.andNot(reserved);
1602 }
1603
1604
1605 /***
1606 * Those characters that are allowed for the fragment component.
1607 */
1608 public static final BitSet allowed_fragment = new BitSet(256);
1609
1610 static {
1611 allowed_fragment.or(uric);
1612 allowed_fragment.clear('%');
1613 }
1614
1615
1616
1617
1618
1619
1620
1621 protected boolean _is_hier_part;
1622 protected boolean _is_opaque_part;
1623
1624
1625 protected boolean _is_net_path;
1626 protected boolean _is_abs_path;
1627 protected boolean _is_rel_path;
1628
1629
1630 protected boolean _is_reg_name;
1631 protected boolean _is_server;
1632
1633
1634 protected boolean _is_hostname;
1635 protected boolean _is_IPv4address;
1636 protected boolean _is_IPv6reference;
1637
1638
1639
1640 /***
1641 * Encodes URI string.
1642 *
1643 * This is a two mapping, one from original characters to octets, and
1644 * subsequently a second from octets to URI characters:
1645 * <p><blockquote><pre>
1646 * original character sequence->octet sequence->URI character sequence
1647 * </pre></blockquote><p>
1648 *
1649 * An escaped octet is encoded as a character triplet, consisting of the
1650 * percent character "%" followed by the two hexadecimal digits
1651 * representing the octet code. For example, "%20" is the escaped
1652 * encoding for the US-ASCII space character.
1653 * <p>
1654 * Conversion from the local filesystem character set to UTF-8 will
1655 * normally involve a two step process. First convert the local character
1656 * set to the UCS; then convert the UCS to UTF-8.
1657 * The first step in the process can be performed by maintaining a mapping
1658 * table that includes the local character set code and the corresponding
1659 * UCS code.
1660 * The next step is to convert the UCS character code to the UTF-8 encoding.
1661 * <p>
1662 * Mapping between vendor codepages can be done in a very similar manner
1663 * as described above.
1664 * <p>
1665 * The only time escape encodings can allowedly be made is when a URI is
1666 * being created from its component parts. The escape and validate methods
1667 * are internally performed within this method.
1668 *
1669 * @param original the original character sequence
1670 * @param allowed those characters that are allowed within a component
1671 * @param charset the protocol charset
1672 * @return URI character sequence
1673 * @throws URIException null component or unsupported character encoding
1674 */
1675
1676 protected static char[] encode(String original, BitSet allowed,
1677 String charset) throws URIException {
1678 if (original == null) {
1679 throw new IllegalArgumentException("Original string may not be null");
1680 }
1681 if (allowed == null) {
1682 throw new IllegalArgumentException("Allowed bitset may not be null");
1683 }
1684 byte[] rawdata = URLCodec.encodeUrl(allowed, EncodingUtil.getBytes(original, charset));
1685 return EncodingUtil.getAsciiString(rawdata).toCharArray();
1686 }
1687
1688 /***
1689 * Decodes URI encoded string.
1690 *
1691 * This is a two mapping, one from URI characters to octets, and
1692 * subsequently a second from octets to original characters:
1693 * <p><blockquote><pre>
1694 * URI character sequence->octet sequence->original character sequence
1695 * </pre></blockquote><p>
1696 *
1697 * A URI must be separated into its components before the escaped
1698 * characters within those components can be allowedly decoded.
1699 * <p>
1700 * Notice that there is a chance that URI characters that are non UTF-8
1701 * may be parsed as valid UTF-8. A recent non-scientific analysis found
1702 * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1703 * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1704 * false reading.
1705 * <p>
1706 * The percent "%" character always has the reserved purpose of being
1707 * the escape indicator, it must be escaped as "%25" in order to be used
1708 * as data within a URI.
1709 * <p>
1710 * The unescape method is internally performed within this method.
1711 *
1712 * @param component the URI character sequence
1713 * @param charset the protocol charset
1714 * @return original character sequence
1715 * @throws URIException incomplete trailing escape pattern or unsupported
1716 * character encoding
1717 */
1718 protected static String decode(char[] component, String charset)
1719 throws URIException {
1720 if (component == null) {
1721 throw new IllegalArgumentException("Component array of chars may not be null");
1722 }
1723 return decode(new String(component), charset);
1724 }
1725
1726 /***
1727 * Decodes URI encoded string.
1728 *
1729 * This is a two mapping, one from URI characters to octets, and
1730 * subsequently a second from octets to original characters:
1731 * <p><blockquote><pre>
1732 * URI character sequence->octet sequence->original character sequence
1733 * </pre></blockquote><p>
1734 *
1735 * A URI must be separated into its components before the escaped
1736 * characters within those components can be allowedly decoded.
1737 * <p>
1738 * Notice that there is a chance that URI characters that are non UTF-8
1739 * may be parsed as valid UTF-8. A recent non-scientific analysis found
1740 * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1741 * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1742 * false reading.
1743 * <p>
1744 * The percent "%" character always has the reserved purpose of being
1745 * the escape indicator, it must be escaped as "%25" in order to be used
1746 * as data within a URI.
1747 * <p>
1748 * The unescape method is internally performed within this method.
1749 *
1750 * @param component the URI character sequence
1751 * @param charset the protocol charset
1752 * @return original character sequence
1753 * @throws URIException incomplete trailing escape pattern or unsupported
1754 * character encoding
1755 *
1756 * @since 3.0
1757 */
1758 protected static String decode(String component, String charset)
1759 throws URIException {
1760 if (component == null) {
1761 throw new IllegalArgumentException("Component array of chars may not be null");
1762 }
1763 byte[] rawdata = null;
1764 try {
1765 rawdata = URLCodec.decodeUrl(EncodingUtil.getAsciiBytes(component));
1766 } catch (DecoderException e) {
1767 throw new URIException(e.getMessage());
1768 }
1769 return EncodingUtil.getString(rawdata, charset);
1770 }
1771 /***
1772 * Pre-validate the unescaped URI string within a specific component.
1773 *
1774 * @param component the component string within the component
1775 * @param disallowed those characters disallowed within the component
1776 * @return if true, it doesn't have the disallowed characters
1777 * if false, the component is undefined or an incorrect one
1778 */
1779 protected boolean prevalidate(String component, BitSet disallowed) {
1780
1781 if (component == null) {
1782 return false;
1783 }
1784 char[] target = component.toCharArray();
1785 for (int i = 0; i < target.length; i++) {
1786 if (disallowed.get(target[i])) {
1787 return false;
1788 }
1789 }
1790 return true;
1791 }
1792
1793
1794 /***
1795 * Validate the URI characters within a specific component.
1796 * The component must be performed after escape encoding. Or it doesn't
1797 * include escaped characters.
1798 *
1799 * @param component the characters sequence within the component
1800 * @param generous those characters that are allowed within a component
1801 * @return if true, it's the correct URI character sequence
1802 */
1803 protected boolean validate(char[] component, BitSet generous) {
1804
1805 return validate(component, 0, -1, generous);
1806 }
1807
1808
1809 /***
1810 * Validate the URI characters within a specific component.
1811 * The component must be performed after escape encoding. Or it doesn't
1812 * include escaped characters.
1813 * <p>
1814 * It's not that much strict, generous. The strict validation might be
1815 * performed before being called this method.
1816 *
1817 * @param component the characters sequence within the component
1818 * @param soffset the starting offset of the given component
1819 * @param eoffset the ending offset of the given component
1820 * if -1, it means the length of the component
1821 * @param generous those characters that are allowed within a component
1822 * @return if true, it's the correct URI character sequence
1823 */
1824 protected boolean validate(char[] component, int soffset, int eoffset,
1825 BitSet generous) {
1826
1827 if (eoffset == -1) {
1828 eoffset = component.length - 1;
1829 }
1830 for (int i = soffset; i <= eoffset; i++) {
1831 if (!generous.get(component[i])) {
1832 return false;
1833 }
1834 }
1835 return true;
1836 }
1837
1838
1839 /***
1840 * In order to avoid any possilbity of conflict with non-ASCII characters,
1841 * Parse a URI reference as a <code>String</code> with the character
1842 * encoding of the local system or the document.
1843 * <p>
1844 * The following line is the regular expression for breaking-down a URI
1845 * reference into its components.
1846 * <p><blockquote><pre>
1847 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1848 * 12 3 4 5 6 7 8 9
1849 * </pre></blockquote><p>
1850 * For example, matching the above expression to
1851 * http://jakarta.apache.org/ietf/uri/#Related
1852 * results in the following subexpression matches:
1853 * <p><blockquote><pre>
1854 * $1 = http:
1855 * scheme = $2 = http
1856 * $3 = //jakarta.apache.org
1857 * authority = $4 = jakarta.apache.org
1858 * path = $5 = /ietf/uri/
1859 * $6 = <undefined>
1860 * query = $7 = <undefined>
1861 * $8 = #Related
1862 * fragment = $9 = Related
1863 * </pre></blockquote><p>
1864 *
1865 * @param original the original character sequence
1866 * @param escaped <code>true</code> if <code>original</code> is escaped
1867 * @throws URIException If an error occurs.
1868 */
1869 protected void parseUriReference(String original, boolean escaped)
1870 throws URIException {
1871
1872
1873 if (original == null) {
1874 throw new URIException("URI-Reference required");
1875 }
1876
1877
1878
1879
1880 String tmp = original.trim();
1881
1882
1883
1884
1885
1886 int length = tmp.length();
1887
1888
1889
1890
1891 if (length > 0) {
1892 char[] firstDelimiter = { tmp.charAt(0) };
1893 if (validate(firstDelimiter, delims)) {
1894 if (length >= 2) {
1895 char[] lastDelimiter = { tmp.charAt(length - 1) };
1896 if (validate(lastDelimiter, delims)) {
1897 tmp = tmp.substring(1, length - 1);
1898 length = length - 2;
1899 }
1900 }
1901 }
1902 }
1903
1904
1905
1906
1907 int from = 0;
1908
1909
1910
1911
1912 boolean isStartedFromPath = false;
1913 int atColon = tmp.indexOf(':');
1914 int atSlash = tmp.indexOf('/');
1915 if (atColon <= 0 || (atSlash >= 0 && atSlash < atColon)) {
1916 isStartedFromPath = true;
1917 }
1918
1919
1920
1921
1922
1923
1924
1925 int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
1926 if (at == -1) {
1927 at = 0;
1928 }
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938 if (at > 0 && at < length && tmp.charAt(at) == ':') {
1939 char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
1940 if (validate(target, scheme)) {
1941 _scheme = target;
1942 } else {
1943 throw new URIException("incorrect scheme");
1944 }
1945 from = ++at;
1946 }
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957 _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
1958 if (0 <= at && at < length && tmp.charAt(at) == '/') {
1959
1960 _is_hier_part = true;
1961 if (at + 2 < length && tmp.charAt(at + 1) == '/') {
1962
1963 int next = indexFirstOf(tmp, "/?#", at + 2);
1964 if (next == -1) {
1965 next = (tmp.substring(at + 2).length() == 0) ? at + 2
1966 : tmp.length();
1967 }
1968 parseAuthority(tmp.substring(at + 2, next), escaped);
1969 from = at = next;
1970
1971 _is_net_path = true;
1972 }
1973 if (from == at) {
1974
1975 _is_abs_path = true;
1976 }
1977 }
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987 if (from < length) {
1988
1989 int next = indexFirstOf(tmp, "?#", from);
1990 if (next == -1) {
1991 next = tmp.length();
1992 }
1993 if (!_is_abs_path) {
1994 if (!escaped
1995 && prevalidate(tmp.substring(from, next), disallowed_rel_path)
1996 || escaped
1997 && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
1998
1999 _is_rel_path = true;
2000 } else if (!escaped
2001 && prevalidate(tmp.substring(from, next), disallowed_opaque_part)
2002 || escaped
2003 && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
2004
2005 _is_opaque_part = true;
2006 } else {
2007
2008 _path = null;
2009 }
2010 }
2011 if (escaped) {
2012 setRawPath(tmp.substring(from, next).toCharArray());
2013 } else {
2014 setPath(tmp.substring(from, next));
2015 }
2016 at = next;
2017 }
2018
2019
2020 String charset = getProtocolCharset();
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030 if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
2031 int next = tmp.indexOf('#', at + 1);
2032 if (next == -1) {
2033 next = tmp.length();
2034 }
2035 _query = (escaped) ? tmp.substring(at + 1, next).toCharArray()
2036 : encode(tmp.substring(at + 1, next), allowed_query, charset);
2037 at = next;
2038 }
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048 if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
2049 if (at + 1 == length) {
2050 _fragment = "".toCharArray();
2051 } else {
2052 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray()
2053 : encode(tmp.substring(at + 1), allowed_fragment, charset);
2054 }
2055 }
2056
2057
2058 setURI();
2059 }
2060
2061
2062 /***
2063 * Get the earlier index that to be searched for the first occurrance in
2064 * one of any of the given string.
2065 *
2066 * @param s the string to be indexed
2067 * @param delims the delimiters used to index
2068 * @return the earlier index if there are delimiters
2069 */
2070 protected int indexFirstOf(String s, String delims) {
2071 return indexFirstOf(s, delims, -1);
2072 }
2073
2074
2075 /***
2076 * Get the earlier index that to be searched for the first occurrance in
2077 * one of any of the given string.
2078 *
2079 * @param s the string to be indexed
2080 * @param delims the delimiters used to index
2081 * @param offset the from index
2082 * @return the earlier index if there are delimiters
2083 */
2084 protected int indexFirstOf(String s, String delims, int offset) {
2085 if (s == null || s.length() == 0) {
2086 return -1;
2087 }
2088 if (delims == null || delims.length() == 0) {
2089 return -1;
2090 }
2091
2092 if (offset < 0) {
2093 offset = 0;
2094 } else if (offset > s.length()) {
2095 return -1;
2096 }
2097
2098 int min = s.length();
2099 char[] delim = delims.toCharArray();
2100 for (int i = 0; i < delim.length; i++) {
2101 int at = s.indexOf(delim[i], offset);
2102 if (at >= 0 && at < min) {
2103 min = at;
2104 }
2105 }
2106 return (min == s.length()) ? -1 : min;
2107 }
2108
2109
2110 /***
2111 * Get the earlier index that to be searched for the first occurrance in
2112 * one of any of the given array.
2113 *
2114 * @param s the character array to be indexed
2115 * @param delim the delimiter used to index
2116 * @return the ealier index if there are a delimiter
2117 */
2118 protected int indexFirstOf(char[] s, char delim) {
2119 return indexFirstOf(s, delim, 0);
2120 }
2121
2122
2123 /***
2124 * Get the earlier index that to be searched for the first occurrance in
2125 * one of any of the given array.
2126 *
2127 * @param s the character array to be indexed
2128 * @param delim the delimiter used to index
2129 * @param offset The offset.
2130 * @return the ealier index if there is a delimiter
2131 */
2132 protected int indexFirstOf(char[] s, char delim, int offset) {
2133 if (s == null || s.length == 0) {
2134 return -1;
2135 }
2136
2137 if (offset < 0) {
2138 offset = 0;
2139 } else if (offset > s.length) {
2140 return -1;
2141 }
2142 for (int i = offset; i < s.length; i++) {
2143 if (s[i] == delim) {
2144 return i;
2145 }
2146 }
2147 return -1;
2148 }
2149
2150
2151 /***
2152 * Parse the authority component.
2153 *
2154 * @param original the original character sequence of authority component
2155 * @param escaped <code>true</code> if <code>original</code> is escaped
2156 * @throws URIException If an error occurs.
2157 */
2158 protected void parseAuthority(String original, boolean escaped)
2159 throws URIException {
2160
2161
2162 _is_reg_name = _is_server =
2163 _is_hostname = _is_IPv4address = _is_IPv6reference = false;
2164
2165
2166 String charset = getProtocolCharset();
2167
2168 boolean hasPort = true;
2169 int from = 0;
2170 int next = original.indexOf('@');
2171 if (next != -1) {
2172
2173 _userinfo = (escaped) ? original.substring(0, next).toCharArray()
2174 : encode(original.substring(0, next), allowed_userinfo,
2175 charset);
2176 from = next + 1;
2177 }
2178 next = original.indexOf('[', from);
2179 if (next >= from) {
2180 next = original.indexOf(']', from);
2181 if (next == -1) {
2182 throw new URIException(URIException.PARSING, "IPv6reference");
2183 } else {
2184 next++;
2185 }
2186
2187 _host = (escaped) ? original.substring(from, next).toCharArray()
2188 : encode(original.substring(from, next), allowed_IPv6reference,
2189 charset);
2190
2191 _is_IPv6reference = true;
2192 } else {
2193 next = original.indexOf(':', from);
2194 if (next == -1) {
2195 next = original.length();
2196 hasPort = false;
2197 }
2198
2199 _host = original.substring(from, next).toCharArray();
2200 if (validate(_host, IPv4address)) {
2201
2202 _is_IPv4address = true;
2203 } else if (validate(_host, hostname)) {
2204
2205 _is_hostname = true;
2206 } else {
2207
2208 _is_reg_name = true;
2209 }
2210 }
2211 if (_is_reg_name) {
2212
2213 _is_server = _is_hostname = _is_IPv4address =
2214 _is_IPv6reference = false;
2215
2216 _authority = (escaped) ? original.toString().toCharArray()
2217 : encode(original.toString(), allowed_reg_name, charset);
2218 } else {
2219 if (original.length() - 1 > next && hasPort
2220 && original.charAt(next) == ':') {
2221 from = next + 1;
2222 try {
2223 _port = Integer.parseInt(original.substring(from));
2224 } catch (NumberFormatException error) {
2225 throw new URIException(URIException.PARSING,
2226 "invalid port number");
2227 }
2228 }
2229
2230 StringBuffer buf = new StringBuffer();
2231 if (_userinfo != null) {
2232 buf.append(_userinfo);
2233 buf.append('@');
2234 }
2235 if (_host != null) {
2236 buf.append(_host);
2237 if (_port != -1) {
2238 buf.append(':');
2239 buf.append(_port);
2240 }
2241 }
2242 _authority = buf.toString().toCharArray();
2243
2244 _is_server = true;
2245 }
2246 }
2247
2248
2249 /***
2250 * Once it's parsed successfully, set this URI.
2251 *
2252 * @see #getRawURI
2253 */
2254 protected void setURI() {
2255
2256 StringBuffer buf = new StringBuffer();
2257
2258 if (_scheme != null) {
2259 buf.append(_scheme);
2260 buf.append(':');
2261 }
2262 if (_is_net_path) {
2263 buf.append("//");
2264 if (_authority != null) {
2265 buf.append(_authority);
2266 }
2267 }
2268 if (_opaque != null && _is_opaque_part) {
2269 buf.append(_opaque);
2270 } else if (_path != null) {
2271
2272 if (_path.length != 0) {
2273 buf.append(_path);
2274 }
2275 }
2276 if (_query != null) {
2277 buf.append('?');
2278 buf.append(_query);
2279 }
2280
2281 _uri = buf.toString().toCharArray();
2282 hash = 0;
2283 }
2284
2285
2286
2287
2288 /***
2289 * Tell whether or not this URI is absolute.
2290 *
2291 * @return true iif this URI is absoluteURI
2292 */
2293 public boolean isAbsoluteURI() {
2294 return (_scheme != null);
2295 }
2296
2297
2298 /***
2299 * Tell whether or not this URI is relative.
2300 *
2301 * @return true iif this URI is relativeURI
2302 */
2303 public boolean isRelativeURI() {
2304 return (_scheme == null);
2305 }
2306
2307
2308 /***
2309 * Tell whether or not the absoluteURI of this URI is hier_part.
2310 *
2311 * @return true iif the absoluteURI is hier_part
2312 */
2313 public boolean isHierPart() {
2314 return _is_hier_part;
2315 }
2316
2317
2318 /***
2319 * Tell whether or not the absoluteURI of this URI is opaque_part.
2320 *
2321 * @return true iif the absoluteURI is opaque_part
2322 */
2323 public boolean isOpaquePart() {
2324 return _is_opaque_part;
2325 }
2326
2327
2328 /***
2329 * Tell whether or not the relativeURI or heir_part of this URI is net_path.
2330 * It's the same function as the has_authority() method.
2331 *
2332 * @return true iif the relativeURI or heir_part is net_path
2333 * @see #hasAuthority
2334 */
2335 public boolean isNetPath() {
2336 return _is_net_path || (_authority != null);
2337 }
2338
2339
2340 /***
2341 * Tell whether or not the relativeURI or hier_part of this URI is abs_path.
2342 *
2343 * @return true iif the relativeURI or hier_part is abs_path
2344 */
2345 public boolean isAbsPath() {
2346 return _is_abs_path;
2347 }
2348
2349
2350 /***
2351 * Tell whether or not the relativeURI of this URI is rel_path.
2352 *
2353 * @return true iif the relativeURI is rel_path
2354 */
2355 public boolean isRelPath() {
2356 return _is_rel_path;
2357 }
2358
2359
2360 /***
2361 * Tell whether or not this URI has authority.
2362 * It's the same function as the is_net_path() method.
2363 *
2364 * @return true iif this URI has authority
2365 * @see #isNetPath
2366 */
2367 public boolean hasAuthority() {
2368 return (_authority != null) || _is_net_path;
2369 }
2370
2371 /***
2372 * Tell whether or not the authority component of this URI is reg_name.
2373 *
2374 * @return true iif the authority component is reg_name
2375 */
2376 public boolean isRegName() {
2377 return _is_reg_name;
2378 }
2379
2380
2381 /***
2382 * Tell whether or not the authority component of this URI is server.
2383 *
2384 * @return true iif the authority component is server
2385 */
2386 public boolean isServer() {
2387 return _is_server;
2388 }
2389
2390
2391 /***
2392 * Tell whether or not this URI has userinfo.
2393 *
2394 * @return true iif this URI has userinfo
2395 */
2396 public boolean hasUserinfo() {
2397 return (_userinfo != null);
2398 }
2399
2400
2401 /***
2402 * Tell whether or not the host part of this URI is hostname.
2403 *
2404 * @return true iif the host part is hostname
2405 */
2406 public boolean isHostname() {
2407 return _is_hostname;
2408 }
2409
2410
2411 /***
2412 * Tell whether or not the host part of this URI is IPv4address.
2413 *
2414 * @return true iif the host part is IPv4address
2415 */
2416 public boolean isIPv4address() {
2417 return _is_IPv4address;
2418 }
2419
2420
2421 /***
2422 * Tell whether or not the host part of this URI is IPv6reference.
2423 *
2424 * @return true iif the host part is IPv6reference
2425 */
2426 public boolean isIPv6reference() {
2427 return _is_IPv6reference;
2428 }
2429
2430
2431 /***
2432 * Tell whether or not this URI has query.
2433 *
2434 * @return true iif this URI has query
2435 */
2436 public boolean hasQuery() {
2437 return (_query != null);
2438 }
2439
2440
2441 /***
2442 * Tell whether or not this URI has fragment.
2443 *
2444 * @return true iif this URI has fragment
2445 */
2446 public boolean hasFragment() {
2447 return (_fragment != null);
2448 }
2449
2450
2451
2452
2453
2454 /***
2455 * Set the default charset of the protocol.
2456 * <p>
2457 * The character set used to store files SHALL remain a local decision and
2458 * MAY depend on the capability of local operating systems. Prior to the
2459 * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format
2460 * and UTF-8 encoded. This approach, while allowing international exchange
2461 * of URIs, will still allow backward compatibility with older systems
2462 * because the code set positions for ASCII characters are identical to the
2463 * one byte sequence in UTF-8.
2464 * <p>
2465 * An individual URI scheme may require a single charset, define a default
2466 * charset, or provide a way to indicate the charset used.
2467 *
2468 * <p>
2469 * Always all the time, the setter method is always succeeded and throws
2470 * <code>DefaultCharsetChanged</code> exception.
2471 *
2472 * So API programmer must follow the following way:
2473 * <code><pre>
2474 * import org.apache.util.URI$DefaultCharsetChanged;
2475 * .
2476 * .
2477 * .
2478 * try {
2479 * URI.setDefaultProtocolCharset("UTF-8");
2480 * } catch (DefaultCharsetChanged cc) {
2481 * // CASE 1: the exception could be ignored, when it is set by user
2482 * if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) {
2483 * // CASE 2: let user know the default protocol charset changed
2484 * } else {
2485 * // CASE 2: let user know the default document charset changed
2486 * }
2487 * }
2488 * </pre></code>
2489 *
2490 * The API programmer is responsible to set the correct charset.
2491 * And each application should remember its own charset to support.
2492 *
2493 * @param charset the default charset for each protocol
2494 * @throws DefaultCharsetChanged default charset changed
2495 */
2496 public static void setDefaultProtocolCharset(String charset)
2497 throws DefaultCharsetChanged {
2498
2499 defaultProtocolCharset = charset;
2500 throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET,
2501 "the default protocol charset changed");
2502 }
2503
2504
2505 /***
2506 * Get the default charset of the protocol.
2507 * <p>
2508 * An individual URI scheme may require a single charset, define a default
2509 * charset, or provide a way to indicate the charset used.
2510 * <p>
2511 * To work globally either requires support of a number of character sets
2512 * and to be able to convert between them, or the use of a single preferred
2513 * character set.
2514 * For support of global compatibility it is STRONGLY RECOMMENDED that
2515 * clients and servers use UTF-8 encoding when exchanging URIs.
2516 *
2517 * @return the default charset string
2518 */
2519 public static String getDefaultProtocolCharset() {
2520 return defaultProtocolCharset;
2521 }
2522
2523
2524 /***
2525 * Get the protocol charset used by this current URI instance.
2526 * It was set by the constructor for this instance. If it was not set by
2527 * contructor, it will return the default protocol charset.
2528 *
2529 * @return the protocol charset string
2530 * @see #getDefaultProtocolCharset
2531 */
2532 public String getProtocolCharset() {
2533 return (protocolCharset != null)
2534 ? protocolCharset
2535 : defaultProtocolCharset;
2536 }
2537
2538
2539 /***
2540 * Set the default charset of the document.
2541 * <p>
2542 * Notice that it will be possible to contain mixed characters (e.g.
2543 * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional
2544 * display of these character sets, the protocol charset could be simply
2545 * used again. Because it's not yet implemented that the insertion of BIDI
2546 * control characters at different points during composition is extracted.
2547 * <p>
2548 *
2549 * Always all the time, the setter method is always succeeded and throws
2550 * <code>DefaultCharsetChanged</code> exception.
2551 *
2552 * So API programmer must follow the following way:
2553 * <code><pre>
2554 * import org.apache.util.URI$DefaultCharsetChanged;
2555 * .
2556 * .
2557 * .
2558 * try {
2559 * URI.setDefaultDocumentCharset("EUC-KR");
2560 * } catch (DefaultCharsetChanged cc) {
2561 * // CASE 1: the exception could be ignored, when it is set by user
2562 * if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) {
2563 * // CASE 2: let user know the default document charset changed
2564 * } else {
2565 * // CASE 2: let user know the default protocol charset changed
2566 * }
2567 * }
2568 * </pre></code>
2569 *
2570 * The API programmer is responsible to set the correct charset.
2571 * And each application should remember its own charset to support.
2572 *
2573 * @param charset the default charset for the document
2574 * @throws DefaultCharsetChanged default charset changed
2575 */
2576 public static void setDefaultDocumentCharset(String charset)
2577 throws DefaultCharsetChanged {
2578
2579 defaultDocumentCharset = charset;
2580 throw new DefaultCharsetChanged(DefaultCharsetChanged.DOCUMENT_CHARSET,
2581 "the default document charset changed");
2582 }
2583
2584
2585 /***
2586 * Get the recommended default charset of the document.
2587 *
2588 * @return the default charset string
2589 */
2590 public static String getDefaultDocumentCharset() {
2591 return defaultDocumentCharset;
2592 }
2593
2594
2595 /***
2596 * Get the default charset of the document by locale.
2597 *
2598 * @return the default charset string by locale
2599 */
2600 public static String getDefaultDocumentCharsetByLocale() {
2601 return defaultDocumentCharsetByLocale;
2602 }
2603
2604
2605 /***
2606 * Get the default charset of the document by platform.
2607 *
2608 * @return the default charset string by platform
2609 */
2610 public static String getDefaultDocumentCharsetByPlatform() {
2611 return defaultDocumentCharsetByPlatform;
2612 }
2613
2614
2615
2616 /***
2617 * Get the scheme.
2618 *
2619 * @return the scheme
2620 */
2621 public char[] getRawScheme() {
2622 return _scheme;
2623 }
2624
2625
2626 /***
2627 * Get the scheme.
2628 *
2629 * @return the scheme
2630 * null if undefined scheme
2631 */
2632 public String getScheme() {
2633 return (_scheme == null) ? null : new String(_scheme);
2634 }
2635
2636
2637
2638 /***
2639 * Set the authority. It can be one type of server, hostport, hostname,
2640 * IPv4address, IPv6reference and reg_name.
2641 * <p><blockquote><pre>
2642 * authority = server | reg_name
2643 * </pre></blockquote><p>
2644 *
2645 * @param escapedAuthority the raw escaped authority
2646 * @throws URIException If {@link
2647 * #parseAuthority(java.lang.String,boolean)} fails
2648 * @throws NullPointerException null authority
2649 */
2650 public void setRawAuthority(char[] escapedAuthority)
2651 throws URIException, NullPointerException {
2652
2653 parseAuthority(new String(escapedAuthority), true);
2654 setURI();
2655 }
2656
2657
2658 /***
2659 * Set the authority. It can be one type of server, hostport, hostname,
2660 * IPv4address, IPv6reference and reg_name.
2661 * Note that there is no setAuthority method by the escape encoding reason.
2662 *
2663 * @param escapedAuthority the escaped authority string
2664 * @throws URIException If {@link
2665 * #parseAuthority(java.lang.String,boolean)} fails
2666 */
2667 public void setEscapedAuthority(String escapedAuthority)
2668 throws URIException {
2669
2670 parseAuthority(escapedAuthority, true);
2671 setURI();
2672 }
2673
2674
2675 /***
2676 * Get the raw-escaped authority.
2677 *
2678 * @return the raw-escaped authority
2679 */
2680 public char[] getRawAuthority() {
2681 return _authority;
2682 }
2683
2684
2685 /***
2686 * Get the escaped authority.
2687 *
2688 * @return the escaped authority
2689 */
2690 public String getEscapedAuthority() {
2691 return (_authority == null) ? null : new String(_authority);
2692 }
2693
2694
2695 /***
2696 * Get the authority.
2697 *
2698 * @return the authority
2699 * @throws URIException If {@link #decode} fails
2700 */
2701 public String getAuthority() throws URIException {
2702 return (_authority == null) ? null : decode(_authority,
2703 getProtocolCharset());
2704 }
2705
2706
2707
2708 /***
2709 * Get the raw-escaped userinfo.
2710 *
2711 * @return the raw-escaped userinfo
2712 * @see #getAuthority
2713 */
2714 public char[] getRawUserinfo() {
2715 return _userinfo;
2716 }
2717
2718
2719 /***
2720 * Get the escaped userinfo.
2721 *
2722 * @return the escaped userinfo
2723 * @see #getAuthority
2724 */
2725 public String getEscapedUserinfo() {
2726 return (_userinfo == null) ? null : new String(_userinfo);
2727 }
2728
2729
2730 /***
2731 * Get the userinfo.
2732 *
2733 * @return the userinfo
2734 * @throws URIException If {@link #decode} fails
2735 * @see #getAuthority
2736 */
2737 public String getUserinfo() throws URIException {
2738 return (_userinfo == null) ? null : decode(_userinfo,
2739 getProtocolCharset());
2740 }
2741
2742
2743
2744 /***
2745 * Get the host.
2746 * <p><blockquote><pre>
2747 * host = hostname | IPv4address | IPv6reference
2748 * </pre></blockquote><p>
2749 *
2750 * @return the host
2751 * @see #getAuthority
2752 */
2753 public char[] getRawHost() {
2754 return _host;
2755 }
2756
2757
2758 /***
2759 * Get the host.
2760 * <p><blockquote><pre>
2761 * host = hostname | IPv4address | IPv6reference
2762 * </pre></blockquote><p>
2763 *
2764 * @return the host
2765 * @throws URIException If {@link #decode} fails
2766 * @see #getAuthority
2767 */
2768 public String getHost() throws URIException {
2769 if (_host != null) {
2770 return decode(_host, getProtocolCharset());
2771 } else {
2772 return null;
2773 }
2774 }
2775
2776
2777
2778 /***
2779 * Get the port. In order to get the specfic default port, the specific
2780 * protocol-supported class extended from the URI class should be used.
2781 * It has the server-based naming authority.
2782 *
2783 * @return the port
2784 * if -1, it has the default port for the scheme or the server-based
2785 * naming authority is not supported in the specific URI.
2786 */
2787 public int getPort() {
2788 return _port;
2789 }
2790
2791
2792
2793 /***
2794 * Set the raw-escaped path.
2795 *
2796 * @param escapedPath the path character sequence
2797 * @throws URIException encoding error or not proper for initial instance
2798 * @see #encode
2799 */
2800 public void setRawPath(char[] escapedPath) throws URIException {
2801 if (escapedPath == null || escapedPath.length == 0) {
2802 _path = _opaque = escapedPath;
2803 setURI();
2804 return;
2805 }
2806
2807 escapedPath = removeFragmentIdentifier(escapedPath);
2808 if (_is_net_path || _is_abs_path) {
2809 if (escapedPath[0] != '/') {
2810 throw new URIException(URIException.PARSING,
2811 "not absolute path");
2812 }
2813 if (!validate(escapedPath, abs_path)) {
2814 throw new URIException(URIException.ESCAPING,
2815 "escaped absolute path not valid");
2816 }
2817 _path = escapedPath;
2818 } else if (_is_rel_path) {
2819 int at = indexFirstOf(escapedPath, '/');
2820 if (at == 0) {
2821 throw new URIException(URIException.PARSING, "incorrect path");
2822 }
2823 if (at > 0 && !validate(escapedPath, 0, at - 1, rel_segment)
2824 && !validate(escapedPath, at, -1, abs_path)
2825 || at < 0 && !validate(escapedPath, 0, -1, rel_segment)) {
2826
2827 throw new URIException(URIException.ESCAPING,
2828 "escaped relative path not valid");
2829 }
2830 _path = escapedPath;
2831 } else if (_is_opaque_part) {
2832 if (!uric_no_slash.get(escapedPath[0])
2833 && !validate(escapedPath, 1, -1, uric)) {
2834 throw new URIException(URIException.ESCAPING,
2835 "escaped opaque part not valid");
2836 }
2837 _opaque = escapedPath;
2838 } else {
2839 throw new URIException(URIException.PARSING, "incorrect path");
2840 }
2841 setURI();
2842 }
2843
2844
2845 /***
2846 * Set the escaped path.
2847 *
2848 * @param escapedPath the escaped path string
2849 * @throws URIException encoding error or not proper for initial instance
2850 * @see #encode
2851 */
2852 public void setEscapedPath(String escapedPath) throws URIException {
2853 if (escapedPath == null) {
2854 _path = _opaque = null;
2855 setURI();
2856 return;
2857 }
2858 setRawPath(escapedPath.toCharArray());
2859 }
2860
2861
2862 /***
2863 * Set the path.
2864 *
2865 * @param path the path string
2866 * @throws URIException set incorrectly or fragment only
2867 * @see #encode
2868 */
2869 public void setPath(String path) throws URIException {
2870
2871 if (path == null || path.length() == 0) {
2872 _path = _opaque = (path == null) ? null : path.toCharArray();
2873 setURI();
2874 return;
2875 }
2876
2877 String charset = getProtocolCharset();
2878
2879 if (_is_net_path || _is_abs_path) {
2880 _path = encode(path, allowed_abs_path, charset);
2881 } else if (_is_rel_path) {
2882 StringBuffer buff = new StringBuffer(path.length());
2883 int at = path.indexOf('/');
2884 if (at == 0) {
2885 throw new URIException(URIException.PARSING,
2886 "incorrect relative path");
2887 }
2888 if (at > 0) {
2889 buff.append(encode(path.substring(0, at), allowed_rel_path,
2890 charset));
2891 buff.append(encode(path.substring(at), allowed_abs_path,
2892 charset));
2893 } else {
2894 buff.append(encode(path, allowed_rel_path, charset));
2895 }
2896 _path = buff.toString().toCharArray();
2897 } else if (_is_opaque_part) {
2898 StringBuffer buf = new StringBuffer();
2899 buf.insert(0, encode(path.substring(0, 1), uric_no_slash, charset));
2900 buf.insert(1, encode(path.substring(1), uric, charset));
2901 _opaque = buf.toString().toCharArray();
2902 } else {
2903 throw new URIException(URIException.PARSING, "incorrect path");
2904 }
2905 setURI();
2906 }
2907
2908
2909 /***
2910 * Resolve the base and relative path.
2911 *
2912 * @param basePath a character array of the basePath
2913 * @param relPath a character array of the relPath
2914 * @return the resolved path
2915 * @throws URIException no more higher path level to be resolved
2916 */
2917 protected char[] resolvePath(char[] basePath, char[] relPath)
2918 throws URIException {
2919
2920
2921 String base = (basePath == null) ? "" : new String(basePath);
2922 int at = base.lastIndexOf('/');
2923 if (at != -1) {
2924 basePath = base.substring(0, at + 1).toCharArray();
2925 }
2926
2927 if (relPath == null || relPath.length == 0) {
2928 return normalize(basePath);
2929 } else if (relPath[0] == '/') {
2930 return normalize(relPath);
2931 } else {
2932 StringBuffer buff = new StringBuffer(base.length()
2933 + relPath.length);
2934 buff.append((at != -1) ? base.substring(0, at + 1) : "/");
2935 buff.append(relPath);
2936 return normalize(buff.toString().toCharArray());
2937 }
2938 }
2939
2940
2941 /***
2942 * Get the raw-escaped current hierarchy level in the given path.
2943 * If the last namespace is a collection, the slash mark ('/') should be
2944 * ended with at the last character of the path string.
2945 *
2946 * @param path the path
2947 * @return the current hierarchy level
2948 * @throws URIException no hierarchy level
2949 */
2950 protected char[] getRawCurrentHierPath(char[] path) throws URIException {
2951
2952 if (_is_opaque_part) {
2953 throw new URIException(URIException.PARSING, "no hierarchy level");
2954 }
2955 if (path == null) {
2956 throw new URIException(URIException.PARSING, "empty path");
2957 }
2958 String buff = new String(path);
2959 int first = buff.indexOf('/');
2960 int last = buff.lastIndexOf('/');
2961 if (last == 0) {
2962 return rootPath;
2963 } else if (first != last && last != -1) {
2964 return buff.substring(0, last).toCharArray();
2965 }
2966
2967 return path;
2968 }
2969
2970
2971 /***
2972 * Get the raw-escaped current hierarchy level.
2973 *
2974 * @return the raw-escaped current hierarchy level
2975 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2976 */
2977 public char[] getRawCurrentHierPath() throws URIException {
2978 return (_path == null) ? null : getRawCurrentHierPath(_path);
2979 }
2980
2981
2982 /***
2983 * Get the escaped current hierarchy level.
2984 *
2985 * @return the escaped current hierarchy level
2986 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2987 */
2988 public String getEscapedCurrentHierPath() throws URIException {
2989 char[] path = getRawCurrentHierPath();
2990 return (path == null) ? null : new String(path);
2991 }
2992
2993
2994 /***
2995 * Get the current hierarchy level.
2996 *
2997 * @return the current hierarchy level
2998 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2999 * @see #decode
3000 */
3001 public String getCurrentHierPath() throws URIException {
3002 char[] path = getRawCurrentHierPath();
3003 return (path == null) ? null : decode(path, getProtocolCharset());
3004 }
3005
3006
3007 /***
3008 * Get the level above the this hierarchy level.
3009 *
3010 * @return the raw above hierarchy level
3011 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3012 */
3013 public char[] getRawAboveHierPath() throws URIException {
3014 char[] path = getRawCurrentHierPath();
3015 return (path == null) ? null : getRawCurrentHierPath(path);
3016 }
3017
3018
3019 /***
3020 * Get the level above the this hierarchy level.
3021 *
3022 * @return the raw above hierarchy level
3023 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3024 */
3025 public String getEscapedAboveHierPath() throws URIException {
3026 char[] path = getRawAboveHierPath();
3027 return (path == null) ? null : new String(path);
3028 }
3029
3030
3031 /***
3032 * Get the level above the this hierarchy level.
3033 *
3034 * @return the above hierarchy level
3035 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3036 * @see #decode
3037 */
3038 public String getAboveHierPath() throws URIException {
3039 char[] path = getRawAboveHierPath();
3040 return (path == null) ? null : decode(path, getProtocolCharset());
3041 }
3042
3043
3044 /***
3045 * Get the raw-escaped path.
3046 * <p><blockquote><pre>
3047 * path = [ abs_path | opaque_part ]
3048 * </pre></blockquote><p>
3049 *
3050 * @return the raw-escaped path
3051 */
3052 public char[] getRawPath() {
3053 return _is_opaque_part ? _opaque : _path;
3054 }
3055
3056
3057 /***
3058 * Get the escaped path.
3059 * <p><blockquote><pre>
3060 * path = [ abs_path | opaque_part ]
3061 * abs_path = "/" path_segments
3062 * opaque_part = uric_no_slash *uric
3063 * </pre></blockquote><p>
3064 *
3065 * @return the escaped path string
3066 */
3067 public String getEscapedPath() {
3068 char[] path = getRawPath();
3069 return (path == null) ? null : new String(path);
3070 }
3071
3072
3073 /***
3074 * Get the path.
3075 * <p><blockquote><pre>
3076 * path = [ abs_path | opaque_part ]
3077 * </pre></blockquote><p>
3078 * @return the path string
3079 * @throws URIException If {@link #decode} fails.
3080 * @see #decode
3081 */
3082 public String getPath() throws URIException {
3083 char[] path = getRawPath();
3084 return (path == null) ? null : decode(path, getProtocolCharset());
3085 }
3086
3087
3088 /***
3089 * Get the raw-escaped basename of the path.
3090 *
3091 * @return the raw-escaped basename
3092 */
3093 public char[] getRawName() {
3094 if (_path == null) {
3095 return null;
3096 }
3097
3098 int at = 0;
3099 for (int i = _path.length - 1; i >= 0; i--) {
3100 if (_path[i] == '/') {
3101 at = i + 1;
3102 break;
3103 }
3104 }
3105 int len = _path.length - at;
3106 char[] basename = new char[len];
3107 System.arraycopy(_path, at, basename, 0, len);
3108 return basename;
3109 }
3110
3111
3112 /***
3113 * Get the escaped basename of the path.
3114 *
3115 * @return the escaped basename string
3116 */
3117 public String getEscapedName() {
3118 char[] basename = getRawName();
3119 return (basename == null) ? null : new String(basename);
3120 }
3121
3122
3123 /***
3124 * Get the basename of the path.
3125 *
3126 * @return the basename string
3127 * @throws URIException incomplete trailing escape pattern or unsupported
3128 * character encoding
3129 * @see #decode
3130 */
3131 public String getName() throws URIException {
3132 char[] basename = getRawName();
3133 return (basename == null) ? null : decode(getRawName(),
3134 getProtocolCharset());
3135 }
3136
3137
3138
3139 /***
3140 * Get the raw-escaped path and query.
3141 *
3142 * @return the raw-escaped path and query
3143 */
3144 public char[] getRawPathQuery() {
3145
3146 if (_path == null && _query == null) {
3147 return null;
3148 }
3149 StringBuffer buff = new StringBuffer();
3150 if (_path != null) {
3151 buff.append(_path);
3152 }
3153 if (_query != null) {
3154 buff.append('?');
3155 buff.append(_query);
3156 }
3157 return buff.toString().toCharArray();
3158 }
3159
3160
3161 /***
3162 * Get the escaped query.
3163 *
3164 * @return the escaped path and query string
3165 */
3166 public String getEscapedPathQuery() {
3167 char[] rawPathQuery = getRawPathQuery();
3168 return (rawPathQuery == null) ? null : new String(rawPathQuery);
3169 }
3170
3171
3172 /***
3173 * Get the path and query.
3174 *
3175 * @return the path and query string.
3176 * @throws URIException incomplete trailing escape pattern or unsupported
3177 * character encoding
3178 * @see #decode
3179 */
3180 public String getPathQuery() throws URIException {
3181 char[] rawPathQuery = getRawPathQuery();
3182 return (rawPathQuery == null) ? null : decode(rawPathQuery,
3183 getProtocolCharset());
3184 }
3185
3186
3187
3188 /***
3189 * Set the raw-escaped query.
3190 *
3191 * @param escapedQuery the raw-escaped query
3192 * @throws URIException escaped query not valid
3193 */
3194 public void setRawQuery(char[] escapedQuery) throws URIException {
3195 if (escapedQuery == null || escapedQuery.length == 0) {
3196 _query = escapedQuery;
3197 setURI();
3198 return;
3199 }
3200
3201 escapedQuery = removeFragmentIdentifier(escapedQuery);
3202 if (!validate(escapedQuery, query)) {
3203 throw new URIException(URIException.ESCAPING,
3204 "escaped query not valid");
3205 }
3206 _query = escapedQuery;
3207 setURI();
3208 }
3209
3210
3211 /***
3212 * Set the escaped query string.
3213 *
3214 * @param escapedQuery the escaped query string
3215 * @throws URIException escaped query not valid
3216 */
3217 public void setEscapedQuery(String escapedQuery) throws URIException {
3218 if (escapedQuery == null) {
3219 _query = null;
3220 setURI();
3221 return;
3222 }
3223 setRawQuery(escapedQuery.toCharArray());
3224 }
3225
3226
3227 /***
3228 * Set the query.
3229 * <p>
3230 * When a query string is not misunderstood the reserved special characters
3231 * ("&", "=", "+", ",", and "$") within a query component, it is
3232 * recommended to use in encoding the whole query with this method.
3233 * <p>
3234 * The additional APIs for the special purpose using by the reserved
3235 * special characters used in each protocol are implemented in each protocol
3236 * classes inherited from <code>URI</code>. So refer to the same-named APIs
3237 * implemented in each specific protocol instance.
3238 *
3239 * @param query the query string.
3240 * @throws URIException incomplete trailing escape pattern or unsupported
3241 * character encoding
3242 * @see #encode
3243 */
3244 public void setQuery(String query) throws URIException {
3245 if (query == null || query.length() == 0) {
3246 _query = (query == null) ? null : query.toCharArray();
3247 setURI();
3248 return;
3249 }
3250 setRawQuery(encode(query, allowed_query, getProtocolCharset()));
3251 }
3252
3253
3254 /***
3255 * Get the raw-escaped query.
3256 *
3257 * @return the raw-escaped query
3258 */
3259 public char[] getRawQuery() {
3260 return _query;
3261 }
3262
3263
3264 /***
3265 * Get the escaped query.
3266 *
3267 * @return the escaped query string
3268 */
3269 public String getEscapedQuery() {
3270 return (_query == null) ? null : new String(_query);
3271 }
3272
3273
3274 /***
3275 * Get the query.
3276 *
3277 * @return the query string.
3278 * @throws URIException incomplete trailing escape pattern or unsupported
3279 * character encoding
3280 * @see #decode
3281 */
3282 public String getQuery() throws URIException {
3283 return (_query == null) ? null : decode(_query, getProtocolCharset());
3284 }
3285
3286
3287
3288 /***
3289 * Set the raw-escaped fragment.
3290 *
3291 * @param escapedFragment the raw-escaped fragment
3292 * @throws URIException escaped fragment not valid
3293 */
3294 public void setRawFragment(char[] escapedFragment) throws URIException {
3295 if (escapedFragment == null || escapedFragment.length == 0) {
3296 _fragment = escapedFragment;
3297 hash = 0;
3298 return;
3299 }
3300 if (!validate(escapedFragment, fragment)) {
3301 throw new URIException(URIException.ESCAPING,
3302 "escaped fragment not valid");
3303 }
3304 _fragment = escapedFragment;
3305 hash = 0;
3306 }
3307
3308
3309 /***
3310 * Set the escaped fragment string.
3311 *
3312 * @param escapedFragment the escaped fragment string
3313 * @throws URIException escaped fragment not valid
3314 */
3315 public void setEscapedFragment(String escapedFragment) throws URIException {
3316 if (escapedFragment == null) {
3317 _fragment = null;
3318 hash = 0;
3319 return;
3320 }
3321 setRawFragment(escapedFragment.toCharArray());
3322 }
3323
3324
3325 /***
3326 * Set the fragment.
3327 *
3328 * @param fragment the fragment string.
3329 * @throws URIException If an error occurs.
3330 */
3331 public void setFragment(String fragment) throws URIException {
3332 if (fragment == null || fragment.length() == 0) {
3333 _fragment = (fragment == null) ? null : fragment.toCharArray();
3334 hash = 0;
3335 return;
3336 }
3337 _fragment = encode(fragment, allowed_fragment, getProtocolCharset());
3338 hash = 0;
3339 }
3340
3341
3342 /***
3343 * Get the raw-escaped fragment.
3344 * <p>
3345 * The optional fragment identifier is not part of a URI, but is often used
3346 * in conjunction with a URI.
3347 * <p>
3348 * The format and interpretation of fragment identifiers is dependent on
3349 * the media type [RFC2046] of the retrieval result.
3350 * <p>
3351 * A fragment identifier is only meaningful when a URI reference is
3352 * intended for retrieval and the result of that retrieval is a document
3353 * for which the identified fragment is consistently defined.
3354 *
3355 * @return the raw-escaped fragment
3356 */
3357 public char[] getRawFragment() {
3358 return _fragment;
3359 }
3360
3361
3362 /***
3363 * Get the escaped fragment.
3364 *
3365 * @return the escaped fragment string
3366 */
3367 public String getEscapedFragment() {
3368 return (_fragment == null) ? null : new String(_fragment);
3369 }
3370
3371
3372 /***
3373 * Get the fragment.
3374 *
3375 * @return the fragment string
3376 * @throws URIException incomplete trailing escape pattern or unsupported
3377 * character encoding
3378 * @see #decode
3379 */
3380 public String getFragment() throws URIException {
3381 return (_fragment == null) ? null : decode(_fragment,
3382 getProtocolCharset());
3383 }
3384
3385
3386
3387 /***
3388 * Remove the fragment identifier of the given component.
3389 *
3390 * @param component the component that a fragment may be included
3391 * @return the component that the fragment identifier is removed
3392 */
3393 protected char[] removeFragmentIdentifier(char[] component) {
3394 if (component == null) {
3395 return null;
3396 }
3397 int lastIndex = new String(component).indexOf('#');
3398 if (lastIndex != -1) {
3399 component = new String(component).substring(0,
3400 lastIndex).toCharArray();
3401 }
3402 return component;
3403 }
3404
3405
3406 /***
3407 * Normalize the given hier path part.
3408 *
3409 * <p>Algorithm taken from URI reference parser at
3410 * http://www.apache.org/~fielding/uri/rev-2002/issues.html.
3411 *
3412 * @param path the path to normalize
3413 * @return the normalized path
3414 * @throws URIException no more higher path level to be normalized
3415 */
3416 protected char[] normalize(char[] path) throws URIException {
3417
3418 if (path == null) {
3419 return null;
3420 }
3421
3422 String normalized = new String(path);
3423
3424
3425 if (normalized.startsWith("./")) {
3426 normalized = normalized.substring(1);
3427 } else if (normalized.startsWith("../")) {
3428 normalized = normalized.substring(2);
3429 } else if (normalized.startsWith("..")) {
3430 normalized = normalized.substring(2);
3431 }
3432
3433
3434 int index = -1;
3435 while ((index = normalized.indexOf("/./")) != -1) {
3436 normalized = normalized.substring(0, index) + normalized.substring(index + 2);
3437 }
3438
3439
3440 if (normalized.endsWith("/.")) {
3441 normalized = normalized.substring(0, normalized.length() - 1);
3442 }
3443
3444 int startIndex = 0;
3445
3446
3447
3448
3449
3450
3451 while ((index = normalized.indexOf("/../", startIndex)) != -1) {
3452 int slashIndex = normalized.lastIndexOf('/', index - 1);
3453 if (slashIndex >= 0) {
3454 normalized = normalized.substring(0, slashIndex) + normalized.substring(index + 3);
3455 } else {
3456 startIndex = index + 3;
3457 }
3458 }
3459 if (normalized.endsWith("/..")) {
3460 int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
3461 if (slashIndex >= 0) {
3462 normalized = normalized.substring(0, slashIndex + 1);
3463 }
3464 }
3465
3466
3467
3468
3469
3470
3471 while ((index = normalized.indexOf("/../")) != -1) {
3472 int slashIndex = normalized.lastIndexOf('/', index - 1);
3473 if (slashIndex >= 0) {
3474 break;
3475 } else {
3476 normalized = normalized.substring(index + 3);
3477 }
3478 }
3479 if (normalized.endsWith("/..")) {
3480 int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
3481 if (slashIndex < 0) {
3482 normalized = "/";
3483 }
3484 }
3485
3486 return normalized.toCharArray();
3487 }
3488
3489
3490 /***
3491 * Normalizes the path part of this URI. Normalization is only meant to be performed on
3492 * URIs with an absolute path. Calling this method on a relative path URI will have no
3493 * effect.
3494 *
3495 * @throws URIException no more higher path level to be normalized
3496 *
3497 * @see #isAbsPath()
3498 */
3499 public void normalize() throws URIException {
3500 if (isAbsPath()) {
3501 _path = normalize(_path);
3502 setURI();
3503 }
3504 }
3505
3506
3507 /***
3508 * Test if the first array is equal to the second array.
3509 *
3510 * @param first the first character array
3511 * @param second the second character array
3512 * @return true if they're equal
3513 */
3514 protected boolean equals(char[] first, char[] second) {
3515
3516 if (first == null && second == null) {
3517 return true;
3518 }
3519 if (first == null || second == null) {
3520 return false;
3521 }
3522 if (first.length != second.length) {
3523 return false;
3524 }
3525 for (int i = 0; i < first.length; i++) {
3526 if (first[i] != second[i]) {
3527 return false;
3528 }
3529 }
3530 return true;
3531 }
3532
3533
3534 /***
3535 * Test an object if this URI is equal to another.
3536 *
3537 * @param obj an object to compare
3538 * @return true if two URI objects are equal
3539 */
3540 public boolean equals(Object obj) {
3541
3542
3543 if (obj == this) {
3544 return true;
3545 }
3546 if (!(obj instanceof URI)) {
3547 return false;
3548 }
3549 URI another = (URI) obj;
3550
3551 if (!equals(_scheme, another._scheme)) {
3552 return false;
3553 }
3554
3555 if (!equals(_opaque, another._opaque)) {
3556 return false;
3557 }
3558
3559
3560 if (!equals(_authority, another._authority)) {
3561 return false;
3562 }
3563
3564 if (!equals(_path, another._path)) {
3565 return false;
3566 }
3567
3568 if (!equals(_query, another._query)) {
3569 return false;
3570 }
3571
3572 if (!equals(_fragment, another._fragment)) {
3573 return false;
3574 }
3575 return true;
3576 }
3577
3578
3579
3580 /***
3581 * Write the content of this URI.
3582 *
3583 * @param oos the object-output stream
3584 * @throws IOException If an IO problem occurs.
3585 */
3586 protected void writeObject(ObjectOutputStream oos)
3587 throws IOException {
3588
3589 oos.defaultWriteObject();
3590 }
3591
3592
3593 /***
3594 * Read a URI.
3595 *
3596 * @param ois the object-input stream
3597 * @throws ClassNotFoundException If one of the classes specified in the
3598 * input stream cannot be found.
3599 * @throws IOException If an IO problem occurs.
3600 */
3601 protected void readObject(ObjectInputStream ois)
3602 throws ClassNotFoundException, IOException {
3603
3604 ois.defaultReadObject();
3605 }
3606
3607
3608
3609 /***
3610 * Return a hash code for this URI.
3611 *
3612 * @return a has code value for this URI
3613 */
3614 public int hashCode() {
3615 if (hash == 0) {
3616 char[] c = _uri;
3617 if (c != null) {
3618 for (int i = 0, len = c.length; i < len; i++) {
3619 hash = 31 * hash + c[i];
3620 }
3621 }
3622 c = _fragment;
3623 if (c != null) {
3624 for (int i = 0, len = c.length; i < len; i++) {
3625 hash = 31 * hash + c[i];
3626 }
3627 }
3628 }
3629 return hash;
3630 }
3631
3632
3633
3634 /***
3635 * Compare this URI to another object.
3636 *
3637 * @param obj the object to be compared.
3638 * @return 0, if it's same,
3639 * -1, if failed, first being compared with in the authority component
3640 * @throws ClassCastException not URI argument
3641 */
3642 public int compareTo(Object obj) throws ClassCastException {
3643
3644 URI another = (URI) obj;
3645 if (!equals(_authority, another.getRawAuthority())) {
3646 return -1;
3647 }
3648 return toString().compareTo(another.toString());
3649 }
3650
3651
3652
3653 /***
3654 * Create and return a copy of this object, the URI-reference containing
3655 * the userinfo component. Notice that the whole URI-reference including
3656 * the userinfo component counld not be gotten as a <code>String</code>.
3657 * <p>
3658 * To copy the identical <code>URI</code> object including the userinfo
3659 * component, it should be used.
3660 *
3661 * @return a clone of this instance
3662 */
3663 public synchronized Object clone() {
3664
3665 URI instance = new URI();
3666
3667 instance._uri = _uri;
3668 instance._scheme = _scheme;
3669 instance._opaque = _opaque;
3670 instance._authority = _authority;
3671 instance._userinfo = _userinfo;
3672 instance._host = _host;
3673 instance._port = _port;
3674 instance._path = _path;
3675 instance._query = _query;
3676 instance._fragment = _fragment;
3677
3678 instance.protocolCharset = protocolCharset;
3679
3680 instance._is_hier_part = _is_hier_part;
3681 instance._is_opaque_part = _is_opaque_part;
3682 instance._is_net_path = _is_net_path;
3683 instance._is_abs_path = _is_abs_path;
3684 instance._is_rel_path = _is_rel_path;
3685 instance._is_reg_name = _is_reg_name;
3686 instance._is_server = _is_server;
3687 instance._is_hostname = _is_hostname;
3688 instance._is_IPv4address = _is_IPv4address;
3689 instance._is_IPv6reference = _is_IPv6reference;
3690
3691 return instance;
3692 }
3693
3694
3695
3696 /***
3697 * It can be gotten the URI character sequence. It's raw-escaped.
3698 * For the purpose of the protocol to be transported, it will be useful.
3699 * <p>
3700 * It is clearly unwise to use a URL that contains a password which is
3701 * intended to be secret. In particular, the use of a password within
3702 * the 'userinfo' component of a URL is strongly disrecommended except
3703 * in those rare cases where the 'password' parameter is intended to be
3704 * public.
3705 * <p>
3706 * When you want to get each part of the userinfo, you need to use the
3707 * specific methods in the specific URL. It depends on the specific URL.
3708 *
3709 * @return the URI character sequence
3710 */
3711 public char[] getRawURI() {
3712 return _uri;
3713 }
3714
3715
3716 /***
3717 * It can be gotten the URI character sequence. It's escaped.
3718 * For the purpose of the protocol to be transported, it will be useful.
3719 *
3720 * @return the escaped URI string
3721 */
3722 public String getEscapedURI() {
3723 return (_uri == null) ? null : new String(_uri);
3724 }
3725
3726
3727 /***
3728 * It can be gotten the URI character sequence.
3729 *
3730 * @return the original URI string
3731 * @throws URIException incomplete trailing escape pattern or unsupported
3732 * character encoding
3733 * @see #decode
3734 */
3735 public String getURI() throws URIException {
3736 return (_uri == null) ? null : decode(_uri, getProtocolCharset());
3737 }
3738
3739
3740 /***
3741 * Get the URI reference character sequence.
3742 *
3743 * @return the URI reference character sequence
3744 */
3745 public char[] getRawURIReference() {
3746 if (_fragment == null) {
3747 return _uri;
3748 }
3749 if (_uri == null) {
3750 return _fragment;
3751 }
3752
3753 String uriReference = new String(_uri) + "#" + new String(_fragment);
3754 return uriReference.toCharArray();
3755 }
3756
3757
3758 /***
3759 * Get the escaped URI reference string.
3760 *
3761 * @return the escaped URI reference string
3762 */
3763 public String getEscapedURIReference() {
3764 char[] uriReference = getRawURIReference();
3765 return (uriReference == null) ? null : new String(uriReference);
3766 }
3767
3768
3769 /***
3770 * Get the original URI reference string.
3771 *
3772 * @return the original URI reference string
3773 * @throws URIException If {@link #decode} fails.
3774 */
3775 public String getURIReference() throws URIException {
3776 char[] uriReference = getRawURIReference();
3777 return (uriReference == null) ? null : decode(uriReference,
3778 getProtocolCharset());
3779 }
3780
3781
3782 /***
3783 * Get the escaped URI string.
3784 * <p>
3785 * On the document, the URI-reference form is only used without the userinfo
3786 * component like http://jakarta.apache.org/ by the security reason.
3787 * But the URI-reference form with the userinfo component could be parsed.
3788 * <p>
3789 * In other words, this URI and any its subclasses must not expose the
3790 * URI-reference expression with the userinfo component like
3791 * http://user:password@hostport/restricted_zone.<br>
3792 * It means that the API client programmer should extract each user and
3793 * password to access manually. Probably it will be supported in the each
3794 * subclass, however, not a whole URI-reference expression.
3795 *
3796 * @return the escaped URI string
3797 * @see #clone()
3798 */
3799 public String toString() {
3800 return getEscapedURI();
3801 }
3802
3803
3804
3805
3806 /***
3807 * The charset-changed normal operation to represent to be required to
3808 * alert to user the fact the default charset is changed.
3809 */
3810 public static class DefaultCharsetChanged extends RuntimeException {
3811
3812
3813
3814 /***
3815 * The constructor with a reason string and its code arguments.
3816 *
3817 * @param reasonCode the reason code
3818 * @param reason the reason
3819 */
3820 public DefaultCharsetChanged(int reasonCode, String reason) {
3821 super(reason);
3822 this.reason = reason;
3823 this.reasonCode = reasonCode;
3824 }
3825
3826
3827
3828 /*** No specified reason code. */
3829 public static final int UNKNOWN = 0;
3830
3831 /*** Protocol charset changed. */
3832 public static final int PROTOCOL_CHARSET = 1;
3833
3834 /*** Document charset changed. */
3835 public static final int DOCUMENT_CHARSET = 2;
3836
3837
3838
3839 /*** The reason code. */
3840 private int reasonCode;
3841
3842 /*** The reason message. */
3843 private String reason;
3844
3845
3846
3847 /***
3848 * Get the reason code.
3849 *
3850 * @return the reason code
3851 */
3852 public int getReasonCode() {
3853 return reasonCode;
3854 }
3855
3856 /***
3857 * Get the reason message.
3858 *
3859 * @return the reason message
3860 */
3861 public String getReason() {
3862 return reason;
3863 }
3864
3865 }
3866
3867
3868 /***
3869 * A mapping to determine the (somewhat arbitrarily) preferred charset for a
3870 * given locale. Supports all locales recognized in JDK 1.1.
3871 * <p>
3872 * The distribution of this class is Servlets.com. It was originally
3873 * written by Jason Hunter [jhunter at acm.org] and used by with permission.
3874 */
3875 public static class LocaleToCharsetMap {
3876
3877 /*** A mapping of language code to charset */
3878 private static final Hashtable LOCALE_TO_CHARSET_MAP;
3879 static {
3880 LOCALE_TO_CHARSET_MAP = new Hashtable();
3881 LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6");
3882 LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5");
3883 LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5");
3884 LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1");
3885 LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2");
3886 LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1");
3887 LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1");
3888 LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7");
3889 LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1");
3890 LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1");
3891 LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1");
3892 LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1");
3893 LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1");
3894 LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2");
3895 LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2");
3896 LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1");
3897 LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1");
3898 LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8");
3899 LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS");
3900 LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR");
3901 LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2");
3902 LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2");
3903 LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5");
3904 LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1");
3905 LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1");
3906 LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2");
3907 LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1");
3908 LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2");
3909 LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5");
3910 LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5");
3911 LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2");
3912 LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2");
3913 LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2");
3914 LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5");
3915 LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1");
3916 LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9");
3917 LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5");
3918 LOCALE_TO_CHARSET_MAP.put("zh", "GB2312");
3919 LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5");
3920 }
3921
3922 /***
3923 * Get the preferred charset for the given locale.
3924 *
3925 * @param locale the locale
3926 * @return the preferred charset or null if the locale is not
3927 * recognized.
3928 */
3929 public static String getCharset(Locale locale) {
3930
3931 String charset =
3932 (String) LOCALE_TO_CHARSET_MAP.get(locale.toString());
3933 if (charset != null) {
3934 return charset;
3935 }
3936
3937
3938 charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage());
3939 return charset;
3940 }
3941
3942 }
3943
3944 }
3945