Skip to content

Commit 1a6a529

Browse files
committed
HTTPCORE-637: RFC 3986 URI: parse/resolve/normalize.
Table-driven ASCII, no regex; correct §5.2.4 dot-segment trailing slash. Drop Rfc3986UriBuilder; helpers @internal; RFC example tests green.
1 parent 1ee9e08 commit 1a6a529

File tree

15 files changed

+1720
-49
lines changed

15 files changed

+1720
-49
lines changed

httpcore5/src/main/java/org/apache/hc/core5/net/URIBuilder.java

Lines changed: 7 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,14 @@
3636
import java.util.Arrays;
3737
import java.util.BitSet;
3838
import java.util.Collections;
39-
import java.util.LinkedList;
4039
import java.util.List;
4140

4241
import org.apache.hc.core5.http.HttpHost;
4342
import org.apache.hc.core5.http.NameValuePair;
4443
import org.apache.hc.core5.http.URIScheme;
4544
import org.apache.hc.core5.http.message.BasicNameValuePair;
4645
import org.apache.hc.core5.http.message.ParserCursor;
46+
import org.apache.hc.core5.net.uri.Rfc3986Uri;
4747
import org.apache.hc.core5.util.Args;
4848
import org.apache.hc.core5.util.TextUtils;
4949
import org.apache.hc.core5.util.Tokenizer;
@@ -1118,58 +1118,16 @@ public URIBuilder normalizeSyntax() {
11181118
* @since 5.3
11191119
*/
11201120
public URIBuilder optimize() {
1121-
final String scheme = this.scheme;
1122-
if (scheme != null) {
1123-
this.scheme = TextUtils.toLowerCase(scheme);
1124-
}
1125-
1126-
if (this.pathRootless) {
1121+
final String raw = this.toString();
1122+
try {
1123+
final Rfc3986Uri u = Rfc3986Uri.parse(raw).optimize();
1124+
return new URIBuilder(u.toString());
1125+
} catch (final IllegalArgumentException | URISyntaxException ex) {
11271126
return this;
11281127
}
1129-
1130-
// Force Percent-Encoding re-encoding
1131-
this.encodedSchemeSpecificPart = null;
1132-
this.encodedAuthority = null;
1133-
this.encodedUserInfo = null;
1134-
this.encodedPath = null;
1135-
this.encodedQuery = null;
1136-
this.encodedFragment = null;
1137-
1138-
final String host = this.host;
1139-
if (host != null) {
1140-
this.host = TextUtils.toLowerCase(host);
1141-
}
1142-
1143-
if (this.pathSegments != null) {
1144-
final List<String> inputSegments = this.pathSegments;
1145-
if (!inputSegments.isEmpty()) {
1146-
final LinkedList<String> outputSegments = new LinkedList<>();
1147-
for (final String inputSegment : inputSegments) {
1148-
if (!inputSegment.isEmpty() && !".".equals(inputSegment)) {
1149-
if ("..".equals(inputSegment)) {
1150-
if (!outputSegments.isEmpty()) {
1151-
outputSegments.removeLast();
1152-
}
1153-
} else {
1154-
outputSegments.addLast(inputSegment);
1155-
}
1156-
}
1157-
}
1158-
if (!inputSegments.isEmpty()) {
1159-
final String lastSegment = inputSegments.get(inputSegments.size() - 1);
1160-
if (lastSegment.isEmpty()) {
1161-
outputSegments.addLast("");
1162-
}
1163-
}
1164-
this.pathSegments = outputSegments;
1165-
} else {
1166-
this.pathSegments = Collections.singletonList("");
1167-
}
1168-
}
1169-
1170-
return this;
11711128
}
11721129

1130+
11731131
/**
11741132
* Converts this instance to a URI string.
11751133
*
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
/*
2+
* ====================================================================
3+
* Licensed to the Apache Software Foundation (ASF) under one
4+
* or more contributor license agreements. See the NOTICE file
5+
* distributed with this work for additional information
6+
* regarding copyright ownership. The ASF licenses this file
7+
* to you under the Apache License, Version 2.0 (the
8+
* "License"); you may not use this file except in compliance
9+
* with the License. You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing,
14+
* software distributed under the License is distributed on an
15+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
* KIND, either express or implied. See the License for the
17+
* specific language governing permissions and limitations
18+
* under the License.
19+
* ====================================================================
20+
*
21+
* This software consists of voluntary contributions made by many
22+
* individuals on behalf of the Apache Software Foundation. For more
23+
* information on the Apache Software Foundation, please see
24+
* <http://www.apache.org/>.
25+
*
26+
*/
27+
28+
package org.apache.hc.core5.net.uri;
29+
30+
import org.apache.hc.core5.annotation.Contract;
31+
import org.apache.hc.core5.annotation.ThreadingBehavior;
32+
import org.apache.hc.core5.net.uri.internal.uris.Rfc3986Equivalence;
33+
import org.apache.hc.core5.net.uri.internal.uris.Rfc3986Normalizer;
34+
import org.apache.hc.core5.net.uri.internal.uris.Rfc3986Parser;
35+
import org.apache.hc.core5.net.uri.internal.uris.Rfc3986Resolver;
36+
37+
/**
38+
* Immutable, RFC 3986-compliant URI value object.
39+
* <ul>
40+
* <li>Parsing preserves raw text (including percent-encodings).</li>
41+
* <li>Resolution &amp; dot-segment removal per RFC 3986 §5.2.</li>
42+
* <li>Scheme and reg-name host are stored in lower case.</li>
43+
* <li>No regex, no {@code Character} classes – pure ASCII tables.</li>
44+
* </ul>
45+
*
46+
* <p><strong>Round-trip:</strong> {@link #toRawString()} returns the exact input.
47+
* {@link #toString()} renders the canonical form held by this object.</p>
48+
*
49+
* @since 5.4
50+
*/
51+
@Contract(threading = ThreadingBehavior.IMMUTABLE)
52+
public final class Rfc3986Uri implements UriReference {
53+
54+
final String original;
55+
56+
final String scheme; // lower-cased (ASCII) or null
57+
final String userInfo; // raw or null
58+
final String host; // reg-name lower-cased; IPv6 literal kept with brackets; or null
59+
final int port; // -1 if missing
60+
final String path; // raw, never null ("" allowed)
61+
final String query; // raw (no '?') or null
62+
final String fragment; // raw (no '#') or null
63+
64+
public Rfc3986Uri(
65+
final String original,
66+
final String scheme,
67+
final String userInfo,
68+
final String host,
69+
final int port,
70+
final String path,
71+
final String query,
72+
final String fragment) {
73+
this.original = original;
74+
this.scheme = scheme;
75+
this.userInfo = userInfo;
76+
this.host = host;
77+
this.port = port;
78+
this.path = path;
79+
this.query = query;
80+
this.fragment = fragment;
81+
}
82+
83+
/**
84+
* Parse a URI reference per RFC 3986.
85+
*/
86+
public static Rfc3986Uri parse(final String s) {
87+
return Rfc3986Parser.parse(s);
88+
}
89+
90+
@Override
91+
public String getScheme() {
92+
return scheme;
93+
}
94+
95+
@Override
96+
public String getUserInfo() {
97+
return userInfo;
98+
}
99+
100+
@Override
101+
public String getHost() {
102+
return host;
103+
}
104+
105+
@Override
106+
public int getPort() {
107+
return port;
108+
}
109+
110+
@Override
111+
public String getPath() {
112+
return path;
113+
}
114+
115+
@Override
116+
public String getQuery() {
117+
return query;
118+
}
119+
120+
@Override
121+
public String getFragment() {
122+
return fragment;
123+
}
124+
125+
@Override
126+
public String toRawString() {
127+
return original;
128+
}
129+
130+
@Override
131+
public String toString() {
132+
// Render canonical internal state (not the raw input).
133+
int cap = 0;
134+
if (scheme != null) {
135+
cap += scheme.length() + 1;
136+
}
137+
if (host != null) {
138+
cap += 2 + host.length();
139+
if (userInfo != null) {
140+
cap += userInfo.length() + 1;
141+
}
142+
if (port >= 0) {
143+
cap += 6;
144+
}
145+
}
146+
if (path != null) {
147+
cap += path.length();
148+
}
149+
if (query != null) {
150+
cap += 1 + query.length();
151+
}
152+
if (fragment != null) {
153+
cap += 1 + fragment.length();
154+
}
155+
156+
final StringBuilder sb = new StringBuilder(Math.max(16, cap));
157+
if (scheme != null) {
158+
sb.append(scheme).append(':');
159+
}
160+
if (host != null) {
161+
sb.append("//");
162+
if (userInfo != null) {
163+
sb.append(userInfo).append('@');
164+
}
165+
sb.append(host);
166+
if (port >= 0) {
167+
sb.append(':').append(port);
168+
}
169+
}
170+
if (path != null) {
171+
sb.append(path);
172+
}
173+
if (query != null) {
174+
sb.append('?').append(query);
175+
}
176+
if (fragment != null) {
177+
sb.append('#').append(fragment);
178+
}
179+
return sb.toString();
180+
}
181+
182+
/**
183+
* Dot-segment removal (RFC 3986 §5.2.4).
184+
*/
185+
public Rfc3986Uri normalizePath() {
186+
return Rfc3986Normalizer.normalizePath(this);
187+
}
188+
189+
/**
190+
* RFC equivalence (case-insensitive scheme/host; decode %XX for unreserved; uppercase hex).
191+
*/
192+
public boolean equivalentTo(final Rfc3986Uri other) {
193+
return Rfc3986Equivalence.equivalent(this, other);
194+
}
195+
196+
/**
197+
* Resolve against a base (RFC 3986 §5.2).
198+
*/
199+
public static Rfc3986Uri resolve(final Rfc3986Uri base, final Rfc3986Uri ref) {
200+
return Rfc3986Resolver.resolve(base, ref);
201+
}
202+
203+
/**
204+
* Canonicalization used by URIBuilder#optimize().
205+
* <p>Performs:
206+
* <ul>
207+
* <li>Lower-case of scheme and reg-name host (IPv6 literal preserved).</li>
208+
* <li>Dot-segment removal if the path is absolute or an authority is present.</li>
209+
* <li>Decoding of percent-escapes only for ASCII unreserved.</li>
210+
* <li>Uppercasing of hex digits in remaining percent-escapes.</li>
211+
* <li>Strict re-encoding of the path (preserve '/' and valid %HH; encode everything else via UTF-8).</li>
212+
* <li>Query and fragment normalized by decode-unreserved + uppercase-hex.</li>
213+
* </ul>
214+
* The operation may change the textual form; consumers should treat this as a canonicalization step,
215+
* not as a guaranteed identity-preserving transformation.
216+
*
217+
* @since 5.4
218+
*/
219+
public Rfc3986Uri optimize() {
220+
return Rfc3986Normalizer.optimize(this);
221+
}
222+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* ====================================================================
3+
* Licensed to the Apache Software Foundation (ASF) under one
4+
* or more contributor license agreements. See the NOTICE file
5+
* distributed with this work for additional information
6+
* regarding copyright ownership. The ASF licenses this file
7+
* to you under the Apache License, Version 2.0 (the
8+
* "License"); you may not use this file except in compliance
9+
* with the License. You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing,
14+
* software distributed under the License is distributed on an
15+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
* KIND, either express or implied. See the License for the
17+
* specific language governing permissions and limitations
18+
* under the License.
19+
* ====================================================================
20+
*
21+
* This software consists of voluntary contributions made by many
22+
* individuals on behalf of the Apache Software Foundation. For more
23+
* information on the Apache Software Foundation, please see
24+
* <http://www.apache.org/>.
25+
*
26+
*/
27+
28+
package org.apache.hc.core5.net.uri;
29+
30+
/**
31+
* Minimal URI accessor interface for RFC 3986 components.
32+
*
33+
* @since 5.4
34+
*/
35+
public interface UriReference {
36+
String getScheme();
37+
38+
String getUserInfo();
39+
40+
String getHost();
41+
42+
int getPort();
43+
44+
String getPath();
45+
46+
String getQuery();
47+
48+
String getFragment();
49+
50+
/**
51+
* Raw original input text (no transformations).
52+
*/
53+
String toRawString();
54+
}

0 commit comments

Comments
 (0)