diff options
Diffstat (limited to 'guava/src/com/google/common/net/InternetDomainName.java')
-rw-r--r-- | guava/src/com/google/common/net/InternetDomainName.java | 580 |
1 files changed, 580 insertions, 0 deletions
diff --git a/guava/src/com/google/common/net/InternetDomainName.java b/guava/src/com/google/common/net/InternetDomainName.java new file mode 100644 index 0000000..ace7cf2 --- /dev/null +++ b/guava/src/com/google/common/net/InternetDomainName.java @@ -0,0 +1,580 @@ +/* + * Copyright (C) 2009 The Guava Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.common.net; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; + +import com.google.common.annotations.Beta; +import com.google.common.annotations.GwtCompatible; +import com.google.common.base.Ascii; +import com.google.common.base.CharMatcher; +import com.google.common.base.Joiner; +import com.google.common.base.Objects; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +import javax.annotation.Nullable; + +/** + * An immutable well-formed internet domain name, such as {@code com} or {@code + * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other + * network interactions take place. Thus there is no guarantee that the domain + * actually exists on the internet. + * + * <p>One common use of this class is to determine whether a given string is + * likely to represent an addressable domain on the web -- that is, for a + * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"} + * result in a webpage being displayed? In the past, this test was frequently + * done by determining whether the domain ended with a {@linkplain + * #isPublicSuffix() public suffix} but was not itself a public suffix. However, + * this test is no longer accurate. There are many domains which are both public + * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a + * result, the only useful test to determine if a domain is a plausible web host + * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains + * which (currently) are not hosts, such as {@code "com"}), but given that any + * public suffix may become a host without warning, it is better to err on the + * side of permissiveness and thus avoid spurious rejection of valid sites. + * + * <p>During construction, names are normalized in two ways: + * <ol> + * <li>ASCII uppercase characters are converted to lowercase. + * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are + * converted to the ASCII period. + * </ol> + * The normalized values will be returned from {@link #name()} and + * {@link #parts()}, and will be reflected in the result of + * {@link #equals(Object)}. + * + * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name"> + * internationalized domain names</a> such as {@code 网络.cn} are supported, as + * are the equivalent <a + * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA + * Punycode-encoded</a> versions. + * + * @author Craig Berry + * @since 5.0 + */ +@Beta +@GwtCompatible(emulated = true) +public final class InternetDomainName { + + private static final CharMatcher DOTS_MATCHER = + CharMatcher.anyOf(".\u3002\uFF0E\uFF61"); + private static final Splitter DOT_SPLITTER = Splitter.on('.'); + private static final Joiner DOT_JOINER = Joiner.on('.'); + + /** + * Value of {@link #publicSuffixIndex} which indicates that no public suffix + * was found. + */ + private static final int NO_PUBLIC_SUFFIX_FOUND = -1; + + private static final String DOT_REGEX = "\\."; + + /** + * Maximum parts (labels) in a domain name. This value arises from + * the 255-octet limit described in + * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with + * the fact that the encoding of each part occupies at least two bytes + * (dot plus label externally, length byte plus label internally). Thus, if + * all labels have the minimum size of one byte, 127 of them will fit. + */ + private static final int MAX_PARTS = 127; + + /** + * Maximum length of a full domain name, including separators, and + * leaving room for the root label. See + * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11. + */ + private static final int MAX_LENGTH = 253; + + /** + * Maximum size of a single part of a domain name. See + * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11. + */ + private static final int MAX_DOMAIN_PART_LENGTH = 63; + + /** + * The full domain name, converted to lower case. + */ + private final String name; + + /** + * The parts of the domain name, converted to lower case. + */ + private final ImmutableList<String> parts; + + /** + * The index in the {@link #parts()} list at which the public suffix begins. + * For example, for the domain name {@code www.google.co.uk}, the value would + * be 2 (the index of the {@code co} part). The value is negative + * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was + * found. + */ + private final int publicSuffixIndex; + + /** + * Constructor used to implement {@link #from(String)}, and from subclasses. + */ + InternetDomainName(String name) { + // Normalize: + // * ASCII characters to lowercase + // * All dot-like characters to '.' + // * Strip trailing '.' + + name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.')); + + if (name.endsWith(".")) { + name = name.substring(0, name.length() - 1); + } + + checkArgument(name.length() <= MAX_LENGTH, + "Domain name too long: '%s':", name); + this.name = name; + + this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name)); + checkArgument(parts.size() <= MAX_PARTS, + "Domain has too many parts: '%s'", name); + checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name); + + this.publicSuffixIndex = findPublicSuffix(); + } + + /** + * Returns the index of the leftmost part of the public suffix, or -1 if not + * found. Note that the value defined as the "public suffix" may not be a + * public suffix according to {@link #isPublicSuffix()} if the domain ends + * with an excluded domain pattern such as {@code "nhs.uk"}. + */ + private int findPublicSuffix() { + final int partsSize = parts.size(); + + for (int i = 0; i < partsSize; i++) { + String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize)); + + if (TldPatterns.EXACT.contains(ancestorName)) { + return i; + } + + // Excluded domains (e.g. !nhs.uk) use the next highest + // domain as the effective public suffix (e.g. uk). + + if (TldPatterns.EXCLUDED.contains(ancestorName)) { + return i + 1; + } + + if (matchesWildcardPublicSuffix(ancestorName)) { + return i; + } + } + + return NO_PUBLIC_SUFFIX_FOUND; + } + + /** + * A deprecated synonym for {@link #from(String)}. + * + * @param domain A domain name (not IP address) + * @throws IllegalArgumentException if {@code name} is not syntactically valid + * according to {@link #isValidLenient} + * @since 8.0 (previously named {@code from}) + * @deprecated Use {@link #from(String)} + */ + @Deprecated + public static InternetDomainName fromLenient(String domain) { + return from(domain); + } + + /** + * Returns an instance of {@link InternetDomainName} after lenient + * validation. Specifically, validation against <a + * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> + * ("Internationalizing Domain Names in Applications") is skipped, while + * validation against <a + * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in + * the following ways: + * <ul> + * <li>Any part containing non-ASCII characters is considered valid. + * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted. + * <li>Parts other than the final part may start with a digit. + * </ul> + * + * + * @param domain A domain name (not IP address) + * @throws IllegalArgumentException if {@code name} is not syntactically valid + * according to {@link #isValid} + * @since 10.0 (previously named {@code fromLenient}) + */ + public static InternetDomainName from(String domain) { + return new InternetDomainName(checkNotNull(domain)); + } + + /** + * Validation method used by {@from} to ensure that the domain name is + * syntactically valid according to RFC 1035. + * + * @return Is the domain name syntactically valid? + */ + private static boolean validateSyntax(List<String> parts) { + final int lastIndex = parts.size() - 1; + + // Validate the last part specially, as it has different syntax rules. + + if (!validatePart(parts.get(lastIndex), true)) { + return false; + } + + for (int i = 0; i < lastIndex; i++) { + String part = parts.get(i); + if (!validatePart(part, false)) { + return false; + } + } + + return true; + } + + private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_"); + + private static final CharMatcher PART_CHAR_MATCHER = + CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER); + + /** + * Helper method for {@link #validateSyntax(List)}. Validates that one part of + * a domain name is valid. + * + * @param part The domain name part to be validated + * @param isFinalPart Is this the final (rightmost) domain part? + * @return Whether the part is valid + */ + private static boolean validatePart(String part, boolean isFinalPart) { + + // These tests could be collapsed into one big boolean expression, but + // they have been left as independent tests for clarity. + + if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) { + return false; + } + + /* + * GWT claims to support java.lang.Character's char-classification methods, + * but it actually only works for ASCII. So for now, assume any non-ASCII + * characters are valid. The only place this seems to be documented is here: + * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html + * + * <p>ASCII characters in the part are expected to be valid per RFC 1035, + * with underscore also being allowed due to widespread practice. + */ + + String asciiChars = CharMatcher.ASCII.retainFrom(part); + + if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) { + return false; + } + + // No initial or final dashes or underscores. + + if (DASH_MATCHER.matches(part.charAt(0)) + || DASH_MATCHER.matches(part.charAt(part.length() - 1))) { + return false; + } + + /* + * Note that we allow (in contravention of a strict interpretation of the + * relevant RFCs) domain parts other than the last may begin with a digit + * (for example, "3com.com"). It's important to disallow an initial digit in + * the last part; it's the only thing that stops an IPv4 numeric address + * like 127.0.0.1 from looking like a valid domain name. + */ + + if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) { + return false; + } + + return true; + } + + /** + * Returns the domain name, normalized to all lower case. + */ + public String name() { + return name; + } + + /** + * Returns the individual components of this domain name, normalized to all + * lower case. For example, for the domain name {@code mail.google.com}, this + * method returns the list {@code ["mail", "google", "com"]}. + */ + public ImmutableList<String> parts() { + return parts; + } + + /** + * Indicates whether this domain name represents a <i>public suffix</i>, as + * defined by the Mozilla Foundation's + * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public + * suffix is one under which Internet users can directly register names, such + * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain + * names that are <i>not</i> public suffixes include {@code google}, {@code + * google.com} and {@code foo.co.uk}. + * + * @return {@code true} if this domain name appears exactly on the public + * suffix list + * @since 6.0 + */ + public boolean isPublicSuffix() { + return publicSuffixIndex == 0; + } + + /** + * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() + * public suffix}, including if it is a public suffix itself. For example, + * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and + * {@code com}, but not for {@code google} or {@code google.foo}. This is + * the recommended method for determining whether a domain is potentially an + * addressable host. + * + * @since 6.0 + */ + public boolean hasPublicSuffix() { + return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND; + } + + /** + * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the + * domain name, or {@code null} if no public suffix is present. + * + * @since 6.0 + */ + public InternetDomainName publicSuffix() { + return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null; + } + + /** + * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() + * public suffix}, while not being a public suffix itself. For example, + * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and + * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code + * google.foo}. + * + * <p><b>Warning:</b> a {@code false} result from this method does not imply + * that the domain does not represent an addressable host, as many public + * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for + * that test. + * + * <p>This method can be used to determine whether it will probably be + * possible to set cookies on the domain, though even that depends on + * individual browsers' implementations of cookie controls. See + * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. + * + * @since 6.0 + */ + public boolean isUnderPublicSuffix() { + return publicSuffixIndex > 0; + } + + /** + * Indicates whether this domain name is composed of exactly one subdomain + * component followed by a {@linkplain #isPublicSuffix() public suffix}. For + * example, returns {@code true} for {@code google.com} and {@code foo.co.uk}, + * but not for {@code www.google.com} or {@code co.uk}. + * + * <p><b>Warning:</b> A {@code true} result from this method does not imply + * that the domain is at the highest level which is addressable as a host, as + * many public suffixes are also addressable hosts. For example, the domain + * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would + * return {@code true} from this method. But {@code uk.com} is itself an + * addressable host. + * + * <p>This method can be used to determine whether a domain is probably the + * highest level for which cookies may be set, though even that depends on + * individual browsers' implementations of cookie controls. See + * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. + * + * @since 6.0 + */ + public boolean isTopPrivateDomain() { + return publicSuffixIndex == 1; + } + + /** + * Returns the portion of this domain name that is one level beneath the + * public suffix. For example, for {@code x.adwords.google.co.uk} it returns + * {@code google.co.uk}, since {@code co.uk} is a public suffix. + * + * <p>If {@link #isTopPrivateDomain()} is true, the current domain name + * instance is returned. + * + * <p>This method should not be used to determine the topmost parent domain + * which is addressable as a host, as many public suffixes are also + * addressable hosts. For example, the domain {@code foo.bar.uk.com} has + * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com} + * from this method. But {@code uk.com} is itself an addressable host. + * + * <p>This method can be used to determine the probable highest level parent + * domain for which cookies may be set, though even that depends on individual + * browsers' implementations of cookie controls. + * + * @throws IllegalStateException if this domain does not end with a + * public suffix + * @since 6.0 + */ + public InternetDomainName topPrivateDomain() { + if (isTopPrivateDomain()) { + return this; + } + checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name); + return ancestor(publicSuffixIndex - 1); + } + + /** + * Indicates whether this domain is composed of two or more parts. + */ + public boolean hasParent() { + return parts.size() > 1; + } + + /** + * Returns an {@code InternetDomainName} that is the immediate ancestor of + * this one; that is, the current domain with the leftmost part removed. For + * example, the parent of {@code www.google.com} is {@code google.com}. + * + * @throws IllegalStateException if the domain has no parent, as determined + * by {@link #hasParent} + */ + public InternetDomainName parent() { + checkState(hasParent(), "Domain '%s' has no parent", name); + return ancestor(1); + } + + /** + * Returns the ancestor of the current domain at the given number of levels + * "higher" (rightward) in the subdomain list. The number of levels must be + * non-negative, and less than {@code N-1}, where {@code N} is the number of + * parts in the domain. + * + * <p>TODO: Reasonable candidate for addition to public API. + */ + private InternetDomainName ancestor(int levels) { + return from(DOT_JOINER.join(parts.subList(levels, parts.size()))); + } + + /** + * Creates and returns a new {@code InternetDomainName} by prepending the + * argument and a dot to the current name. For example, {@code + * InternetDomainName.from("foo.com").child("www.bar")} returns a new + * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only + * lenient validation is performed, as described {@link #from(String) here}. + * + * @throws NullPointerException if leftParts is null + * @throws IllegalArgumentException if the resulting name is not valid + */ + public InternetDomainName child(String leftParts) { + return from(checkNotNull(leftParts) + "." + name); + } + + /** + * A deprecated synonym for {@link #isValid(String)}. + * + * @since 8.0 (previously named {@code isValid}) + * @deprecated Use {@link #isValid(String)} instead + */ + @Deprecated + public static boolean isValidLenient(String name) { + return isValid(name); + } + + /** + * Indicates whether the argument is a syntactically valid domain name using + * lenient validation. Specifically, validation against <a + * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> + * ("Internationalizing Domain Names in Applications") is skipped. + * + * <p>The following two code snippets are equivalent: + * + * <pre> {@code + * + * domainName = InternetDomainName.isValid(name) + * ? InternetDomainName.from(name) + * : DEFAULT_DOMAIN; + * }</pre> + * + * <pre> {@code + * + * try { + * domainName = InternetDomainName.from(name); + * } catch (IllegalArgumentException e) { + * domainName = DEFAULT_DOMAIN; + * }}</pre> + * + * @since 8.0 (previously named {@code isValidLenient}) + */ + public static boolean isValid(String name) { + try { + from(name); + return true; + } catch (IllegalArgumentException e) { + return false; + } + } + + /** + * Does the domain name match one of the "wildcard" patterns (e.g. + * {@code "*.ar"})? + */ + private static boolean matchesWildcardPublicSuffix(String domain) { + final String[] pieces = domain.split(DOT_REGEX, 2); + return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]); + } + + // TODO: specify this to return the same as name(); remove name() + @Override + public String toString() { + return Objects.toStringHelper(this).add("name", name).toString(); + } + + /** + * Equality testing is based on the text supplied by the caller, + * after normalization as described in the class documentation. For + * example, a non-ASCII Unicode domain name and the Punycode version + * of the same domain name would not be considered equal. + * + */ + @Override + public boolean equals(@Nullable Object object) { + if (object == this) { + return true; + } + + if (object instanceof InternetDomainName) { + InternetDomainName that = (InternetDomainName) object; + return this.name.equals(that.name); + } + + return false; + } + + @Override + public int hashCode() { + return name.hashCode(); + } +} |