summaryrefslogtreecommitdiffstats
path: root/guava/src/com/google/common/net/InternetDomainName.java
diff options
context:
space:
mode:
Diffstat (limited to 'guava/src/com/google/common/net/InternetDomainName.java')
-rw-r--r--guava/src/com/google/common/net/InternetDomainName.java580
1 files changed, 580 insertions, 0 deletions
diff --git a/guava/src/com/google/common/net/InternetDomainName.java b/guava/src/com/google/common/net/InternetDomainName.java
new file mode 100644
index 0000000..ace7cf2
--- /dev/null
+++ b/guava/src/com/google/common/net/InternetDomainName.java
@@ -0,0 +1,580 @@
+/*
+ * Copyright (C) 2009 The Guava Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.common.net;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+
+import com.google.common.annotations.Beta;
+import com.google.common.annotations.GwtCompatible;
+import com.google.common.base.Ascii;
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Joiner;
+import com.google.common.base.Objects;
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+import javax.annotation.Nullable;
+
+/**
+ * An immutable well-formed internet domain name, such as {@code com} or {@code
+ * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
+ * network interactions take place. Thus there is no guarantee that the domain
+ * actually exists on the internet.
+ *
+ * <p>One common use of this class is to determine whether a given string is
+ * likely to represent an addressable domain on the web -- that is, for a
+ * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
+ * result in a webpage being displayed? In the past, this test was frequently
+ * done by determining whether the domain ended with a {@linkplain
+ * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
+ * this test is no longer accurate. There are many domains which are both public
+ * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
+ * result, the only useful test to determine if a domain is a plausible web host
+ * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
+ * which (currently) are not hosts, such as {@code "com"}), but given that any
+ * public suffix may become a host without warning, it is better to err on the
+ * side of permissiveness and thus avoid spurious rejection of valid sites.
+ *
+ * <p>During construction, names are normalized in two ways:
+ * <ol>
+ * <li>ASCII uppercase characters are converted to lowercase.
+ * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
+ * converted to the ASCII period.
+ * </ol>
+ * The normalized values will be returned from {@link #name()} and
+ * {@link #parts()}, and will be reflected in the result of
+ * {@link #equals(Object)}.
+ *
+ * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
+ * internationalized domain names</a> such as {@code 网络.cn} are supported, as
+ * are the equivalent <a
+ * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
+ * Punycode-encoded</a> versions.
+ *
+ * @author Craig Berry
+ * @since 5.0
+ */
+@Beta
+@GwtCompatible(emulated = true)
+public final class InternetDomainName {
+
+ private static final CharMatcher DOTS_MATCHER =
+ CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
+ private static final Splitter DOT_SPLITTER = Splitter.on('.');
+ private static final Joiner DOT_JOINER = Joiner.on('.');
+
+ /**
+ * Value of {@link #publicSuffixIndex} which indicates that no public suffix
+ * was found.
+ */
+ private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
+
+ private static final String DOT_REGEX = "\\.";
+
+ /**
+ * Maximum parts (labels) in a domain name. This value arises from
+ * the 255-octet limit described in
+ * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
+ * the fact that the encoding of each part occupies at least two bytes
+ * (dot plus label externally, length byte plus label internally). Thus, if
+ * all labels have the minimum size of one byte, 127 of them will fit.
+ */
+ private static final int MAX_PARTS = 127;
+
+ /**
+ * Maximum length of a full domain name, including separators, and
+ * leaving room for the root label. See
+ * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
+ */
+ private static final int MAX_LENGTH = 253;
+
+ /**
+ * Maximum size of a single part of a domain name. See
+ * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
+ */
+ private static final int MAX_DOMAIN_PART_LENGTH = 63;
+
+ /**
+ * The full domain name, converted to lower case.
+ */
+ private final String name;
+
+ /**
+ * The parts of the domain name, converted to lower case.
+ */
+ private final ImmutableList<String> parts;
+
+ /**
+ * The index in the {@link #parts()} list at which the public suffix begins.
+ * For example, for the domain name {@code www.google.co.uk}, the value would
+ * be 2 (the index of the {@code co} part). The value is negative
+ * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
+ * found.
+ */
+ private final int publicSuffixIndex;
+
+ /**
+ * Constructor used to implement {@link #from(String)}, and from subclasses.
+ */
+ InternetDomainName(String name) {
+ // Normalize:
+ // * ASCII characters to lowercase
+ // * All dot-like characters to '.'
+ // * Strip trailing '.'
+
+ name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
+
+ if (name.endsWith(".")) {
+ name = name.substring(0, name.length() - 1);
+ }
+
+ checkArgument(name.length() <= MAX_LENGTH,
+ "Domain name too long: '%s':", name);
+ this.name = name;
+
+ this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
+ checkArgument(parts.size() <= MAX_PARTS,
+ "Domain has too many parts: '%s'", name);
+ checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
+
+ this.publicSuffixIndex = findPublicSuffix();
+ }
+
+ /**
+ * Returns the index of the leftmost part of the public suffix, or -1 if not
+ * found. Note that the value defined as the "public suffix" may not be a
+ * public suffix according to {@link #isPublicSuffix()} if the domain ends
+ * with an excluded domain pattern such as {@code "nhs.uk"}.
+ */
+ private int findPublicSuffix() {
+ final int partsSize = parts.size();
+
+ for (int i = 0; i < partsSize; i++) {
+ String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
+
+ if (TldPatterns.EXACT.contains(ancestorName)) {
+ return i;
+ }
+
+ // Excluded domains (e.g. !nhs.uk) use the next highest
+ // domain as the effective public suffix (e.g. uk).
+
+ if (TldPatterns.EXCLUDED.contains(ancestorName)) {
+ return i + 1;
+ }
+
+ if (matchesWildcardPublicSuffix(ancestorName)) {
+ return i;
+ }
+ }
+
+ return NO_PUBLIC_SUFFIX_FOUND;
+ }
+
+ /**
+ * A deprecated synonym for {@link #from(String)}.
+ *
+ * @param domain A domain name (not IP address)
+ * @throws IllegalArgumentException if {@code name} is not syntactically valid
+ * according to {@link #isValidLenient}
+ * @since 8.0 (previously named {@code from})
+ * @deprecated Use {@link #from(String)}
+ */
+ @Deprecated
+ public static InternetDomainName fromLenient(String domain) {
+ return from(domain);
+ }
+
+ /**
+ * Returns an instance of {@link InternetDomainName} after lenient
+ * validation. Specifically, validation against <a
+ * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
+ * ("Internationalizing Domain Names in Applications") is skipped, while
+ * validation against <a
+ * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
+ * the following ways:
+ * <ul>
+ * <li>Any part containing non-ASCII characters is considered valid.
+ * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
+ * <li>Parts other than the final part may start with a digit.
+ * </ul>
+ *
+ *
+ * @param domain A domain name (not IP address)
+ * @throws IllegalArgumentException if {@code name} is not syntactically valid
+ * according to {@link #isValid}
+ * @since 10.0 (previously named {@code fromLenient})
+ */
+ public static InternetDomainName from(String domain) {
+ return new InternetDomainName(checkNotNull(domain));
+ }
+
+ /**
+ * Validation method used by {@from} to ensure that the domain name is
+ * syntactically valid according to RFC 1035.
+ *
+ * @return Is the domain name syntactically valid?
+ */
+ private static boolean validateSyntax(List<String> parts) {
+ final int lastIndex = parts.size() - 1;
+
+ // Validate the last part specially, as it has different syntax rules.
+
+ if (!validatePart(parts.get(lastIndex), true)) {
+ return false;
+ }
+
+ for (int i = 0; i < lastIndex; i++) {
+ String part = parts.get(i);
+ if (!validatePart(part, false)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
+
+ private static final CharMatcher PART_CHAR_MATCHER =
+ CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
+
+ /**
+ * Helper method for {@link #validateSyntax(List)}. Validates that one part of
+ * a domain name is valid.
+ *
+ * @param part The domain name part to be validated
+ * @param isFinalPart Is this the final (rightmost) domain part?
+ * @return Whether the part is valid
+ */
+ private static boolean validatePart(String part, boolean isFinalPart) {
+
+ // These tests could be collapsed into one big boolean expression, but
+ // they have been left as independent tests for clarity.
+
+ if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
+ return false;
+ }
+
+ /*
+ * GWT claims to support java.lang.Character's char-classification methods,
+ * but it actually only works for ASCII. So for now, assume any non-ASCII
+ * characters are valid. The only place this seems to be documented is here:
+ * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
+ *
+ * <p>ASCII characters in the part are expected to be valid per RFC 1035,
+ * with underscore also being allowed due to widespread practice.
+ */
+
+ String asciiChars = CharMatcher.ASCII.retainFrom(part);
+
+ if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
+ return false;
+ }
+
+ // No initial or final dashes or underscores.
+
+ if (DASH_MATCHER.matches(part.charAt(0))
+ || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
+ return false;
+ }
+
+ /*
+ * Note that we allow (in contravention of a strict interpretation of the
+ * relevant RFCs) domain parts other than the last may begin with a digit
+ * (for example, "3com.com"). It's important to disallow an initial digit in
+ * the last part; it's the only thing that stops an IPv4 numeric address
+ * like 127.0.0.1 from looking like a valid domain name.
+ */
+
+ if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Returns the domain name, normalized to all lower case.
+ */
+ public String name() {
+ return name;
+ }
+
+ /**
+ * Returns the individual components of this domain name, normalized to all
+ * lower case. For example, for the domain name {@code mail.google.com}, this
+ * method returns the list {@code ["mail", "google", "com"]}.
+ */
+ public ImmutableList<String> parts() {
+ return parts;
+ }
+
+ /**
+ * Indicates whether this domain name represents a <i>public suffix</i>, as
+ * defined by the Mozilla Foundation's
+ * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
+ * suffix is one under which Internet users can directly register names, such
+ * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
+ * names that are <i>not</i> public suffixes include {@code google}, {@code
+ * google.com} and {@code foo.co.uk}.
+ *
+ * @return {@code true} if this domain name appears exactly on the public
+ * suffix list
+ * @since 6.0
+ */
+ public boolean isPublicSuffix() {
+ return publicSuffixIndex == 0;
+ }
+
+ /**
+ * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
+ * public suffix}, including if it is a public suffix itself. For example,
+ * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
+ * {@code com}, but not for {@code google} or {@code google.foo}. This is
+ * the recommended method for determining whether a domain is potentially an
+ * addressable host.
+ *
+ * @since 6.0
+ */
+ public boolean hasPublicSuffix() {
+ return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
+ }
+
+ /**
+ * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
+ * domain name, or {@code null} if no public suffix is present.
+ *
+ * @since 6.0
+ */
+ public InternetDomainName publicSuffix() {
+ return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
+ }
+
+ /**
+ * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
+ * public suffix}, while not being a public suffix itself. For example,
+ * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
+ * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
+ * google.foo}.
+ *
+ * <p><b>Warning:</b> a {@code false} result from this method does not imply
+ * that the domain does not represent an addressable host, as many public
+ * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
+ * that test.
+ *
+ * <p>This method can be used to determine whether it will probably be
+ * possible to set cookies on the domain, though even that depends on
+ * individual browsers' implementations of cookie controls. See
+ * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
+ *
+ * @since 6.0
+ */
+ public boolean isUnderPublicSuffix() {
+ return publicSuffixIndex > 0;
+ }
+
+ /**
+ * Indicates whether this domain name is composed of exactly one subdomain
+ * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
+ * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
+ * but not for {@code www.google.com} or {@code co.uk}.
+ *
+ * <p><b>Warning:</b> A {@code true} result from this method does not imply
+ * that the domain is at the highest level which is addressable as a host, as
+ * many public suffixes are also addressable hosts. For example, the domain
+ * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
+ * return {@code true} from this method. But {@code uk.com} is itself an
+ * addressable host.
+ *
+ * <p>This method can be used to determine whether a domain is probably the
+ * highest level for which cookies may be set, though even that depends on
+ * individual browsers' implementations of cookie controls. See
+ * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
+ *
+ * @since 6.0
+ */
+ public boolean isTopPrivateDomain() {
+ return publicSuffixIndex == 1;
+ }
+
+ /**
+ * Returns the portion of this domain name that is one level beneath the
+ * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
+ * {@code google.co.uk}, since {@code co.uk} is a public suffix.
+ *
+ * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
+ * instance is returned.
+ *
+ * <p>This method should not be used to determine the topmost parent domain
+ * which is addressable as a host, as many public suffixes are also
+ * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
+ * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
+ * from this method. But {@code uk.com} is itself an addressable host.
+ *
+ * <p>This method can be used to determine the probable highest level parent
+ * domain for which cookies may be set, though even that depends on individual
+ * browsers' implementations of cookie controls.
+ *
+ * @throws IllegalStateException if this domain does not end with a
+ * public suffix
+ * @since 6.0
+ */
+ public InternetDomainName topPrivateDomain() {
+ if (isTopPrivateDomain()) {
+ return this;
+ }
+ checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
+ return ancestor(publicSuffixIndex - 1);
+ }
+
+ /**
+ * Indicates whether this domain is composed of two or more parts.
+ */
+ public boolean hasParent() {
+ return parts.size() > 1;
+ }
+
+ /**
+ * Returns an {@code InternetDomainName} that is the immediate ancestor of
+ * this one; that is, the current domain with the leftmost part removed. For
+ * example, the parent of {@code www.google.com} is {@code google.com}.
+ *
+ * @throws IllegalStateException if the domain has no parent, as determined
+ * by {@link #hasParent}
+ */
+ public InternetDomainName parent() {
+ checkState(hasParent(), "Domain '%s' has no parent", name);
+ return ancestor(1);
+ }
+
+ /**
+ * Returns the ancestor of the current domain at the given number of levels
+ * "higher" (rightward) in the subdomain list. The number of levels must be
+ * non-negative, and less than {@code N-1}, where {@code N} is the number of
+ * parts in the domain.
+ *
+ * <p>TODO: Reasonable candidate for addition to public API.
+ */
+ private InternetDomainName ancestor(int levels) {
+ return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
+ }
+
+ /**
+ * Creates and returns a new {@code InternetDomainName} by prepending the
+ * argument and a dot to the current name. For example, {@code
+ * InternetDomainName.from("foo.com").child("www.bar")} returns a new
+ * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only
+ * lenient validation is performed, as described {@link #from(String) here}.
+ *
+ * @throws NullPointerException if leftParts is null
+ * @throws IllegalArgumentException if the resulting name is not valid
+ */
+ public InternetDomainName child(String leftParts) {
+ return from(checkNotNull(leftParts) + "." + name);
+ }
+
+ /**
+ * A deprecated synonym for {@link #isValid(String)}.
+ *
+ * @since 8.0 (previously named {@code isValid})
+ * @deprecated Use {@link #isValid(String)} instead
+ */
+ @Deprecated
+ public static boolean isValidLenient(String name) {
+ return isValid(name);
+ }
+
+ /**
+ * Indicates whether the argument is a syntactically valid domain name using
+ * lenient validation. Specifically, validation against <a
+ * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
+ * ("Internationalizing Domain Names in Applications") is skipped.
+ *
+ * <p>The following two code snippets are equivalent:
+ *
+ * <pre> {@code
+ *
+ * domainName = InternetDomainName.isValid(name)
+ * ? InternetDomainName.from(name)
+ * : DEFAULT_DOMAIN;
+ * }</pre>
+ *
+ * <pre> {@code
+ *
+ * try {
+ * domainName = InternetDomainName.from(name);
+ * } catch (IllegalArgumentException e) {
+ * domainName = DEFAULT_DOMAIN;
+ * }}</pre>
+ *
+ * @since 8.0 (previously named {@code isValidLenient})
+ */
+ public static boolean isValid(String name) {
+ try {
+ from(name);
+ return true;
+ } catch (IllegalArgumentException e) {
+ return false;
+ }
+ }
+
+ /**
+ * Does the domain name match one of the "wildcard" patterns (e.g.
+ * {@code "*.ar"})?
+ */
+ private static boolean matchesWildcardPublicSuffix(String domain) {
+ final String[] pieces = domain.split(DOT_REGEX, 2);
+ return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
+ }
+
+ // TODO: specify this to return the same as name(); remove name()
+ @Override
+ public String toString() {
+ return Objects.toStringHelper(this).add("name", name).toString();
+ }
+
+ /**
+ * Equality testing is based on the text supplied by the caller,
+ * after normalization as described in the class documentation. For
+ * example, a non-ASCII Unicode domain name and the Punycode version
+ * of the same domain name would not be considered equal.
+ *
+ */
+ @Override
+ public boolean equals(@Nullable Object object) {
+ if (object == this) {
+ return true;
+ }
+
+ if (object instanceof InternetDomainName) {
+ InternetDomainName that = (InternetDomainName) object;
+ return this.name.equals(that.name);
+ }
+
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return name.hashCode();
+ }
+}