summaryrefslogtreecommitdiffstats
path: root/common/tools/make-iana-tld-pattern.py
diff options
context:
space:
mode:
Diffstat (limited to 'common/tools/make-iana-tld-pattern.py')
-rwxr-xr-xcommon/tools/make-iana-tld-pattern.py32
1 files changed, 8 insertions, 24 deletions
diff --git a/common/tools/make-iana-tld-pattern.py b/common/tools/make-iana-tld-pattern.py
index ece4dcf..de81c58 100755
--- a/common/tools/make-iana-tld-pattern.py
+++ b/common/tools/make-iana-tld-pattern.py
@@ -4,43 +4,27 @@ from urllib2 import urlopen
TLD_PREFIX = r"""
/**
- * Regular expression pattern to match all IANA top-level domains.
+ * Regular expression to match all IANA top-level domains.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
- public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
+ public static final String TOP_LEVEL_DOMAIN_STR =
"""
-TLD_SUFFIX = '");'
+TLD_SUFFIX = '";'
URL_PREFIX = r"""
/**
- * Regular expression pattern to match RFC 1738 URLs
+ * Regular expression to match all IANA top-level domains for WEB_URL.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
- * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
+ * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
- public static final Pattern WEB_URL = Pattern.compile(
- "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
- + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
- + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
- + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
- + "(?:" // plus top level domain
+ public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
+ "(?:"
"""
-URL_SUFFIX = r"""
- + "|(?:(?:25[0-5]|2[0-4]" // or ip address
- + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
- + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
- + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
- + "|[1-9][0-9]|[0-9])))"
- + "(?:\\:\\d{1,5})?)" // plus option port number
- + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
- + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
- + "(?:\\b|$)"); // and finally, a word boundary or end of
- // input. This is to stop foo.sure from
- // matching as foo.su
-"""
+URL_SUFFIX = ';'
class Bucket:
def __init__(self, baseLetter):