diff options
Diffstat (limited to 'common/tools/make-iana-tld-pattern.py')
-rwxr-xr-x | common/tools/make-iana-tld-pattern.py | 32 |
1 files changed, 8 insertions, 24 deletions
diff --git a/common/tools/make-iana-tld-pattern.py b/common/tools/make-iana-tld-pattern.py index ece4dcf..de81c58 100755 --- a/common/tools/make-iana-tld-pattern.py +++ b/common/tools/make-iana-tld-pattern.py @@ -4,43 +4,27 @@ from urllib2 import urlopen TLD_PREFIX = r""" /** - * Regular expression pattern to match all IANA top-level domains. + * Regular expression to match all IANA top-level domains. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ - public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( + public static final String TOP_LEVEL_DOMAIN_STR = """ -TLD_SUFFIX = '");' +TLD_SUFFIX = '";' URL_PREFIX = r""" /** - * Regular expression pattern to match RFC 1738 URLs + * Regular expression to match all IANA top-level domains for WEB_URL. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt - * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py + * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ - public static final Pattern WEB_URL = Pattern.compile( - "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" - + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" - + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" - + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host - + "(?:" // plus top level domain + public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = + "(?:" """ -URL_SUFFIX = r""" - + "|(?:(?:25[0-5]|2[0-4]" // or ip address - + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" - + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" - + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" - + "|[1-9][0-9]|[0-9])))" - + "(?:\\:\\d{1,5})?)" // plus option port number - + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params - + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" - + "(?:\\b|$)"); // and finally, a word boundary or end of - // input. This is to stop foo.sure from - // matching as foo.su -""" +URL_SUFFIX = ';' class Bucket: def __init__(self, baseLetter): |