summaryrefslogtreecommitdiffstats
path: root/common/tools/make-iana-tld-pattern.py
diff options
context:
space:
mode:
Diffstat (limited to 'common/tools/make-iana-tld-pattern.py')
-rwxr-xr-xcommon/tools/make-iana-tld-pattern.py144
1 files changed, 0 insertions, 144 deletions
diff --git a/common/tools/make-iana-tld-pattern.py b/common/tools/make-iana-tld-pattern.py
deleted file mode 100755
index de81c58..0000000
--- a/common/tools/make-iana-tld-pattern.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/env python
-
-from urllib2 import urlopen
-
-TLD_PREFIX = r"""
- /**
- * Regular expression to match all IANA top-level domains.
- * List accurate as of 2010/02/05. List taken from:
- * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
- * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
- */
- public static final String TOP_LEVEL_DOMAIN_STR =
-"""
-TLD_SUFFIX = '";'
-
-URL_PREFIX = r"""
- /**
- * Regular expression to match all IANA top-level domains for WEB_URL.
- * List accurate as of 2010/02/05. List taken from:
- * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
- * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
- */
- public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
- "(?:"
-"""
-
-URL_SUFFIX = ';'
-
-class Bucket:
- def __init__(self, baseLetter):
- self.base=baseLetter
- self.words=[]
- self.letters=[]
-
- def dump(self, isWebUrl=False, isFirst=False, isLast=False):
- if (len(self.words) == 0) and (len(self.letters) == 0):
- return ''
-
- self.words.sort()
- self.letters.sort()
-
- output = ' ';
-
- if isFirst:
- if isWebUrl:
- output += '+ "'
- else:
- output += '"('
- else:
- output += '+ "|'
-
- if len(self.words) != 0:
- output += '('
-
- if isWebUrl:
- output += '?:'
-
- firstWord = 1
- for word in self.words:
- if firstWord == 0:
- output += '|'
- firstWord = 0
- for letter in word:
- if letter == '-':
- output += '\\\\' # escape the '-' character.
- output += letter
-
- if len(self.words) > 0 and len(self.letters) > 0:
- output += '|'
-
- if len(self.letters) == 1:
- output += '%c%c' % (self.base, self.letters[0])
- elif len(self.letters) > 0:
- output += '%c[' % self.base
-
- for letter in self.letters:
- output += letter
-
- output += ']'
-
- if len(self.words) != 0:
- output += ')'
-
- if not isLast:
- output += '"'
- output += '\n'
-
- return output;
-
- def add(self, line):
- length = len(line)
-
- if line.startswith('#') or (length == 0):
- return;
-
- if length == 2:
- self.letters.append(line[1:2])
- else:
- self.words.append(line)
-
-def getBucket(buckets, line):
- letter = line[0]
- bucket = buckets.get(letter)
-
- if bucket is None:
- bucket = Bucket(letter)
- buckets[letter] = bucket
-
- return bucket
-
-def makePattern(prefix, suffix, buckets, isWebUrl=False):
- output = prefix
-
- output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
-
- for letter in range(ord('b'), ord('z')):
- output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
-
- output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
-
- if isWebUrl:
- output += '))"'
- else:
- output += ')'
-
- output += suffix
-
- print output
-
-if __name__ == "__main__":
- f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
- domains = f.readlines()
- f.close()
-
- buckets = {}
-
- for domain in domains:
- domain = domain.lower()
-
- if len(domain) > 0:
- getBucket(buckets, domain[0]).add(domain.strip())
-
- makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
- makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)