diff options
-rw-r--r-- | common/java/com/android/common/Patterns.java | 51 | ||||
-rw-r--r-- | common/tests/src/com/android/common/PatternsTest.java | 13 | ||||
-rwxr-xr-x | common/tools/make-iana-tld-pattern.py | 32 | ||||
-rw-r--r-- | core/java/android/net/WebAddress.java | 4 |
4 files changed, 61 insertions, 39 deletions
diff --git a/common/java/com/android/common/Patterns.java b/common/java/com/android/common/Patterns.java index 71c3a5e..3b3b038 100644 --- a/common/java/com/android/common/Patterns.java +++ b/common/java/com/android/common/Patterns.java @@ -24,12 +24,12 @@ import java.util.regex.Pattern; */ public class Patterns { /** - * Regular expression pattern to match all IANA top-level domains. + * Regular expression to match all IANA top-level domains. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt - * This pattern is auto-generated by development/tools/make-iana-tld-pattern.py + * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ - public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( + public static final String TOP_LEVEL_DOMAIN_STR = "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(biz|b[abdefghijmnorstvwyz])" + "|(cat|com|coop|c[acdfghiklmnoruvxyz])" @@ -55,20 +55,22 @@ public class Patterns { + "|w[fs]" + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)" + "|y[etu]" - + "|z[amw])"); + + "|z[amw])"; + + /** + * Regular expression pattern to match all IANA top-level domains. + */ + public static final Pattern TOP_LEVEL_DOMAIN = + Pattern.compile(TOP_LEVEL_DOMAIN_STR); /** - * Regular expression pattern to match RFC 1738 URLs + * Regular expression to match all IANA top-level domains for WEB_URL. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt - * This pattern is auto-generated by development/tools/make-iana-tld-pattern.py + * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ - public static final Pattern WEB_URL = Pattern.compile( - "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" - + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" - + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" - + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host - + "(?:" // plus top level domain + public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = + "(?:" + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(?:biz|b[abdefghijmnorstvwyz])" + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" @@ -94,7 +96,28 @@ public class Patterns { + "|w[fs]" + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)" + "|y[etu]" - + "|z[amw]))" + + "|z[amw]))"; + + /** + * Good characters for Internationalized Resource Identifiers (IRI). + * This comprises most common used Unicode characters allowed in IRI + * as detailed in RFC 3987. + * Specifically, those two byte Unicode characters are not included. + */ + public static final String GOOD_IRI_CHAR = + "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; + + /** + * Regular expression pattern to match most part of RFC 3987 + * Internationalized URLs, aka IRIs. Commonly used Unicode characters are + * added. + */ + public static final Pattern WEB_URL = Pattern.compile( + "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host + + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL + "|(?:(?:25[0-5]|2[0-4]" // or ip address + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" @@ -116,7 +139,7 @@ public class Patterns { public static final Pattern DOMAIN_NAME = Pattern.compile( - "(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+" + "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+" + TOP_LEVEL_DOMAIN + ")|" + IP_ADDRESS + ")"); diff --git a/common/tests/src/com/android/common/PatternsTest.java b/common/tests/src/com/android/common/PatternsTest.java index 635601e..9e2ad58 100644 --- a/common/tests/src/com/android/common/PatternsTest.java +++ b/common/tests/src/com/android/common/PatternsTest.java @@ -68,6 +68,12 @@ public class PatternsTest extends TestCase { t = Patterns.WEB_URL.matcher("xn--fsqu00a.xn--0zwm56d").matches(); assertTrue("Valid URL", t); + // Internationalized URL. + t = Patterns.WEB_URL.matcher("http://\uD604\uAE08\uC601\uC218\uC99D.kr").matches(); + assertTrue("Valid URL", t); + t = Patterns.WEB_URL.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches(); + assertTrue("Valid URL", t); + t = Patterns.WEB_URL.matcher("ftp://www.example.com").matches(); assertFalse("Matched invalid protocol", t); @@ -99,6 +105,13 @@ public class PatternsTest extends TestCase { t = Patterns.DOMAIN_NAME.matcher("mail.example.com").matches(); assertTrue("Valid domain", t); + t = Patterns.WEB_URL.matcher("google.me").matches(); + assertTrue("Valid domain", t); + + // Internationalized domains. + t = Patterns.DOMAIN_NAME.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches(); + assertTrue("Valid domain", t); + t = Patterns.DOMAIN_NAME.matcher("__+&42.xer").matches(); assertFalse("Invalid domain", t); } diff --git a/common/tools/make-iana-tld-pattern.py b/common/tools/make-iana-tld-pattern.py index ece4dcf..de81c58 100755 --- a/common/tools/make-iana-tld-pattern.py +++ b/common/tools/make-iana-tld-pattern.py @@ -4,43 +4,27 @@ from urllib2 import urlopen TLD_PREFIX = r""" /** - * Regular expression pattern to match all IANA top-level domains. + * Regular expression to match all IANA top-level domains. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ - public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( + public static final String TOP_LEVEL_DOMAIN_STR = """ -TLD_SUFFIX = '");' +TLD_SUFFIX = '";' URL_PREFIX = r""" /** - * Regular expression pattern to match RFC 1738 URLs + * Regular expression to match all IANA top-level domains for WEB_URL. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt - * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py + * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ - public static final Pattern WEB_URL = Pattern.compile( - "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" - + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" - + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" - + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host - + "(?:" // plus top level domain + public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = + "(?:" """ -URL_SUFFIX = r""" - + "|(?:(?:25[0-5]|2[0-4]" // or ip address - + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" - + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" - + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" - + "|[1-9][0-9]|[0-9])))" - + "(?:\\:\\d{1,5})?)" // plus option port number - + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params - + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" - + "(?:\\b|$)"); // and finally, a word boundary or end of - // input. This is to stop foo.sure from - // matching as foo.su -""" +URL_SUFFIX = ';' class Bucket: def __init__(self, baseLetter): diff --git a/core/java/android/net/WebAddress.java b/core/java/android/net/WebAddress.java index f4ae66a..2d078c1 100644 --- a/core/java/android/net/WebAddress.java +++ b/core/java/android/net/WebAddress.java @@ -16,6 +16,8 @@ package android.net; +import static com.android.common.Patterns.GOOD_IRI_CHAR; + import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -54,7 +56,7 @@ public class WebAddress { static Pattern sAddressPattern = Pattern.compile( /* scheme */ "(?:(http|HTTP|https|HTTPS|file|FILE)\\:\\/\\/)?" + /* authority */ "(?:([-A-Za-z0-9$_.+!*'(),;?&=]+(?:\\:[-A-Za-z0-9$_.+!*'(),;?&=]+)?)@)?" + - /* host */ "([-A-Za-z0-9%_]+(?:\\.[-A-Za-z0-9%_]+)*|\\[[0-9a-fA-F:\\.]+\\])?" + + /* host */ "([-" + GOOD_IRI_CHAR + "%_]+(?:\\.[-" + GOOD_IRI_CHAR + "%_]+)*|\\[[0-9a-fA-F:\\.]+\\])?" + /* port */ "(?:\\:([0-9]+))?" + /* path */ "(\\/?.*)?"); |