summaryrefslogtreecommitdiffstats
path: root/common
diff options
context:
space:
mode:
authorShimeng (Simon) Wang <swang@google.com>2010-02-16 10:49:37 -0800
committerAndroid (Google) Code Review <android-gerrit@google.com>2010-02-16 10:49:37 -0800
commite8cb5ab7c0e9f22ffcaee8de718c83aa01765f55 (patch)
tree5238bdc4e612f37150c41745129504dec8cf5402 /common
parent3d0ff09e35521a97ec1cfe14d57c6868e271fe9a (diff)
parent51c02dbf75fede9b2829af5b821f10e0bc2af124 (diff)
downloadframeworks_base-e8cb5ab7c0e9f22ffcaee8de718c83aa01765f55.zip
frameworks_base-e8cb5ab7c0e9f22ffcaee8de718c83aa01765f55.tar.gz
frameworks_base-e8cb5ab7c0e9f22ffcaee8de718c83aa01765f55.tar.bz2
Merge "Enhance URL regular expression to match more Unicode chars."
Diffstat (limited to 'common')
-rw-r--r--common/java/com/android/common/Patterns.java51
-rw-r--r--common/tests/src/com/android/common/PatternsTest.java13
-rwxr-xr-xcommon/tools/make-iana-tld-pattern.py32
3 files changed, 58 insertions, 38 deletions
diff --git a/common/java/com/android/common/Patterns.java b/common/java/com/android/common/Patterns.java
index 71c3a5e..3b3b038 100644
--- a/common/java/com/android/common/Patterns.java
+++ b/common/java/com/android/common/Patterns.java
@@ -24,12 +24,12 @@ import java.util.regex.Pattern;
*/
public class Patterns {
/**
- * Regular expression pattern to match all IANA top-level domains.
+ * Regular expression to match all IANA top-level domains.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
- * This pattern is auto-generated by development/tools/make-iana-tld-pattern.py
+ * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
- public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
+ public static final String TOP_LEVEL_DOMAIN_STR =
"((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
+ "|(biz|b[abdefghijmnorstvwyz])"
+ "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
@@ -55,20 +55,22 @@ public class Patterns {
+ "|w[fs]"
+ "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)"
+ "|y[etu]"
- + "|z[amw])");
+ + "|z[amw])";
+
+ /**
+ * Regular expression pattern to match all IANA top-level domains.
+ */
+ public static final Pattern TOP_LEVEL_DOMAIN =
+ Pattern.compile(TOP_LEVEL_DOMAIN_STR);
/**
- * Regular expression pattern to match RFC 1738 URLs
+ * Regular expression to match all IANA top-level domains for WEB_URL.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
- * This pattern is auto-generated by development/tools/make-iana-tld-pattern.py
+ * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
- public static final Pattern WEB_URL = Pattern.compile(
- "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
- + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
- + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
- + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
- + "(?:" // plus top level domain
+ public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
+ "(?:"
+ "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
+ "|(?:biz|b[abdefghijmnorstvwyz])"
+ "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
@@ -94,7 +96,28 @@ public class Patterns {
+ "|w[fs]"
+ "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)"
+ "|y[etu]"
- + "|z[amw]))"
+ + "|z[amw]))";
+
+ /**
+ * Good characters for Internationalized Resource Identifiers (IRI).
+ * This comprises most common used Unicode characters allowed in IRI
+ * as detailed in RFC 3987.
+ * Specifically, those two byte Unicode characters are not included.
+ */
+ public static final String GOOD_IRI_CHAR =
+ "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";
+
+ /**
+ * Regular expression pattern to match most part of RFC 3987
+ * Internationalized URLs, aka IRIs. Commonly used Unicode characters are
+ * added.
+ */
+ public static final Pattern WEB_URL = Pattern.compile(
+ "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
+ + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
+ + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
+ + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host
+ + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL
+ "|(?:(?:25[0-5]|2[0-4]" // or ip address
+ "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
+ "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
@@ -116,7 +139,7 @@ public class Patterns {
public static final Pattern DOMAIN_NAME
= Pattern.compile(
- "(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+"
+ "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+"
+ TOP_LEVEL_DOMAIN + ")|"
+ IP_ADDRESS + ")");
diff --git a/common/tests/src/com/android/common/PatternsTest.java b/common/tests/src/com/android/common/PatternsTest.java
index 635601e..9e2ad58 100644
--- a/common/tests/src/com/android/common/PatternsTest.java
+++ b/common/tests/src/com/android/common/PatternsTest.java
@@ -68,6 +68,12 @@ public class PatternsTest extends TestCase {
t = Patterns.WEB_URL.matcher("xn--fsqu00a.xn--0zwm56d").matches();
assertTrue("Valid URL", t);
+ // Internationalized URL.
+ t = Patterns.WEB_URL.matcher("http://\uD604\uAE08\uC601\uC218\uC99D.kr").matches();
+ assertTrue("Valid URL", t);
+ t = Patterns.WEB_URL.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches();
+ assertTrue("Valid URL", t);
+
t = Patterns.WEB_URL.matcher("ftp://www.example.com").matches();
assertFalse("Matched invalid protocol", t);
@@ -99,6 +105,13 @@ public class PatternsTest extends TestCase {
t = Patterns.DOMAIN_NAME.matcher("mail.example.com").matches();
assertTrue("Valid domain", t);
+ t = Patterns.WEB_URL.matcher("google.me").matches();
+ assertTrue("Valid domain", t);
+
+ // Internationalized domains.
+ t = Patterns.DOMAIN_NAME.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches();
+ assertTrue("Valid domain", t);
+
t = Patterns.DOMAIN_NAME.matcher("__+&42.xer").matches();
assertFalse("Invalid domain", t);
}
diff --git a/common/tools/make-iana-tld-pattern.py b/common/tools/make-iana-tld-pattern.py
index ece4dcf..de81c58 100755
--- a/common/tools/make-iana-tld-pattern.py
+++ b/common/tools/make-iana-tld-pattern.py
@@ -4,43 +4,27 @@ from urllib2 import urlopen
TLD_PREFIX = r"""
/**
- * Regular expression pattern to match all IANA top-level domains.
+ * Regular expression to match all IANA top-level domains.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
- public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
+ public static final String TOP_LEVEL_DOMAIN_STR =
"""
-TLD_SUFFIX = '");'
+TLD_SUFFIX = '";'
URL_PREFIX = r"""
/**
- * Regular expression pattern to match RFC 1738 URLs
+ * Regular expression to match all IANA top-level domains for WEB_URL.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
- * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
+ * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
- public static final Pattern WEB_URL = Pattern.compile(
- "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
- + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
- + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
- + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
- + "(?:" // plus top level domain
+ public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
+ "(?:"
"""
-URL_SUFFIX = r"""
- + "|(?:(?:25[0-5]|2[0-4]" // or ip address
- + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
- + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
- + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
- + "|[1-9][0-9]|[0-9])))"
- + "(?:\\:\\d{1,5})?)" // plus option port number
- + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
- + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
- + "(?:\\b|$)"); // and finally, a word boundary or end of
- // input. This is to stop foo.sure from
- // matching as foo.su
-"""
+URL_SUFFIX = ';'
class Bucket:
def __init__(self, baseLetter):