diff options
author | Shimeng (Simon) Wang <swang@google.com> | 2010-02-16 10:49:37 -0800 |
---|---|---|
committer | Android (Google) Code Review <android-gerrit@google.com> | 2010-02-16 10:49:37 -0800 |
commit | e8cb5ab7c0e9f22ffcaee8de718c83aa01765f55 (patch) | |
tree | 5238bdc4e612f37150c41745129504dec8cf5402 /common/java | |
parent | 3d0ff09e35521a97ec1cfe14d57c6868e271fe9a (diff) | |
parent | 51c02dbf75fede9b2829af5b821f10e0bc2af124 (diff) | |
download | frameworks_base-e8cb5ab7c0e9f22ffcaee8de718c83aa01765f55.zip frameworks_base-e8cb5ab7c0e9f22ffcaee8de718c83aa01765f55.tar.gz frameworks_base-e8cb5ab7c0e9f22ffcaee8de718c83aa01765f55.tar.bz2 |
Merge "Enhance URL regular expression to match more Unicode chars."
Diffstat (limited to 'common/java')
-rw-r--r-- | common/java/com/android/common/Patterns.java | 51 |
1 files changed, 37 insertions, 14 deletions
diff --git a/common/java/com/android/common/Patterns.java b/common/java/com/android/common/Patterns.java index 71c3a5e..3b3b038 100644 --- a/common/java/com/android/common/Patterns.java +++ b/common/java/com/android/common/Patterns.java @@ -24,12 +24,12 @@ import java.util.regex.Pattern; */ public class Patterns { /** - * Regular expression pattern to match all IANA top-level domains. + * Regular expression to match all IANA top-level domains. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt - * This pattern is auto-generated by development/tools/make-iana-tld-pattern.py + * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ - public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( + public static final String TOP_LEVEL_DOMAIN_STR = "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(biz|b[abdefghijmnorstvwyz])" + "|(cat|com|coop|c[acdfghiklmnoruvxyz])" @@ -55,20 +55,22 @@ public class Patterns { + "|w[fs]" + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)" + "|y[etu]" - + "|z[amw])"); + + "|z[amw])"; + + /** + * Regular expression pattern to match all IANA top-level domains. + */ + public static final Pattern TOP_LEVEL_DOMAIN = + Pattern.compile(TOP_LEVEL_DOMAIN_STR); /** - * Regular expression pattern to match RFC 1738 URLs + * Regular expression to match all IANA top-level domains for WEB_URL. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt - * This pattern is auto-generated by development/tools/make-iana-tld-pattern.py + * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ - public static final Pattern WEB_URL = Pattern.compile( - "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" - + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" - + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" - + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host - + "(?:" // plus top level domain + public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = + "(?:" + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(?:biz|b[abdefghijmnorstvwyz])" + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" @@ -94,7 +96,28 @@ public class Patterns { + "|w[fs]" + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)" + "|y[etu]" - + "|z[amw]))" + + "|z[amw]))"; + + /** + * Good characters for Internationalized Resource Identifiers (IRI). + * This comprises most common used Unicode characters allowed in IRI + * as detailed in RFC 3987. + * Specifically, those two byte Unicode characters are not included. + */ + public static final String GOOD_IRI_CHAR = + "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; + + /** + * Regular expression pattern to match most part of RFC 3987 + * Internationalized URLs, aka IRIs. Commonly used Unicode characters are + * added. + */ + public static final Pattern WEB_URL = Pattern.compile( + "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host + + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL + "|(?:(?:25[0-5]|2[0-4]" // or ip address + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" @@ -116,7 +139,7 @@ public class Patterns { public static final Pattern DOMAIN_NAME = Pattern.compile( - "(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+" + "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+" + TOP_LEVEL_DOMAIN + ")|" + IP_ADDRESS + ")"); |