summaryrefslogtreecommitdiffstats
path: root/regex
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2010-04-09 16:31:13 -0700
committerElliott Hughes <enh@google.com>2010-04-09 16:31:13 -0700
commit0510f0d8ce7c20b8f6022545a70e8b868805dc60 (patch)
treef63bd19e0797e8263f25748e4a23ee139a3b051f /regex
parentc5a274a7e81a9ae7d3270bb132c9e0949d80fb03 (diff)
downloadlibcore-0510f0d8ce7c20b8f6022545a70e8b868805dc60.zip
libcore-0510f0d8ce7c20b8f6022545a70e8b868805dc60.tar.gz
libcore-0510f0d8ce7c20b8f6022545a70e8b868805dc60.tar.bz2
Make String.split 10x faster.
Almost all uses of String.split in the Android codebase use trivial single literal character separators. This patch optimizes that case to avoid the use of regular expressions entirely. The 10x speedup isn't the whole story, because the speedup is really proportional to the number of separators in the input. 10x is easily achievable, but the speedup could be arbitrarily high. Before: benchmark us logarithmic runtime PatternSplitComma 84.8 XXXXXXXXXXXXXX|||||||||||||| PatternSplitLiteralDot 85.0 XXXXXXXXXXXXXX|||||||||||||| StringSplitComma 166.3 XXXXXXXXXXXXXXXXXXXXXXXXXXXX| StringSplitHard 173.6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX StringSplitLiteralDot 167.7 XXXXXXXXXXXXXXXXXXXXXXXXXXXX| After: benchmark us logarithmic runtime PatternSplitComma 18.9 XXX||||||||||||||||||||| PatternSplitLiteralDot 19.0 XXX||||||||||||||||||||| StringSplitComma 18.8 XXX||||||||||||||||||||| StringSplitHard 174.2 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX StringSplitLiteralDot 18.8 XXX||||||||||||||||||||| (The benchmarks starting "Pattern" use a precompiled Pattern for performance. Those starting "String" use String.split and would traditional entail a temporary Pattern. As you can see, creating Patterns is very expensive for us, and each one throws a finalizer spanner in the GC works too. The new fast path avoids all this. I'll commit the benchmark -- along with all the others I've ever used -- to http://code.google.com/p/dalvik this afternoon.) Tests? We actually pass _more_ tests after this patch, because the increase in performance means we don't hit timeouts. Change-Id: I404298e21a78d72cf5ce6ea675844bf251e3825b
Diffstat (limited to 'regex')
-rw-r--r--regex/src/main/java/java/util/regex/Matcher.java3
-rw-r--r--regex/src/main/java/java/util/regex/Pattern.java52
-rw-r--r--regex/src/main/java/java/util/regex/Splitter.java122
3 files changed, 125 insertions, 52 deletions
diff --git a/regex/src/main/java/java/util/regex/Matcher.java b/regex/src/main/java/java/util/regex/Matcher.java
index be5c782..5abbbd5 100644
--- a/regex/src/main/java/java/util/regex/Matcher.java
+++ b/regex/src/main/java/java/util/regex/Matcher.java
@@ -206,8 +206,7 @@ public final class Matcher implements MatchResult {
throw new IllegalArgumentException();
}
- if (start < 0 || end < 0 || start > input.length() ||
- end > input.length() || start > end) {
+ if (start < 0 || end < 0 || start > input.length() || end > input.length() || start > end) {
throw new IllegalArgumentException();
}
diff --git a/regex/src/main/java/java/util/regex/Pattern.java b/regex/src/main/java/java/util/regex/Pattern.java
index c366732..325e3e0 100644
--- a/regex/src/main/java/java/util/regex/Pattern.java
+++ b/regex/src/main/java/java/util/regex/Pattern.java
@@ -169,8 +169,6 @@ public final class Pattern implements Serializable {
* <p>Otherwise, the {@code limit} parameter controls the contents of the
* returned array as described below.
*
- * @param inputSeq
- * the input sequence.
* @param limit
* Determines the maximum number of entries in the resulting
* array, and the treatment of trailing empty strings.
@@ -188,61 +186,15 @@ public final class Pattern implements Serializable {
* special, as described above, and the limit parameter does
* not apply there.)
* </ul>
- *
- * @return the resulting array.
*/
- public String[] split(CharSequence inputSeq, int limit) {
- if (inputSeq.length() == 0) {
- // Unlike Perl, which considers the result of splitting the empty
- // string to be the empty array, Java returns an array containing
- // the empty string.
- return new String[] { "" };
- }
-
- int maxLength = limit <= 0 ? Integer.MAX_VALUE : limit;
-
- String input = inputSeq.toString();
- ArrayList<String> list = new ArrayList<String>();
-
- Matcher matcher = new Matcher(this, inputSeq);
- int savedPos = 0;
-
- // Add text preceding each occurrence, if enough space.
- while(matcher.find() && list.size() + 1 < maxLength) {
- list.add(input.substring(savedPos, matcher.start()));
- savedPos = matcher.end();
- }
-
- // Add trailing text if enough space.
- if (list.size() < maxLength) {
- if (savedPos < input.length()) {
- list.add(input.substring(savedPos));
- } else {
- list.add("");
- }
- }
-
- // Remove trailing empty matches in the limit == 0 case.
- if (limit == 0) {
- int i = list.size() - 1;
- while (i >= 0 && "".equals(list.get(i))) {
- list.remove(i);
- i--;
- }
- }
-
- return list.toArray(new String[list.size()]);
+ public String[] split(CharSequence input, int limit) {
+ return Splitter.split(this, pattern, input.toString(), limit);
}
/**
* Splits a given input around occurrences of a regular expression. This is
* a convenience method that is equivalent to calling the method
* {@link #split(java.lang.CharSequence, int)} with a limit of 0.
- *
- * @param input
- * the input sequence.
- *
- * @return the resulting array.
*/
public String[] split(CharSequence input) {
return split(input, 0);
diff --git a/regex/src/main/java/java/util/regex/Splitter.java b/regex/src/main/java/java/util/regex/Splitter.java
new file mode 100644
index 0000000..5b4c048
--- /dev/null
+++ b/regex/src/main/java/java/util/regex/Splitter.java
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package java.util.regex;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/*
+ * Used to make {@code String.split} fast (and to help {@code Pattern.split} too).
+ * @hide
+ */
+public class Splitter {
+ // The RI allows regular expressions beginning with ] or }, but that's probably a bug.
+ private static final String METACHARACTERS = "\\?*+[](){}^$.|";
+
+ private Splitter() {
+ }
+
+ /**
+ * Returns a result equivalent to {@code s.split(separator, limit)} if it's able
+ * to compute it more cheaply than ICU, or null if the caller should fall back to
+ * using ICU.
+ */
+ public static String[] fastSplit(String re, String input, int limit) {
+ // Can we do it cheaply?
+ int len = re.length();
+ if (len == 0) {
+ return null;
+ }
+ char ch = re.charAt(0);
+ if (len == 1 && METACHARACTERS.indexOf(ch) == -1) {
+ // We're looking for a single non-metacharacter. Easy.
+ } else if (len == 2 && ch == '\\') {
+ // We're looking for a quoted character.
+ // Quoted metacharacters are effectively single non-metacharacters.
+ ch = re.charAt(1);
+ if (METACHARACTERS.indexOf(ch) == -1) {
+ return null;
+ }
+ } else {
+ return null;
+ }
+
+ // We can do this cheaply...
+
+ // Unlike Perl, which considers the result of splitting the empty string to be the empty
+ // array, Java returns an array containing the empty string.
+ if (input.isEmpty()) {
+ return new String[] { "" };
+ }
+
+ // Collect text preceding each occurrence of the separator, while there's enough space.
+ ArrayList<String> list = new ArrayList<String>();
+ int maxSize = limit <= 0 ? Integer.MAX_VALUE : limit;
+ int begin = 0;
+ int end;
+ while ((end = input.indexOf(ch, begin)) != -1 && list.size() + 1 < maxSize) {
+ list.add(input.substring(begin, end));
+ begin = end + 1;
+ }
+ return finishSplit(list, input, begin, maxSize, limit);
+ }
+
+ public static String[] split(Pattern pattern, String re, String input, int limit) {
+ String[] fastResult = fastSplit(re, input, limit);
+ if (fastResult != null) {
+ return fastResult;
+ }
+
+ // Unlike Perl, which considers the result of splitting the empty string to be the empty
+ // array, Java returns an array containing the empty string.
+ if (input.isEmpty()) {
+ return new String[] { "" };
+ }
+
+ // Collect text preceding each occurrence of the separator, while there's enough space.
+ ArrayList<String> list = new ArrayList<String>();
+ int maxSize = limit <= 0 ? Integer.MAX_VALUE : limit;
+ Matcher matcher = new Matcher(pattern, input);
+ int begin = 0;
+ while (matcher.find() && list.size() + 1 < maxSize) {
+ list.add(input.substring(begin, matcher.start()));
+ begin = matcher.end();
+ }
+ return finishSplit(list, input, begin, maxSize, limit);
+ }
+
+ private static String[] finishSplit(List<String> list, String input, int begin, int maxSize, int limit) {
+ // Add trailing text if enough space.
+ if (list.size() < maxSize) {
+ if (begin < input.length()) {
+ list.add(input.substring(begin));
+ } else {
+ list.add("");
+ }
+ }
+ // Remove trailing empty matches in the limit == 0 case.
+ if (limit == 0) {
+ int i = list.size() - 1;
+ while (i >= 0 && "".equals(list.get(i))) {
+ list.remove(i);
+ i--;
+ }
+ }
+ // Convert to an array.
+ return list.toArray(new String[list.size()]);
+ }
+}