summaryrefslogtreecommitdiffstats
path: root/luni/src/main/java/java/text/RuleBasedCollator.java
blob: 4fd86501b2d00336a207ee66c3efcf7b039be9f1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package java.text;

import libcore.icu.RuleBasedCollatorICU;

/**
 * A concrete implementation class for {@code Collation}.
 * <p>
 * {@code RuleBasedCollator} has the following restrictions for efficiency
 * (other subclasses may be used for more complex languages):
 * <ol>
 * <li> If a French secondary ordering is specified it applies to the whole
 * collator object.</li>
 * <li> All non-mentioned Unicode characters are at the end of the collation
 * order.</li>
 * <li> If a character is not located in the {@code RuleBasedCollator}, the
 * default Unicode Collation Algorithm (UCA) rule-based table is automatically
 * searched as a backup.</li>
 * </ol>
 * <p>
 * The collation table is composed of a list of collation rules, where each rule
 * is of three forms:
 * <blockquote>
 * <pre>
 * &lt;modifier&gt;
 * &lt;relation&gt; &lt;text-argument&gt;
 * &lt;reset&gt; &lt;text-argument&gt;
 * </pre>
 * </blockquote>
 * <p>
 * The rule elements are defined as follows:
 * <ul type="disc">
 * <li><strong>Modifier</strong>: There is a single modifier which is used to
 * specify that all accents (secondary differences) are backwards:
 * <ul type=square>
 * <li>'@' : Indicates that accents are sorted backwards, as in French.
 * </ul>
 * </li>
 * <li><strong>Relation</strong>: The relations are the following:
 * <ul type=square>
 * <li>'&lt;' : Greater, as a letter difference (primary)
 * <li>';' : Greater, as an accent difference (secondary)
 * <li>',' : Greater, as a case difference (tertiary)
 * <li>'=' : Equal
 * </ul>
 * </li>
 * <li><strong>Text-Argument</strong>: A text-argument is any sequence of
 * characters, excluding special characters (that is, common whitespace
 * characters [0009-000D, 0020] and rule syntax characters [0021-002F,
 * 003A-0040, 005B-0060, 007B-007E]). If those characters are desired, you can
 * put them in single quotes (for example, use '&amp;' for ampersand). Note that
 * unquoted white space characters are ignored; for example, {@code b c} is
 * treated as {@code bc}.</li>
 * <li><strong>Reset</strong>: There is a single reset which is used primarily
 * for contractions and expansions, but which can also be used to add a
 * modification at the end of a set of rules:
 * <ul type=square>
 * <li>'&amp;' : Indicates that the next rule follows the position to where the reset
 * text-argument would be sorted.
 * </ul>
 * </li>
 * </ul>
 * <p>
 * This sounds more complicated than it is in practice. For example, the
 * following are equivalent ways of expressing the same thing:
 * <blockquote>
 *
 * <pre>
 * a < b < c
 * a < b & b < c
 * a < c & a < b
 * </pre>
 *
 * </blockquote>
 * <p>
 * Notice that the order is important, as the subsequent item goes immediately
 * after the text-argument. The following are not equivalent:
 * <blockquote>
 *
 * <pre>
 * a < b & a < c
 * a < c & a < b
 * </pre>
 *
 * </blockquote>
 * <p>
 * Either the text-argument must already be present in the sequence, or some
 * initial substring of the text-argument must be present. For example
 * {@code "a < b & ae < e"} is valid since "a" is present in the sequence before
 * "ae" is reset. In this latter case, "ae" is not entered and treated as a
 * single character; instead, "e" is sorted as if it were expanded to two
 * characters: "a" followed by an "e". This difference appears in natural
 * languages: in traditional Spanish "ch" is treated as if it contracts to a
 * single character (expressed as {@code "c < ch < d"}), while in traditional
 * German a-umlaut is treated as if it expands to two characters (expressed as
 * {@code "a,A < b,B  ... & ae;\u00e3 & AE;\u00c3"}, where \u00e3 and \u00c3
 * are the escape sequences for a-umlaut).
 * <h4>Ignorable Characters</h4>
 * <p>
 * For ignorable characters, the first rule must start with a relation (the
 * examples we have used above are really fragments; {@code "a < b"} really
 * should be {@code "< a < b"}). If, however, the first relation is not
 * {@code "<"}, then all text-arguments up to the first {@code "<"} are
 * ignorable. For example, {@code ", - < a < b"} makes {@code "-"} an ignorable
 * character.
 * <h4>Normalization and Accents</h4>
 * <p>
 * {@code RuleBasedCollator} automatically processes its rule table to include
 * both pre-composed and combining-character versions of accented characters.
 * Even if the provided rule string contains only base characters and separate
 * combining accent characters, the pre-composed accented characters matching
 * all canonical combinations of characters from the rule string will be entered
 * in the table.
 * <p>
 * This allows you to use a RuleBasedCollator to compare accented strings even
 * when the collator is set to NO_DECOMPOSITION. However, if the strings to be
 * collated contain combining sequences that may not be in canonical order, you
 * should set the collator to CANONICAL_DECOMPOSITION to enable sorting of
 * combining sequences. For more information, see <a
 * href="http://www.aw.com/devpress">The Unicode Standard, Version 3.0</a>.
 * <h4>Errors</h4>
 * <p>
 * The following rules are not valid:
 * <ul type="disc">
 * <li>A text-argument contains unquoted punctuation symbols, for example
 * {@code "a < b-c < d"}.</li>
 * <li>A relation or reset character is not followed by a text-argument, for
 * example {@code "a < , b"}.</li>
 * <li>A reset where the text-argument (or an initial substring of the
 * text-argument) is not already in the sequence or allocated in the default UCA
 * table, for example {@code "a < b & e < f"}.</li>
 * </ul>
 * <p>
 * If you produce one of these errors, {@code RuleBasedCollator} throws a
 * {@code ParseException}.
 * <h4>Examples</h4>
 * <p>
 * Normally, to create a rule-based collator object, you will use
 * {@code Collator}'s factory method {@code getInstance}. However, to create a
 * rule-based collator object with specialized rules tailored to your needs, you
 * construct the {@code RuleBasedCollator} with the rules contained in a
 * {@code String} object. For example:
 * <blockquote>
 *
 * <pre>
 * String Simple = "< a < b < c < d";
 * RuleBasedCollator mySimple = new RuleBasedCollator(Simple);
 * </pre>
 *
 * </blockquote>
 * <p>
 * Or:
 * <blockquote>
 *
 * <pre>
 * String Norwegian = "< a,A< b,B< c,C< d,D< e,E< f,F< g,G< h,H< i,I"
 *         + "< j,J< k,K< l,L< m,M< n,N< o,O< p,P< q,Q< r,R"
 *         + "< s,S< t,T< u,U< v,V< w,W< x,X< y,Y< z,Z"
 *         + "< \u00E5=a\u030A,\u00C5=A\u030A"
 *         + ";aa,AA< \u00E6,\u00C6< \u00F8,\u00D8";
 * RuleBasedCollator myNorwegian = new RuleBasedCollator(Norwegian);
 * </pre>
 *
 * </blockquote>
 * <p>
 * Combining {@code Collator}s is as simple as concatenating strings. Here is
 * an example that combines two {@code Collator}s from two different locales:
 * <blockquote>
 *
 * <pre>
 * // Create an en_US Collator object
 * RuleBasedCollator en_USCollator = (RuleBasedCollator)Collator
 *         .getInstance(new Locale("en", "US", ""));
 *
 * // Create a da_DK Collator object
 * RuleBasedCollator da_DKCollator = (RuleBasedCollator)Collator
 *         .getInstance(new Locale("da", "DK", ""));
 *
 * // Combine the two collators
 * // First, get the collation rules from en_USCollator
 * String en_USRules = en_USCollator.getRules();
 *
 * // Second, get the collation rules from da_DKCollator
 * String da_DKRules = da_DKCollator.getRules();
 *
 * RuleBasedCollator newCollator = new RuleBasedCollator(en_USRules + da_DKRules);
 * // newCollator has the combined rules
 * </pre>
 *
 * </blockquote>
 * <p>
 * The next example shows to make changes on an existing table to create a new
 * {@code Collator} object. For example, add {@code "& C < ch, cH, Ch, CH"} to
 * the {@code en_USCollator} object to create your own:
 * <blockquote>
 *
 * <pre>
 * // Create a new Collator object with additional rules
 * String addRules = "& C < ch, cH, Ch, CH";
 *
 * RuleBasedCollator myCollator = new RuleBasedCollator(en_USCollator + addRules);
 * // myCollator contains the new rules
 * </pre>
 *
 * </blockquote>
 * <p>
 * The following example demonstrates how to change the order of non-spacing
 * accents:
 * <blockquote>
 *
 * <pre>
 * // old rule
 * String oldRules = "= \u00a8 ; \u00af ; \u00bf" + "< a , A ; ae, AE ; \u00e6 , \u00c6"
 *         + "< b , B < c, C < e, E & C < d, D";
 *
 * // change the order of accent characters
 * String addOn = "& \u00bf ; \u00af ; \u00a8;";
 *
 * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
 * </pre>
 *
 * </blockquote>
 * <p>
 * The last example shows how to put new primary ordering in before the default
 * setting. For example, in the Japanese {@code Collator}, you can either sort
 * English characters before or after Japanese characters:
 * <blockquote>
 *
 * <pre>
 * // get en_US Collator rules
 * RuleBasedCollator en_USCollator = (RuleBasedCollator)
 *     Collator.getInstance(Locale.US);
 *
 * // add a few Japanese character to sort before English characters
 * // suppose the last character before the first base letter 'a' in
 * // the English collation rule is \u30A2
 * String jaString = "& \u30A2 , \u30FC < \u30C8";
 *
 * RuleBasedCollator myJapaneseCollator =
 *     new RuleBasedCollator(en_USCollator.getRules() + jaString);
 * </pre>
 *
 * </blockquote>
 */
public class RuleBasedCollator extends Collator {
    RuleBasedCollator(RuleBasedCollatorICU wrapper) {
        super(wrapper);
    }

    /**
     * Constructs a new instance of {@code RuleBasedCollator} using the
     * specified {@code rules}. The {@code rules} are usually either
     * hand-written based on the {@link RuleBasedCollator class description} or
     * the result of a former {@link #getRules()} call.
     * <p>
     * Note that the {@code rules} are actually interpreted as a delta to the
     * standard Unicode Collation Algorithm (UCA). This differs
     * slightly from other implementations which work with full {@code rules}
     * specifications and may result in different behavior.
     *
     * @param rules
     *            the collation rules.
     * @throws NullPointerException
     *             if {@code rules == null}.
     * @throws ParseException
     *             if {@code rules} contains rules with invalid collation rule
     *             syntax.
     */
    public RuleBasedCollator(String rules) throws ParseException {
        if (rules == null) {
            throw new NullPointerException();
        }
        if (rules.isEmpty()) {
            throw new ParseException("empty rules", 0);
        }
        try {
            icuColl = new RuleBasedCollatorICU(rules);
        } catch (Exception e) {
            if (e instanceof ParseException) {
                throw (ParseException) e;
            }
            /*
             * -1 means it's not a ParseException. Maybe IOException thrown when
             * an error occurred while reading internal data.
             */
            throw new ParseException(e.getMessage(), -1);
        }
    }

    /**
     * Obtains a {@code CollationElementIterator} for the given
     * {@code CharacterIterator}. The source iterator's integrity will be
     * preserved since a new copy will be created for use.
     *
     * @param source
     *            the source character iterator.
     * @return a {@code CollationElementIterator} for {@code source}.
     */
    public CollationElementIterator getCollationElementIterator(CharacterIterator source) {
        if (source == null) {
            throw new NullPointerException();
        }
        return new CollationElementIterator(icuColl.getCollationElementIterator(source));
    }

    /**
     * Obtains a {@code CollationElementIterator} for the given string.
     *
     * @param source
     *            the source string.
     * @return the {@code CollationElementIterator} for {@code source}.
     */
    public CollationElementIterator getCollationElementIterator(String source) {
        if (source == null) {
            throw new NullPointerException();
        }
        return new CollationElementIterator(icuColl.getCollationElementIterator(source));
    }

    /**
     * Returns the collation rules of this collator. These {@code rules} can be
     * fed into the {@code RuleBasedCollator(String)} constructor.
     * <p>
     * Note that the {@code rules} are actually interpreted as a delta to the
     * standard Unicode Collation Algorithm (UCA). Hence, an empty {@code rules}
     * string results in the default UCA rules being applied. This differs
     * slightly from other implementations which work with full {@code rules}
     * specifications and may result in different behavior.
     *
     * @return the collation rules.
     */
    public String getRules() {
        return icuColl.getRules();
    }

    /**
     * Returns a new collator with the same collation rules, decomposition mode and
     * strength value as this collator.
     *
     * @return a shallow copy of this collator.
     * @see java.lang.Cloneable
     */
    @Override
    public Object clone() {
        RuleBasedCollator clone = (RuleBasedCollator) super.clone();
        return clone;
    }

    /**
     * Compares the {@code source} text to the {@code target} text according to
     * the collation rules, strength and decomposition mode for this
     * {@code RuleBasedCollator}. See the {@code Collator} class description
     * for an example of use.
     * <p>
     * General recommendation: If comparisons are to be done with the same strings
     * multiple times, it is more efficient to generate {@code CollationKey}
     * objects for the strings and use
     * {@code CollationKey.compareTo(CollationKey)} for the comparisons. If each
     * string is compared to only once, using
     * {@code RuleBasedCollator.compare(String, String)} has better performance.
     *
     * @param source
     *            the source text.
     * @param target
     *            the target text.
     * @return an integer which may be a negative value, zero, or else a
     *         positive value depending on whether {@code source} is less than,
     *         equivalent to, or greater than {@code target}.
     */
    @Override
    public int compare(String source, String target) {
        if (source == null || target == null) {
            throw new NullPointerException();
        }
        return icuColl.compare(source, target);
    }

    /**
     * Returns the {@code CollationKey} for the given source text.
     *
     * @param source
     *            the specified source text.
     * @return the {@code CollationKey} for the given source text.
     */
    @Override
    public CollationKey getCollationKey(String source) {
        return icuColl.getCollationKey(source);
    }

    @Override
    public int hashCode() {
        return icuColl.getRules().hashCode();
    }

    /**
     * Compares the specified object with this {@code RuleBasedCollator} and
     * indicates if they are equal. In order to be equal, {@code object} must be
     * an instance of {@code Collator} with the same collation rules and the
     * same attributes.
     *
     * @param obj
     *            the object to compare with this object.
     * @return {@code true} if the specified object is equal to this
     *         {@code RuleBasedCollator}; {@code false} otherwise.
     * @see #hashCode
     */
    @Override
    public boolean equals(Object obj) {
        if (!(obj instanceof Collator)) {
            return false;
        }
        return super.equals(obj);
    }
}