diff options
Diffstat (limited to 'lint/libs/lint_checks/src/com/android/tools/lint/checks/TypoLookup.java')
-rw-r--r-- | lint/libs/lint_checks/src/com/android/tools/lint/checks/TypoLookup.java | 785 |
1 files changed, 0 insertions, 785 deletions
diff --git a/lint/libs/lint_checks/src/com/android/tools/lint/checks/TypoLookup.java b/lint/libs/lint_checks/src/com/android/tools/lint/checks/TypoLookup.java deleted file mode 100644 index 2dcd6c3..0000000 --- a/lint/libs/lint_checks/src/com/android/tools/lint/checks/TypoLookup.java +++ /dev/null @@ -1,785 +0,0 @@ -/* - * Copyright (C) 2012 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.tools.lint.checks; - -import static com.android.SdkConstants.DOT_XML; -import static com.android.tools.lint.detector.api.LintUtils.assertionsEnabled; - -import com.android.annotations.NonNull; -import com.android.annotations.Nullable; -import com.android.annotations.VisibleForTesting; -import com.android.tools.lint.client.api.LintClient; -import com.android.tools.lint.detector.api.LintUtils; -import com.google.common.base.Charsets; -import com.google.common.base.Splitter; -import com.google.common.io.Files; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel.MapMode; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.WeakHashMap; - -/** - * Database of common typos / misspellings. - */ -public class TypoLookup { - private static final TypoLookup NONE = new TypoLookup(); - - /** String separating misspellings and suggested replacements in the text file */ - private static final String WORD_SEPARATOR = "->"; //$NON-NLS-1$ - - /** Relative path to the typos database file within the Lint installation */ - private static final String XML_FILE_PATH = "tools/support/typos-%1$s.txt"; //$NON-NLS-1$ - private static final String FILE_HEADER = "Typo database used by Android lint\000"; - private static final int BINARY_FORMAT_VERSION = 2; - private static final boolean DEBUG_FORCE_REGENERATE_BINARY = false; - private static final boolean DEBUG_SEARCH = false; - private static final boolean WRITE_STATS = false; - /** Default size to reserve for each API entry when creating byte buffer to build up data */ - private static final int BYTES_PER_ENTRY = 28; - - private final LintClient mClient; - private final File mXmlFile; - private final File mBinaryFile; - private byte[] mData; - private int[] mIndices; - private int mWordCount; - - private static WeakHashMap<String, TypoLookup> sInstanceMap = - new WeakHashMap<String, TypoLookup>(); - - /** - * Returns an instance of the Typo database for the given locale - * - * @param client the client to associate with this database - used only for - * logging. The database object may be shared among repeated - * invocations, and in that case client used will be the one - * originally passed in. In other words, this parameter may be - * ignored if the client created is not new. - * @param locale the locale to look up a typo database for (should be a - * language code (ISO 639-1, two lowercase character names) - * @param region the region to look up a typo database for (should be a two - * letter ISO 3166-1 alpha-2 country code in upper case) language - * code - * @return a (possibly shared) instance of the typo database, or null if its - * data can't be found - */ - @Nullable - public static TypoLookup get(@NonNull LintClient client, @NonNull String locale, - @Nullable String region) { - synchronized (TypoLookup.class) { - String key = locale; - - if (region != null) { - // Allow for region-specific dictionaries. See for example - // http://en.wikipedia.org/wiki/American_and_British_English_spelling_differences - assert region.length() == 2 - && Character.isUpperCase(region.charAt(0)) - && Character.isUpperCase(region.charAt(1)) : region; - // Look for typos-en-rUS.txt etc - key = locale + 'r' + region; - } - - TypoLookup db = sInstanceMap.get(key); - if (db == null) { - String path = String.format(XML_FILE_PATH, key); - File file = client.findResource(path); - if (file == null) { - // AOSP build environment? - String build = System.getenv("ANDROID_BUILD_TOP"); //$NON-NLS-1$ - if (build != null) { - file = new File(build, ("sdk/files/" //$NON-NLS-1$ - + path.substring(path.lastIndexOf('/') + 1)) - .replace('/', File.separatorChar)); - } - } - - if (file == null || !file.exists()) { - if (region != null) { - // Fall back to the generic locale (non-region-specific) database - return get(client, locale, null); - } - db = NONE; - } else { - db = get(client, file); - assert db != null : file; - } - sInstanceMap.put(key, db); - } - - if (db == NONE) { - return null; - } else { - return db; - } - } - } - - /** - * Returns an instance of the typo database - * - * @param client the client to associate with this database - used only for - * logging - * @param xmlFile the XML file containing configuration data to use for this - * database - * @return a (possibly shared) instance of the typo database, or null - * if its data can't be found - */ - @Nullable - private static TypoLookup get(LintClient client, File xmlFile) { - if (!xmlFile.exists()) { - client.log(null, "The typo database file %1$s does not exist", xmlFile); - return null; - } - - String name = xmlFile.getName(); - if (LintUtils.endsWith(name, DOT_XML)) { - name = name.substring(0, name.length() - DOT_XML.length()); - } - File cacheDir = client.getCacheDir(true/*create*/); - if (cacheDir == null) { - cacheDir = xmlFile.getParentFile(); - } - - File binaryData = new File(cacheDir, name - // Incorporate version number in the filename to avoid upgrade filename - // conflicts on Windows (such as issue #26663) - + "-" + BINARY_FORMAT_VERSION + ".bin"); //$NON-NLS-1$ //$NON-NLS-2$ - - if (DEBUG_FORCE_REGENERATE_BINARY) { - System.err.println("\nTemporarily regenerating binary data unconditionally \nfrom " - + xmlFile + "\nto " + binaryData); - if (!createCache(client, xmlFile, binaryData)) { - return null; - } - } else if (!binaryData.exists() || binaryData.lastModified() < xmlFile.lastModified()) { - if (!createCache(client, xmlFile, binaryData)) { - return null; - } - } - - if (!binaryData.exists()) { - client.log(null, "The typo database file %1$s does not exist", binaryData); - return null; - } - - return new TypoLookup(client, xmlFile, binaryData); - } - - private static boolean createCache(LintClient client, File xmlFile, File binaryData) { - long begin = 0; - if (WRITE_STATS) { - begin = System.currentTimeMillis(); - } - - // Read in data - List<String> lines; - try { - lines = Files.readLines(xmlFile, Charsets.UTF_8); - } catch (IOException e) { - client.log(e, "Can't read typo database file"); - return false; - } - - if (WRITE_STATS) { - long end = System.currentTimeMillis(); - System.out.println("Reading data structures took " + (end - begin) + " ms)"); - } - - try { - writeDatabase(binaryData, lines); - return true; - } catch (IOException ioe) { - client.log(ioe, "Can't write typo cache file"); - } - - return false; - } - - /** Use one of the {@link #get} factory methods instead */ - private TypoLookup( - @NonNull LintClient client, - @NonNull File xmlFile, - @Nullable File binaryFile) { - mClient = client; - mXmlFile = xmlFile; - mBinaryFile = binaryFile; - - if (binaryFile != null) { - readData(); - } - } - - private TypoLookup() { - mClient = null; - mXmlFile = null; - mBinaryFile = null; - } - - private void readData() { - if (!mBinaryFile.exists()) { - mClient.log(null, "%1$s does not exist", mBinaryFile); - return; - } - long start = System.currentTimeMillis(); - try { - MappedByteBuffer buffer = Files.map(mBinaryFile, MapMode.READ_ONLY); - assert buffer.order() == ByteOrder.BIG_ENDIAN; - - // First skip the header - byte[] expectedHeader = FILE_HEADER.getBytes(Charsets.US_ASCII); - buffer.rewind(); - for (int offset = 0; offset < expectedHeader.length; offset++) { - if (expectedHeader[offset] != buffer.get()) { - mClient.log(null, "Incorrect file header: not an typo database cache " + - "file, or a corrupt cache file"); - return; - } - } - - // Read in the format number - if (buffer.get() != BINARY_FORMAT_VERSION) { - // Force regeneration of new binary data with up to date format - if (createCache(mClient, mXmlFile, mBinaryFile)) { - readData(); // Recurse - } - - return; - } - - mWordCount = buffer.getInt(); - - // Read in the word table indices; - int count = mWordCount; - int[] offsets = new int[count]; - - // Another idea: I can just store the DELTAS in the file (and add them up - // when reading back in) such that it takes just ONE byte instead of four! - - for (int i = 0; i < count; i++) { - offsets[i] = buffer.getInt(); - } - - // No need to read in the rest -- we'll just keep the whole byte array in memory - // TODO: Make this code smarter/more efficient. - int size = buffer.limit(); - byte[] b = new byte[size]; - buffer.rewind(); - buffer.get(b); - mData = b; - mIndices = offsets; - - // TODO: We only need to keep the data portion here since we've initialized - // the offset array separately. - // TODO: Investigate (profile) accessing the byte buffer directly instead of - // accessing a byte array. - } catch (IOException e) { - mClient.log(e, null); - } - if (WRITE_STATS) { - long end = System.currentTimeMillis(); - System.out.println("\nRead typo database in " + (end - start) - + " milliseconds."); - System.out.println("Size of data table: " + mData.length + " bytes (" - + Integer.toString(mData.length/1024) + "k)\n"); - } - } - - /** See the {@link #readData()} for documentation on the data format. */ - private static void writeDatabase(File file, List<String> lines) throws IOException { - /* - * 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded - * as ASCII characters. The purpose of the header is to identify what the file - * is for, for anyone attempting to open the file. - * 2. A file version number. If the binary file does not match the reader's expected - * version, it can ignore it (and regenerate the cache from XML). - */ - - // Drop comments etc - List<String> words = new ArrayList<String>(lines.size()); - for (String line : lines) { - if (!line.isEmpty() && Character.isLetter(line.charAt(0))) { - int end = line.indexOf(WORD_SEPARATOR); - if (end == -1) { - end = line.trim().length(); - } - String typo = line.substring(0, end).trim(); - String replacements = line.substring(end + WORD_SEPARATOR.length()).trim(); - if (replacements.isEmpty()) { - // We don't support empty replacements - continue; - } - String combined = typo + (char) 0 + replacements; - - words.add(combined); - } - } - - byte[][] wordArrays = new byte[words.size()][]; - for (int i = 0, n = words.size(); i < n; i++) { - String word = words.get(i); - wordArrays[i] = word.getBytes(Charsets.UTF_8); - } - // Sort words, using our own comparator to ensure that it matches the - // binary search in getTypos() - Comparator<byte[]> comparator = new Comparator<byte[]>() { - @Override - public int compare(byte[] o1, byte[] o2) { - return TypoLookup.compare(o1, 0, (byte) 0, o2, 0, o2.length); - } - }; - Arrays.sort(wordArrays, comparator); - - int entryCount = wordArrays.length; - int capacity = entryCount * BYTES_PER_ENTRY; - ByteBuffer buffer = ByteBuffer.allocate(capacity); - buffer.order(ByteOrder.BIG_ENDIAN); - // 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded - // as ASCII characters. The purpose of the header is to identify what the file - // is for, for anyone attempting to open the file. - buffer.put(FILE_HEADER.getBytes(Charsets.US_ASCII)); - - // 2. A file version number. If the binary file does not match the reader's expected - // version, it can ignore it (and regenerate the cache from XML). - buffer.put((byte) BINARY_FORMAT_VERSION); - - // 3. The number of words [1 int] - buffer.putInt(entryCount); - - // 4. Word offset table (one integer per word, pointing to the byte offset in the - // file (relative to the beginning of the file) where each word begins. - // The words are always sorted alphabetically. - int wordOffsetTable = buffer.position(); - - // Reserve enough room for the offset table here: we will backfill it with pointers - // as we're writing out the data structures below - for (int i = 0, n = entryCount; i < n; i++) { - buffer.putInt(0); - } - - int nextEntry = buffer.position(); - int nextOffset = wordOffsetTable; - - // 7. Word entry table. Each word entry consists of the word, followed by the byte 0 - // as a terminator, followed by a comma separated list of suggestions (which - // may be empty), or a final 0. - for (int i = 0; i < entryCount; i++) { - byte[] word = wordArrays[i]; - buffer.position(nextOffset); - buffer.putInt(nextEntry); - nextOffset = buffer.position(); - buffer.position(nextEntry); - - buffer.put(word); // already embeds 0 to separate typo from words - buffer.put((byte) 0); - - nextEntry = buffer.position(); - } - - int size = buffer.position(); - assert size <= buffer.limit(); - buffer.mark(); - - if (WRITE_STATS) { - System.out.println("Wrote " + words.size() + " word entries"); - System.out.print("Actual binary size: " + size + " bytes"); - System.out.println(String.format(" (%.1fM)", size/(1024*1024.f))); - - System.out.println("Allocated size: " + (entryCount * BYTES_PER_ENTRY) + " bytes"); - System.out.println("Required bytes per entry: " + (size/ entryCount) + " bytes"); - } - - // Now dump this out as a file - // There's probably an API to do this more efficiently; TODO: Look into this. - byte[] b = new byte[size]; - buffer.rewind(); - buffer.get(b); - FileOutputStream output = Files.newOutputStreamSupplier(file).getOutput(); - output.write(b); - output.close(); - } - - // For debugging only - private String dumpEntry(int offset) { - if (DEBUG_SEARCH) { - int end = offset; - while (mData[end] != 0) { - end++; - } - return new String(mData, offset, end - offset, Charsets.UTF_8); - } else { - return "<disabled>"; //$NON-NLS-1$ - } - } - - /** Comparison function: *only* used for ASCII strings */ - @VisibleForTesting - static int compare(byte[] data, int offset, byte terminator, CharSequence s, - int begin, int end) { - int i = offset; - int j = begin; - for (; ; i++, j++) { - byte b = data[i]; - if (b == ' ') { - // We've matched up to the space in a split-word typo, such as - // in German all zu=>allzu; here we've matched just past "all". - // Rather than terminating, attempt to continue in the buffer. - if (j == end) { - int max = s.length(); - if (end < max && s.charAt(end) == ' ') { - // Find next word - for (; end < max; end++) { - char c = s.charAt(end); - if (!Character.isLetter(c)) { - if (c == ' ' && end == j) { - continue; - } - break; - } - } - } - } - } - - if (j == end) { - break; - } - - if (b == '*') { - // Glob match (only supported at the end) - return 0; - } - char c = s.charAt(j); - byte cb = (byte) c; - int delta = b - cb; - if (delta != 0) { - cb = (byte) Character.toLowerCase(c); - if (b != cb) { - // Ensure that it has the right sign - b = (byte) Character.toLowerCase(b); - delta = b - cb; - if (delta != 0) { - return delta; - } - } - } - } - - return data[i] - terminator; - } - - /** Comparison function used for general UTF-8 encoded strings */ - @VisibleForTesting - static int compare(byte[] data, int offset, byte terminator, byte[] s, - int begin, int end) { - int i = offset; - int j = begin; - for (; ; i++, j++) { - byte b = data[i]; - if (b == ' ') { - // We've matched up to the space in a split-word typo, such as - // in German all zu=>allzu; here we've matched just past "all". - // Rather than terminating, attempt to continue in the buffer. - // We've matched up to the space in a split-word typo, such as - // in German all zu=>allzu; here we've matched just past "all". - // Rather than terminating, attempt to continue in the buffer. - if (j == end) { - int max = s.length; - if (end < max && s[end] == ' ') { - // Find next word - for (; end < max; end++) { - byte cb = s[end]; - if (!isLetter(cb)) { - if (cb == ' ' && end == j) { - continue; - } - break; - } - } - } - } - } - - if (j == end) { - break; - } - if (b == '*') { - // Glob match (only supported at the end) - return 0; - } - byte cb = s[j]; - int delta = b - cb; - if (delta != 0) { - cb = toLowerCase(cb); - b = toLowerCase(b); - delta = b - cb; - if (delta != 0) { - return delta; - } - } - - if (b == terminator || cb == terminator) { - return delta; - } - } - - return data[i] - terminator; - } - - /** - * Look up whether this word is a typo, and if so, return the typo itself - * and one or more likely meanings - * - * @param text the string containing the word - * @param begin the index of the first character in the word - * @param end the index of the first character after the word. Note that the - * search may extend <b>beyond</b> this index, if for example the - * word matches a multi-word typo in the dictionary - * @return a list of the typo itself followed by the replacement strings if - * the word represents a typo, and null otherwise - */ - @Nullable - public List<String> getTypos(@NonNull CharSequence text, int begin, int end) { - assert end <= text.length(); - - if (assertionsEnabled()) { - for (int i = begin; i < end; i++) { - char c = text.charAt(i); - if (c >= 128) { - assert false : "Call the UTF-8 version of this method instead"; - return null; - } - } - } - - int low = 0; - int high = mWordCount - 1; - while (low <= high) { - int middle = (low + high) >>> 1; - int offset = mIndices[middle]; - - if (DEBUG_SEARCH) { - System.out.println("Comparing string " + text +" with entry at " + offset - + ": " + dumpEntry(offset)); - } - - // Compare the word at the given index. - int compare = compare(mData, offset, (byte) 0, text, begin, end); - - if (compare == 0) { - offset = mIndices[middle]; - - // Don't allow matching uncapitalized words, such as "enlish", when - // the dictionary word is capitalized, "Enlish". - if (mData[offset] != text.charAt(begin) - && Character.isLowerCase(text.charAt(begin))) { - return null; - } - - // Make sure there is a case match; we only want to allow - // matching capitalized words to capitalized typos or uncapitalized typos - // (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized - // typos (e.g. "enlish" to "Enlish"). - String glob = null; - for (int i = begin; ; i++) { - byte b = mData[offset++]; - if (b == 0) { - offset--; - break; - } else if (b == '*') { - int globEnd = i; - while (globEnd < text.length() - && Character.isLetter(text.charAt(globEnd))) { - globEnd++; - } - glob = text.subSequence(i, globEnd).toString(); - break; - } - char c = text.charAt(i); - byte cb = (byte) c; - if (b != cb && i > begin) { - return null; - } - } - - return computeSuggestions(mIndices[middle], offset, glob); - } - - if (compare < 0) { - low = middle + 1; - } else if (compare > 0) { - high = middle - 1; - } else { - assert false; // compare == 0 already handled above - return null; - } - } - - return null; - } - - /** - * Look up whether this word is a typo, and if so, return the typo itself - * and one or more likely meanings - * - * @param utf8Text the string containing the word, encoded as UTF-8 - * @param begin the index of the first character in the word - * @param end the index of the first character after the word. Note that the - * search may extend <b>beyond</b> this index, if for example the - * word matches a multi-word typo in the dictionary - * @return a list of the typo itself followed by the replacement strings if - * the word represents a typo, and null otherwise - */ - @Nullable - public List<String> getTypos(@NonNull byte[] utf8Text, int begin, int end) { - assert end <= utf8Text.length; - - int low = 0; - int high = mWordCount - 1; - while (low <= high) { - int middle = (low + high) >>> 1; - int offset = mIndices[middle]; - - if (DEBUG_SEARCH) { - String s = new String(Arrays.copyOfRange(utf8Text, begin, end), Charsets.UTF_8); - System.out.println("Comparing string " + s +" with entry at " + offset - + ": " + dumpEntry(offset)); - System.out.println(" middle=" + middle + ", low=" + low + ", high=" + high); - } - - // Compare the word at the given index. - int compare = compare(mData, offset, (byte) 0, utf8Text, begin, end); - - if (DEBUG_SEARCH) { - System.out.println(" signum=" + (int)Math.signum(compare) + ", delta=" + compare); - } - - if (compare == 0) { - offset = mIndices[middle]; - - // Don't allow matching uncapitalized words, such as "enlish", when - // the dictionary word is capitalized, "Enlish". - if (mData[offset] != utf8Text[begin] && isUpperCase(mData[offset])) { - return null; - } - - // Make sure there is a case match; we only want to allow - // matching capitalized words to capitalized typos or uncapitalized typos - // (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized - // typos (e.g. "enlish" to "Enlish"). - String glob = null; - for (int i = begin; ; i++) { - byte b = mData[offset++]; - if (b == 0) { - offset--; - break; - } else if (b == '*') { - int globEnd = i; - while (globEnd < utf8Text.length && isLetter(utf8Text[globEnd])) { - globEnd++; - } - glob = new String(utf8Text, i, globEnd - i, Charsets.UTF_8); - break; - } - byte cb = utf8Text[i]; - if (b != cb && i > begin) { - return null; - } - } - - return computeSuggestions(mIndices[middle], offset, glob); - } - - if (compare < 0) { - low = middle + 1; - } else if (compare > 0) { - high = middle - 1; - } else { - assert false; // compare == 0 already handled above - return null; - } - } - - return null; - } - - private List<String> computeSuggestions(int begin, int offset, String glob) { - String typo = new String(mData, begin, offset - begin, Charsets.UTF_8); - - if (glob != null) { - typo = typo.replaceAll("\\*", glob); //$NON-NLS-1$ - } - - assert mData[offset] == 0; - offset++; - int replacementEnd = offset; - while (mData[replacementEnd] != 0) { - replacementEnd++; - } - String replacements = new String(mData, offset, replacementEnd - offset, Charsets.UTF_8); - List<String> words = new ArrayList<String>(); - words.add(typo); - - // The first entry should be the typo itself. We need to pass this back since due - // to multi-match words and globbing it could extend beyond the initial word range - - for (String s : Splitter.on(',').omitEmptyStrings().trimResults().split(replacements)) { - if (glob != null) { - // Need to append the glob string to each result - words.add(s.replaceAll("\\*", glob)); //$NON-NLS-1$ - } else { - words.add(s); - } - } - - return words; - } - - // "Character" handling for bytes. This assumes that the bytes correspond to Unicode - // characters in the ISO 8859-1 range, which is are encoded the same way in UTF-8. - // This obviously won't work to for example uppercase to lowercase conversions for - // multi byte characters, which means we simply won't catch typos if the dictionaries - // contain these. None of the currently included dictionaries do. However, it does - // help us properly deal with punctuation and spacing characters. - - static final boolean isUpperCase(byte b) { - return Character.isUpperCase((char) b); - } - - static final byte toLowerCase(byte b) { - return (byte) Character.toLowerCase((char) b); - } - - static final boolean isSpace(byte b) { - return Character.isWhitespace((char) b); - } - - static final boolean isLetter(byte b) { - // Assume that multi byte characters represent letters in other languages. - // Obviously, it could be unusual punctuation etc but letters are more likely - // in this context. - return Character.isLetter((char) b) || (b & 0x80) != 0; - } -} |