diff options
Diffstat (limited to 'guava/src/com/google/common/hash/BloomFilterStrategies.java')
-rw-r--r-- | guava/src/com/google/common/hash/BloomFilterStrategies.java | 137 |
1 files changed, 137 insertions, 0 deletions
diff --git a/guava/src/com/google/common/hash/BloomFilterStrategies.java b/guava/src/com/google/common/hash/BloomFilterStrategies.java new file mode 100644 index 0000000..9709e0b --- /dev/null +++ b/guava/src/com/google/common/hash/BloomFilterStrategies.java @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2011 The Guava Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package com.google.common.hash; + +import static com.google.common.base.Preconditions.checkArgument; + +import com.google.common.math.IntMath; + +import java.math.RoundingMode; +import java.util.Arrays; + +/** + * Collections of strategies of generating the k * log(M) bits required for an element to + * be mapped to a BloomFilter of M bits and k hash functions. These + * strategies are part of the serialized form of the Bloom filters that use them, thus they must be + * preserved as is (no updates allowed, only introduction of new versions). + * + * Important: the order of the constants cannot change, and they cannot be deleted - we depend + * on their ordinal for BloomFilter serialization. + * + * @author Dimitris Andreou + */ +enum BloomFilterStrategies implements BloomFilter.Strategy { + /** + * See "Less Hashing, Same Performance: Building a Better Bloom Filter" by Adam Kirsch and + * Michael Mitzenmacher. The paper argues that this trick doesn't significantly deteriorate the + * performance of a Bloom filter (yet only needs two 32bit hash functions). + */ + MURMUR128_MITZ_32() { + @Override public <T> boolean put(T object, Funnel<? super T> funnel, + int numHashFunctions, BitArray bits) { + // TODO(user): when the murmur's shortcuts are implemented, update this code + long hash64 = Hashing.murmur3_128().newHasher().putObject(object, funnel).hash().asLong(); + int hash1 = (int) hash64; + int hash2 = (int) (hash64 >>> 32); + boolean bitsChanged = false; + for (int i = 1; i <= numHashFunctions; i++) { + int nextHash = hash1 + i * hash2; + if (nextHash < 0) { + nextHash = ~nextHash; + } + bitsChanged |= bits.set(nextHash % bits.size()); + } + return bitsChanged; + } + + @Override public <T> boolean mightContain(T object, Funnel<? super T> funnel, + int numHashFunctions, BitArray bits) { + long hash64 = Hashing.murmur3_128().newHasher().putObject(object, funnel).hash().asLong(); + int hash1 = (int) hash64; + int hash2 = (int) (hash64 >>> 32); + for (int i = 1; i <= numHashFunctions; i++) { + int nextHash = hash1 + i * hash2; + if (nextHash < 0) { + nextHash = ~nextHash; + } + if (!bits.get(nextHash % bits.size())) { + return false; + } + } + return true; + } + }; + + // Note: We use this instead of java.util.BitSet because we need access to the long[] data field + static class BitArray { + final long[] data; + int bitCount; + + BitArray(int bits) { + this(new long[IntMath.divide(bits, 64, RoundingMode.CEILING)]); + } + + // Used by serialization + BitArray(long[] data) { + checkArgument(data.length > 0, "data length is zero!"); + this.data = data; + int bitCount = 0; + for (long value : data) { + bitCount += Long.bitCount(value); + } + this.bitCount = bitCount; + } + + /** Returns true if the bit changed value. */ + boolean set(int index) { + if (!get(index)) { + data[index >> 6] |= (1L << index); + bitCount++; + return true; + } + return false; + } + + boolean get(int index) { + return (data[index >> 6] & (1L << index)) != 0; + } + + /** Number of bits */ + int size() { + return data.length * Long.SIZE; + } + + /** Number of set bits (1s) */ + int bitCount() { + return bitCount; + } + + BitArray copy() { + return new BitArray(data.clone()); + } + + @Override public boolean equals(Object o) { + if (o instanceof BitArray) { + BitArray bitArray = (BitArray) o; + return Arrays.equals(data, bitArray.data); + } + return false; + } + + @Override public int hashCode() { + return Arrays.hashCode(data); + } + } +} |