// Package bloom provides a space-efficient, concurrent-safe Bloom filter
// with zero allocations per operation.
//
// A Bloom filter is a probabilistic data structure that can answer the question
// "Is this item in the set?" with either a definitive No or a probabilistic
// Probably. There are no false negatives: if Contains returns false, the item
// was never added. False positives are possible but their rate is controlled
// by the filter's size and number of hash functions.
//
// The filter uses [atomic.Uint64] for its bitset, making Add and Contains
// fully safe for concurrent use without any locks. Hashers are recycled via
// [sync.Pool] to eliminate per-call heap allocations.
//
// Hash independence is achieved through the Kirsch-Mitzenmacher optimization:
// a single 64-bit hash is split into two 32-bit halves to simulate k
// independent hash functions. The default hasher is FNV-1a from the standard
// library; a custom [hash.Hash64] can be injected via [WithHashFunc].
//
// Filters implement [encoding.BinaryMarshaler] and [encoding.BinaryUnmarshaler]
// for compact serialization.
package bloom
import (
"encoding/binary"
"errors"
"fmt"
"hash"
"hash/fnv"
"math"
"sync"
"sync/atomic"
)
// HashFunc is a function that returns a new hash.Hash64.
// By defining an interface for our hasher, we avoid hardcoding a
// specific algorithm. This gives users the flexibility to swap
// the default hasher (FNV-1a) with a faster one (like Murmur3 or xxHash)
// if their use case requires maximum throughput.
type HashFunc func() hash.Hash64
// Filter represents a Bloom filter.
// We use atomic.Uint64 so that Add and Contains are fully concurrent-safe
// without requiring an external mutex. On modern 64-bit architectures,
// atomic operations on aligned 64-bit words are essentially free.
type Filter struct {
bitset []atomic.Uint64
m uint64 // Total number of bits in the filter
k uint64 // Number of hash functions to apply per element
hasher HashFunc
pool sync.Pool // Pool of hash.Hash64 instances to avoid per-call allocations
}
// Option configures the Filter using the functional options pattern.
// This pattern allows us to provide sensible defaults while
// remaining open for extension later, without breaking the API.
type Option func(*Filter)
// WithHashFunc allows providing a custom hash function constructor.
func WithHashFunc(h HashFunc) Option {
return func(f *Filter) {
f.hasher = h
}
}
// optimalM calculates the ideal total number of bits (m) needed to store
// `n` elements while maintaining a target false positive rate `p`.
// The formula is: m = - (n * ln(p)) / (ln(2)^2)
func optimalM(n int, p float64) uint64 {
return uint64(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)))
}
// optimalK calculates the ideal number of hash functions (k) to use
// for a given number of bits (`m`) and expected elements (`n`).
// The formula is: k = (m / n) * ln(2)
func optimalK(m uint64, n int) uint64 {
return uint64(math.Ceil((float64(m) / float64(n)) * math.Log(2)))
}
// NewFilter creates a Bloom filter directly with a specific bit size (m)
// and number of hash iterations (k).
func NewFilter(m int, k int, opts ...Option) *Filter {
// Since each uint64 holds 64 bits, we divide `m` by 64.
// We add 63 before dividing to ensure we round up.
bits := uint64(m)
size := (bits + 63) / 64
f := &Filter{
bitset: make([]atomic.Uint64, size),
m: bits,
k: uint64(k),
hasher: fnv.New64a, // FNV-1a is in the standard library and computationally cheap.
}
for _, opt := range opts {
opt(f)
}
// Initialize the sync.Pool with the (possibly overridden) hasher.
// This eliminates the per-call heap allocation of the hash.Hash64 interface.
f.pool = sync.Pool{
New: func() any { return f.hasher() },
}
return f
}
// NewFilterFromProbability creates a Bloom filter tailored for an expected
// number of elements (`n`) and a desired false-positive probability (`p`).
// Example: NewFilterFromProbability(10000, 0.01) creates a filter for 10k items
// with a 1% chance of a false positive.
func NewFilterFromProbability(n int, p float64, opts ...Option) *Filter {
m := optimalM(n, p)
k := optimalK(m, n)
return NewFilter(int(m), int(k), opts...)
}
// iter builds an iterator that yields each bit index for data.
// It computes one 64-bit hash, splits it into two 32-bit halves,
// and applies Kirsch-Mitzenmacher to simulate k independent hashes.
// It pools the hasher to avoid per-call allocations and is safe for concurrent use.
func (f *Filter) iter(data []byte) func(func(uint64) bool) {
return func(yield func(uint64) bool) {
h := f.pool.Get().(hash.Hash64)
h.Reset()
_, _ = h.Write(data)
sum := h.Sum64()
f.pool.Put(h)
h1, h2 := sum&0xffffffff, sum>>32
for i := range f.k {
bitIdx := (h1 + i*h2) % f.m
if !yield(bitIdx) {
return
}
}
}
}
// Add inserts data into the Bloom filter.
// It hashes the data once, splits the 64-bit result into two 32-bit halves,
// and simulates `k` independent hashes via Kirsch-Mitzenmacher.
// This method is safe for concurrent use.
func (f *Filter) Add(data []byte) {
for bitIdx := range f.iter(data) {
// Atomically OR the bit to 1, safe for concurrent writes
f.bitset[bitIdx/64].Or(1 << (bitIdx % 64))
}
}
// Contains checks if data might be in the set.
// If *any* of the `k` hash positions are 0, the element was definitively never added.
// If *all* of the positions are 1, it *might* have been added (or we have a collision).
// This method is safe for concurrent use.
func (f *Filter) Contains(data []byte) bool {
for bitIdx := range f.iter(data) {
// Atomically load the word and check the specific bit
if (f.bitset[bitIdx/64].Load() & (1 << (bitIdx % 64))) == 0 {
return false // Definitively not in the set
}
}
return true // Probably in the set
}
// Binary encoding format (little-endian):
// [0:8] m (uint64, total number of bits)
// [8:16] k (uint64, number of hash functions)
// [16:] bitset ((m+63)/64 × 8 bytes, raw uint64 words)
// MarshalBinary implements the encoding.BinaryMarshaler interface.
// It serializes the filter's parameters and bitset into a portable binary format.
// The hash function is NOT serialized — upon UnmarshalBinary, the default
// FNV-1a hasher is used. Use WithHashFunc after unmarshaling if needed.
func (f *Filter) MarshalBinary() ([]byte, error) {
nwords := uint64(len(f.bitset))
buf := make([]byte, 8+8+nwords*8) // m + k + bitset
binary.LittleEndian.PutUint64(buf[0:8], f.m)
binary.LittleEndian.PutUint64(buf[8:16], f.k)
for i := range nwords {
binary.LittleEndian.PutUint64(buf[16+i*8:16+i*8+8], f.bitset[i].Load())
}
return buf, nil
}
// UnmarshalBinary implements the encoding.BinaryUnmarshaler interface.
// It restores the filter's parameters and bitset from data produced by
// MarshalBinary. If the receiver was created with a custom hasher (via
// WithHashFunc), that hasher is preserved. Otherwise, it defaults to FNV-1a.
func (f *Filter) UnmarshalBinary(data []byte) error {
if len(data) < 16 {
return errors.New("bloom: binary data too short")
}
m := binary.LittleEndian.Uint64(data[0:8])
k := binary.LittleEndian.Uint64(data[8:16])
nwords := (m + 63) / 64
expected := 16 + nwords*8
if uint64(len(data)) != expected {
return fmt.Errorf("bloom: expected %d bytes, got %d", expected, len(data))
}
bitset := make([]atomic.Uint64, nwords)
for i := range nwords {
bitset[i].Store(binary.LittleEndian.Uint64(data[16+i*8 : 16+i*8+8]))
}
f.m = m
f.k = k
f.bitset = bitset
if f.hasher == nil {
f.hasher = fnv.New64a
}
f.pool = sync.Pool{
New: func() any { return f.hasher() },
}
return nil
}