indexmap: use bloom filter to drastically speed up check for unknown blobs

Only in use on 64-bit systems. Use the upper 28bits of the id of an index entry as bloom filter. This allows skipping the index entry traversal most of the time if an id is not stored in the hashmap. The bloom filter embedded in the index entry id is check each time before following a reference to an index entry. This further reduces the risk of false positives. The bloom filter itself is basically for free on modern CPUs. The main performance cost of checking for unknown blobs in the index are the essentially random RAM accesses for the initial bucket lookup as well as following the next pointer in the index entries. With the bloom filter most of the time only the initial bucket lookup is necessary. This speeds up checking for unknown blobs by a factor 5 (!), while having no effect on the lookup of known blobs: $ benchstat no-bloom with-bloom name old time/op new time/op delta IndexHasUnknown-16 49.0ms ± 2% 9.9ms ± 7% -79.70% (p=0.000 n=10+10) IndexHasKnown-16 48.0ms ± 3% 47.9ms ± 3% ~ (p=0.968 n=10+9) This bloom filter parameters m=28 k=1 were derived empirically, while also leaving sufficient room for very large repositories. Before this commit, the final merge index step took roughly 1 second per million index entries. With the chosen bloom filter parameters, it would currently take 19 hours to just merge such an index. It is safe to assume that such large repositories don't exist. Comparison with other parameter sets: $ m=28 k=1 versus m=32 k=1 name old time/op new time/op delta IndexHasUnknown-16 49.0ms ± 2% 9.7ms ±16% -80.17% (p=0.000 n=10+10) IndexHasKnown-16 48.0ms ± 3% 48.4ms ± 3% ~ (p=0.436 n=10+10) $ m=28 k=1 versus m=24 k=1 name old time/op new time/op delta IndexHasUnknown-16 49.0ms ± 2% 10.8ms ±13% -77.90% (p=0.000 n=10+10) IndexHasKnown-16 48.0ms ± 3% 47.9ms ± 3% ~ (p=0.684 n=10+10) $ m=28 k=1 versus m=28 k=2 name old time/op new time/op delta IndexHasUnknown-16 49.0ms ± 2% 24.9ms ± 5% -49.27% (p=0.000 n=10+10) IndexHasKnown-16 48.0ms ± 3% 48.0ms ± 4% ~ (p=1.000 n=10+10) `k=2` outright wrecks the performance. This is most likely the case as it performs worse on longer index entry chains, which also happen to be the expensive ones to process. `m=32` yields diminishing returns, while getting within an order of magnitude of the largest known restic repositories. Design alternatives: In principle it would be possible to add a single large bloom filter instead of embedding them in the index entry ids. However, this bloom filter would necessarily incur additional random memory accesses and thus slow things down overall.
2026-05-11 21:15:23 +00:00 · 2026-02-14 21:49:17 +01:00
parent 320f709fbc
commit ba638b6602
1 changed files with 62 additions and 8 deletions
@@ -3,6 +3,7 @@ package index
 import (
 	"hash/maphash"
 	"iter"
+	"math"

 	"github.com/restic/restic/internal/restic"
 )
@@ -16,6 +17,15 @@ import (
 // The buckets in this hash table contain only pointers, rather than inlined
 // key-value pairs like the standard Go map. This way, only a pointer array
 // needs to be resized when the table grows, preventing memory usage spikes.
+//
+// On 64-bit systems, the id of an indexEntry is a uint64 containing the index
+// of the entry in the `buckets` slice. This index is also stored in the
+// `next` field of an indexEntry. However, the actual number of entries
+// is far lower. Thus, the upper 28 bits are used to store a bloom filter,
+// leaving the lower 36 bits for the index in the block list. The bloom filter
+// is used to quickly check if an entry might be present in the map before
+// traversing the block list. This significantly reduces the number of cache
+// misses when following the `next` field chain for unknown ids.
 type indexMap struct {
 	// The number of buckets is always a power of two and never zero.
 	buckets    []uint
@@ -50,7 +60,7 @@ func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompr
 	e.length = length
 	e.uncompressedLength = uncompressedLength

-	m.buckets[h] = idx
+	m.buckets[h] = bloomInsertID(idx, e.next, id)
 	m.numentries++
 }

@@ -75,7 +85,9 @@ func (m *indexMap) valuesWithID(id restic.ID) iter.Seq[*indexEntry] {

 		h := m.hash(id)
 		ei := m.buckets[h]
-		for ei != 0 {
+		// checking before resolving each entry is significantly faster than
+		// checking only once at the start.
+		for bloomHasID(ei, id) {
 			e := m.resolve(ei)
 			ei = e.next
 			if e.id != id {
@@ -96,7 +108,7 @@ func (m *indexMap) get(id restic.ID) *indexEntry {

 	h := m.hash(id)
 	ei := m.buckets[h]
-	for ei != 0 {
+	for bloomHasID(ei, id) {
 		e := m.resolve(ei)
 		if e.id == id {
 			return e
@@ -116,9 +128,9 @@ func (m *indexMap) firstIndex(id restic.ID) int {
 	idx := -1
 	h := m.hash(id)
 	ei := m.buckets[h]
-	for ei != 0 {
+	for bloomHasID(ei, id) {
 		e := m.resolve(ei)
-		cur := ei
+		cur := bloomCleanID(ei)
 		ei = e.next
 		if e.id != id {
 			continue
@@ -141,7 +153,7 @@ func (m *indexMap) grow() {

 		h := m.hash(e.id)
 		e.next = m.buckets[h]
-		m.buckets[h] = i
+		m.buckets[h] = bloomInsertID(i, e.next, e.id)
 	}
 }

@@ -169,11 +181,53 @@ func (m *indexMap) init() {
 func (m *indexMap) len() uint { return m.numentries }

 func (m *indexMap) newEntry() (*indexEntry, uint) {
-	return m.blockList.Alloc()
+	entry, idx := m.blockList.Alloc()
+	if idx != bloomCleanID(idx) {
+		panic("repository index size overflow")
+	}
+	return entry, idx
 }

 func (m *indexMap) resolve(idx uint) *indexEntry {
-	return m.blockList.Ref(idx)
+	return m.blockList.Ref(bloomCleanID(idx))
+}
+
+// On 32-bit systems, the bloom filter compiles away into a no-op.
+const bloomShift = 36
+const bloomMask = 1<<bloomShift - 1
+
+func bloomCleanID(idx uint) uint {
+	// extra variable to compile on 32bit systems
+	bloomMask := uint64(bloomMask)
+	return idx & uint(bloomMask)
+}
+
+func bloomForID(id restic.ID) uint {
+	// A bloom filter with a single hash function seems to work best.
+	// This is probably because the entry chains can be quite long, such that several entries end
+	// up in the same bloom filter. In this case, a single hash function yields the lowest false positive rate.
+	k1 := id[0] % (64 - bloomShift)
+	return uint(1 << k1)
+}
+
+// bloomHasID returns whether the idx could contain the id. Returns false only if the index cannot contain the id.
+// It may return true even if the id is not present in the entry chain. However, those false positives are expected to be rare.
+func bloomHasID(idx uint, id restic.ID) bool {
+	if math.MaxUint == math.MaxUint32 {
+		// On 32-bit systems, the bloom filter is empty for all entries.
+		// Thus, simply check if there is a next entry.
+		return idx != 0
+	}
+	bloom := idx >> bloomShift
+	return bloom&bloomForID(id) != 0
+}
+
+func bloomInsertID(idx uint, nextIdx uint, id restic.ID) uint {
+	// extra variable to compile on 32bit systems
+	bloomMask := uint64(bloomMask)
+	oldBloom := (nextIdx & ^uint(bloomMask))
+	newBloom := bloomForID(id) << bloomShift
+	return idx | oldBloom | newBloom
 }

 type indexEntry struct {