Files
restic/internal/repository/index/indexmap.go
T
2026-05-10 00:35:17 +02:00

344 lines
8.7 KiB
Go

package index
import (
"hash/maphash"
"iter"
"math"
"github.com/restic/restic/internal/restic"
)
// An indexMap is a chained hash table that maps blob IDs to indexEntries.
// It allows storing multiple entries with the same key.
//
// IndexMap uses some optimizations that are not compatible with supporting
// deletions.
//
// The buckets in this hash table contain only pointers, rather than inlined
// key-value pairs like the standard Go map. This way, only a pointer array
// needs to be resized when the table grows, preventing memory usage spikes.
//
// On 64-bit systems, the id of an indexEntry is a uint64 containing the index
// of the entry in the `buckets` slice. This index is also stored in the
// `next` field of an indexEntry. However, the actual number of entries
// is far lower. Thus, the upper 28 bits are used to store a bloom filter,
// leaving the lower 36 bits for the index in the block list. The bloom filter
// is used to quickly check if an entry might be present in the map before
// traversing the block list. This significantly reduces the number of cache
// misses when following the `next` field chain for unknown ids.
type indexMap struct {
// The number of buckets is always a power of two and never zero.
buckets []uint
numentries uint
mh maphash.Hash
blockList hashedArrayTree
}
const (
maxLoad = 4 // Max. number of entries per bucket.
)
// add inserts an indexEntry for the given arguments into the map,
// using id as the key.
func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompressedLength uint32) {
// Make sure there is enough space for the new entry.
m.preallocate(int(m.numentries) + 1)
h := m.hash(id)
e, idx := m.newEntry()
e.id = id
e.next = m.buckets[h] // Prepend to existing chain.
e.packIndex = packIdx
e.offset = offset
e.length = length
e.uncompressedLength = uncompressedLength
m.buckets[h] = bloomInsertID(idx, e.next, id)
m.numentries++
}
// values returns an iterator over all entries in the map.
func (m *indexMap) values() iter.Seq[*indexEntry] {
return func(yield func(*indexEntry) bool) {
blockCount := m.blockList.Size()
for i := uint(1); i < blockCount; i++ {
if !yield(m.resolve(i)) {
return
}
}
}
}
// valuesWithID returns an iterator over all entries with the given id.
func (m *indexMap) valuesWithID(id restic.ID) iter.Seq[*indexEntry] {
return func(yield func(*indexEntry) bool) {
if len(m.buckets) == 0 {
return
}
h := m.hash(id)
ei := m.buckets[h]
// checking before resolving each entry is significantly faster than
// checking only once at the start.
for bloomHasID(ei, id) {
e := m.resolve(ei)
ei = e.next
if e.id != id {
continue
}
if !yield(e) {
return
}
}
}
}
// get returns the first entry for the given id.
func (m *indexMap) get(id restic.ID) *indexEntry {
if len(m.buckets) == 0 {
return nil
}
h := m.hash(id)
ei := m.buckets[h]
for bloomHasID(ei, id) {
e := m.resolve(ei)
if e.id == id {
return e
}
ei = e.next
}
return nil
}
// firstIndex returns the index of the first entry for ID id.
// This index is guaranteed to never change.
func (m *indexMap) firstIndex(id restic.ID) int {
if len(m.buckets) == 0 {
return -1
}
idx := -1
h := m.hash(id)
ei := m.buckets[h]
for bloomHasID(ei, id) {
e := m.resolve(ei)
cur := bloomCleanID(ei)
ei = e.next
if e.id != id {
continue
}
if int(cur) < idx || idx == -1 {
// casting from uint to int is unproblematic as we'd run out of memory
// before this can result in an overflow.
idx = int(cur)
}
}
return idx
}
func (m *indexMap) preallocate(numEntries int) {
if numEntries == 0 {
return
}
if len(m.buckets) == 0 {
m.init() // Perform lazy initialization.
}
// new size must be a power of two
newSize := len(m.buckets)
for newSize < (numEntries+maxLoad-1)/maxLoad {
newSize *= 2
}
if newSize == len(m.buckets) {
return
}
m.buckets = make([]uint, newSize)
blockCount := m.blockList.Size()
for i := uint(1); i < blockCount; i++ {
e := m.resolve(i)
h := m.hash(e.id)
e.next = m.buckets[h]
m.buckets[h] = bloomInsertID(i, e.next, e.id)
}
m.blockList.preallocate(uint(numEntries))
}
func (m *indexMap) hash(id restic.ID) uint {
// We use maphash to prevent backups of specially crafted inputs
// from degrading performance.
// While SHA-256 should be collision-resistant, for hash table indices
// we use only a few bits of it and finding collisions for those is
// much easier than breaking the whole algorithm.
mh := maphash.Hash{}
mh.SetSeed(m.mh.Seed())
_, _ = mh.Write(id[:])
h := uint(mh.Sum64())
return h & uint(len(m.buckets)-1)
}
func (m *indexMap) init() {
const initialBuckets = 64
m.buckets = make([]uint, initialBuckets)
// first entry in blockList serves as null byte
m.blockList = *newHAT()
m.newEntry()
}
func (m *indexMap) len() uint { return m.numentries }
func (m *indexMap) newEntry() (*indexEntry, uint) {
entry, idx := m.blockList.Alloc()
if idx != bloomCleanID(idx) {
panic("repository index size overflow")
}
return entry, idx
}
func (m *indexMap) resolve(idx uint) *indexEntry {
return m.blockList.Ref(bloomCleanID(idx))
}
// On 32-bit systems, the bloom filter compiles away into a no-op.
const bloomShift = 36
const bloomMask = 1<<bloomShift - 1
func bloomCleanID(idx uint) uint {
// extra variable to compile on 32bit systems
bloomMask := uint64(bloomMask)
return idx & uint(bloomMask)
}
func bloomForID(id restic.ID) uint {
// A bloom filter with a single hash function seems to work best.
// This is probably because the entry chains can be quite long, such that several entries end
// up in the same bloom filter. In this case, a single hash function yields the lowest false positive rate.
k1 := id[0] % (64 - bloomShift)
return uint(1 << k1)
}
// bloomHasID returns whether the idx could contain the id. Returns false only if the index cannot contain the id.
// It may return true even if the id is not present in the entry chain. However, those false positives are expected to be rare.
func bloomHasID(idx uint, id restic.ID) bool {
if math.MaxUint == math.MaxUint32 {
// On 32-bit systems, the bloom filter is empty for all entries.
// Thus, simply check if there is a next entry.
return idx != 0
}
bloom := idx >> bloomShift
return bloom&bloomForID(id) != 0
}
func bloomInsertID(idx uint, nextIdx uint, id restic.ID) uint {
// extra variable to compile on 32bit systems
bloomMask := uint64(bloomMask)
oldBloom := (nextIdx & ^uint(bloomMask))
newBloom := bloomForID(id) << bloomShift
return idx | oldBloom | newBloom
}
type indexEntry struct {
id restic.ID
next uint
packIndex int // Position in containing Index's packs field.
offset uint32
length uint32
uncompressedLength uint32
}
type hashedArrayTree struct {
mask uint
maskShift uint
blockSize uint
size uint
blockList [][]indexEntry
}
func newHAT() *hashedArrayTree {
// start with a small block size
blockSizePower := uint(2)
blockSize := uint(1 << blockSizePower)
return &hashedArrayTree{
mask: blockSize - 1,
maskShift: blockSizePower,
blockSize: blockSize,
size: 0,
blockList: make([][]indexEntry, blockSize),
}
}
func (h *hashedArrayTree) Alloc() (*indexEntry, uint) {
h.grow()
size := h.size
idx, subIdx := h.index(size)
h.size++
return &h.blockList[idx][subIdx], size
}
func (h *hashedArrayTree) index(pos uint) (idx uint, subIdx uint) {
subIdx = pos & h.mask
idx = pos >> h.maskShift
return
}
func (h *hashedArrayTree) Ref(pos uint) *indexEntry {
if pos >= h.size {
panic("array index out of bounds")
}
idx, subIdx := h.index(pos)
return &h.blockList[idx][subIdx]
}
func (h *hashedArrayTree) Size() uint {
return h.size
}
func (h *hashedArrayTree) preallocate(numEntries uint) {
idx, _ := h.index(numEntries - 1)
for int(idx) >= len(h.blockList) {
// blockList is too short -> double list and block size
h.blockSize *= 2
h.mask = h.mask*2 + 1
h.maskShift++
idx = idx / 2
oldBlocks := h.blockList
h.blockList = make([][]indexEntry, h.blockSize)
// pairwise merging of blocks
for i := 0; i < len(oldBlocks); i += 2 {
if oldBlocks[i] == nil && oldBlocks[i+1] == nil {
// merged all blocks with data. Grow will allocate the block later on
break
}
block := make([]indexEntry, 0, h.blockSize)
block = append(block, oldBlocks[i]...)
block = append(block, oldBlocks[i+1]...)
// make sure to set the correct length as not all old blocks may contain entries yet
h.blockList[i/2] = block[0:h.blockSize]
// allow GC
oldBlocks[i] = nil
oldBlocks[i+1] = nil
}
}
}
func (h *hashedArrayTree) grow() {
h.preallocate(h.size + 1)
idx, subIdx := h.index(h.size)
if subIdx == 0 {
// new index entry batch
h.blockList[idx] = make([]indexEntry, h.blockSize)
}
}