package index import ( "hash/maphash" "iter" "math" "github.com/restic/restic/internal/restic" ) // An indexMap is a chained hash table that maps blob IDs to indexEntries. // It allows storing multiple entries with the same key. // // IndexMap uses some optimizations that are not compatible with supporting // deletions. // // The buckets in this hash table contain only pointers, rather than inlined // key-value pairs like the standard Go map. This way, only a pointer array // needs to be resized when the table grows, preventing memory usage spikes. // // On 64-bit systems, the id of an indexEntry is a uint64 containing the index // of the entry in the `buckets` slice. This index is also stored in the // `next` field of an indexEntry. However, the actual number of entries // is far lower. Thus, the upper 28 bits are used to store a bloom filter, // leaving the lower 36 bits for the index in the block list. The bloom filter // is used to quickly check if an entry might be present in the map before // traversing the block list. This significantly reduces the number of cache // misses when following the `next` field chain for unknown ids. type indexMap struct { // The number of buckets is always a power of two and never zero. buckets []uint numentries uint mh maphash.Hash blockList hashedArrayTree } const ( maxLoad = 4 // Max. number of entries per bucket. ) // add inserts an indexEntry for the given arguments into the map, // using id as the key. func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompressedLength uint32) { // Make sure there is enough space for the new entry. m.preallocate(int(m.numentries) + 1) h := m.hash(id) e, idx := m.newEntry() e.id = id e.next = m.buckets[h] // Prepend to existing chain. e.packIndex = packIdx e.offset = offset e.length = length e.uncompressedLength = uncompressedLength m.buckets[h] = bloomInsertID(idx, e.next, id) m.numentries++ } // values returns an iterator over all entries in the map. func (m *indexMap) values() iter.Seq[*indexEntry] { return func(yield func(*indexEntry) bool) { blockCount := m.blockList.Size() for i := uint(1); i < blockCount; i++ { if !yield(m.resolve(i)) { return } } } } // valuesWithID returns an iterator over all entries with the given id. func (m *indexMap) valuesWithID(id restic.ID) iter.Seq[*indexEntry] { return func(yield func(*indexEntry) bool) { if len(m.buckets) == 0 { return } h := m.hash(id) ei := m.buckets[h] // checking before resolving each entry is significantly faster than // checking only once at the start. for bloomHasID(ei, id) { e := m.resolve(ei) ei = e.next if e.id != id { continue } if !yield(e) { return } } } } // get returns the first entry for the given id. func (m *indexMap) get(id restic.ID) *indexEntry { if len(m.buckets) == 0 { return nil } h := m.hash(id) ei := m.buckets[h] for bloomHasID(ei, id) { e := m.resolve(ei) if e.id == id { return e } ei = e.next } return nil } // firstIndex returns the index of the first entry for ID id. // This index is guaranteed to never change. func (m *indexMap) firstIndex(id restic.ID) int { if len(m.buckets) == 0 { return -1 } idx := -1 h := m.hash(id) ei := m.buckets[h] for bloomHasID(ei, id) { e := m.resolve(ei) cur := bloomCleanID(ei) ei = e.next if e.id != id { continue } if int(cur) < idx || idx == -1 { // casting from uint to int is unproblematic as we'd run out of memory // before this can result in an overflow. idx = int(cur) } } return idx } func (m *indexMap) preallocate(numEntries int) { if numEntries == 0 { return } if len(m.buckets) == 0 { m.init() // Perform lazy initialization. } // new size must be a power of two newSize := len(m.buckets) for newSize < (numEntries+maxLoad-1)/maxLoad { newSize *= 2 } if newSize == len(m.buckets) { return } m.buckets = make([]uint, newSize) blockCount := m.blockList.Size() for i := uint(1); i < blockCount; i++ { e := m.resolve(i) h := m.hash(e.id) e.next = m.buckets[h] m.buckets[h] = bloomInsertID(i, e.next, e.id) } m.blockList.preallocate(uint(numEntries)) } func (m *indexMap) hash(id restic.ID) uint { // We use maphash to prevent backups of specially crafted inputs // from degrading performance. // While SHA-256 should be collision-resistant, for hash table indices // we use only a few bits of it and finding collisions for those is // much easier than breaking the whole algorithm. mh := maphash.Hash{} mh.SetSeed(m.mh.Seed()) _, _ = mh.Write(id[:]) h := uint(mh.Sum64()) return h & uint(len(m.buckets)-1) } func (m *indexMap) init() { const initialBuckets = 64 m.buckets = make([]uint, initialBuckets) // first entry in blockList serves as null byte m.blockList = *newHAT() m.newEntry() } func (m *indexMap) len() uint { return m.numentries } func (m *indexMap) newEntry() (*indexEntry, uint) { entry, idx := m.blockList.Alloc() if idx != bloomCleanID(idx) { panic("repository index size overflow") } return entry, idx } func (m *indexMap) resolve(idx uint) *indexEntry { return m.blockList.Ref(bloomCleanID(idx)) } // On 32-bit systems, the bloom filter compiles away into a no-op. const bloomShift = 36 const bloomMask = 1<> bloomShift return bloom&bloomForID(id) != 0 } func bloomInsertID(idx uint, nextIdx uint, id restic.ID) uint { // extra variable to compile on 32bit systems bloomMask := uint64(bloomMask) oldBloom := (nextIdx & ^uint(bloomMask)) newBloom := bloomForID(id) << bloomShift return idx | oldBloom | newBloom } type indexEntry struct { id restic.ID next uint packIndex int // Position in containing Index's packs field. offset uint32 length uint32 uncompressedLength uint32 } type hashedArrayTree struct { mask uint maskShift uint blockSize uint size uint blockList [][]indexEntry } func newHAT() *hashedArrayTree { // start with a small block size blockSizePower := uint(2) blockSize := uint(1 << blockSizePower) return &hashedArrayTree{ mask: blockSize - 1, maskShift: blockSizePower, blockSize: blockSize, size: 0, blockList: make([][]indexEntry, blockSize), } } func (h *hashedArrayTree) Alloc() (*indexEntry, uint) { h.grow() size := h.size idx, subIdx := h.index(size) h.size++ return &h.blockList[idx][subIdx], size } func (h *hashedArrayTree) index(pos uint) (idx uint, subIdx uint) { subIdx = pos & h.mask idx = pos >> h.maskShift return } func (h *hashedArrayTree) Ref(pos uint) *indexEntry { if pos >= h.size { panic("array index out of bounds") } idx, subIdx := h.index(pos) return &h.blockList[idx][subIdx] } func (h *hashedArrayTree) Size() uint { return h.size } func (h *hashedArrayTree) preallocate(numEntries uint) { idx, _ := h.index(numEntries - 1) for int(idx) >= len(h.blockList) { // blockList is too short -> double list and block size h.blockSize *= 2 h.mask = h.mask*2 + 1 h.maskShift++ idx = idx / 2 oldBlocks := h.blockList h.blockList = make([][]indexEntry, h.blockSize) // pairwise merging of blocks for i := 0; i < len(oldBlocks); i += 2 { if oldBlocks[i] == nil && oldBlocks[i+1] == nil { // merged all blocks with data. Grow will allocate the block later on break } block := make([]indexEntry, 0, h.blockSize) block = append(block, oldBlocks[i]...) block = append(block, oldBlocks[i+1]...) // make sure to set the correct length as not all old blocks may contain entries yet h.blockList[i/2] = block[0:h.blockSize] // allow GC oldBlocks[i] = nil oldBlocks[i+1] = nil } } } func (h *hashedArrayTree) grow() { h.preallocate(h.size + 1) idx, subIdx := h.index(h.size) if subIdx == 0 { // new index entry batch h.blockList[idx] = make([]indexEntry, h.blockSize) } }