restic prune: Merge three loops over the index

There were three loops over the index in restic prune, to find
duplicates, to determine sizes (in pack.Size) and to generate packInfos.
These three are now one loop. This way, prune doesn't need to construct
a set of duplicate blobs, pack.Size doesn't need to contain special
logic for prune's use case (the onlyHdr argument) and pack.Size doesn't
need to construct a map only to have it immediately transformed into a
different map.

Some quick testing on a 160GiB local repo doesn't show running time or
memory use of restic prune --dry-run changing significantly.
This commit is contained in:
greatroar
2022-06-05 10:14:32 +02:00
parent b7c990871f
commit 8bdfcf779f
4 changed files with 42 additions and 59 deletions
+33 -47
View File
@@ -242,11 +242,26 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
Verbosef("searching used packs...\n")
indexPack := make(map[restic.ID]packInfo)
keepBlobs := restic.NewBlobSet()
duplicateBlobs := restic.NewBlobSet()
// iterate over all blobs in index to find out which blobs are duplicates
// iterate over all blobs in index to generate packInfo and find duplicates
for blob := range repo.Index().Each(ctx) {
ip, seen := indexPack[blob.PackID]
if seen {
// mark mixed packs with "Invalid blob type"
if ip.tpe != blob.Type {
ip.tpe = restic.InvalidBlob
}
} else {
ip = packInfo{
tpe: blob.Type,
usedSize: pack.HeaderSize,
}
}
ip.usedSize += uint64(pack.CalculateEntrySize(blob.Blob))
bh := blob.BlobHandle
size := uint64(blob.Length)
switch {
@@ -255,14 +270,27 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
keepBlobs.Insert(bh)
stats.size.used += size
stats.blobs.used++
case keepBlobs.Has(bh): // duplicate blob
duplicateBlobs.Insert(bh)
ip.usedSize += size
ip.usedBlobs++
case keepBlobs.Has(bh): // duplicate of a blob that we want to keep
stats.size.duplicate += size
stats.blobs.duplicate++
default:
ip.usedSize += size
ip.duplicateBlobs++
default: // unused, don't care if it's a duplicate
stats.size.unused += size
stats.blobs.unused++
ip.unusedSize += size
ip.unusedBlobs++
}
if !blob.IsCompressed() {
ip.uncompressed = true
}
// update indexPack
indexPack[blob.PackID] = ip
}
// Check if all used blobs have been found in index
@@ -275,48 +303,6 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
return errorIndexIncomplete
}
indexPack := make(map[restic.ID]packInfo)
// save computed pack header size
for pid, hdrSize := range pack.Size(ctx, repo.Index(), true) {
// initialize tpe with NumBlobTypes to indicate it's not set
indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)}
}
// iterate over all blobs in index to generate packInfo
for blob := range repo.Index().Each(ctx) {
ip := indexPack[blob.PackID]
// Set blob type if not yet set
if ip.tpe == restic.NumBlobTypes {
ip.tpe = blob.Type
}
// mark mixed packs with "Invalid blob type"
if ip.tpe != blob.Type {
ip.tpe = restic.InvalidBlob
}
bh := blob.BlobHandle
size := uint64(blob.Length)
switch {
case duplicateBlobs.Has(bh): // duplicate blob
ip.usedSize += size
ip.duplicateBlobs++
case keepBlobs.Has(bh): // used blob, not duplicate
ip.usedSize += size
ip.usedBlobs++
default: // unused blob
ip.unusedSize += size
ip.unusedBlobs++
}
if !blob.IsCompressed() {
ip.uncompressed = true
}
// update indexPack
indexPack[blob.PackID] = ip
}
Verbosef("collecting packs for deletion and repacking\n")
removePacksFirst := restic.NewIDSet()
removePacks := restic.NewIDSet()
+1 -1
View File
@@ -98,7 +98,7 @@ func rebuildIndex(opts RebuildIndexOptions, gopts GlobalOptions, repo *repositor
if err != nil {
return err
}
packSizeFromIndex = pack.Size(ctx, repo.Index(), false)
packSizeFromIndex = pack.Size(ctx, repo.Index())
}
Verbosef("getting pack files to read...\n")