prune more aggresively (#21803)

Co-authored-by: Michael Eischer <michael.eischer@fau.de>
This commit is contained in:
Winfried Plappert
2026-05-16 16:49:08 +01:00
committed by GitHub
parent b7d8c98214
commit 990329013e
6 changed files with 98 additions and 60 deletions
+8
View File
@@ -0,0 +1,8 @@
Change: prune small packfiles more aggressively
The `prune` command now repacks more small packfiles by default. The option `--repack-small`
is not longer needed and has been marked as deprecated. You can still use `--repack-smaller-than`
to further control repacking of small pack files.
https://github.com/restic/restic/issues/5293
https://github.com/restic/restic/pull/21803
+32 -26
View File
@@ -64,7 +64,6 @@ type PruneOptions struct {
MaxRepackBytes uint64
RepackCacheableOnly bool
RepackSmall bool
RepackUncompressed bool
SmallPackSize string
@@ -78,12 +77,19 @@ func (opts *PruneOptions) AddFlags(f *pflag.FlagSet) {
}
func (opts *PruneOptions) AddLimitedFlags(f *pflag.FlagSet) {
var unused bool
f.StringVar(&opts.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused data (absolute value in bytes with suffixes k/K, m/M, g/G, t/T, a value in % or the word 'unlimited')")
f.StringVar(&opts.MaxRepackSize, "max-repack-size", "", "stop after repacking this much data in total (allowed suffixes for `size`: k/K, m/M, g/G, t/T)")
f.BoolVar(&opts.RepackCacheableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable")
f.BoolVar(&opts.RepackSmall, "repack-small", false, "repack pack files below 80% of target pack size")
f.BoolVar(&unused, "repack-small", false, "deprecated. Use --repack-smaller-than to specify a minimum size")
f.BoolVar(&opts.RepackUncompressed, "repack-uncompressed", false, "repack all uncompressed data")
f.StringVar(&opts.SmallPackSize, "repack-smaller-than", "", "pack `below-limit` packfiles (allowed suffixes: k/K, m/M)")
f.StringVar(&opts.SmallPackSize, "repack-smaller-than", "", "pack `below-limit` packfiles (allowed suffixes: m/M)")
err := f.MarkDeprecated("repack-small", "small files are automatically repacked. Use --repack-smaller-than to specify a minimum size")
if err != nil {
// MarkDeprecated only returns an error when the flag is not found
panic(err)
}
}
func verifyPruneOptions(opts *PruneOptions) error {
@@ -146,9 +152,10 @@ func verifyPruneOptions(opts *PruneOptions) error {
size, err := ui.ParseBytes(opts.SmallPackSize)
if err != nil {
return errors.Fatalf("invalid number of bytes %q for --repack-smaller-than: %v", opts.SmallPackSize, err)
} else if size <= 0 {
return errors.Fatalf("--repack-smaller-than must be larger than zero")
}
opts.SmallPackBytes = uint64(size)
opts.RepackSmall = true
}
return nil
@@ -206,7 +213,6 @@ func runPruneWithRepo(ctx context.Context, opts PruneOptions, repo *repository.R
SmallPackBytes: opts.SmallPackBytes,
RepackCacheableOnly: opts.RepackCacheableOnly,
RepackSmall: opts.RepackSmall,
RepackUncompressed: opts.RepackUncompressed,
}
@@ -237,40 +243,40 @@ func runPruneWithRepo(ctx context.Context, opts PruneOptions, repo *repository.R
// printPruneStats prints out the statistics
func printPruneStats(printer progress.Printer, stats repository.PruneStats) error {
printer.V("\nused: %10d blobs / %s\n", stats.Blobs.Used, ui.FormatBytes(stats.Size.Used))
printer.V("\nused: %10d blobs / %s", stats.Blobs.Used, ui.FormatBytes(stats.Size.Used))
if stats.Blobs.Duplicate > 0 {
printer.V("duplicates: %10d blobs / %s\n", stats.Blobs.Duplicate, ui.FormatBytes(stats.Size.Duplicate))
printer.V("duplicates: %10d blobs / %s", stats.Blobs.Duplicate, ui.FormatBytes(stats.Size.Duplicate))
}
printer.V("unused: %10d blobs / %s\n", stats.Blobs.Unused, ui.FormatBytes(stats.Size.Unused))
printer.V("unused: %10d blobs / %s", stats.Blobs.Unused, ui.FormatBytes(stats.Size.Unused))
if stats.Size.Unref > 0 {
printer.V("unreferenced: %s\n", ui.FormatBytes(stats.Size.Unref))
printer.V("unreferenced: %s", ui.FormatBytes(stats.Size.Unref))
}
totalBlobs := stats.Blobs.Used + stats.Blobs.Unused + stats.Blobs.Duplicate
totalSize := stats.Size.Used + stats.Size.Duplicate + stats.Size.Unused + stats.Size.Unref
unusedSize := stats.Size.Duplicate + stats.Size.Unused
printer.V("total: %10d blobs / %s\n", totalBlobs, ui.FormatBytes(totalSize))
printer.V("unused size: %s of total size\n", ui.FormatPercent(unusedSize, totalSize))
printer.V("total: %10d blobs / %s", totalBlobs, ui.FormatBytes(totalSize))
printer.V("unused size: %s of total size", ui.FormatPercent(unusedSize, totalSize))
printer.P("\nto repack: %10d blobs / %s\n", stats.Blobs.Repack, ui.FormatBytes(stats.Size.Repack))
printer.P("this removes: %10d blobs / %s\n", stats.Blobs.Repackrm, ui.FormatBytes(stats.Size.Repackrm))
printer.P("to delete: %10d blobs / %s\n", stats.Blobs.Remove, ui.FormatBytes(stats.Size.Remove+stats.Size.Unref))
printer.P("\nto repack: %10d blobs / %s", stats.Blobs.Repack, ui.FormatBytes(stats.Size.Repack))
printer.P("this removes: %10d blobs / %s", stats.Blobs.Repackrm, ui.FormatBytes(stats.Size.Repackrm))
printer.P("to delete: %10d blobs / %s", stats.Blobs.Remove, ui.FormatBytes(stats.Size.Remove+stats.Size.Unref))
totalPruneSize := stats.Size.Remove + stats.Size.Repackrm + stats.Size.Unref
printer.P("total prune: %10d blobs / %s\n", stats.Blobs.Remove+stats.Blobs.Repackrm, ui.FormatBytes(totalPruneSize))
printer.P("total prune: %10d blobs / %s", stats.Blobs.Remove+stats.Blobs.Repackrm, ui.FormatBytes(totalPruneSize))
if stats.Size.Uncompressed > 0 {
printer.P("not yet compressed: %s\n", ui.FormatBytes(stats.Size.Uncompressed))
printer.P("not yet compressed: %s", ui.FormatBytes(stats.Size.Uncompressed))
}
printer.P("remaining: %10d blobs / %s\n", totalBlobs-(stats.Blobs.Remove+stats.Blobs.Repackrm), ui.FormatBytes(totalSize-totalPruneSize))
printer.P("remaining: %10d blobs / %s", totalBlobs-(stats.Blobs.Remove+stats.Blobs.Repackrm), ui.FormatBytes(totalSize-totalPruneSize))
unusedAfter := unusedSize - stats.Size.Remove - stats.Size.Repackrm
printer.P("unused size after prune: %s (%s of remaining size)\n",
printer.P("unused size after prune: %s (%s of remaining size)",
ui.FormatBytes(unusedAfter), ui.FormatPercent(unusedAfter, totalSize-totalPruneSize))
printer.P("\n")
printer.V("totally used packs: %10d\n", stats.Packs.Used)
printer.V("partly used packs: %10d\n", stats.Packs.PartlyUsed)
printer.P("")
printer.V("totally used packs: %10d", stats.Packs.Used)
printer.V("partly used packs: %10d", stats.Packs.PartlyUsed)
printer.V("unused packs: %10d\n\n", stats.Packs.Unused)
printer.V("to keep: %10d packs\n", stats.Packs.Keep)
printer.V("to repack: %10d packs\n", stats.Packs.Repack)
printer.V("to delete: %10d packs\n", stats.Packs.Remove)
printer.V("to keep: %10d packs", stats.Packs.Keep)
printer.V("to repack: %10d packs", stats.Packs.Repack)
printer.V("to delete: %10d packs", stats.Packs.Remove)
if stats.Packs.Unref > 0 {
printer.V("to delete: %10d unreferenced packs\n\n", stats.Packs.Unref)
}
@@ -279,7 +285,7 @@ func printPruneStats(printer progress.Printer, stats repository.PruneStats) erro
func getUsedBlobs(ctx context.Context, repo restic.Repository, usedBlobs restic.FindBlobSet, ignoreSnapshots restic.IDSet, printer progress.Printer) error {
var snapshotTrees restic.IDs
printer.P("loading all snapshots...\n")
printer.P("loading all snapshots...")
err := data.ForAllSnapshots(ctx, repo, repo, ignoreSnapshots,
func(id restic.ID, sn *data.Snapshot, err error) error {
if err != nil {
@@ -294,7 +300,7 @@ func getUsedBlobs(ctx context.Context, repo restic.Repository, usedBlobs restic.
return errors.Fatalf("failed loading snapshot: %v", err)
}
printer.P("finding data that is still in use for %d snapshots\n", len(snapshotTrees))
printer.P("finding data that is still in use for %d snapshots", len(snapshotTrees))
bar := printer.NewCounter("snapshots")
bar.SetMax(uint64(len(snapshotTrees)))
-5
View File
@@ -67,11 +67,6 @@ func testPruneVariants(t *testing.T, unsafeNoSpaceRecovery bool) {
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
t.Run("Small", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "unlimited", RepackSmall: true}
checkOpts := CheckOptions{ReadData: true, CheckUnused: true}
testPrune(t, opts, checkOpts)
})
}
func createPrunableRepo(t *testing.T, env *testEnvironment) {
+5 -8
View File
@@ -286,7 +286,7 @@ snapshot only need to match a single option to be kept (the results are ORed).
This means that the most recent snapshot would match both hourly,
daily and weekly ``--keep-*`` options, and possibly more depending on calendar.
Let's look at a simple example: Suppose you have made backups every weekday for the past two weeks,
Let's look at a simple example: Suppose you have made backups every weekday for the past two weeks,
and on Fridays, you make an additional backup:
.. code-block:: console
@@ -309,7 +309,7 @@ and on Fridays, you make an additional backup:
---------------------------------------------------------------
11 snapshots
However, for some reason, you missed making a backup last Wednesday.
However, for some reason, you missed making a backup last Wednesday.
Then ``forget --keep-daily 5`` will keep only one snapshot per day for the last
five days that had a snapshot and remove the other snapshots:
@@ -341,9 +341,9 @@ five days that had a snapshot and remove the other snapshots:
---------------------------------------------------------------
6 snapshots
As you can see, this kept a snapshot from the previous Friday since you missed the Wednesday backup.
If you use ``forget --keep-within-daily 7d`` instead, you will only keep
at most one daily backup from any given day for the last seven days. In this example, it means that no backups from the first week will be kept,
As you can see, this kept a snapshot from the previous Friday since you missed the Wednesday backup.
If you use ``forget --keep-within-daily 7d`` instead, you will only keep
at most one daily backup from any given day for the last seven days. In this example, it means that no backups from the first week will be kept,
regardless of how many backups exist in the second week:
.. code-block:: console
@@ -497,9 +497,6 @@ The ``prune`` command accepts the following options:
your repository exceeds the value given by ``--max-unused``.
The default value is false.
- ``--repack-small`` if set will repack pack files below 80% of target pack size.
The default value is false.
- ``--repack-smaller-than`` will repack all packfiles below the size of
``--repack-smaller-than``. This allows repacking packfiles that initially came from a
repository with a smaller ``--pack-size`` to be compacted into larger packfiles.
+53 -19
View File
@@ -1,11 +1,14 @@
package repository
import (
"cmp"
"context"
"fmt"
"math"
"slices"
"sort"
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/repository/index"
"github.com/restic/restic/internal/repository/pack"
@@ -27,7 +30,6 @@ type PruneOptions struct {
SmallPackBytes uint64
RepackCacheableOnly bool
RepackSmall bool
RepackUncompressed bool
}
@@ -188,7 +190,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs *i
"Integrity check failed: Data seems to be missing.\n"+
"Will not start prune to prevent (additional) data loss!\n"+
"Please report this error (along with the output of the 'prune' run) at\n"+
"https://github.com/restic/restic/issues/new/choose\n", missingBlobs)
"https://github.com/restic/restic/issues/new/choose", missingBlobs)
return nil, nil, ErrIndexIncomplete
}
@@ -320,6 +322,43 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs *i
return usedBlobs, indexPack, nil
}
// calculateTargetPacksize calculates the packsize as
// 0.8 * max(4MB, third percentile of all packfile sizes)
func calculateTargetPacksize(opts PruneOptions, indexPack map[restic.ID]packInfo) (targetPackSize uint) {
// For an empty repository, the target pack size does not matter.
targetPackSize = 0
if len(indexPack) > 0 {
type ToSort struct {
packID restic.ID
size uint64
}
// calculate 3rd percentile from sorted packfiles
toSort := make([]ToSort, 0, len(indexPack))
for packID, ip := range indexPack {
toSort = append(toSort, ToSort{packID, uint64(ip.usedSize + ip.unusedSize)})
}
slices.SortFunc(toSort, func(a, b ToSort) int {
return cmp.Compare(a.size, b.size)
})
// Using the approximatelly 3rd percentile is just a heuristic and may not always be the optimal choice.
// However, using a low percentile ensures that only a small fraction of the repository
// may end up being repacked. By using 80% of that perecentile or the minimum pack size,
// we ensure that no repacking happens if the repository already has no small pack files.
index := len(indexPack) * 3 / 100
targetPackSize = max(MinPackSize, uint(toSort[index].size)) * 4 / 5
debug.Log("targetPackSize %d, minimum pack size %d, 3rd percentile %d", targetPackSize, MinPackSize, toSort[index].size)
}
if opts.SmallPackBytes > 0 {
// used option --repack-smaller-than if it is set
targetPackSize = uint(opts.SmallPackBytes)
}
return targetPackSize
}
func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, indexPack map[restic.ID]packInfo, stats *PruneStats, printer progress.Printer) (PrunePlan, error) {
removePacksFirst := restic.NewIDSet()
removePacks := restic.NewIDSet()
@@ -328,15 +367,8 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository,
var repackCandidates []packInfoWithID
var repackSmallCandidates []packInfoWithID
repoVersion := repo.Config().Version
// only repack very small files by default
targetPackSize := repo.PackSize() / 25
if opts.SmallPackBytes > 0 {
targetPackSize = uint(opts.SmallPackBytes)
} else if opts.RepackSmall {
// consider files with at least 80% of the target size as large enough
targetPackSize = repo.PackSize() / 5 * 4
}
targetPackSize := calculateTargetPacksize(opts, indexPack)
// loop over all packs and decide what to do
bar := printer.NewCounter("packs processed")
bar.SetMax(uint64(len(indexPack)))
@@ -344,7 +376,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository,
p, ok := indexPack[id]
if !ok {
// Pack was not referenced in index and is not used => immediately remove!
printer.V("will remove pack %v as it is unused and not indexed\n", id.Str())
printer.V("will remove pack %v as it is unused and not indexed", id.Str())
removePacksFirst.Insert(id)
stats.Size.Unref += uint64(packSize)
return nil
@@ -354,7 +386,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository,
// Pack size does not fit and pack is needed => error
// If the pack is not needed, this is no error, the pack can
// and will be simply removed, see below.
printer.E("pack %s: calculated size %d does not match real size %d\nRun 'restic repair index'.\n",
printer.E("pack %s: calculated size %d does not match real size %d\nRun 'restic repair index'",
id.Str(), p.unusedSize+p.usedSize, packSize)
return ErrSizeNotMatching
}
@@ -428,16 +460,16 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository,
}
if len(indexPack) != 0 {
printer.E("The index references %d needed pack files which are missing from the repository:\n", len(indexPack))
printer.E("The index references %d needed pack files which are missing from the repository:", len(indexPack))
for id := range indexPack {
printer.E(" %v\n", id)
printer.E(" %v", id)
}
return PrunePlan{}, ErrPacksMissing
}
if len(ignorePacks) != 0 {
printer.E("Missing but unneeded pack files are referenced in the index, will be repaired\n")
for id := range ignorePacks {
printer.E("will forget missing pack file %v\n", id)
printer.E("will forget missing pack file %v", id)
}
}
@@ -454,6 +486,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository,
// Instead of unused[i] / used[i] > unused[j] / used[j] we use
// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
// Moreover packs containing trees and too short packs are sorted to the beginning
debug.Log("%d candidate packfiles to repack", len(repackCandidates))
sort.Slice(repackCandidates, func(i, j int) bool {
pi := repackCandidates[i].packInfo
pj := repackCandidates[j].packInfo
@@ -479,6 +512,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository,
if p.uncompressed {
stats.Size.Uncompressed -= p.unusedSize + p.usedSize
}
debug.Log("repack %s(%s) %10d %10d", id.Str(), p.tpe, p.usedSize, p.unusedSize)
}
// calculate limit for number of unused bytes in the repo after repacking
@@ -579,7 +613,7 @@ func (plan *PrunePlan) Execute(ctx context.Context, printer progress.Printer) er
printer.E("%v was not repacked\n\n"+
"Integrity check failed.\n"+
"Please report this error (along with the output of the 'prune' run) at\n"+
"https://github.com/restic/restic/issues/new/choose\n", plan.keepBlobs)
"https://github.com/restic/restic/issues/new/choose", plan.keepBlobs)
return errors.Fatal("internal error: blobs were not repacked")
}
@@ -608,7 +642,7 @@ func (plan *PrunePlan) Execute(ctx context.Context, printer progress.Printer) er
}
if len(plan.removePacks) != 0 {
printer.P("removing %d old packs\n", len(plan.removePacks))
printer.P("removing %d old packs", len(plan.removePacks))
_ = deleteFiles(ctx, true, &internalRepository{repo}, plan.removePacks, restic.PackFile, printer)
}
if ctx.Err() != nil {
@@ -637,12 +671,12 @@ func deleteFiles(ctx context.Context, ignoreError bool, repo restic.RemoverUnpac
return restic.ParallelRemove(ctx, repo, fileList, fileType, func(id restic.ID, err error) error {
if err != nil {
printer.E("unable to remove %v/%v from the repository\n", fileType, id)
printer.E("unable to remove %v/%v from the repository", fileType, id)
if !ignoreError {
return err
}
}
printer.VV("removed %v/%v\n", fileType, id)
printer.VV("removed %v/%v", fileType, id)
return nil
}, bar)
}
-2
View File
@@ -96,7 +96,6 @@ func TestPrune(t *testing.T) {
opts: repository.PruneOptions{
MaxRepackBytes: math.MaxUint64,
MaxUnusedBytes: func(used uint64) (unused uint64) { return math.MaxUint64 },
RepackSmall: true,
},
errOnUnused: true,
},
@@ -160,7 +159,6 @@ func TestPruneSmall(t *testing.T) {
MaxRepackBytes: math.MaxUint64,
MaxUnusedBytes: func(used uint64) (unused uint64) { return blobSize / 4 },
SmallPackBytes: 5 * 1024 * 1024,
RepackSmall: true,
}
plan, err := repository.PlanPrune(context.TODO(), opts, repo, func(ctx context.Context, repo restic.Repository, usedBlobs restic.FindBlobSet) error {
for blob := range keep {