diff --git a/changelog/unreleased/issue-5293 b/changelog/unreleased/issue-5293 new file mode 100644 index 000000000..7c5cc8e04 --- /dev/null +++ b/changelog/unreleased/issue-5293 @@ -0,0 +1,8 @@ +Change: prune small packfiles more aggressively + +The `prune` command now repacks more small packfiles by default. The option `--repack-small` +is not longer needed and has been marked as deprecated. You can still use `--repack-smaller-than` +to further control repacking of small pack files. + +https://github.com/restic/restic/issues/5293 +https://github.com/restic/restic/pull/21803 diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 87251c672..df08c3a33 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -64,7 +64,6 @@ type PruneOptions struct { MaxRepackBytes uint64 RepackCacheableOnly bool - RepackSmall bool RepackUncompressed bool SmallPackSize string @@ -78,12 +77,19 @@ func (opts *PruneOptions) AddFlags(f *pflag.FlagSet) { } func (opts *PruneOptions) AddLimitedFlags(f *pflag.FlagSet) { + var unused bool f.StringVar(&opts.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused data (absolute value in bytes with suffixes k/K, m/M, g/G, t/T, a value in % or the word 'unlimited')") f.StringVar(&opts.MaxRepackSize, "max-repack-size", "", "stop after repacking this much data in total (allowed suffixes for `size`: k/K, m/M, g/G, t/T)") f.BoolVar(&opts.RepackCacheableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable") - f.BoolVar(&opts.RepackSmall, "repack-small", false, "repack pack files below 80% of target pack size") + f.BoolVar(&unused, "repack-small", false, "deprecated. Use --repack-smaller-than to specify a minimum size") f.BoolVar(&opts.RepackUncompressed, "repack-uncompressed", false, "repack all uncompressed data") - f.StringVar(&opts.SmallPackSize, "repack-smaller-than", "", "pack `below-limit` packfiles (allowed suffixes: k/K, m/M)") + f.StringVar(&opts.SmallPackSize, "repack-smaller-than", "", "pack `below-limit` packfiles (allowed suffixes: m/M)") + + err := f.MarkDeprecated("repack-small", "small files are automatically repacked. Use --repack-smaller-than to specify a minimum size") + if err != nil { + // MarkDeprecated only returns an error when the flag is not found + panic(err) + } } func verifyPruneOptions(opts *PruneOptions) error { @@ -146,9 +152,10 @@ func verifyPruneOptions(opts *PruneOptions) error { size, err := ui.ParseBytes(opts.SmallPackSize) if err != nil { return errors.Fatalf("invalid number of bytes %q for --repack-smaller-than: %v", opts.SmallPackSize, err) + } else if size <= 0 { + return errors.Fatalf("--repack-smaller-than must be larger than zero") } opts.SmallPackBytes = uint64(size) - opts.RepackSmall = true } return nil @@ -206,7 +213,6 @@ func runPruneWithRepo(ctx context.Context, opts PruneOptions, repo *repository.R SmallPackBytes: opts.SmallPackBytes, RepackCacheableOnly: opts.RepackCacheableOnly, - RepackSmall: opts.RepackSmall, RepackUncompressed: opts.RepackUncompressed, } @@ -237,40 +243,40 @@ func runPruneWithRepo(ctx context.Context, opts PruneOptions, repo *repository.R // printPruneStats prints out the statistics func printPruneStats(printer progress.Printer, stats repository.PruneStats) error { - printer.V("\nused: %10d blobs / %s\n", stats.Blobs.Used, ui.FormatBytes(stats.Size.Used)) + printer.V("\nused: %10d blobs / %s", stats.Blobs.Used, ui.FormatBytes(stats.Size.Used)) if stats.Blobs.Duplicate > 0 { - printer.V("duplicates: %10d blobs / %s\n", stats.Blobs.Duplicate, ui.FormatBytes(stats.Size.Duplicate)) + printer.V("duplicates: %10d blobs / %s", stats.Blobs.Duplicate, ui.FormatBytes(stats.Size.Duplicate)) } - printer.V("unused: %10d blobs / %s\n", stats.Blobs.Unused, ui.FormatBytes(stats.Size.Unused)) + printer.V("unused: %10d blobs / %s", stats.Blobs.Unused, ui.FormatBytes(stats.Size.Unused)) if stats.Size.Unref > 0 { - printer.V("unreferenced: %s\n", ui.FormatBytes(stats.Size.Unref)) + printer.V("unreferenced: %s", ui.FormatBytes(stats.Size.Unref)) } totalBlobs := stats.Blobs.Used + stats.Blobs.Unused + stats.Blobs.Duplicate totalSize := stats.Size.Used + stats.Size.Duplicate + stats.Size.Unused + stats.Size.Unref unusedSize := stats.Size.Duplicate + stats.Size.Unused - printer.V("total: %10d blobs / %s\n", totalBlobs, ui.FormatBytes(totalSize)) - printer.V("unused size: %s of total size\n", ui.FormatPercent(unusedSize, totalSize)) + printer.V("total: %10d blobs / %s", totalBlobs, ui.FormatBytes(totalSize)) + printer.V("unused size: %s of total size", ui.FormatPercent(unusedSize, totalSize)) - printer.P("\nto repack: %10d blobs / %s\n", stats.Blobs.Repack, ui.FormatBytes(stats.Size.Repack)) - printer.P("this removes: %10d blobs / %s\n", stats.Blobs.Repackrm, ui.FormatBytes(stats.Size.Repackrm)) - printer.P("to delete: %10d blobs / %s\n", stats.Blobs.Remove, ui.FormatBytes(stats.Size.Remove+stats.Size.Unref)) + printer.P("\nto repack: %10d blobs / %s", stats.Blobs.Repack, ui.FormatBytes(stats.Size.Repack)) + printer.P("this removes: %10d blobs / %s", stats.Blobs.Repackrm, ui.FormatBytes(stats.Size.Repackrm)) + printer.P("to delete: %10d blobs / %s", stats.Blobs.Remove, ui.FormatBytes(stats.Size.Remove+stats.Size.Unref)) totalPruneSize := stats.Size.Remove + stats.Size.Repackrm + stats.Size.Unref - printer.P("total prune: %10d blobs / %s\n", stats.Blobs.Remove+stats.Blobs.Repackrm, ui.FormatBytes(totalPruneSize)) + printer.P("total prune: %10d blobs / %s", stats.Blobs.Remove+stats.Blobs.Repackrm, ui.FormatBytes(totalPruneSize)) if stats.Size.Uncompressed > 0 { - printer.P("not yet compressed: %s\n", ui.FormatBytes(stats.Size.Uncompressed)) + printer.P("not yet compressed: %s", ui.FormatBytes(stats.Size.Uncompressed)) } - printer.P("remaining: %10d blobs / %s\n", totalBlobs-(stats.Blobs.Remove+stats.Blobs.Repackrm), ui.FormatBytes(totalSize-totalPruneSize)) + printer.P("remaining: %10d blobs / %s", totalBlobs-(stats.Blobs.Remove+stats.Blobs.Repackrm), ui.FormatBytes(totalSize-totalPruneSize)) unusedAfter := unusedSize - stats.Size.Remove - stats.Size.Repackrm - printer.P("unused size after prune: %s (%s of remaining size)\n", + printer.P("unused size after prune: %s (%s of remaining size)", ui.FormatBytes(unusedAfter), ui.FormatPercent(unusedAfter, totalSize-totalPruneSize)) - printer.P("\n") - printer.V("totally used packs: %10d\n", stats.Packs.Used) - printer.V("partly used packs: %10d\n", stats.Packs.PartlyUsed) + printer.P("") + printer.V("totally used packs: %10d", stats.Packs.Used) + printer.V("partly used packs: %10d", stats.Packs.PartlyUsed) printer.V("unused packs: %10d\n\n", stats.Packs.Unused) - printer.V("to keep: %10d packs\n", stats.Packs.Keep) - printer.V("to repack: %10d packs\n", stats.Packs.Repack) - printer.V("to delete: %10d packs\n", stats.Packs.Remove) + printer.V("to keep: %10d packs", stats.Packs.Keep) + printer.V("to repack: %10d packs", stats.Packs.Repack) + printer.V("to delete: %10d packs", stats.Packs.Remove) if stats.Packs.Unref > 0 { printer.V("to delete: %10d unreferenced packs\n\n", stats.Packs.Unref) } @@ -279,7 +285,7 @@ func printPruneStats(printer progress.Printer, stats repository.PruneStats) erro func getUsedBlobs(ctx context.Context, repo restic.Repository, usedBlobs restic.FindBlobSet, ignoreSnapshots restic.IDSet, printer progress.Printer) error { var snapshotTrees restic.IDs - printer.P("loading all snapshots...\n") + printer.P("loading all snapshots...") err := data.ForAllSnapshots(ctx, repo, repo, ignoreSnapshots, func(id restic.ID, sn *data.Snapshot, err error) error { if err != nil { @@ -294,7 +300,7 @@ func getUsedBlobs(ctx context.Context, repo restic.Repository, usedBlobs restic. return errors.Fatalf("failed loading snapshot: %v", err) } - printer.P("finding data that is still in use for %d snapshots\n", len(snapshotTrees)) + printer.P("finding data that is still in use for %d snapshots", len(snapshotTrees)) bar := printer.NewCounter("snapshots") bar.SetMax(uint64(len(snapshotTrees))) diff --git a/cmd/restic/cmd_prune_integration_test.go b/cmd/restic/cmd_prune_integration_test.go index 1061966b3..74dc6d363 100644 --- a/cmd/restic/cmd_prune_integration_test.go +++ b/cmd/restic/cmd_prune_integration_test.go @@ -67,11 +67,6 @@ func testPruneVariants(t *testing.T, unsafeNoSpaceRecovery bool) { checkOpts := CheckOptions{ReadData: true} testPrune(t, opts, checkOpts) }) - t.Run("Small", func(t *testing.T) { - opts := PruneOptions{MaxUnused: "unlimited", RepackSmall: true} - checkOpts := CheckOptions{ReadData: true, CheckUnused: true} - testPrune(t, opts, checkOpts) - }) } func createPrunableRepo(t *testing.T, env *testEnvironment) { diff --git a/doc/060_forget.rst b/doc/060_forget.rst index 711644d14..346c13f32 100644 --- a/doc/060_forget.rst +++ b/doc/060_forget.rst @@ -286,7 +286,7 @@ snapshot only need to match a single option to be kept (the results are ORed). This means that the most recent snapshot would match both hourly, daily and weekly ``--keep-*`` options, and possibly more depending on calendar. -Let's look at a simple example: Suppose you have made backups every weekday for the past two weeks, +Let's look at a simple example: Suppose you have made backups every weekday for the past two weeks, and on Fridays, you make an additional backup: .. code-block:: console @@ -309,7 +309,7 @@ and on Fridays, you make an additional backup: --------------------------------------------------------------- 11 snapshots -However, for some reason, you missed making a backup last Wednesday. +However, for some reason, you missed making a backup last Wednesday. Then ``forget --keep-daily 5`` will keep only one snapshot per day for the last five days that had a snapshot and remove the other snapshots: @@ -341,9 +341,9 @@ five days that had a snapshot and remove the other snapshots: --------------------------------------------------------------- 6 snapshots -As you can see, this kept a snapshot from the previous Friday since you missed the Wednesday backup. -If you use ``forget --keep-within-daily 7d`` instead, you will only keep -at most one daily backup from any given day for the last seven days. In this example, it means that no backups from the first week will be kept, +As you can see, this kept a snapshot from the previous Friday since you missed the Wednesday backup. +If you use ``forget --keep-within-daily 7d`` instead, you will only keep +at most one daily backup from any given day for the last seven days. In this example, it means that no backups from the first week will be kept, regardless of how many backups exist in the second week: .. code-block:: console @@ -497,9 +497,6 @@ The ``prune`` command accepts the following options: your repository exceeds the value given by ``--max-unused``. The default value is false. -- ``--repack-small`` if set will repack pack files below 80% of target pack size. - The default value is false. - - ``--repack-smaller-than`` will repack all packfiles below the size of ``--repack-smaller-than``. This allows repacking packfiles that initially came from a repository with a smaller ``--pack-size`` to be compacted into larger packfiles. diff --git a/internal/repository/prune.go b/internal/repository/prune.go index 843837617..89c0a8b7a 100644 --- a/internal/repository/prune.go +++ b/internal/repository/prune.go @@ -1,11 +1,14 @@ package repository import ( + "cmp" "context" "fmt" "math" + "slices" "sort" + "github.com/restic/restic/internal/debug" "github.com/restic/restic/internal/errors" "github.com/restic/restic/internal/repository/index" "github.com/restic/restic/internal/repository/pack" @@ -27,7 +30,6 @@ type PruneOptions struct { SmallPackBytes uint64 RepackCacheableOnly bool - RepackSmall bool RepackUncompressed bool } @@ -188,7 +190,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs *i "Integrity check failed: Data seems to be missing.\n"+ "Will not start prune to prevent (additional) data loss!\n"+ "Please report this error (along with the output of the 'prune' run) at\n"+ - "https://github.com/restic/restic/issues/new/choose\n", missingBlobs) + "https://github.com/restic/restic/issues/new/choose", missingBlobs) return nil, nil, ErrIndexIncomplete } @@ -320,6 +322,43 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs *i return usedBlobs, indexPack, nil } +// calculateTargetPacksize calculates the packsize as +// 0.8 * max(4MB, third percentile of all packfile sizes) +func calculateTargetPacksize(opts PruneOptions, indexPack map[restic.ID]packInfo) (targetPackSize uint) { + // For an empty repository, the target pack size does not matter. + targetPackSize = 0 + if len(indexPack) > 0 { + type ToSort struct { + packID restic.ID + size uint64 + } + + // calculate 3rd percentile from sorted packfiles + toSort := make([]ToSort, 0, len(indexPack)) + for packID, ip := range indexPack { + toSort = append(toSort, ToSort{packID, uint64(ip.usedSize + ip.unusedSize)}) + } + slices.SortFunc(toSort, func(a, b ToSort) int { + return cmp.Compare(a.size, b.size) + }) + + // Using the approximatelly 3rd percentile is just a heuristic and may not always be the optimal choice. + // However, using a low percentile ensures that only a small fraction of the repository + // may end up being repacked. By using 80% of that perecentile or the minimum pack size, + // we ensure that no repacking happens if the repository already has no small pack files. + index := len(indexPack) * 3 / 100 + targetPackSize = max(MinPackSize, uint(toSort[index].size)) * 4 / 5 + debug.Log("targetPackSize %d, minimum pack size %d, 3rd percentile %d", targetPackSize, MinPackSize, toSort[index].size) + } + + if opts.SmallPackBytes > 0 { + // used option --repack-smaller-than if it is set + targetPackSize = uint(opts.SmallPackBytes) + } + + return targetPackSize +} + func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, indexPack map[restic.ID]packInfo, stats *PruneStats, printer progress.Printer) (PrunePlan, error) { removePacksFirst := restic.NewIDSet() removePacks := restic.NewIDSet() @@ -328,15 +367,8 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, var repackCandidates []packInfoWithID var repackSmallCandidates []packInfoWithID repoVersion := repo.Config().Version - // only repack very small files by default - targetPackSize := repo.PackSize() / 25 - if opts.SmallPackBytes > 0 { - targetPackSize = uint(opts.SmallPackBytes) - } else if opts.RepackSmall { - // consider files with at least 80% of the target size as large enough - targetPackSize = repo.PackSize() / 5 * 4 - } + targetPackSize := calculateTargetPacksize(opts, indexPack) // loop over all packs and decide what to do bar := printer.NewCounter("packs processed") bar.SetMax(uint64(len(indexPack))) @@ -344,7 +376,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, p, ok := indexPack[id] if !ok { // Pack was not referenced in index and is not used => immediately remove! - printer.V("will remove pack %v as it is unused and not indexed\n", id.Str()) + printer.V("will remove pack %v as it is unused and not indexed", id.Str()) removePacksFirst.Insert(id) stats.Size.Unref += uint64(packSize) return nil @@ -354,7 +386,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, // Pack size does not fit and pack is needed => error // If the pack is not needed, this is no error, the pack can // and will be simply removed, see below. - printer.E("pack %s: calculated size %d does not match real size %d\nRun 'restic repair index'.\n", + printer.E("pack %s: calculated size %d does not match real size %d\nRun 'restic repair index'", id.Str(), p.unusedSize+p.usedSize, packSize) return ErrSizeNotMatching } @@ -428,16 +460,16 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, } if len(indexPack) != 0 { - printer.E("The index references %d needed pack files which are missing from the repository:\n", len(indexPack)) + printer.E("The index references %d needed pack files which are missing from the repository:", len(indexPack)) for id := range indexPack { - printer.E(" %v\n", id) + printer.E(" %v", id) } return PrunePlan{}, ErrPacksMissing } if len(ignorePacks) != 0 { printer.E("Missing but unneeded pack files are referenced in the index, will be repaired\n") for id := range ignorePacks { - printer.E("will forget missing pack file %v\n", id) + printer.E("will forget missing pack file %v", id) } } @@ -454,6 +486,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, // Instead of unused[i] / used[i] > unused[j] / used[j] we use // unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64 // Moreover packs containing trees and too short packs are sorted to the beginning + debug.Log("%d candidate packfiles to repack", len(repackCandidates)) sort.Slice(repackCandidates, func(i, j int) bool { pi := repackCandidates[i].packInfo pj := repackCandidates[j].packInfo @@ -479,6 +512,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, if p.uncompressed { stats.Size.Uncompressed -= p.unusedSize + p.usedSize } + debug.Log("repack %s(%s) %10d %10d", id.Str(), p.tpe, p.usedSize, p.unusedSize) } // calculate limit for number of unused bytes in the repo after repacking @@ -579,7 +613,7 @@ func (plan *PrunePlan) Execute(ctx context.Context, printer progress.Printer) er printer.E("%v was not repacked\n\n"+ "Integrity check failed.\n"+ "Please report this error (along with the output of the 'prune' run) at\n"+ - "https://github.com/restic/restic/issues/new/choose\n", plan.keepBlobs) + "https://github.com/restic/restic/issues/new/choose", plan.keepBlobs) return errors.Fatal("internal error: blobs were not repacked") } @@ -608,7 +642,7 @@ func (plan *PrunePlan) Execute(ctx context.Context, printer progress.Printer) er } if len(plan.removePacks) != 0 { - printer.P("removing %d old packs\n", len(plan.removePacks)) + printer.P("removing %d old packs", len(plan.removePacks)) _ = deleteFiles(ctx, true, &internalRepository{repo}, plan.removePacks, restic.PackFile, printer) } if ctx.Err() != nil { @@ -637,12 +671,12 @@ func deleteFiles(ctx context.Context, ignoreError bool, repo restic.RemoverUnpac return restic.ParallelRemove(ctx, repo, fileList, fileType, func(id restic.ID, err error) error { if err != nil { - printer.E("unable to remove %v/%v from the repository\n", fileType, id) + printer.E("unable to remove %v/%v from the repository", fileType, id) if !ignoreError { return err } } - printer.VV("removed %v/%v\n", fileType, id) + printer.VV("removed %v/%v", fileType, id) return nil }, bar) } diff --git a/internal/repository/prune_test.go b/internal/repository/prune_test.go index a363acd41..668866808 100644 --- a/internal/repository/prune_test.go +++ b/internal/repository/prune_test.go @@ -96,7 +96,6 @@ func TestPrune(t *testing.T) { opts: repository.PruneOptions{ MaxRepackBytes: math.MaxUint64, MaxUnusedBytes: func(used uint64) (unused uint64) { return math.MaxUint64 }, - RepackSmall: true, }, errOnUnused: true, }, @@ -160,7 +159,6 @@ func TestPruneSmall(t *testing.T) { MaxRepackBytes: math.MaxUint64, MaxUnusedBytes: func(used uint64) (unused uint64) { return blobSize / 4 }, SmallPackBytes: 5 * 1024 * 1024, - RepackSmall: true, } plan, err := repository.PlanPrune(context.TODO(), opts, repo, func(ctx context.Context, repo restic.Repository, usedBlobs restic.FindBlobSet) error { for blob := range keep {