Add cron method to gc LFS MetaObjects (#22385)

This PR adds a task to the cron service to allow garbage collection of
LFS meta objects. As repositories may have a large number of
LFSMetaObjects, an updated column is added to this table and it is used
to perform a generational GC to attempt to reduce the amount of work.
(There may need to be a bit more work here but this is probably enough
for the moment.)

Fix #7045

Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
zeripath
2023-01-16 19:50:53 +00:00
committed by GitHub
parent 04c97aa364
commit 2cc3a6381c
9 changed files with 255 additions and 35 deletions

View File

@ -175,6 +175,48 @@ func registerDeleteOldSystemNotices() {
})
}
func registerGCLFS() {
if !setting.LFS.StartServer {
return
}
type GCLFSConfig struct {
OlderThanConfig
LastUpdatedMoreThanAgo time.Duration
NumberToCheckPerRepo int64
ProportionToCheckPerRepo float64
}
RegisterTaskFatal("gc_lfs", &GCLFSConfig{
OlderThanConfig: OlderThanConfig{
BaseConfig: BaseConfig{
Enabled: false,
RunAtStart: false,
Schedule: "@every 24h",
},
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
// objects.
//
// It is likely that a week is potentially excessive but it should definitely be enough that any
// unassociated LFS object is genuinely unassociated.
OlderThan: 24 * time.Hour * 7,
},
// Only GC things that haven't been looked at in the past 3 days
LastUpdatedMoreThanAgo: 24 * time.Hour * 3,
NumberToCheckPerRepo: 100,
ProportionToCheckPerRepo: 0.6,
}, func(ctx context.Context, _ *user_model.User, config Config) error {
gcLFSConfig := config.(*GCLFSConfig)
return repo_service.GarbageCollectLFSMetaObjects(ctx, repo_service.GarbageCollectLFSMetaObjectsOptions{
AutoFix: true,
OlderThan: time.Now().Add(-gcLFSConfig.OlderThan),
UpdatedLessRecentlyThan: time.Now().Add(-gcLFSConfig.LastUpdatedMoreThanAgo),
})
})
}
func initExtendedTasks() {
registerDeleteInactiveUsers()
registerDeleteRepositoryArchives()
@ -188,4 +230,5 @@ func initExtendedTasks() {
registerDeleteOldActions()
registerUpdateGiteaChecker()
registerDeleteOldSystemNotices()
registerGCLFS()
}

View File

@ -5,49 +5,67 @@ package repository
import (
"context"
"errors"
"fmt"
"time"
"code.gitea.io/gitea/models/db"
git_model "code.gitea.io/gitea/models/git"
repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/lfs"
"code.gitea.io/gitea/modules/log"
"xorm.io/builder"
"code.gitea.io/gitea/modules/setting"
)
func GarbageCollectLFSMetaObjects(ctx context.Context, logger log.Logger, autofix bool) error {
log.Trace("Doing: GarbageCollectLFSMetaObjects")
if err := db.Iterate(
ctx,
builder.And(builder.Gt{"id": 0}),
func(ctx context.Context, repo *repo_model.Repository) error {
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, logger, autofix)
},
); err != nil {
return err
}
log.Trace("Finished: GarbageCollectLFSMetaObjects")
return nil
// GarbageCollectLFSMetaObjectsOptions provides options for GarbageCollectLFSMetaObjects function
type GarbageCollectLFSMetaObjectsOptions struct {
Logger log.Logger
AutoFix bool
OlderThan time.Time
UpdatedLessRecentlyThan time.Time
NumberToCheckPerRepo int64
ProportionToCheckPerRepo float64
}
func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, logger log.Logger, autofix bool) error {
if logger != nil {
logger.Info("Checking %-v", repo)
// GarbageCollectLFSMetaObjects garbage collects LFS objects for all repositories
func GarbageCollectLFSMetaObjects(ctx context.Context, opts GarbageCollectLFSMetaObjectsOptions) error {
log.Trace("Doing: GarbageCollectLFSMetaObjects")
defer log.Trace("Finished: GarbageCollectLFSMetaObjects")
if !setting.LFS.StartServer {
if opts.Logger != nil {
opts.Logger.Info("LFS support is disabled")
}
return nil
}
total, orphaned, collected, deleted := 0, 0, 0, 0
if logger != nil {
return git_model.IterateRepositoryIDsWithLFSMetaObjects(ctx, func(ctx context.Context, repoID, count int64) error {
repo, err := repo_model.GetRepositoryByID(ctx, repoID)
if err != nil {
return err
}
if newMinimum := int64(float64(count) * opts.ProportionToCheckPerRepo); newMinimum > opts.NumberToCheckPerRepo && opts.NumberToCheckPerRepo != 0 {
opts.NumberToCheckPerRepo = newMinimum
}
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts)
})
}
// GarbageCollectLFSMetaObjectsForRepo garbage collects LFS objects for a specific repository
func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, opts GarbageCollectLFSMetaObjectsOptions) error {
if opts.Logger != nil {
opts.Logger.Info("Checking %-v", repo)
}
total, orphaned, collected, deleted := int64(0), 0, 0, 0
if opts.Logger != nil {
defer func() {
if orphaned == 0 {
logger.Info("Found %d total LFSMetaObjects in %-v", total, repo)
} else if !autofix {
logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo)
opts.Logger.Info("Found %d total LFSMetaObjects in %-v", total, repo)
} else if !opts.AutoFix {
opts.Logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo)
} else {
logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted)
opts.Logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted)
}
}()
}
@ -60,17 +78,21 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R
defer gitRepo.Close()
store := lfs.NewContentStore()
errStop := errors.New("STOPERR")
return git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error {
err = git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error {
if opts.NumberToCheckPerRepo > 0 && total > opts.NumberToCheckPerRepo {
return errStop
}
total++
pointerSha := git.ComputeBlobHash([]byte(metaObject.Pointer.StringContent()))
if gitRepo.IsObjectExist(pointerSha.String()) {
return nil
return git_model.MarkLFSMetaObject(ctx, metaObject.ID)
}
orphaned++
if !autofix {
if !opts.AutoFix {
return nil
}
// Non-existent pointer file
@ -100,6 +122,19 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R
//
// It is likely that a week is potentially excessive but it should definitely be enough that any
// unassociated LFS object is genuinely unassociated.
OlderThan: time.Now().Add(-24 * 7 * time.Hour),
OlderThan: opts.OlderThan,
UpdatedLessRecentlyThan: opts.UpdatedLessRecentlyThan,
OrderByUpdated: true,
LoopFunctionAlwaysUpdates: true,
})
if err == errStop {
if opts.Logger != nil {
opts.Logger.Info("Processing stopped at %d total LFSMetaObjects in %-v", total, repo)
}
return nil
} else if err != nil {
return err
}
return nil
}