Skip to content

Commit 651fe4b

Browse files
authored
Add doctor command for full GC of LFS (#21978)
The recent PR adding orphaned checks to the LFS storage is not sufficient to completely GC LFS, as it is possible for LFSMetaObjects to remain associated with repos but still need to be garbage collected. Imagine a situation where a branch is uploaded containing LFS files but that branch is later completely deleted. The LFSMetaObjects will remain associated with the Repository but the Repository will no longer contain any pointers to the object. This PR adds a second doctor command to perform a full GC. Signed-off-by: Andrew Thornton <[email protected]>
1 parent 3243dbe commit 651fe4b

File tree

5 files changed

+245
-39
lines changed

5 files changed

+245
-39
lines changed

Diff for: models/git/lfs.go

+54
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package git
66
import (
77
"context"
88
"fmt"
9+
"time"
910

1011
"code.gitea.io/gitea/models/db"
1112
"code.gitea.io/gitea/models/perm"
@@ -14,6 +15,7 @@ import (
1415
user_model "code.gitea.io/gitea/models/user"
1516
"code.gitea.io/gitea/modules/lfs"
1617
"code.gitea.io/gitea/modules/log"
18+
"code.gitea.io/gitea/modules/setting"
1719
"code.gitea.io/gitea/modules/timeutil"
1820
"code.gitea.io/gitea/modules/util"
1921

@@ -180,6 +182,12 @@ func GetLFSMetaObjectByOid(repoID int64, oid string) (*LFSMetaObject, error) {
180182
// RemoveLFSMetaObjectByOid removes a LFSMetaObject entry from database by its OID.
181183
// It may return ErrLFSObjectNotExist or a database error.
182184
func RemoveLFSMetaObjectByOid(repoID int64, oid string) (int64, error) {
185+
return RemoveLFSMetaObjectByOidFn(repoID, oid, nil)
186+
}
187+
188+
// RemoveLFSMetaObjectByOidFn removes a LFSMetaObject entry from database by its OID.
189+
// It may return ErrLFSObjectNotExist or a database error. It will run Fn with the current count within the transaction
190+
func RemoveLFSMetaObjectByOidFn(repoID int64, oid string, fn func(count int64) error) (int64, error) {
183191
if len(oid) == 0 {
184192
return 0, ErrLFSObjectNotExist
185193
}
@@ -200,6 +208,12 @@ func RemoveLFSMetaObjectByOid(repoID int64, oid string) (int64, error) {
200208
return count, err
201209
}
202210

211+
if fn != nil {
212+
if err := fn(count); err != nil {
213+
return count, err
214+
}
215+
}
216+
203217
return count, committer.Commit()
204218
}
205219

@@ -319,3 +333,43 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) {
319333
}
320334
return lfsSize, nil
321335
}
336+
337+
type IterateLFSMetaObjectsForRepoOptions struct {
338+
OlderThan time.Time
339+
}
340+
341+
// IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo
342+
func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(context.Context, *LFSMetaObject, int64) error, opts *IterateLFSMetaObjectsForRepoOptions) error {
343+
var start int
344+
batchSize := setting.Database.IterateBufferSize
345+
engine := db.GetEngine(ctx)
346+
type CountLFSMetaObject struct {
347+
Count int64
348+
LFSMetaObject
349+
}
350+
351+
for {
352+
beans := make([]*CountLFSMetaObject, 0, batchSize)
353+
// SELECT `lfs_meta_object`.*, COUNT(`l1`.id) as `count` FROM lfs_meta_object INNER JOIN lfs_meta_object AS l1 ON l1.oid = lfs_meta_object.oid WHERE lfs_meta_object.repository_id = ? GROUP BY lfs_meta_object.id
354+
sess := engine.Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`").
355+
Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid").
356+
Where("`lfs_meta_object`.repository_id = ?", repoID)
357+
if !opts.OlderThan.IsZero() {
358+
sess.And("`lfs_meta_object`.created_unix < ?", opts.OlderThan)
359+
}
360+
sess.GroupBy("`lfs_meta_object`.id")
361+
if err := sess.Limit(batchSize, start).Find(&beans); err != nil {
362+
return err
363+
}
364+
if len(beans) == 0 {
365+
return nil
366+
}
367+
start += len(beans)
368+
369+
for _, bean := range beans {
370+
if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil {
371+
return err
372+
}
373+
}
374+
}
375+
}

Diff for: modules/doctor/lfs.go

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package doctor
5+
6+
import (
7+
"context"
8+
"fmt"
9+
10+
"code.gitea.io/gitea/modules/log"
11+
"code.gitea.io/gitea/modules/setting"
12+
"code.gitea.io/gitea/services/repository"
13+
)
14+
15+
func init() {
16+
Register(&Check{
17+
Title: "Garbage collect LFS",
18+
Name: "gc-lfs",
19+
IsDefault: false,
20+
Run: garbageCollectLFSCheck,
21+
AbortIfFailed: false,
22+
SkipDatabaseInitialization: false,
23+
Priority: 1,
24+
})
25+
}
26+
27+
func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool) error {
28+
if !setting.LFS.StartServer {
29+
return fmt.Errorf("LFS support is disabled")
30+
}
31+
32+
if err := repository.GarbageCollectLFSMetaObjects(ctx, logger, autofix); err != nil {
33+
return err
34+
}
35+
36+
return checkStorage(&checkStorageOptions{LFS: true})(ctx, logger, autofix)
37+
}

Diff for: services/cron/tasks_basic.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ func registerRepoHealthCheck() {
6363
for _, arg := range rhcConfig.Args {
6464
args = append(args, git.CmdArg(arg))
6565
}
66-
return repo_service.GitFsck(ctx, rhcConfig.Timeout, args)
66+
return repo_service.GitFsckRepos(ctx, rhcConfig.Timeout, args)
6767
})
6868
}
6969

Diff for: services/repository/check.go

+48-38
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ import (
2222
"xorm.io/builder"
2323
)
2424

25-
// GitFsck calls 'git fsck' to check repository health.
26-
func GitFsck(ctx context.Context, timeout time.Duration, args []git.CmdArg) error {
25+
// GitFsckRepos calls 'git fsck' to check repository health.
26+
func GitFsckRepos(ctx context.Context, timeout time.Duration, args []git.CmdArg) error {
2727
log.Trace("Doing: GitFsck")
2828

2929
if err := db.Iterate(
@@ -35,15 +35,7 @@ func GitFsck(ctx context.Context, timeout time.Duration, args []git.CmdArg) erro
3535
return db.ErrCancelledf("before fsck of %s", repo.FullName())
3636
default:
3737
}
38-
log.Trace("Running health check on repository %v", repo)
39-
repoPath := repo.RepoPath()
40-
if err := git.Fsck(ctx, repoPath, timeout, args...); err != nil {
41-
log.Warn("Failed to health check repository (%v): %v", repo, err)
42-
if err = system_model.CreateRepositoryNotice("Failed to health check repository (%s): %v", repo.FullName(), err); err != nil {
43-
log.Error("CreateRepositoryNotice: %v", err)
44-
}
45-
}
46-
return nil
38+
return GitFsckRepo(ctx, repo, timeout, args)
4739
},
4840
); err != nil {
4941
log.Trace("Error: GitFsck: %v", err)
@@ -54,6 +46,19 @@ func GitFsck(ctx context.Context, timeout time.Duration, args []git.CmdArg) erro
5446
return nil
5547
}
5648

49+
// GitFsckRepo calls 'git fsck' to check an individual repository's health.
50+
func GitFsckRepo(ctx context.Context, repo *repo_model.Repository, timeout time.Duration, args []git.CmdArg) error {
51+
log.Trace("Running health check on repository %-v", repo)
52+
repoPath := repo.RepoPath()
53+
if err := git.Fsck(ctx, repoPath, timeout, args...); err != nil {
54+
log.Warn("Failed to health check repository (%-v): %v", repo, err)
55+
if err = system_model.CreateRepositoryNotice("Failed to health check repository (%s): %v", repo.FullName(), err); err != nil {
56+
log.Error("CreateRepositoryNotice: %v", err)
57+
}
58+
}
59+
return nil
60+
}
61+
5762
// GitGcRepos calls 'git gc' to remove unnecessary files and optimize the local repository
5863
func GitGcRepos(ctx context.Context, timeout time.Duration, args ...git.CmdArg) error {
5964
log.Trace("Doing: GitGcRepos")
@@ -68,33 +73,7 @@ func GitGcRepos(ctx context.Context, timeout time.Duration, args ...git.CmdArg)
6873
return db.ErrCancelledf("before GC of %s", repo.FullName())
6974
default:
7075
}
71-
log.Trace("Running git gc on %v", repo)
72-
command := git.NewCommand(ctx, args...).
73-
SetDescription(fmt.Sprintf("Repository Garbage Collection: %s", repo.FullName()))
74-
var stdout string
75-
var err error
76-
stdout, _, err = command.RunStdString(&git.RunOpts{Timeout: timeout, Dir: repo.RepoPath()})
77-
78-
if err != nil {
79-
log.Error("Repository garbage collection failed for %v. Stdout: %s\nError: %v", repo, stdout, err)
80-
desc := fmt.Sprintf("Repository garbage collection failed for %s. Stdout: %s\nError: %v", repo.RepoPath(), stdout, err)
81-
if err = system_model.CreateRepositoryNotice(desc); err != nil {
82-
log.Error("CreateRepositoryNotice: %v", err)
83-
}
84-
return fmt.Errorf("Repository garbage collection failed in repo: %s: Error: %w", repo.FullName(), err)
85-
}
86-
87-
// Now update the size of the repository
88-
if err := repo_module.UpdateRepoSize(ctx, repo); err != nil {
89-
log.Error("Updating size as part of garbage collection failed for %v. Stdout: %s\nError: %v", repo, stdout, err)
90-
desc := fmt.Sprintf("Updating size as part of garbage collection failed for %s. Stdout: %s\nError: %v", repo.RepoPath(), stdout, err)
91-
if err = system_model.CreateRepositoryNotice(desc); err != nil {
92-
log.Error("CreateRepositoryNotice: %v", err)
93-
}
94-
return fmt.Errorf("Updating size as part of garbage collection failed in repo: %s: Error: %w", repo.FullName(), err)
95-
}
96-
97-
return nil
76+
return GitGcRepo(ctx, repo, timeout, args)
9877
},
9978
); err != nil {
10079
return err
@@ -104,6 +83,37 @@ func GitGcRepos(ctx context.Context, timeout time.Duration, args ...git.CmdArg)
10483
return nil
10584
}
10685

86+
// GitGcRepo calls 'git gc' to remove unnecessary files and optimize the local repository
87+
func GitGcRepo(ctx context.Context, repo *repo_model.Repository, timeout time.Duration, args []git.CmdArg) error {
88+
log.Trace("Running git gc on %-v", repo)
89+
command := git.NewCommand(ctx, args...).
90+
SetDescription(fmt.Sprintf("Repository Garbage Collection: %s", repo.FullName()))
91+
var stdout string
92+
var err error
93+
stdout, _, err = command.RunStdString(&git.RunOpts{Timeout: timeout, Dir: repo.RepoPath()})
94+
95+
if err != nil {
96+
log.Error("Repository garbage collection failed for %v. Stdout: %s\nError: %v", repo, stdout, err)
97+
desc := fmt.Sprintf("Repository garbage collection failed for %s. Stdout: %s\nError: %v", repo.RepoPath(), stdout, err)
98+
if err = system_model.CreateRepositoryNotice(desc); err != nil {
99+
log.Error("CreateRepositoryNotice: %v", err)
100+
}
101+
return fmt.Errorf("Repository garbage collection failed in repo: %s: Error: %w", repo.FullName(), err)
102+
}
103+
104+
// Now update the size of the repository
105+
if err := repo_module.UpdateRepoSize(ctx, repo); err != nil {
106+
log.Error("Updating size as part of garbage collection failed for %-v. Stdout: %s\nError: %v", repo, stdout, err)
107+
desc := fmt.Sprintf("Updating size as part of garbage collection failed for %s. Stdout: %s\nError: %v", repo.RepoPath(), stdout, err)
108+
if err = system_model.CreateRepositoryNotice(desc); err != nil {
109+
log.Error("CreateRepositoryNotice: %v", err)
110+
}
111+
return fmt.Errorf("Updating size as part of garbage collection failed in repo: %s: Error: %w", repo.FullName(), err)
112+
}
113+
114+
return nil
115+
}
116+
107117
func gatherMissingRepoRecords(ctx context.Context) ([]*repo_model.Repository, error) {
108118
repos := make([]*repo_model.Repository, 0, 10)
109119
if err := db.Iterate(

Diff for: services/repository/lfs.go

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package repository
5+
6+
import (
7+
"context"
8+
"fmt"
9+
"time"
10+
11+
"code.gitea.io/gitea/models/db"
12+
git_model "code.gitea.io/gitea/models/git"
13+
repo_model "code.gitea.io/gitea/models/repo"
14+
"code.gitea.io/gitea/modules/git"
15+
"code.gitea.io/gitea/modules/lfs"
16+
"code.gitea.io/gitea/modules/log"
17+
18+
"xorm.io/builder"
19+
)
20+
21+
func GarbageCollectLFSMetaObjects(ctx context.Context, logger log.Logger, autofix bool) error {
22+
log.Trace("Doing: GarbageCollectLFSMetaObjects")
23+
24+
if err := db.Iterate(
25+
ctx,
26+
builder.And(builder.Gt{"id": 0}),
27+
func(ctx context.Context, repo *repo_model.Repository) error {
28+
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, logger, autofix)
29+
},
30+
); err != nil {
31+
return err
32+
}
33+
34+
log.Trace("Finished: GarbageCollectLFSMetaObjects")
35+
return nil
36+
}
37+
38+
func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, logger log.Logger, autofix bool) error {
39+
if logger != nil {
40+
logger.Info("Checking %-v", repo)
41+
}
42+
total, orphaned, collected, deleted := 0, 0, 0, 0
43+
if logger != nil {
44+
defer func() {
45+
if orphaned == 0 {
46+
logger.Info("Found %d total LFSMetaObjects in %-v", total, repo)
47+
} else if !autofix {
48+
logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo)
49+
} else {
50+
logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted)
51+
}
52+
}()
53+
}
54+
55+
gitRepo, err := git.OpenRepository(ctx, repo.RepoPath())
56+
if err != nil {
57+
log.Error("Unable to open git repository %-v: %v", repo, err)
58+
return err
59+
}
60+
defer gitRepo.Close()
61+
62+
store := lfs.NewContentStore()
63+
64+
return git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error {
65+
total++
66+
pointerSha := git.ComputeBlobHash([]byte(metaObject.Pointer.StringContent()))
67+
68+
if gitRepo.IsObjectExist(pointerSha.String()) {
69+
return nil
70+
}
71+
orphaned++
72+
73+
if !autofix {
74+
return nil
75+
}
76+
// Non-existent pointer file
77+
_, err = git_model.RemoveLFSMetaObjectByOidFn(repo.ID, metaObject.Oid, func(count int64) error {
78+
if count > 0 {
79+
return nil
80+
}
81+
82+
if err := store.Delete(metaObject.RelativePath()); err != nil {
83+
log.Error("Unable to remove lfs metaobject %s from store: %v", metaObject.Oid, err)
84+
}
85+
deleted++
86+
return nil
87+
})
88+
if err != nil {
89+
return fmt.Errorf("unable to remove meta-object %s in %s: %w", metaObject.Oid, repo.FullName(), err)
90+
}
91+
collected++
92+
93+
return nil
94+
}, &git_model.IterateLFSMetaObjectsForRepoOptions{
95+
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
96+
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
97+
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
98+
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
99+
// objects.
100+
//
101+
// It is likely that a week is potentially excessive but it should definitely be enough that any
102+
// unassociated LFS object is genuinely unassociated.
103+
OlderThan: time.Now().Add(-24 * 7 * time.Hour),
104+
})
105+
}

0 commit comments

Comments
 (0)