From 2cc3a6381cab34113fd2dc2c24ef0efc22c4336d Mon Sep 17 00:00:00 2001
From: zeripath <art27@cantab.net>
Date: Mon, 16 Jan 2023 19:50:53 +0000
Subject: [PATCH] Add cron method to gc LFS MetaObjects (#22385)

This PR adds a task to the cron service to allow garbage collection of
LFS meta objects. As repositories may have a large number of
LFSMetaObjects, an updated column is added to this table and it is used
to perform a generational GC to attempt to reduce the amount of work.
(There may need to be a bit more work here but this is probably enough
for the moment.)

Fix #7045

Signed-off-by: Andrew Thornton <art27@cantab.net>
---
 custom/conf/app.example.ini                   | 22 +++++
 .../doc/advanced/config-cheat-sheet.en-us.md  | 10 ++
 models/git/lfs.go                             | 69 ++++++++++++-
 models/migrations/migrations.go               |  5 +
 models/migrations/v1_19/v238.go               | 27 ++++++
 modules/doctor/lfs.go                         | 16 ++-
 options/locale/locale_en-US.ini               |  1 +
 services/cron/tasks_extended.go               | 43 ++++++++
 services/repository/lfs.go                    | 97 +++++++++++++------
 9 files changed, 255 insertions(+), 35 deletions(-)
 create mode 100644 models/migrations/v1_19/v238.go

diff --git a/custom/conf/app.example.ini b/custom/conf/app.example.ini
index 3233135e9d..eca1184ff9 100644
--- a/custom/conf/app.example.ini
+++ b/custom/conf/app.example.ini
@@ -2213,6 +2213,28 @@ ROUTER = console
 ;SCHEDULE = @every 168h
 ;OLDER_THAN = 8760h
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Garbage collect LFS pointers in repositories
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;[cron.gc_lfs]
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;ENABLED = false
+;; Garbage collect LFS pointers in repositories (default false)
+;RUN_AT_START = false
+;; Interval as a duration between each gc run (default every 24h)
+;SCHEDULE = @every 24h
+;; Only attempt to garbage collect LFSMetaObjects older than this (default 7 days)
+;OLDER_THAN = 168h
+;; Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days)
+;LAST_UPDATED_MORE_THAN_AGO = 72h
+; Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all.
+;NUMBER_TO_CHECK_PER_REPO = 100
+;Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.)
+;PROPORTION_TO_CHECK_PER_REPO = 0.6
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Git Operation timeout in seconds
diff --git a/docs/content/doc/advanced/config-cheat-sheet.en-us.md b/docs/content/doc/advanced/config-cheat-sheet.en-us.md
index 7dbcad6b40..295fa713a4 100644
--- a/docs/content/doc/advanced/config-cheat-sheet.en-us.md
+++ b/docs/content/doc/advanced/config-cheat-sheet.en-us.md
@@ -1039,6 +1039,16 @@ Default templates for project boards:
 - `SCHEDULE`: **@every 168h**: Cron syntax to set how often to check.
 - `OLDER_THAN`: **@every 8760h**: any system notice older than this expression will be deleted from database.
 
+#### Cron -  Garbage collect LFS pointers in repositories ('cron.gc_lfs')
+
+- `ENABLED`: **false**: Enable service.
+- `RUN_AT_START`: **false**: Run tasks at start up time (if ENABLED).
+- `SCHEDULE`: **@every 24h**: Cron syntax to set how often to check.
+- `OLDER_THAN`: **168h**: Only attempt to garbage collect LFSMetaObjects older than this (default 7 days)
+- `LAST_UPDATED_MORE_THAN_AGO`: **72h**: Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days)
+- `NUMBER_TO_CHECK_PER_REPO`: **100**: Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all.
+- `PROPORTION_TO_CHECK_PER_REPO`: **0.6**: Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.)
+
 ## Git (`git`)
 
 - `PATH`: **""**: The path of Git executable. If empty, Gitea searches through the PATH environment.
diff --git a/models/git/lfs.go b/models/git/lfs.go
index 3494264688..0ba8e919d0 100644
--- a/models/git/lfs.go
+++ b/models/git/lfs.go
@@ -115,6 +115,7 @@ type LFSMetaObject struct {
 	RepositoryID int64              `xorm:"UNIQUE(s) INDEX NOT NULL"`
 	Existing     bool               `xorm:"-"`
 	CreatedUnix  timeutil.TimeStamp `xorm:"created"`
+	UpdatedUnix  timeutil.TimeStamp `xorm:"INDEX updated"`
 }
 
 func init() {
@@ -334,8 +335,45 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) {
 	return lfsSize, nil
 }
 
+// IterateRepositoryIDsWithLFSMetaObjects iterates across the repositories that have LFSMetaObjects
+func IterateRepositoryIDsWithLFSMetaObjects(ctx context.Context, f func(ctx context.Context, repoID, count int64) error) error {
+	batchSize := setting.Database.IterateBufferSize
+	sess := db.GetEngine(ctx)
+	id := int64(0)
+	type RepositoryCount struct {
+		RepositoryID int64
+		Count        int64
+	}
+	for {
+		counts := make([]*RepositoryCount, 0, batchSize)
+		sess.Select("repository_id, COUNT(id) AS count").
+			Table("lfs_meta_object").
+			Where("repository_id > ?", id).
+			GroupBy("repository_id").
+			OrderBy("repository_id ASC")
+
+		if err := sess.Limit(batchSize, 0).Find(&counts); err != nil {
+			return err
+		}
+		if len(counts) == 0 {
+			return nil
+		}
+
+		for _, count := range counts {
+			if err := f(ctx, count.RepositoryID, count.Count); err != nil {
+				return err
+			}
+		}
+		id = counts[len(counts)-1].RepositoryID
+	}
+}
+
+// IterateLFSMetaObjectsForRepoOptions provides options for IterateLFSMetaObjectsForRepo
 type IterateLFSMetaObjectsForRepoOptions struct {
-	OlderThan time.Time
+	OlderThan                 time.Time
+	UpdatedLessRecentlyThan   time.Time
+	OrderByUpdated            bool
+	LoopFunctionAlwaysUpdates bool
 }
 
 // IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo
@@ -348,28 +386,53 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont
 		LFSMetaObject
 	}
 
+	id := int64(0)
+
 	for {
 		beans := make([]*CountLFSMetaObject, 0, batchSize)
-		// SELECT `lfs_meta_object`.*, COUNT(`l1`.id) as `count` FROM lfs_meta_object INNER JOIN lfs_meta_object AS l1 ON l1.oid = lfs_meta_object.oid WHERE lfs_meta_object.repository_id = ? GROUP BY lfs_meta_object.id
 		sess := engine.Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`").
 			Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid").
 			Where("`lfs_meta_object`.repository_id = ?", repoID)
 		if !opts.OlderThan.IsZero() {
 			sess.And("`lfs_meta_object`.created_unix < ?", opts.OlderThan)
 		}
+		if !opts.UpdatedLessRecentlyThan.IsZero() {
+			sess.And("`lfs_meta_object`.updated_unix < ?", opts.UpdatedLessRecentlyThan)
+		}
 		sess.GroupBy("`lfs_meta_object`.id")
+		if opts.OrderByUpdated {
+			sess.OrderBy("`lfs_meta_object`.updated_unix ASC")
+		} else {
+			sess.And("`lfs_meta_object`.id > ?", id)
+			sess.OrderBy("`lfs_meta_object`.id ASC")
+		}
 		if err := sess.Limit(batchSize, start).Find(&beans); err != nil {
 			return err
 		}
 		if len(beans) == 0 {
 			return nil
 		}
-		start += len(beans)
+		if !opts.LoopFunctionAlwaysUpdates {
+			start += len(beans)
+		}
 
 		for _, bean := range beans {
 			if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil {
 				return err
 			}
 		}
+		id = beans[len(beans)-1].ID
 	}
 }
+
+// MarkLFSMetaObject updates the updated time for the provided LFSMetaObject
+func MarkLFSMetaObject(ctx context.Context, id int64) error {
+	obj := &LFSMetaObject{
+		UpdatedUnix: timeutil.TimeStampNow(),
+	}
+	count, err := db.GetEngine(ctx).ID(id).Update(obj)
+	if count != 1 {
+		log.Error("Unexpectedly updated %d LFSMetaObjects with ID: %d", count, id)
+	}
+	return err
+}
diff --git a/models/migrations/migrations.go b/models/migrations/migrations.go
index 9d9c8f5165..4e211617c0 100644
--- a/models/migrations/migrations.go
+++ b/models/migrations/migrations.go
@@ -432,6 +432,9 @@ var migrations = []Migration{
 	NewMigration("Update counts of all open milestones", v1_18.UpdateOpenMilestoneCounts),
 	// v230 -> v231
 	NewMigration("Add ConfidentialClient column (default true) to OAuth2Application table", v1_18.AddConfidentialClientColumnToOAuth2ApplicationTable),
+
+	// Gitea 1.18.0 ends at v231
+
 	// v231 -> v232
 	NewMigration("Add index for hook_task", v1_19.AddIndexForHookTask),
 	// v232 -> v233
@@ -446,6 +449,8 @@ var migrations = []Migration{
 	NewMigration("Create secrets table", v1_19.CreateSecretsTable),
 	// v237 -> v238
 	NewMigration("Drop ForeignReference table", v1_19.DropForeignReferenceTable),
+	// v238 -> v239
+	NewMigration("Add updated unix to LFSMetaObject", v1_19.AddUpdatedUnixToLFSMetaObject),
 }
 
 // GetCurrentDBVersion returns the current db version
diff --git a/models/migrations/v1_19/v238.go b/models/migrations/v1_19/v238.go
new file mode 100644
index 0000000000..266e6cea58
--- /dev/null
+++ b/models/migrations/v1_19/v238.go
@@ -0,0 +1,27 @@
+// Copyright 2022 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package v1_19 //nolint
+
+import (
+	"code.gitea.io/gitea/modules/timeutil"
+
+	"xorm.io/xorm"
+)
+
+// AddUpdatedUnixToLFSMetaObject adds an updated column to the LFSMetaObject to allow for garbage collection
+func AddUpdatedUnixToLFSMetaObject(x *xorm.Engine) error {
+	// Drop the table introduced in `v211`, it's considered badly designed and doesn't look like to be used.
+	// See: https://github.com/go-gitea/gitea/issues/21086#issuecomment-1318217453
+	// LFSMetaObject stores metadata for LFS tracked files.
+	type LFSMetaObject struct {
+		ID           int64              `xorm:"pk autoincr"`
+		Oid          string             `json:"oid" xorm:"UNIQUE(s) INDEX NOT NULL"`
+		Size         int64              `json:"size" xorm:"NOT NULL"`
+		RepositoryID int64              `xorm:"UNIQUE(s) INDEX NOT NULL"`
+		CreatedUnix  timeutil.TimeStamp `xorm:"created"`
+		UpdatedUnix  timeutil.TimeStamp `xorm:"INDEX updated"`
+	}
+
+	return x.Sync(new(LFSMetaObject))
+}
diff --git a/modules/doctor/lfs.go b/modules/doctor/lfs.go
index 410ed5a9a5..64ee4c40bf 100644
--- a/modules/doctor/lfs.go
+++ b/modules/doctor/lfs.go
@@ -6,6 +6,7 @@ package doctor
 import (
 	"context"
 	"fmt"
+	"time"
 
 	"code.gitea.io/gitea/modules/log"
 	"code.gitea.io/gitea/modules/setting"
@@ -29,7 +30,20 @@ func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool
 		return fmt.Errorf("LFS support is disabled")
 	}
 
-	if err := repository.GarbageCollectLFSMetaObjects(ctx, logger, autofix); err != nil {
+	if err := repository.GarbageCollectLFSMetaObjects(ctx, repository.GarbageCollectLFSMetaObjectsOptions{
+		Logger:  logger,
+		AutoFix: autofix,
+		// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
+		// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
+		// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
+		// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
+		// objects.
+		//
+		// It is likely that a week is potentially excessive but it should definitely be enough that any
+		// unassociated LFS object is genuinely unassociated.
+		OlderThan: time.Now().Add(-24 * time.Hour * 7),
+		// We don't set the UpdatedLessRecentlyThan because we want to do a full GC
+	}); err != nil {
 		return err
 	}
 
diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini
index 39aef9d993..8ddd8e40d6 100644
--- a/options/locale/locale_en-US.ini
+++ b/options/locale/locale_en-US.ini
@@ -2554,6 +2554,7 @@ dashboard.delete_old_actions = Delete all old actions from database
 dashboard.delete_old_actions.started = Delete all old actions from database started.
 dashboard.update_checker = Update checker
 dashboard.delete_old_system_notices = Delete all old system notices from database
+dashboard.gc_lfs = Garbage collect LFS meta objects
 
 users.user_manage_panel = User Account Management
 users.new_account = Create User Account
diff --git a/services/cron/tasks_extended.go b/services/cron/tasks_extended.go
index 4486be0c2f..520d940edf 100644
--- a/services/cron/tasks_extended.go
+++ b/services/cron/tasks_extended.go
@@ -175,6 +175,48 @@ func registerDeleteOldSystemNotices() {
 	})
 }
 
+func registerGCLFS() {
+	if !setting.LFS.StartServer {
+		return
+	}
+	type GCLFSConfig struct {
+		OlderThanConfig
+		LastUpdatedMoreThanAgo   time.Duration
+		NumberToCheckPerRepo     int64
+		ProportionToCheckPerRepo float64
+	}
+
+	RegisterTaskFatal("gc_lfs", &GCLFSConfig{
+		OlderThanConfig: OlderThanConfig{
+			BaseConfig: BaseConfig{
+				Enabled:    false,
+				RunAtStart: false,
+				Schedule:   "@every 24h",
+			},
+			// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
+			// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
+			// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
+			// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
+			// objects.
+			//
+			// It is likely that a week is potentially excessive but it should definitely be enough that any
+			// unassociated LFS object is genuinely unassociated.
+			OlderThan: 24 * time.Hour * 7,
+		},
+		// Only GC things that haven't been looked at in the past 3 days
+		LastUpdatedMoreThanAgo:   24 * time.Hour * 3,
+		NumberToCheckPerRepo:     100,
+		ProportionToCheckPerRepo: 0.6,
+	}, func(ctx context.Context, _ *user_model.User, config Config) error {
+		gcLFSConfig := config.(*GCLFSConfig)
+		return repo_service.GarbageCollectLFSMetaObjects(ctx, repo_service.GarbageCollectLFSMetaObjectsOptions{
+			AutoFix:                 true,
+			OlderThan:               time.Now().Add(-gcLFSConfig.OlderThan),
+			UpdatedLessRecentlyThan: time.Now().Add(-gcLFSConfig.LastUpdatedMoreThanAgo),
+		})
+	})
+}
+
 func initExtendedTasks() {
 	registerDeleteInactiveUsers()
 	registerDeleteRepositoryArchives()
@@ -188,4 +230,5 @@ func initExtendedTasks() {
 	registerDeleteOldActions()
 	registerUpdateGiteaChecker()
 	registerDeleteOldSystemNotices()
+	registerGCLFS()
 }
diff --git a/services/repository/lfs.go b/services/repository/lfs.go
index 7806e20a9f..aeb808a72f 100644
--- a/services/repository/lfs.go
+++ b/services/repository/lfs.go
@@ -5,49 +5,67 @@ package repository
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"time"
 
-	"code.gitea.io/gitea/models/db"
 	git_model "code.gitea.io/gitea/models/git"
 	repo_model "code.gitea.io/gitea/models/repo"
 	"code.gitea.io/gitea/modules/git"
 	"code.gitea.io/gitea/modules/lfs"
 	"code.gitea.io/gitea/modules/log"
-
-	"xorm.io/builder"
+	"code.gitea.io/gitea/modules/setting"
 )
 
-func GarbageCollectLFSMetaObjects(ctx context.Context, logger log.Logger, autofix bool) error {
-	log.Trace("Doing: GarbageCollectLFSMetaObjects")
-
-	if err := db.Iterate(
-		ctx,
-		builder.And(builder.Gt{"id": 0}),
-		func(ctx context.Context, repo *repo_model.Repository) error {
-			return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, logger, autofix)
-		},
-	); err != nil {
-		return err
-	}
-
-	log.Trace("Finished: GarbageCollectLFSMetaObjects")
-	return nil
+// GarbageCollectLFSMetaObjectsOptions provides options for GarbageCollectLFSMetaObjects function
+type GarbageCollectLFSMetaObjectsOptions struct {
+	Logger                   log.Logger
+	AutoFix                  bool
+	OlderThan                time.Time
+	UpdatedLessRecentlyThan  time.Time
+	NumberToCheckPerRepo     int64
+	ProportionToCheckPerRepo float64
 }
 
-func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, logger log.Logger, autofix bool) error {
-	if logger != nil {
-		logger.Info("Checking %-v", repo)
+// GarbageCollectLFSMetaObjects garbage collects LFS objects for all repositories
+func GarbageCollectLFSMetaObjects(ctx context.Context, opts GarbageCollectLFSMetaObjectsOptions) error {
+	log.Trace("Doing: GarbageCollectLFSMetaObjects")
+	defer log.Trace("Finished: GarbageCollectLFSMetaObjects")
+
+	if !setting.LFS.StartServer {
+		if opts.Logger != nil {
+			opts.Logger.Info("LFS support is disabled")
+		}
+		return nil
 	}
-	total, orphaned, collected, deleted := 0, 0, 0, 0
-	if logger != nil {
+
+	return git_model.IterateRepositoryIDsWithLFSMetaObjects(ctx, func(ctx context.Context, repoID, count int64) error {
+		repo, err := repo_model.GetRepositoryByID(ctx, repoID)
+		if err != nil {
+			return err
+		}
+
+		if newMinimum := int64(float64(count) * opts.ProportionToCheckPerRepo); newMinimum > opts.NumberToCheckPerRepo && opts.NumberToCheckPerRepo != 0 {
+			opts.NumberToCheckPerRepo = newMinimum
+		}
+		return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts)
+	})
+}
+
+// GarbageCollectLFSMetaObjectsForRepo garbage collects LFS objects for a specific repository
+func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, opts GarbageCollectLFSMetaObjectsOptions) error {
+	if opts.Logger != nil {
+		opts.Logger.Info("Checking %-v", repo)
+	}
+	total, orphaned, collected, deleted := int64(0), 0, 0, 0
+	if opts.Logger != nil {
 		defer func() {
 			if orphaned == 0 {
-				logger.Info("Found %d total LFSMetaObjects in %-v", total, repo)
-			} else if !autofix {
-				logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo)
+				opts.Logger.Info("Found %d total LFSMetaObjects in %-v", total, repo)
+			} else if !opts.AutoFix {
+				opts.Logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo)
 			} else {
-				logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted)
+				opts.Logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted)
 			}
 		}()
 	}
@@ -60,17 +78,21 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R
 	defer gitRepo.Close()
 
 	store := lfs.NewContentStore()
+	errStop := errors.New("STOPERR")
 
-	return git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error {
+	err = git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error {
+		if opts.NumberToCheckPerRepo > 0 && total > opts.NumberToCheckPerRepo {
+			return errStop
+		}
 		total++
 		pointerSha := git.ComputeBlobHash([]byte(metaObject.Pointer.StringContent()))
 
 		if gitRepo.IsObjectExist(pointerSha.String()) {
-			return nil
+			return git_model.MarkLFSMetaObject(ctx, metaObject.ID)
 		}
 		orphaned++
 
-		if !autofix {
+		if !opts.AutoFix {
 			return nil
 		}
 		// Non-existent pointer file
@@ -100,6 +122,19 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R
 		//
 		// It is likely that a week is potentially excessive but it should definitely be enough that any
 		// unassociated LFS object is genuinely unassociated.
-		OlderThan: time.Now().Add(-24 * 7 * time.Hour),
+		OlderThan:                 opts.OlderThan,
+		UpdatedLessRecentlyThan:   opts.UpdatedLessRecentlyThan,
+		OrderByUpdated:            true,
+		LoopFunctionAlwaysUpdates: true,
 	})
+
+	if err == errStop {
+		if opts.Logger != nil {
+			opts.Logger.Info("Processing stopped at %d total LFSMetaObjects in %-v", total, repo)
+		}
+		return nil
+	} else if err != nil {
+		return err
+	}
+	return nil
 }