1
0
mirror of https://github.com/rclone/rclone.git synced 2025-12-16 00:04:40 +00:00

cmd/dedupe: make largest directory primary to minimize data moved (#3648)

This change makes dedupe recursively count elements in same-named directories
and make the largest one primary. This allows to minimize the amount of data
moved (or at least the amount of API calls) when dedupe merges them.
It also adds a new fs.Object interface `ParentIDer` with function `ParentID` and
implements it for the drive and opendrive backends. This function returns
parent directory ID for objects on filesystems that allow same-named dirs.
We use it to correctly count sizes of same-named directories.

Fixes #2568

Co-authored-by: Ivan Andreev <ivandeex@gmail.com>
This commit is contained in:
Saksham Khanna
2021-03-11 23:10:29 +05:30
committed by GitHub
parent 6a9ae32012
commit 4d8ef7bca7
5 changed files with 187 additions and 67 deletions

View File

@@ -247,20 +247,82 @@ func (x *DeduplicateMode) Type() string {
return "string"
}
// Directory with entry count and links to parents
type dedupeDir struct {
dir fs.Directory
parent string
count int
}
// Map of directories by ID with recursive counts
type dedupeDirsMap map[string]*dedupeDir
func (dm dedupeDirsMap) get(id string) *dedupeDir {
d := dm[id]
if d == nil {
d = &dedupeDir{}
dm[id] = d
}
return d
}
func (dm dedupeDirsMap) increment(parent string) {
if parent != "" {
d := dm.get(parent)
d.count++
dm.increment(d.parent)
}
}
// dedupeFindDuplicateDirs scans f for duplicate directories
func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, error) {
func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) (duplicateDirs [][]*dedupeDir, err error) {
dirsByID := dedupeDirsMap{}
dirs := map[string][]*dedupeDir{}
ci := fs.GetConfig(ctx)
dirs := map[string][]fs.Directory{}
err := walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListDirs, func(entries fs.DirEntries) error {
entries.ForDir(func(d fs.Directory) {
dirs[d.Remote()] = append(dirs[d.Remote()], d)
})
err = walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListAll, func(entries fs.DirEntries) error {
for _, entry := range entries {
remote := entry.Remote()
parentRemote := path.Dir(remote)
if parentRemote == "." {
parentRemote = ""
}
// Obtain ID of the object parent, if known.
// (This usually means that backend allows duplicate paths)
// Fall back to remote parent path, if unavailable.
var parent string
if entryParentIDer, ok := entry.(fs.ParentIDer); ok {
parent = entryParentIDer.ParentID()
}
if parent == "" {
parent = parentRemote
}
var ID string
if entryIDer, ok := entry.(fs.IDer); ok {
ID = entryIDer.ID()
}
if ID == "" {
ID = remote
}
if fsDir, ok := entry.(fs.Directory); ok {
d := dirsByID.get(ID)
d.dir = fsDir
d.parent = parent
dirs[remote] = append(dirs[remote], d)
}
dirsByID.increment(parent)
}
return nil
})
if err != nil {
return nil, errors.Wrap(err, "find duplicate dirs")
}
// make sure parents are before children
// Make sure parents are before children
duplicateNames := []string{}
for name, ds := range dirs {
if len(ds) > 1 {
@@ -268,15 +330,15 @@ func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, er
}
}
sort.Strings(duplicateNames)
duplicateDirs := [][]fs.Directory{}
for _, name := range duplicateNames {
duplicateDirs = append(duplicateDirs, dirs[name])
}
return duplicateDirs, nil
return
}
// dedupeMergeDuplicateDirs merges all the duplicate directories found
func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs.Directory) error {
func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]*dedupeDir) error {
mergeDirs := f.Features().MergeDirs
if mergeDirs == nil {
return errors.Errorf("%v: can't merge directories", f)
@@ -285,15 +347,30 @@ func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs
if dirCacheFlush == nil {
return errors.Errorf("%v: can't flush dir cache", f)
}
for _, dirs := range duplicateDirs {
if !SkipDestructive(ctx, dirs[0], "merge duplicate directories") {
fs.Infof(dirs[0], "Merging contents of duplicate directories")
err := mergeDirs(ctx, dirs)
if err != nil {
err = fs.CountError(err)
fs.Errorf(nil, "merge duplicate dirs: %v", err)
for _, dedupeDirs := range duplicateDirs {
if SkipDestructive(ctx, dedupeDirs[0].dir, "merge duplicate directories") {
continue
}
// Put largest directory in front to minimize movements
fsDirs := []fs.Directory{}
largestCount := -1
largestIdx := 0
for i, d := range dedupeDirs {
fsDirs = append(fsDirs, d.dir)
if d.count > largestCount {
largestIdx = i
largestCount = d.count
}
}
fsDirs[largestIdx], fsDirs[0] = fsDirs[0], fsDirs[largestIdx]
fs.Infof(fsDirs[0], "Merging contents of duplicate directories")
err := mergeDirs(ctx, fsDirs)
if err != nil {
err = fs.CountError(err)
fs.Errorf(nil, "merge duplicate dirs: %v", err)
}
}
dirCacheFlush()
return nil
@@ -335,15 +412,16 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool
if err != nil {
return err
}
if len(duplicateDirs) != 0 {
if len(duplicateDirs) > 0 {
if mode != DeduplicateList {
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
if err != nil {
return err
}
} else {
for _, dir := range duplicateDirs {
fmt.Printf("%s: %d duplicates of this directory\n", dir[0].Remote(), len(dir))
for _, dedupeDirs := range duplicateDirs {
remote := dedupeDirs[0].dir.Remote()
fmt.Printf("%s: %d duplicates of this directory\n", remote, len(dedupeDirs))
}
}
}
@@ -375,42 +453,43 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool
}
for remote, objs := range files {
if len(objs) > 1 {
fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
if !byHash && mode != DeduplicateList {
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
if len(objs) <= 1 {
fs.Logf(remote, "All duplicates removed")
continue
}
}
switch mode {
case DeduplicateInteractive:
dedupeInteractive(ctx, f, ht, remote, objs, byHash)
case DeduplicateFirst:
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateNewest:
sortOldestFirst(objs)
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
case DeduplicateOldest:
sortOldestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateRename:
dedupeRename(ctx, f, remote, objs)
case DeduplicateLargest:
sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
case DeduplicateSmallest:
sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateSkip:
fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what)
case DeduplicateList:
dedupeList(ctx, f, ht, remote, objs, byHash)
default:
//skip
if len(objs) <= 1 {
continue
}
fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
if !byHash && mode != DeduplicateList {
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
if len(objs) <= 1 {
fs.Logf(remote, "All duplicates removed")
continue
}
}
switch mode {
case DeduplicateInteractive:
dedupeInteractive(ctx, f, ht, remote, objs, byHash)
case DeduplicateFirst:
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateNewest:
sortOldestFirst(objs)
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
case DeduplicateOldest:
sortOldestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateRename:
dedupeRename(ctx, f, remote, objs)
case DeduplicateLargest:
sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
case DeduplicateSmallest:
sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateSkip:
fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what)
case DeduplicateList:
dedupeList(ctx, f, ht, remote, objs, byHash)
default:
//skip
}
}
return nil
}