mirror of
https://github.com/rclone/rclone.git
synced 2025-12-16 00:04:40 +00:00
cmd/dedupe: make largest directory primary to minimize data moved (#3648)
This change makes dedupe recursively count elements in same-named directories and make the largest one primary. This allows to minimize the amount of data moved (or at least the amount of API calls) when dedupe merges them. It also adds a new fs.Object interface `ParentIDer` with function `ParentID` and implements it for the drive and opendrive backends. This function returns parent directory ID for objects on filesystems that allow same-named dirs. We use it to correctly count sizes of same-named directories. Fixes #2568 Co-authored-by: Ivan Andreev <ivandeex@gmail.com>
This commit is contained in:
@@ -247,20 +247,82 @@ func (x *DeduplicateMode) Type() string {
|
||||
return "string"
|
||||
}
|
||||
|
||||
// Directory with entry count and links to parents
|
||||
type dedupeDir struct {
|
||||
dir fs.Directory
|
||||
parent string
|
||||
count int
|
||||
}
|
||||
|
||||
// Map of directories by ID with recursive counts
|
||||
type dedupeDirsMap map[string]*dedupeDir
|
||||
|
||||
func (dm dedupeDirsMap) get(id string) *dedupeDir {
|
||||
d := dm[id]
|
||||
if d == nil {
|
||||
d = &dedupeDir{}
|
||||
dm[id] = d
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func (dm dedupeDirsMap) increment(parent string) {
|
||||
if parent != "" {
|
||||
d := dm.get(parent)
|
||||
d.count++
|
||||
dm.increment(d.parent)
|
||||
}
|
||||
}
|
||||
|
||||
// dedupeFindDuplicateDirs scans f for duplicate directories
|
||||
func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, error) {
|
||||
func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) (duplicateDirs [][]*dedupeDir, err error) {
|
||||
dirsByID := dedupeDirsMap{}
|
||||
dirs := map[string][]*dedupeDir{}
|
||||
|
||||
ci := fs.GetConfig(ctx)
|
||||
dirs := map[string][]fs.Directory{}
|
||||
err := walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListDirs, func(entries fs.DirEntries) error {
|
||||
entries.ForDir(func(d fs.Directory) {
|
||||
dirs[d.Remote()] = append(dirs[d.Remote()], d)
|
||||
})
|
||||
err = walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListAll, func(entries fs.DirEntries) error {
|
||||
for _, entry := range entries {
|
||||
remote := entry.Remote()
|
||||
parentRemote := path.Dir(remote)
|
||||
if parentRemote == "." {
|
||||
parentRemote = ""
|
||||
}
|
||||
|
||||
// Obtain ID of the object parent, if known.
|
||||
// (This usually means that backend allows duplicate paths)
|
||||
// Fall back to remote parent path, if unavailable.
|
||||
var parent string
|
||||
if entryParentIDer, ok := entry.(fs.ParentIDer); ok {
|
||||
parent = entryParentIDer.ParentID()
|
||||
}
|
||||
if parent == "" {
|
||||
parent = parentRemote
|
||||
}
|
||||
|
||||
var ID string
|
||||
if entryIDer, ok := entry.(fs.IDer); ok {
|
||||
ID = entryIDer.ID()
|
||||
}
|
||||
if ID == "" {
|
||||
ID = remote
|
||||
}
|
||||
|
||||
if fsDir, ok := entry.(fs.Directory); ok {
|
||||
d := dirsByID.get(ID)
|
||||
d.dir = fsDir
|
||||
d.parent = parent
|
||||
dirs[remote] = append(dirs[remote], d)
|
||||
}
|
||||
|
||||
dirsByID.increment(parent)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "find duplicate dirs")
|
||||
}
|
||||
// make sure parents are before children
|
||||
|
||||
// Make sure parents are before children
|
||||
duplicateNames := []string{}
|
||||
for name, ds := range dirs {
|
||||
if len(ds) > 1 {
|
||||
@@ -268,15 +330,15 @@ func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, er
|
||||
}
|
||||
}
|
||||
sort.Strings(duplicateNames)
|
||||
duplicateDirs := [][]fs.Directory{}
|
||||
for _, name := range duplicateNames {
|
||||
duplicateDirs = append(duplicateDirs, dirs[name])
|
||||
}
|
||||
return duplicateDirs, nil
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// dedupeMergeDuplicateDirs merges all the duplicate directories found
|
||||
func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs.Directory) error {
|
||||
func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]*dedupeDir) error {
|
||||
mergeDirs := f.Features().MergeDirs
|
||||
if mergeDirs == nil {
|
||||
return errors.Errorf("%v: can't merge directories", f)
|
||||
@@ -285,15 +347,30 @@ func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs
|
||||
if dirCacheFlush == nil {
|
||||
return errors.Errorf("%v: can't flush dir cache", f)
|
||||
}
|
||||
for _, dirs := range duplicateDirs {
|
||||
if !SkipDestructive(ctx, dirs[0], "merge duplicate directories") {
|
||||
fs.Infof(dirs[0], "Merging contents of duplicate directories")
|
||||
err := mergeDirs(ctx, dirs)
|
||||
if err != nil {
|
||||
err = fs.CountError(err)
|
||||
fs.Errorf(nil, "merge duplicate dirs: %v", err)
|
||||
for _, dedupeDirs := range duplicateDirs {
|
||||
if SkipDestructive(ctx, dedupeDirs[0].dir, "merge duplicate directories") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Put largest directory in front to minimize movements
|
||||
fsDirs := []fs.Directory{}
|
||||
largestCount := -1
|
||||
largestIdx := 0
|
||||
for i, d := range dedupeDirs {
|
||||
fsDirs = append(fsDirs, d.dir)
|
||||
if d.count > largestCount {
|
||||
largestIdx = i
|
||||
largestCount = d.count
|
||||
}
|
||||
}
|
||||
fsDirs[largestIdx], fsDirs[0] = fsDirs[0], fsDirs[largestIdx]
|
||||
|
||||
fs.Infof(fsDirs[0], "Merging contents of duplicate directories")
|
||||
err := mergeDirs(ctx, fsDirs)
|
||||
if err != nil {
|
||||
err = fs.CountError(err)
|
||||
fs.Errorf(nil, "merge duplicate dirs: %v", err)
|
||||
}
|
||||
}
|
||||
dirCacheFlush()
|
||||
return nil
|
||||
@@ -335,15 +412,16 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(duplicateDirs) != 0 {
|
||||
if len(duplicateDirs) > 0 {
|
||||
if mode != DeduplicateList {
|
||||
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
for _, dir := range duplicateDirs {
|
||||
fmt.Printf("%s: %d duplicates of this directory\n", dir[0].Remote(), len(dir))
|
||||
for _, dedupeDirs := range duplicateDirs {
|
||||
remote := dedupeDirs[0].dir.Remote()
|
||||
fmt.Printf("%s: %d duplicates of this directory\n", remote, len(dedupeDirs))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -375,42 +453,43 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool
|
||||
}
|
||||
|
||||
for remote, objs := range files {
|
||||
if len(objs) > 1 {
|
||||
fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
|
||||
if !byHash && mode != DeduplicateList {
|
||||
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
|
||||
if len(objs) <= 1 {
|
||||
fs.Logf(remote, "All duplicates removed")
|
||||
continue
|
||||
}
|
||||
}
|
||||
switch mode {
|
||||
case DeduplicateInteractive:
|
||||
dedupeInteractive(ctx, f, ht, remote, objs, byHash)
|
||||
case DeduplicateFirst:
|
||||
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
||||
case DeduplicateNewest:
|
||||
sortOldestFirst(objs)
|
||||
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
|
||||
case DeduplicateOldest:
|
||||
sortOldestFirst(objs)
|
||||
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
||||
case DeduplicateRename:
|
||||
dedupeRename(ctx, f, remote, objs)
|
||||
case DeduplicateLargest:
|
||||
sortSmallestFirst(objs)
|
||||
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
|
||||
case DeduplicateSmallest:
|
||||
sortSmallestFirst(objs)
|
||||
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
||||
case DeduplicateSkip:
|
||||
fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what)
|
||||
case DeduplicateList:
|
||||
dedupeList(ctx, f, ht, remote, objs, byHash)
|
||||
default:
|
||||
//skip
|
||||
if len(objs) <= 1 {
|
||||
continue
|
||||
}
|
||||
fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
|
||||
if !byHash && mode != DeduplicateList {
|
||||
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
|
||||
if len(objs) <= 1 {
|
||||
fs.Logf(remote, "All duplicates removed")
|
||||
continue
|
||||
}
|
||||
}
|
||||
switch mode {
|
||||
case DeduplicateInteractive:
|
||||
dedupeInteractive(ctx, f, ht, remote, objs, byHash)
|
||||
case DeduplicateFirst:
|
||||
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
||||
case DeduplicateNewest:
|
||||
sortOldestFirst(objs)
|
||||
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
|
||||
case DeduplicateOldest:
|
||||
sortOldestFirst(objs)
|
||||
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
||||
case DeduplicateRename:
|
||||
dedupeRename(ctx, f, remote, objs)
|
||||
case DeduplicateLargest:
|
||||
sortSmallestFirst(objs)
|
||||
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
|
||||
case DeduplicateSmallest:
|
||||
sortSmallestFirst(objs)
|
||||
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
||||
case DeduplicateSkip:
|
||||
fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what)
|
||||
case DeduplicateList:
|
||||
dedupeList(ctx, f, ht, remote, objs, byHash)
|
||||
default:
|
||||
//skip
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user