cluster: make workers write status and controller read the status

The controller will retry the batches if it loses contact with the worker.
2025-12-06 00:03:32 +00:00 · 2025-10-02 17:27:51 +01:00
parent 09535a06f7
commit ab60a77aba
4 changed files with 261 additions and 8 deletions
--- a/docs/content/cluster.md
+++ b/docs/content/cluster.md
@@ -119,6 +119,10 @@ The controller only sends transfer jobs to the workers. All the other
 tasks (eg listing, comparing) are done by the controller. The
 controller does not execute any transfer tasks itself.

+The controller reads worker status as written to `queue/status` and
+will detect workers which have stopped. If it detects a failed worker
+then it will re-assign any outstanding work.
+
 ## Workers

 The workers job is entirely to act as API endpoints that receive their
@@ -137,6 +141,27 @@ work via files in `/work`. Then
  allows it.
 - Repeat

+Every second the worker will write a status file in `queue/status` to
+be read by the controller.
+
+## Layout of the work directory
+
+The format of the files in this directory may change without notice
+but the layout is documented here as it can help debugging.
+
+```text
+/work                - root of the work directory
+└── queue            - files to control the queue
+    ├── done         - job files that are finished and read
+    ├── finished     - job files that are finished but not yet read
+    ├── pending      - job files that are not started yet
+    ├── processing   - job files that are running
+    └── status       - worker status files
+```
+
+If debugging use `--cluster-cleanup none` to leave the completed files
+in the directory layout.
+
 ## Flags

 ### --cluster string
--- a/fs/cluster/cluster.go
+++ b/fs/cluster/cluster.go
@@ -4,6 +4,7 @@ package cluster

 import (
 	"context"
+	"encoding/json"
 	"errors"
 	"fmt"
 	"path"
@@ -17,11 +18,17 @@ import (
 	"github.com/rclone/rclone/fs/operations"
 	"github.com/rclone/rclone/fs/rc"
 	"github.com/rclone/rclone/lib/atexit"
+	"github.com/rclone/rclone/lib/errcount"
+	"golang.org/x/sync/errgroup"
 )

 // ErrClusterNotConfigured is returned from creation functions.
 var ErrClusterNotConfigured = errors.New("cluster is not configured")

+// If we don't hear from workers in this time we assume they have timed out
+// and re-assign their jobs.
+const workerTimeout = 2 * time.Second
+
 // Cluster describes the workings of the current cluster.
 type Cluster struct {
 	jobs        *Jobs
@@ -37,6 +44,9 @@ type Cluster struct {
 	sync        chan chan<- struct{} // sync the current jobs
 	quitWorkers bool                 // if set, send workers a stop signal on Shutdown

+	workers     map[string]*WorkerStatus // worker ID => status
+	deadWorkers map[string]struct{}
+
 	mu           sync.Mutex
 	currentBatch Batch
 	inflight     map[string]Batch
@@ -88,6 +98,8 @@ func NewCluster(ctx context.Context) (*Cluster, error) {
 		quit:        make(chan struct{}),
 		sync:        make(chan chan<- struct{}),
 		inflight:    make(map[string]Batch),
+		workers:     make(map[string]*WorkerStatus),
+		deadWorkers: make(map[string]struct{}),
 	}

 	// Configure _config
@@ -379,6 +391,90 @@ func (c *Cluster) processCompletedJob(ctx context.Context, obj fs.Object) error
 	return nil
 }

+// loadWorkerStatus updates the worker status
+func (c *Cluster) loadWorkerStatus(ctx context.Context) error {
+	objs, err := c.jobs.listDir(ctx, clusterStatus)
+	if err != nil {
+		return fmt.Errorf("cluster: get job status list failed: %w", err)
+	}
+	ec := errcount.New()
+	g, gCtx := errgroup.WithContext(ctx)
+	var mu sync.Mutex
+	for _, obj := range objs {
+		g.Go(func() error {
+			buf, err := c.jobs.readFile(gCtx, obj)
+			if err != nil {
+				ec.Add(fmt.Errorf("read object: %w", err))
+				return nil
+			}
+			workerStatus := new(WorkerStatus)
+			err = json.Unmarshal(buf, workerStatus)
+			if err != nil {
+				ec.Add(fmt.Errorf("status json: %w", err))
+				return nil
+			}
+			mu.Lock()
+			c.workers[workerStatus.ID] = workerStatus
+			mu.Unlock()
+			return nil
+		})
+	}
+	return ec.Err("cluster: load status")
+}
+
+// checkWorkers loads the worker status
+func (c *Cluster) checkWorkers(ctx context.Context) {
+	err := c.loadWorkerStatus(ctx)
+	if err != nil {
+		fs.Errorf(nil, "failed to read some worker status: %v", err)
+	}
+	for workerID, status := range c.workers {
+		timeSinceUpdated := time.Since(status.Updated)
+		if timeSinceUpdated > workerTimeout {
+			if _, isDead := c.deadWorkers[workerID]; isDead {
+				continue
+			}
+			fs.Errorf(nil, "cluster: haven't heard from worker %q for %v - assuming dead", workerID, timeSinceUpdated)
+			// Find any jobs claimed by worker and restart
+			objs, err := c.jobs.listDir(ctx, clusterProcessing)
+			if err != nil {
+				fs.Errorf(nil, "cluster: failed to find pending jobs: %v", err)
+				continue
+			}
+			for _, obj := range objs {
+				fs.Errorf(obj, "cluster: checking job")
+				// Jobs are named {jobID}-{workerID}.json
+				name := strings.TrimSuffix(path.Base(obj.Remote()), ".json")
+				dash := strings.LastIndex(name, "-")
+				if dash < 0 {
+					fs.Errorf(nil, "cluster: failed to find dash in job %q", name)
+					continue
+				}
+				jobID, jobWorkerID := name[:dash], name[dash+1:]
+				fs.Errorf(obj, "cluster: checking jobID %q, workerID %q", jobID, jobWorkerID)
+				if workerID != jobWorkerID {
+					fs.Debugf(nil, "cluster: job %q doesn't match %q", jobWorkerID, workerID)
+					continue
+				}
+				// Found a job running on worker - rename it back to Pending
+				newRemote := path.Join(clusterPending, jobID+".json")
+				_, err = c.jobs.rename(ctx, obj, newRemote)
+				if err != nil {
+					fs.Errorf(nil, "cluster: failed to restart job %q: %v", jobID, err)
+					continue
+				}
+				fs.Errorf(nil, "cluster: restarted job %q", jobID)
+			}
+			c.deadWorkers[workerID] = struct{}{}
+		} else {
+			if _, isDead := c.deadWorkers[workerID]; isDead {
+				fs.Errorf(nil, "cluster: dead worker %q came back to life!", workerID)
+				delete(c.deadWorkers, workerID)
+			}
+		}
+	}
+}
+
 // checkJobs sees if there are any completed jobs
 func (c *Cluster) checkJobs(ctx context.Context) {
 	objs, err := c.jobs.listDir(ctx, clusterDone)
@@ -404,6 +500,8 @@ func (c *Cluster) run(ctx context.Context) {
 	defer c.wg.Done()
 	checkJobs := time.NewTicker(clusterCheckJobsInterval)
 	defer checkJobs.Stop()
+	checkWorkers := time.NewTicker(clusterCheckWorkersInterval)
+	defer checkWorkers.Stop()
 	var syncedChans []chan<- struct{}
 	for {
 		select {
@@ -415,6 +513,8 @@ func (c *Cluster) run(ctx context.Context) {
 		case synced := <-c.sync:
 			syncedChans = append(syncedChans, synced)
 			fs.Debugf(nil, "cluster: sync request received")
+		case <-checkWorkers.C:
+			c.checkWorkers(ctx)
 		case <-checkJobs.C:
 		}
 		c.checkJobs(ctx)
--- a/fs/cluster/jobs.go
+++ b/fs/cluster/jobs.go
@@ -31,6 +31,7 @@ const (
 	clusterProcessing = clusterQueue + "/processing"
 	clusterDone       = clusterQueue + "/done"
 	clusterFinished   = clusterQueue + "/finished"
+	clusterStatus     = clusterQueue + "/status"

 	minSleep      = 10 * time.Millisecond
 	maxSleep      = 2 * time.Second
@@ -39,6 +40,12 @@ const (
 	// Read the queue this often
 	clusterCheckJobsInterval = time.Second

+	// Write the worker status this often
+	clusterWriteStatusInterval = time.Second
+
+	// Read the worker status this often
+	clusterCheckWorkersInterval = time.Second
+
 	// Name of job which signals to the workers to quit
 	quitJob = "QUIT"
 )
@@ -82,7 +89,7 @@ func NewJobs(ctx context.Context) (*Jobs, error) {

 // Create the cluster directory structure
 func (jobs *Jobs) createDirectoryStructure(ctx context.Context) (err error) {
-	for _, dir := range []string{clusterPending, clusterProcessing, clusterDone, clusterFinished} {
+	for _, dir := range []string{clusterPending, clusterProcessing, clusterDone, clusterFinished, clusterStatus} {
 		err = jobs.f.Mkdir(ctx, dir)
 		if err != nil {
 			return fmt.Errorf("cluster mkdir %q: %w", dir, err)
@@ -165,6 +172,17 @@ func (jobs *Jobs) writeFile(ctx context.Context, remote string, modTime time.Tim
 	return nil
 }

+// Remove the file if it exists
+func (jobs *Jobs) removeFile(ctx context.Context, remote string) error {
+	obj, err := jobs.f.NewObject(ctx, remote)
+	if errors.Is(err, fs.ErrorObjectNotFound) || errors.Is(err, fs.ErrorDirNotFound) {
+		return nil
+	} else if err != nil {
+		return err
+	}
+	return obj.Remove(ctx)
+}
+
 // write a job to a file returning the name
 func (jobs *Jobs) writeJob(ctx context.Context, where string, job any) (name string, err error) {
 	now := time.Now().UTC()
--- a/fs/cluster/worker.go
+++ b/fs/cluster/worker.go
@@ -2,21 +2,40 @@ package cluster

 import (
 	"context"
+	"encoding/json"
 	"path"
 	"sync"
 	"time"

 	"github.com/rclone/rclone/fs"
+	"github.com/rclone/rclone/fs/accounting"
+	"github.com/rclone/rclone/fs/rc"
 	"github.com/rclone/rclone/fs/rc/jobs"
 	"github.com/rclone/rclone/lib/random"
 )

+const maxWorkersDone = 16 // maximum jobs in the done list
+
 // Worker describes a single instance of a cluster worker.
 type Worker struct {
 	jobs   *Jobs
 	cancel func()         // stop bg job
 	wg     sync.WaitGroup //  bg job finished
 	id     string         // id of this worker
+	status string         // place it stores it status
+
+	jobsMu  sync.Mutex
+	running map[string]struct{} // IDs of the jobs being processed
+	done    []string            // IDs of finished jobs
+}
+
+// WorkerStatus shows the status of this worker including jobs
+// running.
+type WorkerStatus struct {
+	ID      string               `json:"id"`
+	Running map[string]rc.Params `json:"running"` // Job ID => accounting.RemoteStats
+	Done    map[string]bool      `json:"done"`    // Job ID => finished status
+	Updated time.Time            `json:"updated"`
 }

 // NewWorker creates a new cluster from the config in ctx.
@@ -32,18 +51,22 @@ func NewWorker(ctx context.Context) (*Worker, error) {
 		return nil, err
 	}
 	w := &Worker{
-		jobs: jobs,
-		id:   ci.ClusterID,
+		jobs:    jobs,
+		id:      ci.ClusterID,
+		running: make(map[string]struct{}),
 	}
 	if w.id == "" {
 		w.id = random.String(10)
 	}
+	w.status = path.Join(clusterStatus, w.id+".json")

-	// Start the background worker
+	// Start the background workers
 	bgCtx, cancel := context.WithCancel(context.Background())
 	w.cancel = cancel
 	w.wg.Add(1)
-	go w.run(bgCtx)
+	go w.runJobs(bgCtx)
+	w.wg.Add(1)
+	go w.runStatus(bgCtx)

 	fs.Logf(w.jobs.f, "Started cluster worker")

@@ -60,6 +83,27 @@ func (w *Worker) checkJobs(ctx context.Context) {
 	if obj == nil {
 		return // no jobs available
 	}
+
+	// make a stats group for this job
+	ctx = accounting.WithStatsGroup(ctx, name)
+
+	// Add job ID
+	w.jobsMu.Lock()
+	w.running[name] = struct{}{}
+	w.jobsMu.Unlock()
+	fs.Infof(nil, "write jobID %q", name)
+
+	// Remove job ID on exit
+	defer func() {
+		w.jobsMu.Lock()
+		delete(w.running, name)
+		w.done = append(w.done, name)
+		if len(w.done) > maxWorkersDone {
+			w.done = w.done[len(w.done)-maxWorkersDone : len(w.done)]
+		}
+		w.jobsMu.Unlock()
+	}()
+
 	fs.Debugf(nil, "cluster: processing pending job %q", name)
 	inBuf, err := w.jobs.readFile(ctx, obj)
 	if err != nil {
@@ -67,7 +111,6 @@ func (w *Worker) checkJobs(ctx context.Context) {
 		w.jobs.finish(ctx, obj, "input-error", false)
 		return
 	}
-	w.jobs.finish(ctx, obj, "input-ok", true)
 	outBuf := jobs.NewJobFromBytes(ctx, inBuf)
 	remote := path.Join(clusterDone, name+".json")
 	err = w.jobs.writeFile(ctx, remote, time.Now(), outBuf)
@@ -75,11 +118,12 @@ func (w *Worker) checkJobs(ctx context.Context) {
 		fs.Errorf(nil, "check jobs failed to write output: %v", err)
 		return
 	}
+	w.jobs.finish(ctx, obj, "input-ok", true)
 	fs.Debugf(nil, "cluster: processed pending job %q", name)
 }

-// Run the background process
-func (w *Worker) run(ctx context.Context) {
+// Run the background process to pick up jobs
+func (w *Worker) runJobs(ctx context.Context) {
 	defer w.wg.Done()
 	checkJobs := time.NewTicker(clusterCheckJobsInterval)
 	defer checkJobs.Stop()
@@ -93,6 +137,72 @@ func (w *Worker) run(ctx context.Context) {
 	}
 }

+// Write the worker status
+func (w *Worker) writeStatus(ctx context.Context) {
+	// Create the worker status from the jobIDs and the short stats
+	status := WorkerStatus{
+		ID:      w.id,
+		Running: make(map[string]rc.Params),
+		Updated: time.Now(),
+		Done:    make(map[string]bool),
+	}
+	w.jobsMu.Lock()
+	for _, jobID := range w.done {
+		status.Done[jobID] = true
+	}
+	for jobID := range w.running {
+		fs.Infof(nil, "read jobID %q", jobID)
+		si := accounting.StatsGroup(ctx, jobID)
+		out, err := si.RemoteStats(true)
+		if err != nil {
+			fs.Errorf(nil, "cluster: write status: stats: %v", err)
+			status.Running[jobID] = rc.Params{}
+		} else {
+			status.Running[jobID] = out
+		}
+		status.Done[jobID] = false
+	}
+	w.jobsMu.Unlock()
+
+	// Write the stats to a file
+	buf, err := json.MarshalIndent(status, "", "\t")
+	if err != nil {
+		fs.Errorf(nil, "cluster: write status: json: %w", err)
+		return
+	}
+	err = w.jobs.writeFile(ctx, w.status, status.Updated, buf)
+	if err != nil {
+		fs.Errorf(nil, "cluster: write status: %w", err)
+	}
+}
+
+// Remove the worker status
+func (w *Worker) clearStatus(ctx context.Context) {
+	err := w.jobs.removeFile(ctx, w.status)
+	if err != nil {
+		fs.Errorf(nil, "cluster: clear status: %w", err)
+	}
+}
+
+// Run the background process to write status
+func (w *Worker) runStatus(ctx context.Context) {
+	defer w.wg.Done()
+	w.writeStatus(ctx)
+	defer w.clearStatus(ctx)
+	writeStatus := time.NewTicker(clusterWriteStatusInterval)
+	defer writeStatus.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-writeStatus.C:
+			t0 := time.Now()
+			w.writeStatus(ctx)
+			fs.Debugf(nil, "write status took %v at %v", time.Since(t0), t0)
+		}
+	}
+}
+
 // Shutdown the worker regardless of whether it has work to process or not.
 func (w *Worker) Shutdown(ctx context.Context) error {
 	w.cancel()