fix: improve job tracking

Signed-off-by: allanice001 <allanice001@gmail.com>
This commit is contained in:
allanice001
2025-12-26 15:04:15 +00:00
parent 865270312c
commit 169283b6c7
2 changed files with 77 additions and 31 deletions

View File

@@ -36,6 +36,21 @@ func ClusterActionWorker(db *gorm.DB) archer.WorkerFn {
var args ClusterActionArgs
_ = j.ParseArguments(&args)
runID, _ := uuid.Parse(j.ID)
updateRun := func(status string, errMsg string) {
updates := map[string]any{
"status": status,
"error": errMsg,
}
if status == "succeeded" || status == "failed" {
updates["finised_at"] = time.Now().UTC()
}
db.Model(&models.ClusterRun{}).Where("id = ?", runID).Updates(updates)
}
updateRun("running", "")
logger := log.With().
Str("job", j.ID).
Str("cluster_id", args.ClusterID.String()).
@@ -56,18 +71,20 @@ func ClusterActionWorker(db *gorm.DB) archer.WorkerFn {
Preload("NodePools.Servers.SshKey").
Where("id = ? AND organization_id = ?", args.ClusterID, args.OrgID).
First(&c).Error; err != nil {
updateRun("failed", fmt.Errorf("load cluster: %w", err).Error())
return nil, fmt.Errorf("load cluster: %w", err)
}
// ---- Step 1: Prepare (mostly lifted from ClusterPrepareWorker)
if err := setClusterStatus(db, c.ID, clusterStatusBootstrapping, ""); err != nil {
updateRun("failed", err.Error())
return nil, fmt.Errorf("mark bootstrapping: %w", err)
}
c.Status = clusterStatusBootstrapping
if err := validateClusterForPrepare(&c); err != nil {
_ = setClusterStatus(db, c.ID, clusterStatusFailed, err.Error())
updateRun("failed", err.Error())
return nil, fmt.Errorf("validate: %w", err)
}
@@ -75,6 +92,7 @@ func ClusterActionWorker(db *gorm.DB) archer.WorkerFn {
keyPayloads, sshConfig, err := buildSSHAssetsForCluster(db, &c, allServers)
if err != nil {
_ = setClusterStatus(db, c.ID, clusterStatusFailed, err.Error())
updateRun("failed", err.Error())
return nil, fmt.Errorf("build ssh assets: %w", err)
}
@@ -98,6 +116,7 @@ func ClusterActionWorker(db *gorm.DB) archer.WorkerFn {
orgKey, orgSecret, err := findOrCreateClusterAutomationKey(db, c.OrganizationID, c.ID, 24*time.Hour)
if err != nil {
_ = setClusterStatus(db, c.ID, clusterStatusFailed, err.Error())
updateRun("failed", err.Error())
return nil, fmt.Errorf("org key: %w", err)
}
dtoCluster.OrgKey = &orgKey
@@ -106,6 +125,7 @@ func ClusterActionWorker(db *gorm.DB) archer.WorkerFn {
payloadJSON, err := json.MarshalIndent(dtoCluster, "", " ")
if err != nil {
_ = setClusterStatus(db, c.ID, clusterStatusFailed, err.Error())
updateRun("failed", err.Error())
return nil, fmt.Errorf("marshal payload: %w", err)
}
@@ -115,11 +135,13 @@ func ClusterActionWorker(db *gorm.DB) archer.WorkerFn {
cancel()
if err != nil {
_ = setClusterStatus(db, c.ID, clusterStatusFailed, err.Error())
updateRun("failed", err.Error())
return nil, fmt.Errorf("push assets: %w", err)
}
}
if err := setClusterStatus(db, c.ID, clusterStatusPending, ""); err != nil {
updateRun("failed", err.Error())
return nil, fmt.Errorf("mark pending: %w", err)
}
c.Status = clusterStatusPending
@@ -132,11 +154,13 @@ func ClusterActionWorker(db *gorm.DB) archer.WorkerFn {
if err != nil {
logger.Error().Err(err).Str("output", out).Msg("ping-servers failed")
_ = setClusterStatus(db, c.ID, clusterStatusFailed, fmt.Sprintf("make ping-servers: %v", err))
updateRun("failed", err.Error())
return nil, fmt.Errorf("ping-servers: %w", err)
}
}
if err := setClusterStatus(db, c.ID, clusterStatusProvisioning, ""); err != nil {
updateRun("failed", err.Error())
return nil, fmt.Errorf("mark provisioning: %w", err)
}
c.Status = clusterStatusProvisioning
@@ -149,13 +173,18 @@ func ClusterActionWorker(db *gorm.DB) archer.WorkerFn {
if err != nil {
logger.Error().Err(err).Str("output", out).Msg("bootstrap target failed")
_ = setClusterStatus(db, c.ID, clusterStatusFailed, fmt.Sprintf("make %s: %v", args.MakeTarget, err))
updateRun("failed", err.Error())
return nil, fmt.Errorf("make %s: %w", args.MakeTarget, err)
}
}
if err := setClusterStatus(db, c.ID, clusterStatusReady, ""); err != nil {
updateRun("failed", err.Error())
return nil, fmt.Errorf("mark ready: %w", err)
}
updateRun("succeeded", "")
return ClusterActionResult{
Status: "ok",
Action: args.Action,