Skip to content

Commit

Permalink
fix: scale old runners only (#65)
Browse files Browse the repository at this point in the history

---------

Signed-off-by: Mario Constanti <[email protected]>
  • Loading branch information
bavarianbidi committed Jan 23, 2024
1 parent f1dd79e commit 1334d0e
Show file tree
Hide file tree
Showing 13 changed files with 1,123 additions and 155 deletions.
7 changes: 3 additions & 4 deletions api/v1alpha1/pool_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,9 @@ type PoolSpec struct {

// PoolStatus defines the observed state of Pool
type PoolStatus struct {
ID string `json:"id"`
IdleRunners uint `json:"idleRunners"`
Runners uint `json:"runners"`
Selector string `json:"selector"`
ID string `json:"id"`
LongRunningIdleRunners uint `json:"longRunningIdleRunners"`
Selector string `json:"selector"`

LastSyncError string `json:"lastSyncError,omitempty"`
}
Expand Down
7 changes: 2 additions & 5 deletions config/crd/bases/garm-operator.mercedes-benz.com_pools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,15 @@ spec:
properties:
id:
type: string
idleRunners:
type: integer
lastSyncError:
type: string
runners:
longRunningIdleRunners:
type: integer
selector:
type: string
required:
- id
- idleRunners
- runners
- longRunningIdleRunners
- selector
type: object
type: object
Expand Down
1 change: 1 addition & 0 deletions config/overlays/local/manager_patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ spec:
- --garm-username=admin
- --garm-password=LmrBG1KcBOsDfNKq4cQTGpc0hJ0kejkk
- --operator-watch-namespace=garm-operator-system
- --operator-min-idle-runners-age=1m
4 changes: 4 additions & 0 deletions docs/config/configuration-parsing.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ OPERATOR_METRICS_BIND_ADDRESS
OPERATOR_HEALTH_PROBE_BIND_ADDRESS
OPERATOR_LEADER_ELECTION
OPERATOR_SYNC_PERIOD
OPERATOR_MIN_IDLE_RUNNERS_AGE
OPERATOR_WATCH_NAMESPACE
OPERATOR_ENTERPRISE_CONCURRENCY
Expand All @@ -68,6 +69,7 @@ The following flags will be parsed and can be found in the [flags package](../..
--operator-health-probe-bind-address
--operator-leader-election
--operator-sync-period
--operator-min-idle-runners-age
--operator-watch-namespace
--operator-enterprise-concurrency
Expand Down Expand Up @@ -103,6 +105,7 @@ operator:
healthProbeBindAddress: :8081
leaderElection: false
syncPeriod: 5m0s
minIdleRunnersAge: 5m0s
watchNamespace: garm-operator-system
enterpriseConcurrency: 1
organizationConcurrency: 3
Expand All @@ -129,6 +132,7 @@ operator:
healthProbeBindAddress: ":7001"
leaderElection: true
syncPeriod: "10m"
minIdleRunnersAge: "5m"
watchNamespace: "garm-operator-namespace"
enterpriseConcurrency: 1
organizationConcurrency: 3
Expand Down
18 changes: 17 additions & 1 deletion docs/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,23 @@ Garm will take care of this request and will create the missing runners to fulfi
Scaling down the number of runners is done by decreasing the `minIdleRunners` field. If we wan't to scale down from `6` to `4`,
`garm-operator` will make the same API call towards the garm-server where the `minIdleRunners` field is set to `4`.

As garm doesn't remove any idle runners (by design), the `garm-operator` will make another API call towards the garm-server,
As garm doesn't remove any idle runners (by design), `garm-operator` will take care of this request and scale pool down in a pessimistic manner.

This means, that two configuration parameters are taken into account:

- `minIdleRunners` (on the `pool.spec`): defines the new value for `minIdleRunners`
- `minIdleRunnersAge` ([configured on the `operator` itself](config/configuration-parsing.md)): defines the age of the idle runners which should be removed

If the `minIdleRunnersAge` is set to `5m` and we scale down from `6` to `4`, `garm-operator` will only delete idle runners which are older than `5m`.

> [!IMPORTANT]
> The intention behind this approach was to prevent the deletion of to many idle runners as garm itself is responsible for the lifecycle of a runners in a pool.
##### scaling down to zero

If we scale down to zero, `garm-operator` will delete all idle runners, no matter how old they are.

the `garm-operator` will make another API call towards the garm-server,
where it get the current number of idle runners and will remove the difference between the current number of idle runners and the new `minIdleRunners` value.

### pause reconciliation
Expand Down
109 changes: 60 additions & 49 deletions internal/controller/pool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"sort"
"time"

"github.com/cloudbase/garm/client/instances"
"github.com/cloudbase/garm/client/pools"
"github.com/cloudbase/garm/params"
apierrors "k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -30,6 +31,7 @@ import (
garmoperatorv1alpha1 "github.com/mercedes-benz/garm-operator/api/v1alpha1"
garmClient "github.com/mercedes-benz/garm-operator/pkg/client"
"github.com/mercedes-benz/garm-operator/pkg/client/key"
"github.com/mercedes-benz/garm-operator/pkg/config"
"github.com/mercedes-benz/garm-operator/pkg/event"
"github.com/mercedes-benz/garm-operator/pkg/util/annotations"
poolUtil "github.com/mercedes-benz/garm-operator/pkg/util/pool"
Expand Down Expand Up @@ -151,18 +153,13 @@ func (r *PoolReconciler) reconcileUpdate(ctx context.Context, garmClient garmCli
return r.handleUpdateError(ctx, pool, err)
}

poolNeedsUpdate, runners, err := r.comparePoolSpecs(ctx, pool, image.Spec.Tag, garmClient)
poolCRdiffersFromGarmPool, idleRunners, err := r.comparePoolSpecs(ctx, pool, image.Spec.Tag, garmClient)
if err != nil {
log.Error(err, "error comparing pool specs")
return r.handleUpdateError(ctx, pool, err)
}

idleRunners, err := poolUtil.ExtractIdleRunners(ctx, runners)
if err != nil {
return r.handleUpdateError(ctx, pool, err)
}

if !poolNeedsUpdate {
if !poolCRdiffersFromGarmPool {
log.Info("pool CR differs from pool on garm side. Trigger a garm pool update")

if err = poolUtil.UpdatePool(ctx, garmClient, pool, image); err != nil {
Expand All @@ -171,23 +168,52 @@ func (r *PoolReconciler) reconcileUpdate(ctx context.Context, garmClient garmCli
}
}

// update pool idle runners count in status
pool.Status.IdleRunners = uint(len(idleRunners))
pool.Status.Runners = uint(len(runners))
if err := r.updatePoolCRStatus(ctx, pool); err != nil {
return ctrl.Result{}, err
}
longRunningIdleRunnersCount := len(poolUtil.OldIdleRunners(config.Config.Operator.MinIdleRunnersAge, idleRunners))

switch {
case pool.Spec.MinIdleRunners == 0:
// scale to zero
// when scale to zero is desired, we immediately scale down to zero by deleting all idle runners

// If there are more idle Runners than minIdleRunners are defined in
// the spec, we force a runner deletion.
if pool.Status.IdleRunners > pool.Spec.MinIdleRunners {
log.Info("Scaling pool", "pool", pool.Name)
event.Scaling(r.Recorder, pool, fmt.Sprintf("scale idle runners down to %d", pool.Spec.MinIdleRunners))
if err := poolUtil.AlignIdleRunners(ctx, pool, idleRunners, instanceClient); err != nil {
return ctrl.Result{}, err

runners := poolUtil.DeletableRunners(ctx, idleRunners)
for _, runner := range runners {
if err := instanceClient.DeleteInstance(instances.NewDeleteInstanceParams().WithInstanceName(runner.Name)); err != nil {
log.Error(err, "unable to delete runner", "runner", runner.Name)
}
longRunningIdleRunnersCount--
}
default:
// If there are more old idle Runners than minIdleRunners are defined in
// the spec, we delete old idle runners

// get all idle runners that are older than minRunnerAge
longRunningIdleRunners := poolUtil.OldIdleRunners(config.Config.Operator.MinIdleRunnersAge, idleRunners)

// calculate how many old runners need to be deleted to match the desired minIdleRunners
alignedRunners := poolUtil.AlignIdleRunners(int(pool.Spec.MinIdleRunners), longRunningIdleRunners)

// extract runners which are deletable
runners := poolUtil.DeletableRunners(ctx, alignedRunners)
for _, runner := range runners {
log.Info("Scaling pool", "pool", pool.Name)
event.Scaling(r.Recorder, pool, fmt.Sprintf("scale long running idle runners down to %d", pool.Spec.MinIdleRunners))

if err := instanceClient.DeleteInstance(instances.NewDeleteInstanceParams().WithInstanceName(runner.Name)); err != nil {
log.Error(err, "unable to delete runner", "runner", runner.Name)
}
longRunningIdleRunnersCount--
}
}

// update pool idle runners count in status
pool.Status.LongRunningIdleRunners = uint(longRunningIdleRunnersCount)
if err := r.updatePoolCRStatus(ctx, pool); err != nil {
return ctrl.Result{}, err
}

err = annotations.SetLastSyncTime(pool, r.Client)
if err != nil {
log.Error(err, "can not set annotation")
Expand Down Expand Up @@ -225,46 +251,33 @@ func (r *PoolReconciler) reconcileDelete(ctx context.Context, garmClient garmCli
return r.handleUpdateError(ctx, pool, err)
}

// get current idle runners
// get all runners
runners, err := poolUtil.GetAllRunners(ctx, pool, instanceClient)
if err != nil {
return ctrl.Result{Requeue: true, RequeueAfter: 1 * time.Minute}, err
}

idleRunners, err := poolUtil.ExtractIdleRunners(ctx, runners)
// get a list of all idle runners to trigger deletion
deletableRunners := poolUtil.DeletableRunners(ctx, runners)
if err != nil {
return ctrl.Result{Requeue: true, RequeueAfter: 1 * time.Minute}, err
}

// set current idle runners count in status
pool.Status.IdleRunners = uint(len(idleRunners))
pool.Status.Runners = uint(len(runners))
if err := r.updatePoolCRStatus(ctx, pool); err != nil {
return r.handleUpdateError(ctx, pool, err)
}
pool.Status.LongRunningIdleRunners = uint(len(deletableRunners))

// scale pool down only needed if spec is smaller than current min idle runners
// doesn't catch the case where minIdleRunners is increased and to many idle runners exist
if pool.Status.IdleRunners > pool.Spec.MinIdleRunners {
log.Info("Scaling pool", "pool", pool.Name)
event.Scaling(r.Recorder, pool, fmt.Sprintf("scale idle runners down to %d", pool.Spec.MinIdleRunners))
if err := poolUtil.AlignIdleRunners(ctx, pool, idleRunners, instanceClient); err != nil {
return ctrl.Result{Requeue: true, RequeueAfter: 1 * time.Minute}, err
}
}
// scale pool down that all idle runners are deleted
log.Info("Scaling pool", "pool", pool.Name)
event.Scaling(r.Recorder, pool, fmt.Sprintf("scale idle runners down to %d before deleting", pool.Spec.MinIdleRunners))

// if the pool still contains runners, we need
if pool.Status.Runners != 0 {
log.Info("scaling pool down before deleting")
event.Info(r.Recorder, pool, fmt.Sprintf("pool still contains %d runners. Reconcile again", pool.Status.Runners))
return ctrl.Result{Requeue: true, RequeueAfter: 1 * time.Minute}, err
for _, runner := range deletableRunners {
if err := instanceClient.DeleteInstance(instances.NewDeleteInstanceParams().WithInstanceName(runner.Name)); err != nil {
log.Error(err, "unable to delete runner", "runner", runner.Name)
}
}

err = garmClient.DeletePool(
pools.NewDeletePoolParams().
WithPoolID(pool.Status.ID),
)
if err != nil {
// delete pool in garm
if err = garmClient.DeletePool(pools.NewDeletePoolParams().WithPoolID(pool.Status.ID)); err != nil {
return r.handleUpdateError(ctx, pool, fmt.Errorf("error deleting pool %s: %w", pool.Name, err))
}

Expand Down Expand Up @@ -306,7 +319,7 @@ func (r *PoolReconciler) handleSuccessfulUpdate(ctx context.Context, pool *garmo
log := log.FromContext(ctx)

pool.Status.ID = garmPool.ID
pool.Status.IdleRunners = garmPool.MinIdleRunners
pool.Status.LongRunningIdleRunners = garmPool.MinIdleRunners
pool.Status.LastSyncError = ""

if err := r.updatePoolCRStatus(ctx, pool); err != nil {
Expand Down Expand Up @@ -391,10 +404,8 @@ func (r *PoolReconciler) comparePoolSpecs(ctx context.Context, pool *garmoperato
tmpGarmPool.RepoName = gitHubScopeRef.GetName()
}

idleInstances, err := poolUtil.ExtractIdleRunners(ctx, garmPool.Payload.Instances)
if err != nil {
return false, []params.Instance{}, err
}
// we are only interested in IdleRunners
idleInstances := poolUtil.IdleRunners(ctx, garmPool.Payload.Instances)

// empty instances for comparison
garmPool.Payload.Instances = nil
Expand Down
Loading

0 comments on commit 1334d0e

Please sign in to comment.