-
Notifications
You must be signed in to change notification settings - Fork 554
[RayJob] Support deletion policies based on job status #3731
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c57a0b5
b0ce08c
aca42ec
4b19d17
fd5f1c8
e1b0280
810be41
477b681
1f8d199
f6d79ac
e8e64a0
3943530
722dc48
7df9eb2
6fd0ef5
b42d6fa
02060a0
fd98ccb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -382,24 +382,44 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) | |||||||||||||||||
"ShutdownTime", shutdownTime) | ||||||||||||||||||
|
||||||||||||||||||
if features.Enabled(features.RayJobDeletionPolicy) && | ||||||||||||||||||
rayJobInstance.Spec.DeletionPolicy != nil && | ||||||||||||||||||
*rayJobInstance.Spec.DeletionPolicy != rayv1.DeleteNoneDeletionPolicy && | ||||||||||||||||||
rayJobInstance.Spec.DeletionStrategy != nil && | ||||||||||||||||||
len(rayJobInstance.Spec.ClusterSelector) == 0 { | ||||||||||||||||||
logger.Info("Shutdown behavior is defined by the deletion policy", "deletionPolicy", rayJobInstance.Spec.DeletionPolicy) | ||||||||||||||||||
|
||||||||||||||||||
if shutdownTime.After(nowTime) { | ||||||||||||||||||
delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds()) | ||||||||||||||||||
logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta) | ||||||||||||||||||
return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
policy := rayv1.DeleteNone | ||||||||||||||||||
if rayJobInstance.Status.JobStatus == rayv1.JobStatusSucceeded { | ||||||||||||||||||
policy = *rayJobInstance.Spec.DeletionStrategy.OnSuccess.Policy | ||||||||||||||||||
} else if rayJobInstance.Status.JobStatus == rayv1.JobStatusFailed { | ||||||||||||||||||
policy = *rayJobInstance.Spec.DeletionStrategy.OnFailure.Policy | ||||||||||||||||||
} else { | ||||||||||||||||||
logger.Info("jobStatus not valid for deletion", "jobStatus", rayJobInstance.Status.JobStatus) | ||||||||||||||||||
} | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add an else here with error logging, there are other job status' to consider kuberay/ray-operator/apis/ray/v1alpha1/rayjob_types.go Lines 14 to 21 in 280902f
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||||||||||||||||||
|
||||||||||||||||||
// no need to continue as the selected policy is DeleteNone | ||||||||||||||||||
if policy == rayv1.DeleteNone { | ||||||||||||||||||
break | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
logger.Info("Shutdown behavior is defined by the deletion policy", "deletionPolicy", rayJobInstance.Spec.DeletionStrategy) | ||||||||||||||||||
if shutdownTime.After(nowTime) { | ||||||||||||||||||
delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds()) | ||||||||||||||||||
logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta) | ||||||||||||||||||
return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
switch *rayJobInstance.Spec.DeletionPolicy { | ||||||||||||||||||
case rayv1.DeleteClusterDeletionPolicy: | ||||||||||||||||||
switch policy { | ||||||||||||||||||
case rayv1.DeleteCluster: | ||||||||||||||||||
logger.Info("Deleting RayCluster", "RayCluster", rayJobInstance.Status.RayClusterName) | ||||||||||||||||||
_, err = r.deleteClusterResources(ctx, rayJobInstance) | ||||||||||||||||||
case rayv1.DeleteWorkersDeletionPolicy: | ||||||||||||||||||
case rayv1.DeleteWorkers: | ||||||||||||||||||
logger.Info("Suspending all worker groups", "RayCluster", rayJobInstance.Status.RayClusterName) | ||||||||||||||||||
err = r.suspendWorkerGroups(ctx, rayJobInstance) | ||||||||||||||||||
case rayv1.DeleteSelfDeletionPolicy: | ||||||||||||||||||
case rayv1.DeleteSelf: | ||||||||||||||||||
logger.Info("Deleting RayJob") | ||||||||||||||||||
err = r.Client.Delete(ctx, rayJobInstance) | ||||||||||||||||||
default: | ||||||||||||||||||
|
@@ -409,7 +429,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) | |||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
if (!features.Enabled(features.RayJobDeletionPolicy) || rayJobInstance.Spec.DeletionPolicy == nil) && rayJobInstance.Spec.ShutdownAfterJobFinishes && len(rayJobInstance.Spec.ClusterSelector) == 0 { | ||||||||||||||||||
if (!features.Enabled(features.RayJobDeletionPolicy) || rayJobInstance.Spec.DeletionStrategy == nil) && rayJobInstance.Spec.ShutdownAfterJobFinishes && len(rayJobInstance.Spec.ClusterSelector) == 0 { | ||||||||||||||||||
logger.Info("Shutdown behavior is defined by the 506D `ShutdownAfterJobFinishes` flag", "shutdownAfterJobFinishes", rayJobInstance.Spec.ShutdownAfterJobFinishes) | ||||||||||||||||||
if shutdownTime.After(nowTime) { | ||||||||||||||||||
delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds()) | ||||||||||||||||||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the expected behavior if only onSuccess or onFailure is set? What should be the default deletion policy in that case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In ray-operator/controllers/ray/utils/validation.go, it's validated that both onSuccess and onFailure should be set, otherwise it return.