Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion docs/reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ _Appears in:_

| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `upgradeStrategy` _[RayClusterUpgradeStrategy](#rayclusterupgradestrategy)_ | UpgradeStrategy defines the scaling policy used when upgrading the RayCluster | | |
| `authOptions` _[AuthOptions](#authoptions)_ | AuthOptions specifies the authentication options for the RayCluster. | | |
| `suspend` _boolean_ | Suspend indicates whether a RayCluster should be suspended.<br />A suspended RayCluster will have head pods and worker pods deleted. | | |
| `managedBy` _string_ | ManagedBy is an optional configuration for the controller or entity that manages a RayCluster.<br />The value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'.<br />The kuberay-operator reconciles a RayCluster which doesn't have this field at all or<br />the field value is the reserved string 'ray.io/kuberay-operator',<br />but delegates reconciling the RayCluster with 'kueue.x-k8s.io/multikueue' to the Kueue.<br />The field is immutable. | | |
Expand All @@ -309,6 +310,36 @@ _Appears in:_
| `workerGroupSpecs` _[WorkerGroupSpec](#workergroupspec) array_ | WorkerGroupSpecs are the specs for the worker pods | | |


#### RayClusterUpgradeStrategy







_Appears in:_
- [RayClusterSpec](#rayclusterspec)

| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `type` _[RayClusterUpgradeType](#rayclusterupgradetype)_ | Type represents the strategy used when upgrading the RayCluster Pods. Currently supports `Recreate` and `None`. | | Enum: [Recreate None] <br /> |


#### RayClusterUpgradeType

_Underlying type:_ _string_



_Validation:_
- Enum: [Recreate None]

_Appears in:_
- [RayClusterUpgradeStrategy](#rayclusterupgradestrategy)



#### RayJob


Expand Down Expand Up @@ -425,7 +456,7 @@ _Appears in:_

| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | |
| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster`, `NewClusterWithIncrementalUpgrade` and `None`. | | |
| `clusterUpgradeOptions` _[ClusterUpgradeOptions](#clusterupgradeoptions)_ | ClusterUpgradeOptions defines the behavior of a NewClusterWithIncrementalUpgrade type.<br />RayServiceIncrementalUpgrade feature gate must be enabled to set ClusterUpgradeOptions. | | |


Expand Down
8 changes: 8 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions ray-operator/apis/ray/v1/raycluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ import (

// RayClusterSpec defines the desired state of RayCluster
type RayClusterSpec struct {
// UpgradeStrategy defines the scaling policy used when upgrading the RayCluster
// +optional
UpgradeStrategy *RayClusterUpgradeStrategy `json:"upgradeStrategy,omitempty"`
// AuthOptions specifies the authentication options for the RayCluster.
// +optional
AuthOptions *AuthOptions `json:"authOptions,omitempty"`
Expand Down Expand Up @@ -49,6 +52,22 @@ type RayClusterSpec struct {
WorkerGroupSpecs []WorkerGroupSpec `json:"workerGroupSpecs,omitempty"`
}

// +kubebuilder:validation:Enum=Recreate;None
type RayClusterUpgradeType string

const (
// During upgrade, Recreate strategy will delete all existing pods before creating new ones
RayClusterRecreate RayClusterUpgradeType = "Recreate"
// No new pod will be created while the strategy is set to None
RayClusterUpgradeNone RayClusterUpgradeType = "None"
)

type RayClusterUpgradeStrategy struct {
// Type represents the strategy used when upgrading the RayCluster Pods. Currently supports `Recreate` and `None`.
// +optional
Type *RayClusterUpgradeType `json:"type,omitempty"`
}

// AuthMode describes the authentication mode for the Ray cluster.
type AuthMode string

Expand Down
8 changes: 4 additions & 4 deletions ray-operator/apis/ray/v1/rayservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ type RayServiceUpgradeType string
const (
// During upgrade, NewClusterWithIncrementalUpgrade strategy will create an upgraded cluster to gradually scale
// and migrate traffic to using Gateway API.
NewClusterWithIncrementalUpgrade RayServiceUpgradeType = "NewClusterWithIncrementalUpgrade"
RayServiceNewClusterWithIncrementalUpgrade RayServiceUpgradeType = "NewClusterWithIncrementalUpgrade"
// During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready
NewCluster RayServiceUpgradeType = "NewCluster"
RayServiceNewCluster RayServiceUpgradeType = "NewCluster"
// No new cluster will be created while the strategy is set to None
None RayServiceUpgradeType = "None"
RayServiceUpgradeNone RayServiceUpgradeType = "None"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also rename two constants above to start with RayService*

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, updated in f3a9b81.

)

// These statuses should match Ray Serve's application statuses
Expand Down Expand Up @@ -75,7 +75,7 @@ type ClusterUpgradeOptions struct {
}

type RayServiceUpgradeStrategy struct {
// Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`.
// Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster`, `NewClusterWithIncrementalUpgrade` and `None`.
// +optional
Type *RayServiceUpgradeType `json:"type,omitempty"`
// ClusterUpgradeOptions defines the behavior of a NewClusterWithIncrementalUpgrade type.
Expand Down
25 changes: 25 additions & 0 deletions ray-operator/apis/ray/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayclusters.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayjobs.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,27 @@ func configureGCSFaultTolerance(podTemplate *corev1.PodTemplateSpec, instance ra
}
}

func GeneratePodTemplateHash(template corev1.PodTemplateSpec) (string, error) {
return utils.GenerateJsonHash(template)
}

// DefaultHeadPodTemplate sets the config values
func DefaultHeadPodTemplate(ctx context.Context, instance rayv1.RayCluster, headSpec rayv1.HeadGroupSpec, podName string, headPort string) corev1.PodTemplateSpec {
// TODO (Dmitri) The argument headPort is essentially unused;
// headPort is passed into setMissingRayStartParams but unused there for the head pod.
// To mitigate this awkwardness and reduce code redundancy, unify head and worker pod configuration logic.

log := ctrl.LoggerFrom(ctx)
// Calculate the pod template hash before any modifications
// This ensures the hash reflects the original user-defined template for upgrade detection
templateHash := ""
podTemplate := headSpec.Template
if hash, err := GeneratePodTemplateHash(podTemplate); err == nil {
templateHash = hash
} else {
log.Error(err, "Failed to generate pod template hash for head group")
}

if utils.IsDeterministicHeadPodNameEnabled() {
podTemplate.Name = podName
} else {
Expand All @@ -173,6 +188,13 @@ func DefaultHeadPodTemplate(ctx context.Context, instance rayv1.RayCluster, head
// This ensures privilege of KubeRay users are contained within the namespace of the RayCluster.
podTemplate.ObjectMeta.Namespace = instance.Namespace

if templateHash != "" {
if podTemplate.Annotations == nil {
podTemplate.Annotations = make(map[string]string)
}
podTemplate.Annotations[utils.PodTemplateHashKey] = templateHash
}

// Update rayStartParams with top-level Resources for head group.
updateRayStartParamsResources(ctx, headSpec.RayStartParams, headSpec.Resources)

Expand Down Expand Up @@ -296,12 +318,29 @@ func getEnableProbesInjection() bool {

// DefaultWorkerPodTemplate sets the config values
func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, workerSpec rayv1.WorkerGroupSpec, podName string, fqdnRayIP string, headPort string, replicaGrpName string, replicaIndex int, numHostIndex int) corev1.PodTemplateSpec {
log := ctrl.LoggerFrom(ctx)

podTemplate := workerSpec.Template
// Calculate the pod template hash before any modifications
// This ensures the hash reflects the original user-defined template for upgrade detection
templateHash := ""
if hash, err := GeneratePodTemplateHash(podTemplate); err == nil {
templateHash = hash
} else {
log.Error(err, "Failed to generate pod template hash for worker group", "groupName", workerSpec.GroupName)
}
podTemplate.GenerateName = podName

// Pods created by RayCluster should be restricted to the namespace of the RayCluster.
// This ensures privilege of KubeRay users are contained within the namespace of the RayCluster.
podTemplate.ObjectMeta.Namespace = instance.Namespace

if templateHash != "" {
if podTemplate.Annotations == nil {
podTemplate.Annotations = make(map[string]string)
}
podTemplate.Annotations[utils.PodTemplateHashKey] = templateHash
}
// The Ray worker should only start once the GCS server is ready.
// only inject init container only when ENABLE_INIT_CONTAINER_INJECTION is true
enableInitContainerInjection := getEnableInitContainerInjection()
Expand Down
Loading
Loading