Skip to content

Commit 7be6c82

Browse files
committed
test: add verification for idleTimeoutSeconds config per worker groups
Signed-off-by: alimaazamat <alima.azamat2003@gmail.com> Signed-off-by: alimaazamat <alima.azamat2003@gmail.com> Signed-off-by: alimaazamat <alima.azamat2003@gmail.com> Signed-off-by: alimaazamat <alima.azamat2003@gmail.com>
1 parent 8aa4d10 commit 7be6c82

File tree

2 files changed

+212
-0
lines changed

2 files changed

+212
-0
lines changed

ray-operator/controllers/ray/utils/validation.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]s
104104
if err := validateRayGroupLabels(workerGroup.GroupName, workerGroup.RayStartParams, workerGroup.Labels); err != nil {
105105
return err
106106
}
107+
if err := validateWorkerGroupIdleTimeout(workerGroup, spec); err != nil {
108+
return err
109+
}
107110
}
108111

109112
if annotations[RayFTEnabledAnnotationKey] != "" && spec.GcsFaultToleranceOptions != nil {
@@ -596,3 +599,21 @@ func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error {
596599

597600
return nil
598601
}
602+
603+
// validateWorkerGroupIdleTimeout validates the idleTimeoutSeconds field in a worker group spec
604+
func validateWorkerGroupIdleTimeout(workerGroup rayv1.WorkerGroupSpec, spec *rayv1.RayClusterSpec) error {
605+
idleTimeoutSeconds := workerGroup.IdleTimeoutSeconds
606+
if idleTimeoutSeconds != nil {
607+
if *idleTimeoutSeconds < 0 {
608+
return fmt.Errorf("idleTimeoutSeconds must be non-negative, got %d", *idleTimeoutSeconds)
609+
}
610+
611+
// idleTimeoutSeconds only allowed on autoscaler v2
612+
envVar, exists := EnvVarByName(RAY_ENABLE_AUTOSCALER_V2, spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env)
613+
if !exists || (envVar.Value != "1" && envVar.Value != "true") {
614+
return fmt.Errorf("worker group %s has idleTimeoutSeconds set, but %s environment variable is not set to 'true' in the head pod", workerGroup.GroupName, RAY_ENABLE_AUTOSCALER_V2)
615+
}
616+
}
617+
618+
return nil
619+
}

ray-operator/controllers/ray/utils/validation_test.go

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1882,3 +1882,194 @@ func TestValidateClusterUpgradeOptions(t *testing.T) {
18821882
})
18831883
}
18841884
}
1885+
1886+
func TestValidateWorkerGroupIdleTimeout(t *testing.T) {
1887+
tests := map[string]struct {
1888+
expectedErr string
1889+
spec rayv1.RayClusterSpec
1890+
}{
1891+
"should accept worker group with valid idleTimeoutSeconds": {
1892+
spec: rayv1.RayClusterSpec{
1893+
EnableInTreeAutoscaling: ptr.To(true),
1894+
HeadGroupSpec: rayv1.HeadGroupSpec{
1895+
Template: podTemplateSpec([]corev1.EnvVar{
1896+
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"},
1897+
}, nil),
1898+
},
1899+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
1900+
{
1901+
GroupName: "worker-group-1",
1902+
Template: podTemplateSpec(nil, nil),
1903+
IdleTimeoutSeconds: ptr.To(int32(60)),
1904+
MinReplicas: ptr.To(int32(0)),
1905+
MaxReplicas: ptr.To(int32(10)),
1906+
},
1907+
},
1908+
},
1909+
expectedErr: "",
1910+
},
1911+
"should reject negative idleTimeoutSeconds": {
1912+
spec: rayv1.RayClusterSpec{
1913+
EnableInTreeAutoscaling: ptr.To(true),
1914+
HeadGroupSpec: rayv1.HeadGroupSpec{
1915+
Template: podTemplateSpec([]corev1.EnvVar{
1916+
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"},
1917+
}, nil),
1918+
},
1919+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
1920+
{
1921+
GroupName: "worker-group-1",
1922+
Template: podTemplateSpec(nil, nil),
1923+
IdleTimeoutSeconds: ptr.To(int32(-10)),
1924+
MinReplicas: ptr.To(int32(0)),
1925+
MaxReplicas: ptr.To(int32(10)),
1926+
},
1927+
},
1928+
},
1929+
expectedErr: "idleTimeoutSeconds must be non-negative, got -10",
1930+
},
1931+
"should accept zero idleTimeoutSeconds": {
1932+
spec: rayv1.RayClusterSpec{
1933+
EnableInTreeAutoscaling: ptr.To(true),
1934+
HeadGroupSpec: rayv1.HeadGroupSpec{
1935+
Template: podTemplateSpec([]corev1.EnvVar{
1936+
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"},
1937+
}, nil),
1938+
},
1939+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
1940+
{
1941+
GroupName: "worker-group-1",
1942+
Template: podTemplateSpec(nil, nil),
1943+
IdleTimeoutSeconds: ptr.To(int32(0)),
1944+
MinReplicas: ptr.To(int32(0)),
1945+
MaxReplicas: ptr.To(int32(10)),
1946+
},
1947+
},
1948+
},
1949+
expectedErr: "",
1950+
},
1951+
"should reject idleTimeoutSeconds when autoscaler version is not v2": {
1952+
spec: rayv1.RayClusterSpec{
1953+
EnableInTreeAutoscaling: ptr.To(true),
1954+
HeadGroupSpec: rayv1.HeadGroupSpec{
1955+
Template: podTemplateSpec(nil, nil),
1956+
},
1957+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
1958+
{
1959+
GroupName: "worker-group-1",
1960+
Template: podTemplateSpec(nil, nil),
1961+
IdleTimeoutSeconds: ptr.To(int32(60)),
1962+
MinReplicas: ptr.To(int32(0)),
1963+
MaxReplicas: ptr.To(int32(10)),
1964+
},
1965+
},
1966+
},
1967+
expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod",
1968+
},
1969+
"should reject idleTimeoutSeconds when autoscaler version is not set": {
1970+
spec: rayv1.RayClusterSpec{
1971+
EnableInTreeAutoscaling: ptr.To(true),
1972+
HeadGroupSpec: rayv1.HeadGroupSpec{
1973+
Template: podTemplateSpec(nil, nil),
1974+
},
1975+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
1976+
{
1977+
GroupName: "worker-group-1",
1978+
Template: podTemplateSpec(nil, nil),
1979+
IdleTimeoutSeconds: ptr.To(int32(60)),
1980+
MinReplicas: ptr.To(int32(0)),
1981+
MaxReplicas: ptr.To(int32(10)),
1982+
},
1983+
},
1984+
},
1985+
expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod",
1986+
},
1987+
"should reject idleTimeoutSeconds when AutoscalerOptions is nil": {
1988+
spec: rayv1.RayClusterSpec{
1989+
EnableInTreeAutoscaling: ptr.To(true),
1990+
HeadGroupSpec: rayv1.HeadGroupSpec{
1991+
Template: podTemplateSpec(nil, nil),
1992+
},
1993+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
1994+
{
1995+
GroupName: "worker-group-1",
1996+
Template: podTemplateSpec(nil, nil),
1997+
IdleTimeoutSeconds: ptr.To(int32(60)),
1998+
MinReplicas: ptr.To(int32(0)),
1999+
MaxReplicas: ptr.To(int32(10)),
2000+
},
2001+
},
2002+
},
2003+
expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod",
2004+
},
2005+
"should reject idleTimeoutSeconds when env var is set to invalid value": {
2006+
spec: rayv1.RayClusterSpec{
2007+
EnableInTreeAutoscaling: ptr.To(true),
2008+
HeadGroupSpec: rayv1.HeadGroupSpec{
2009+
Template: podTemplateSpec([]corev1.EnvVar{
2010+
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "false"},
2011+
}, nil),
2012+
},
2013+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
2014+
{
2015+
GroupName: "worker-group-1",
2016+
Template: podTemplateSpec(nil, nil),
2017+
IdleTimeoutSeconds: ptr.To(int32(60)),
2018+
MinReplicas: ptr.To(int32(0)),
2019+
MaxReplicas: ptr.To(int32(10)),
2020+
},
2021+
},
2022+
},
2023+
expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod",
2024+
},
2025+
"should accept worker group with idleTimeoutSeconds when env var is set to true": {
2026+
spec: rayv1.RayClusterSpec{
2027+
EnableInTreeAutoscaling: ptr.To(true),
2028+
HeadGroupSpec: rayv1.HeadGroupSpec{
2029+
Template: podTemplateSpec([]corev1.EnvVar{
2030+
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "true"},
2031+
}, nil),
2032+
},
2033+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
2034+
{
2035+
GroupName: "worker-group-1",
2036+
Template: podTemplateSpec(nil, nil),
2037+
IdleTimeoutSeconds: ptr.To(int32(60)),
2038+
MinReplicas: ptr.To(int32(0)),
2039+
MaxReplicas: ptr.To(int32(10)),
2040+
},
2041+
},
2042+
},
2043+
expectedErr: "",
2044+
},
2045+
"should accept worker group without idleTimeoutSeconds and without autoscaler v2": {
2046+
spec: rayv1.RayClusterSpec{
2047+
EnableInTreeAutoscaling: ptr.To(true),
2048+
HeadGroupSpec: rayv1.HeadGroupSpec{
2049+
Template: podTemplateSpec(nil, nil),
2050+
},
2051+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
2052+
{
2053+
GroupName: "worker-group-1",
2054+
Template: podTemplateSpec(nil, nil),
2055+
MinReplicas: ptr.To(int32(0)),
2056+
MaxReplicas: ptr.To(int32(10)),
2057+
},
2058+
},
2059+
},
2060+
expectedErr: "",
2061+
},
2062+
}
2063+
2064+
for name, tc := range tests {
2065+
t.Run(name, func(t *testing.T) {
2066+
err := ValidateRayClusterSpec(&tc.spec, nil)
2067+
if tc.expectedErr != "" {
2068+
require.Error(t, err)
2069+
require.EqualError(t, err, tc.expectedErr)
2070+
} else {
2071+
require.NoError(t, err)
2072+
}
2073+
})
2074+
}
2075+
}

0 commit comments

Comments
 (0)