diff --git a/examples/spark-operator-rapids-test.yaml b/examples/rapids-sparkoperator-gpu-test.yaml similarity index 100% rename from examples/spark-operator-rapids-test.yaml rename to examples/rapids-sparkoperator-gpu-test.yaml diff --git a/examples/yunikorn-spark-operator-rapids.yaml b/examples/rapids-sparkoperator-yunikorn.yaml similarity index 100% rename from examples/yunikorn-spark-operator-rapids.yaml rename to examples/rapids-sparkoperator-yunikorn.yaml diff --git a/infra/aws/terraform/addons.tf b/infra/aws/terraform/addons.tf index 33dbdaa..d4165d6 100644 --- a/infra/aws/terraform/addons.tf +++ b/infra/aws/terraform/addons.tf @@ -1,23 +1,4 @@ - - -# FluentBit -# Prometheus -# Spark History Server - -# #--------------------------------------------------------------- -# # EKS Blueprints Addons -# #--------------------------------------------------------------- -# module "eks_blueprints_addons" { -# source = "aws-ia/eks-blueprints-addons/aws" -# version = "~> 1.16" - - -# cluster_name = module.eks.cluster_name -# cluster_endpoint = module.eks.cluster_endpoint -# cluster_version = module.eks.cluster_version -# oidc_provider_arn = module.eks.oidc_provider_arn - -# } +# TODO: Add FluentBit, Prometheus & Grafana and Spark History Server #--------------------------------------------------------------- # Data on EKS Kubernetes Addons @@ -52,7 +33,7 @@ module "eks_data_addons" { version = "2.0.1" } - enable_yunikorn = true + enable_yunikorn = var.enable_yunikorn yunikorn_helm_config = { version = "1.6.0" } diff --git a/infra/aws/terraform/eks.tf b/infra/aws/terraform/eks.tf index 65fe7dc..a5fd2b2 100644 --- a/infra/aws/terraform/eks.tf +++ b/infra/aws/terraform/eks.tf @@ -109,16 +109,6 @@ module "eks" { labels = { NodeGroupType = "system-nodegrp" } - - # taints = { - # # The pods that do not tolerate this taint should run on nodes - # # created by Karpenter - # karpenter = { - # key = "CriticalAddonsOnly" - # value = "true" - # effect = "NO_SCHEDULE" - # } - # } } } diff --git a/infra/aws/terraform/main.tf b/infra/aws/terraform/main.tf index 54e128b..27106ee 100644 --- a/infra/aws/terraform/main.tf +++ b/infra/aws/terraform/main.tf @@ -55,13 +55,6 @@ locals { # --------------------------------------------------------------- # AWS Data Sources # --------------------------------------------------------------- -# Data sources used to retrieve AWS-specific information such as current identity, region, and session context. - -# EKS cluster authentication data -# data "aws_eks_cluster_auth" "this" { -# name = module.eks.cluster_name -# } - # Retrieves an authorization token for public ECR registry to authenticate image pulls. data "aws_ecrpublic_authorization_token" "token" { provider = aws.ecr @@ -70,9 +63,6 @@ data "aws_ecrpublic_authorization_token" "token" { # Retrieves all available AWS availability zones in the selected region. data "aws_availability_zones" "available" {} -# Retrieves the current AWS region. -# data "aws_region" "current" {} - # Retrieves the AWS account and caller identity details for the session. data "aws_caller_identity" "current" {} @@ -83,44 +73,3 @@ data "aws_caller_identity" "current" {} data "aws_iam_session_context" "current" { arn = data.aws_caller_identity.current.arn } - -# --------------------------------------------------------------- -# IAM Policy Document for Spark Operator -# --------------------------------------------------------------- -# This IAM policy document allows the Spark operator to interact with S3 and CloudWatch Logs for logging and object storage. - -# Policy granting permissions for S3 operations required by Spark jobs. -# data "aws_iam_policy_document" "spark_operator" { -# statement { -# sid = "AllowS3AccessForSparkJobs" -# effect = "Allow" -# # Grants access to all S3 resources in the current AWS partition. -# resources = ["arn:${data.aws_partition.current.partition}:s3:::*"] - -# actions = [ -# "s3:DeleteObject", -# "s3:DeleteObjectVersion", -# "s3:GetObject", -# "s3:ListBucket", -# "s3:PutObject", -# ] -# } - -# # Policy granting permissions for CloudWatch Logs operations. -# statement { -# sid = "AllowCloudWatchLogsAccessForSpark" -# effect = "Allow" -# # Grants access to all CloudWatch Log Groups in the current AWS region and account. -# resources = [ -# "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:log-group:*" -# ] - -# actions = [ -# "logs:CreateLogGroup", -# "logs:CreateLogStream", -# "logs:DescribeLogGroups", -# "logs:DescribeLogStreams", -# "logs:PutLogEvents", -# ] -# } -# } diff --git a/infra/aws/terraform/variables.tf b/infra/aws/terraform/variables.tf index df4f0d9..2bcdfc3 100644 --- a/infra/aws/terraform/variables.tf +++ b/infra/aws/terraform/variables.tf @@ -60,16 +60,14 @@ EOT default = {} } -# # Enable JupyterHub for interactive workloads -# variable "enable_jupyterhub" { -# description = "Flag to enable the deployment of JupyterHub on the Kubernetes cluster. Set to true if interactive Jupyter notebooks are required for data science workloads." -# type = bool -# default = false -# } +variable "enable_yunikorn" { + description = "Flag to enable the Apache Yunikorn batch scheduler on the Kubernetes cluster." + type = bool + default = false +} -# Enable Volcano for batch scheduling of Spark jobs or other workloads variable "enable_volcano" { - description = "Flag to enable the Volcano batch scheduler on the Kubernetes cluster. Volcano is typically used for high-performance batch job scheduling in AI/ML workloads." + description = "Flag to enable the Volcano batch scheduler on the Kubernetes cluster." type = bool default = false }