From 40547c04fbe1add470d2170f1d6828aa5f2a19d0 Mon Sep 17 00:00:00 2001 From: Vara Bonthu Date: Sat, 5 Oct 2024 12:55:34 -0700 Subject: [PATCH] Added Terraform templates --- .github/workflows/codespell.yaml | 3 +- .github/workflows/dependabot-automerge.yml | 6 +- .github/workflows/website-deploy.yml | 2 +- .github/workflows/website-test-deploy.yaml | 2 +- .gitignore | 2 - .pre-commit-config.yaml | 44 +++++++ infra/aws/terraform/eks.tf | 122 ++++++++++++++++++++ infra/aws/terraform/main.tf | 126 +++++++++++++++++++++ infra/aws/terraform/outputs.tf | 4 + infra/aws/terraform/variables.tf | 75 ++++++++++++ infra/aws/terraform/versions.tf | 14 +++ infra/aws/terraform/vpc.tf | 53 +++++++++ website/package.json | 47 ++++++++ website/static/img/logo.svg | 2 +- 14 files changed, 492 insertions(+), 10 deletions(-) create mode 100644 infra/aws/terraform/main.tf create mode 100644 website/package.json diff --git a/.github/workflows/codespell.yaml b/.github/workflows/codespell.yaml index 1a2b741..ac1c1f3 100644 --- a/.github/workflows/codespell.yaml +++ b/.github/workflows/codespell.yaml @@ -1,4 +1,3 @@ ---- name: Codespell on: @@ -24,4 +23,4 @@ jobs: check_filenames: true # When using this Action in other repos, the --skip option below can be removed skip: "*.excalidraw,*.git,*.png,*.jpg,*.svg,go.mod,go.sum" - continue-on-error: true # The PR checks will not fail, but the possible spelling issues will still be reported for review and correction \ No newline at end of file + continue-on-error: true # The PR checks will not fail, but the possible spelling issues will still be reported for review and correction diff --git a/.github/workflows/dependabot-automerge.yml b/.github/workflows/dependabot-automerge.yml index bdea584..f5bc42e 100644 --- a/.github/workflows/dependabot-automerge.yml +++ b/.github/workflows/dependabot-automerge.yml @@ -2,8 +2,8 @@ name: Dependabot auto-merge on: pull_request_target: - branches: [ main ] - types: [ opened ] + branches: [main] + types: [opened] permissions: pull-requests: write @@ -18,4 +18,4 @@ jobs: run: gh pr merge --auto --merge "$PR_URL" env: PR_URL: ${{github.event.pull_request.html_url}} - GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} \ No newline at end of file + GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} diff --git a/.github/workflows/website-deploy.yml b/.github/workflows/website-deploy.yml index c8cfd23..aee2d1a 100644 --- a/.github/workflows/website-deploy.yml +++ b/.github/workflows/website-deploy.yml @@ -41,4 +41,4 @@ jobs: # The GH actions bot is used by default if you didn't specify the two fields. # You can swap them out with your own user credentials. user_name: github-actions[bot] - user_email: github-actions[bot]@users.noreply.github.com \ No newline at end of file + user_email: github-actions[bot]@users.noreply.github.com diff --git a/.github/workflows/website-test-deploy.yaml b/.github/workflows/website-test-deploy.yaml index f96b818..d051510 100644 --- a/.github/workflows/website-test-deploy.yaml +++ b/.github/workflows/website-test-deploy.yaml @@ -24,4 +24,4 @@ jobs: - name: Install dependencies run: npm ci - name: Test build website - run: npm run build \ No newline at end of file + run: npm run build diff --git a/.gitignore b/.gitignore index c3f31af..fbb5b95 100644 --- a/.gitignore +++ b/.gitignore @@ -56,5 +56,3 @@ site # node modules node_modules -website/package-lock.json -website/package.json \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e69de29..9102f20 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -0,0 +1,44 @@ +repos: + - repo: https://github.com/streetsidesoftware/cspell-cli + rev: v8.13.3 + hooks: + - id: cspell + args: [--exclude, 'ADOPTERS.md', --exclude, '.pre-commit-config.yaml', --exclude, '.gitignore', --exclude, '*.drawio', --exclude, 'mkdocs.yml', --exclude, '.helmignore', --exclude, '.github/workflows/*', --exclude, 'patterns/istio-multi-cluster/*', --exclude, 'patterns/blue-green-upgrade/*', --exclude, '/patterns/vpc-lattice/cross-cluster-pod-communication/*', --exclude, 'patterns/bottlerocket/*', --exclude, 'patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh'] + - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks + rev: v2.14.0 + hooks: + - id: pretty-format-yaml + args: [--autofix, --indent, '2', --offset, '2', --preserve-quotes] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-merge-conflict + - id: detect-private-key + - id: detect-aws-credentials + args: [--allow-missing-credentials] + - repo: https://github.com/antonbabenko/pre-commit-terraform + rev: v1.96.1 + hooks: + - id: terraform_fmt + - id: terraform_docs + args: + - --args=--lockfile=false + - id: terraform_tflint + args: + - --args=--only=terraform_deprecated_interpolation + - --args=--only=terraform_deprecated_index + - --args=--only=terraform_unused_declarations + - --args=--only=terraform_comment_syntax + - --args=--only=terraform_documented_outputs + - --args=--only=terraform_documented_variables + - --args=--only=terraform_typed_variables + - --args=--only=terraform_module_pinned_source + - --args=--only=terraform_naming_convention + - --args=--only=terraform_required_version + - --args=--only=terraform_required_providers + - --args=--only=terraform_unused_required_providers + - --args=--only=terraform_workspace_remote + - id: terraform_validate + exclude: (docs|modules) diff --git a/infra/aws/terraform/eks.tf b/infra/aws/terraform/eks.tf index e69de29..5166742 100644 --- a/infra/aws/terraform/eks.tf +++ b/infra/aws/terraform/eks.tf @@ -0,0 +1,122 @@ + +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "~> 20.24" + + cluster_name = local.name + cluster_version = var.eks_cluster_version + + # EKS Addons + cluster_addons = { + coredns = {} + eks-pod-identity-agent = {} + kube-proxy = {} + vpc-cni = {} + } + + # Give the Terraform identity admin access to the cluster + # which will allow it to deploy resources into the cluster + enable_cluster_creator_admin_permissions = true + cluster_endpoint_public_access = true + access_entries = var.access_entries + + vpc_id = module.vpc.vpc_id + # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) + + # Combine root account, current user/role and additinoal roles to be able to access the cluster KMS key - required for terraform updates + kms_key_administrators = distinct(concat([ + "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"], + var.kms_key_admin_roles, + [data.aws_iam_session_context.current.issuer_arn] + )) + + #--------------------------------------- + # Note: This can further restricted to specific required for each Add-on and your application + #--------------------------------------- + # Extend cluster security group rules + cluster_security_group_additional_rules = { + ingress_nodes_ephemeral_ports_tcp = { + description = "Nodes on ephemeral ports" + protocol = "tcp" + from_port = 0 + to_port = 65535 + type = "ingress" + source_node_security_group = true + } + } + + # security group rule from all ipv4 to nodes for port 22 + node_security_group_additional_rules = { + # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. + # This can be restricted further to specific port based on the requirement for each Add-on e.g., coreDNS 53, metrics-server 4443, spark-operator 8080, karpenter 8443 etc. + # Update this according to your security requirements if needed + ingress_cluster_to_node_all_traffic = { + description = "Cluster API to Nodegroup all traffic" + protocol = "-1" + from_port = 0 + to_port = 0 + type = "ingress" + source_cluster_security_group = true + } + } + + eks_managed_node_group_defaults = { + iam_role_additional_policies = { + # Not required, but used in the example to access the nodes to inspect mounted volumes + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + + ebs_optimized = true + # This block device is used only for root volume. Adjust volume according to your size. + # NOTE: Don't use this volume for ML workloads + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 100 + volume_type = "gp3" + } + } + } + } + + eks_managed_node_groups = { + # It's recommended to have a Managed Node group for hosting critical add-ons + # It's recommended to use Karpenter to place your workloads instead of using Managed Node groups + # You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes. + system_node_group = { + name = "system-node-group" + description = "EKS Core node group for hosting system add-ons" + # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null] + ) + + # aws ssm get-parameters --names /aws/service/eks/optimized-ami/${var.eks_cluster_version}/amazon-linux-2023/x86_64/standard/recommended/release_version --region us-west-2 + ami_type = "AL2023_x86_64_STANDARD" # Use this for Graviton AL2023_ARM_64_STANDARD + min_size = 2 + max_size = 8 + desired_size = 2 + + instance_types = ["m6i.large"] + + labels = { + NodeGroupType = "system-nodegroup" + } + + tags = merge(local.tags, { + Name = "system-nodegroup" + }) + } + + tags = merge(local.tags, { + # NOTE - if creating multiple security groups with this module, only tag the + # security group that Karpenter should utilize with the following tag + # (i.e. - at most, only one security group should have this tag in your account) + "karpenter.sh/discovery" = local.name + }) + } + +} diff --git a/infra/aws/terraform/main.tf b/infra/aws/terraform/main.tf new file mode 100644 index 0000000..0066778 --- /dev/null +++ b/infra/aws/terraform/main.tf @@ -0,0 +1,126 @@ +# --------------------------------------------------------------- +# AWS Provider Configuration +# --------------------------------------------------------------- +# The primary AWS provider, used for interacting with resources in the region specified by 'var.region'. +provider "aws" { + region = local.region +} + +# Secondary AWS provider for ECR (Elastic Container Registry) authentication. +# ECR public authentication requires the 'us-east-1' region, which is hardcoded here. +# If your main region is 'us-east-1', you can remove this second provider and use the primary one. +provider "aws" { + alias = "ecr" + region = "us-east-1" +} + +# --------------------------------------------------------------- +# Helm Provider Configuration +# --------------------------------------------------------------- +# The Helm provider is used to manage Kubernetes applications, relying on the EKS cluster. +provider "helm" { + kubernetes { + # The EKS cluster API endpoint and certificate are retrieved from the EKS module. + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + # Retrieves an authentication token for Kubernetes API using the AWS CLI. + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + # Note: The AWS CLI must be installed locally where Terraform is executed. + } + } +} + +# --------------------------------------------------------------- +# Local Variables +# --------------------------------------------------------------- +# These locals store reusable values for the project, such as the name, region, and tags. +locals { + # Name and region variables for naming consistency across resources. + name = var.name + region = var.region + + # Limiting Availability Zones to two for resource allocation. + azs = slice(data.aws_availability_zones.available.names, 0, 2) + + # Project tags for tracking and referencing the GitHub repository. + tags = { + GithubRepo = "https://github.com/KubedAI/spark-rapids-on-kubernetes" + } +} + +# --------------------------------------------------------------- +# AWS Data Sources +# --------------------------------------------------------------- +# Data sources used to retrieve AWS-specific information such as current identity, region, and session context. + +# EKS cluster authentication data +# data "aws_eks_cluster_auth" "this" { +# name = module.eks.cluster_name +# } + +# Retrieves an authorization token for public ECR registry to authenticate image pulls. +# data "aws_ecrpublic_authorization_token" "token" { +# provider = aws.ecr +# } + +# Retrieves all available AWS availability zones in the selected region. +data "aws_availability_zones" "available" {} + +# Retrieves the current AWS region. +# data "aws_region" "current" {} + +# Retrieves the AWS account and caller identity details for the session. +data "aws_caller_identity" "current" {} + +# Retrieves the current AWS partition (useful for AWS GovCloud or China regions). +# data "aws_partition" "current" {} + +# Retrieves the IAM session context, including the ARN of the currently logged-in user/role. +data "aws_iam_session_context" "current" { + arn = data.aws_caller_identity.current.arn +} + +# --------------------------------------------------------------- +# IAM Policy Document for Spark Operator +# --------------------------------------------------------------- +# This IAM policy document allows the Spark operator to interact with S3 and CloudWatch Logs for logging and object storage. + +# Policy granting permissions for S3 operations required by Spark jobs. +# data "aws_iam_policy_document" "spark_operator" { +# statement { +# sid = "AllowS3AccessForSparkJobs" +# effect = "Allow" +# # Grants access to all S3 resources in the current AWS partition. +# resources = ["arn:${data.aws_partition.current.partition}:s3:::*"] + +# actions = [ +# "s3:DeleteObject", +# "s3:DeleteObjectVersion", +# "s3:GetObject", +# "s3:ListBucket", +# "s3:PutObject", +# ] +# } + +# # Policy granting permissions for CloudWatch Logs operations. +# statement { +# sid = "AllowCloudWatchLogsAccessForSpark" +# effect = "Allow" +# # Grants access to all CloudWatch Log Groups in the current AWS region and account. +# resources = [ +# "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:log-group:*" +# ] + +# actions = [ +# "logs:CreateLogGroup", +# "logs:CreateLogStream", +# "logs:DescribeLogGroups", +# "logs:DescribeLogStreams", +# "logs:PutLogEvents", +# ] +# } +# } diff --git a/infra/aws/terraform/outputs.tf b/infra/aws/terraform/outputs.tf index e69de29..c624023 100644 --- a/infra/aws/terraform/outputs.tf +++ b/infra/aws/terraform/outputs.tf @@ -0,0 +1,4 @@ +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" +} diff --git a/infra/aws/terraform/variables.tf b/infra/aws/terraform/variables.tf index e69de29..047cee4 100644 --- a/infra/aws/terraform/variables.tf +++ b/infra/aws/terraform/variables.tf @@ -0,0 +1,75 @@ +# Name of the VPC and EKS Cluster +variable "name" { + description = "Name of the VPC and EKS Cluster. This will be used as a prefix for all resources created in this project." + type = string + default = "spark-rapids-eks" +} + +# AWS region where the infrastructure will be deployed +variable "region" { + description = "AWS region where the EKS cluster and associated infrastructure will be deployed. Ensure that the selected region supports the required services (e.g., EKS, EC2 with GPU, etc.)." + type = string + default = "us-west-2" +} + +# EKS Cluster version +variable "eks_cluster_version" { + description = "Version of the EKS Kubernetes cluster to be deployed. Ensure this is compatible with the desired workload and add-ons (e.g., Spark, JupyterHub)." + type = string + default = "1.30" +} + +# VPC CIDR block for the primary network +variable "vpc_cidr" { + description = "The primary CIDR block for the VPC, defining the IP address range. This should be a valid private (RFC 1918) CIDR block, typically used for internal networks." + type = string + default = "10.1.0.0/21" +} + +# Secondary CIDR blocks for extended networking capabilities +variable "secondary_cidr_blocks" { + description = < Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] +} + +#--------------------------------------------------------------- +# VPC +#--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 5.0" + + name = local.name + cidr = var.vpc_cidr + azs = local.azs + + # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods + secondary_cidr_blocks = var.secondary_cidr_blocks + + # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods + # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc. + private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets) + + # ------------------------------ + # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments + # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW + public_subnets = local.public_subnets + enable_nat_gateway = true + single_nat_gateway = true + #------------------------------- + + public_subnet_tags = { + "kubernetes.io/role/elb" = 1 + } + + private_subnet_tags = { + "kubernetes.io/role/internal-elb" = 1 + # Tags subnets for Karpenter auto-discovery + "karpenter.sh/discovery" = local.name + } + + tags = local.tags +} diff --git a/website/package.json b/website/package.json new file mode 100644 index 0000000..f884b5c --- /dev/null +++ b/website/package.json @@ -0,0 +1,47 @@ +{ + "name": "website", + "version": "0.0.0", + "private": true, + "scripts": { + "docusaurus": "docusaurus", + "start": "docusaurus start", + "build": "docusaurus build", + "swizzle": "docusaurus swizzle", + "deploy": "docusaurus deploy", + "clear": "docusaurus clear", + "serve": "docusaurus serve", + "write-translations": "docusaurus write-translations", + "write-heading-ids": "docusaurus write-heading-ids", + "typecheck": "tsc" + }, + "dependencies": { + "@docusaurus/core": "3.5.2", + "@docusaurus/preset-classic": "3.5.2", + "@mdx-js/react": "^3.0.0", + "clsx": "^2.0.0", + "prism-react-renderer": "^2.3.0", + "react": "^18.0.0", + "react-dom": "^18.0.0" + }, + "devDependencies": { + "@docusaurus/module-type-aliases": "3.5.2", + "@docusaurus/tsconfig": "3.5.2", + "@docusaurus/types": "3.5.2", + "typescript": "~5.5.2" + }, + "browserslist": { + "production": [ + ">0.5%", + "not dead", + "not op_mini all" + ], + "development": [ + "last 3 chrome version", + "last 3 firefox version", + "last 5 safari version" + ] + }, + "engines": { + "node": ">=18.0" + } +} diff --git a/website/static/img/logo.svg b/website/static/img/logo.svg index 9db6d0d..ad9d11a 100644 --- a/website/static/img/logo.svg +++ b/website/static/img/logo.svg @@ -1 +1 @@ - \ No newline at end of file +