Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added Terraform templates #1

Merged
merged 1 commit into from
Oct 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/codespell.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
---
name: Codespell

on:
Expand All @@ -24,4 +23,4 @@ jobs:
check_filenames: true
# When using this Action in other repos, the --skip option below can be removed
skip: "*.excalidraw,*.git,*.png,*.jpg,*.svg,go.mod,go.sum"
continue-on-error: true # The PR checks will not fail, but the possible spelling issues will still be reported for review and correction
continue-on-error: true # The PR checks will not fail, but the possible spelling issues will still be reported for review and correction
6 changes: 3 additions & 3 deletions .github/workflows/dependabot-automerge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ name: Dependabot auto-merge

on:
pull_request_target:
branches: [ main ]
types: [ opened ]
branches: [main]
types: [opened]

permissions:
pull-requests: write
Expand All @@ -18,4 +18,4 @@ jobs:
run: gh pr merge --auto --merge "$PR_URL"
env:
PR_URL: ${{github.event.pull_request.html_url}}
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
2 changes: 1 addition & 1 deletion .github/workflows/website-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ jobs:
# The GH actions bot is used by default if you didn't specify the two fields.
# You can swap them out with your own user credentials.
user_name: github-actions[bot]
user_email: github-actions[bot]@users.noreply.github.com
user_email: github-actions[bot]@users.noreply.github.com
2 changes: 1 addition & 1 deletion .github/workflows/website-test-deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ jobs:
- name: Install dependencies
run: npm ci
- name: Test build website
run: npm run build
run: npm run build
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,3 @@ site

# node modules
node_modules
website/package-lock.json
website/package.json
44 changes: 44 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
repos:
- repo: https://github.com/streetsidesoftware/cspell-cli
rev: v8.13.3
hooks:
- id: cspell
args: [--exclude, 'ADOPTERS.md', --exclude, '.pre-commit-config.yaml', --exclude, '.gitignore', --exclude, '*.drawio', --exclude, 'mkdocs.yml', --exclude, '.helmignore', --exclude, '.github/workflows/*', --exclude, 'patterns/istio-multi-cluster/*', --exclude, 'patterns/blue-green-upgrade/*', --exclude, '/patterns/vpc-lattice/cross-cluster-pod-communication/*', --exclude, 'patterns/bottlerocket/*', --exclude, 'patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh']
- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
rev: v2.14.0
hooks:
- id: pretty-format-yaml
args: [--autofix, --indent, '2', --offset, '2', --preserve-quotes]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-merge-conflict
- id: detect-private-key
- id: detect-aws-credentials
args: [--allow-missing-credentials]
- repo: https://github.com/antonbabenko/pre-commit-terraform
rev: v1.96.1
hooks:
- id: terraform_fmt
- id: terraform_docs
args:
- --args=--lockfile=false
- id: terraform_tflint
args:
- --args=--only=terraform_deprecated_interpolation
- --args=--only=terraform_deprecated_index
- --args=--only=terraform_unused_declarations
- --args=--only=terraform_comment_syntax
- --args=--only=terraform_documented_outputs
- --args=--only=terraform_documented_variables
- --args=--only=terraform_typed_variables
- --args=--only=terraform_module_pinned_source
- --args=--only=terraform_naming_convention
- --args=--only=terraform_required_version
- --args=--only=terraform_required_providers
- --args=--only=terraform_unused_required_providers
- --args=--only=terraform_workspace_remote
- id: terraform_validate
exclude: (docs|modules)
122 changes: 122 additions & 0 deletions infra/aws/terraform/eks.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@

module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 20.24"

cluster_name = local.name
cluster_version = var.eks_cluster_version

# EKS Addons
cluster_addons = {
coredns = {}
eks-pod-identity-agent = {}
kube-proxy = {}
vpc-cni = {}
}

# Give the Terraform identity admin access to the cluster
# which will allow it to deploy resources into the cluster
enable_cluster_creator_admin_permissions = true
cluster_endpoint_public_access = true
access_entries = var.access_entries

vpc_id = module.vpc.vpc_id
# Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created
subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
substr(cidr_block, 0, 4) == "100." ? subnet_id : null])

# Combine root account, current user/role and additinoal roles to be able to access the cluster KMS key - required for terraform updates
kms_key_administrators = distinct(concat([
"arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"],
var.kms_key_admin_roles,
[data.aws_iam_session_context.current.issuer_arn]
))

#---------------------------------------
# Note: This can further restricted to specific required for each Add-on and your application
#---------------------------------------
# Extend cluster security group rules
cluster_security_group_additional_rules = {
ingress_nodes_ephemeral_ports_tcp = {
description = "Nodes on ephemeral ports"
protocol = "tcp"
from_port = 0
to_port = 65535
type = "ingress"
source_node_security_group = true
}
}

# security group rule from all ipv4 to nodes for port 22
node_security_group_additional_rules = {
# Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
# This can be restricted further to specific port based on the requirement for each Add-on e.g., coreDNS 53, metrics-server 4443, spark-operator 8080, karpenter 8443 etc.
# Update this according to your security requirements if needed
ingress_cluster_to_node_all_traffic = {
description = "Cluster API to Nodegroup all traffic"
protocol = "-1"
from_port = 0
to_port = 0
type = "ingress"
source_cluster_security_group = true
}
}

eks_managed_node_group_defaults = {
iam_role_additional_policies = {
# Not required, but used in the example to access the nodes to inspect mounted volumes
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}

ebs_optimized = true
# This block device is used only for root volume. Adjust volume according to your size.
# NOTE: Don't use this volume for ML workloads
block_device_mappings = {
xvda = {
device_name = "/dev/xvda"
ebs = {
volume_size = 100
volume_type = "gp3"
}
}
}
}

eks_managed_node_groups = {
# It's recommended to have a Managed Node group for hosting critical add-ons
# It's recommended to use Karpenter to place your workloads instead of using Managed Node groups
# You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes.
system_node_group = {
name = "system-node-group"
description = "EKS Core node group for hosting system add-ons"
# Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned
subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
)

# aws ssm get-parameters --names /aws/service/eks/optimized-ami/${var.eks_cluster_version}/amazon-linux-2023/x86_64/standard/recommended/release_version --region us-west-2
ami_type = "AL2023_x86_64_STANDARD" # Use this for Graviton AL2023_ARM_64_STANDARD
min_size = 2
max_size = 8
desired_size = 2

instance_types = ["m6i.large"]

labels = {
NodeGroupType = "system-nodegroup"
}

tags = merge(local.tags, {
Name = "system-nodegroup"
})
}

tags = merge(local.tags, {
# NOTE - if creating multiple security groups with this module, only tag the
# security group that Karpenter should utilize with the following tag
# (i.e. - at most, only one security group should have this tag in your account)
"karpenter.sh/discovery" = local.name
})
}

}
126 changes: 126 additions & 0 deletions infra/aws/terraform/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# ---------------------------------------------------------------
# AWS Provider Configuration
# ---------------------------------------------------------------
# The primary AWS provider, used for interacting with resources in the region specified by 'var.region'.
provider "aws" {
region = local.region
}

# Secondary AWS provider for ECR (Elastic Container Registry) authentication.
# ECR public authentication requires the 'us-east-1' region, which is hardcoded here.
# If your main region is 'us-east-1', you can remove this second provider and use the primary one.
provider "aws" {
alias = "ecr"
region = "us-east-1"
}

# ---------------------------------------------------------------
# Helm Provider Configuration
# ---------------------------------------------------------------
# The Helm provider is used to manage Kubernetes applications, relying on the EKS cluster.
provider "helm" {
kubernetes {
# The EKS cluster API endpoint and certificate are retrieved from the EKS module.
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)

exec {
# Retrieves an authentication token for Kubernetes API using the AWS CLI.
api_version = "client.authentication.k8s.io/v1beta1"
command = "aws"
args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
# Note: The AWS CLI must be installed locally where Terraform is executed.
}
}
}

# ---------------------------------------------------------------
# Local Variables
# ---------------------------------------------------------------
# These locals store reusable values for the project, such as the name, region, and tags.
locals {
# Name and region variables for naming consistency across resources.
name = var.name
region = var.region

# Limiting Availability Zones to two for resource allocation.
azs = slice(data.aws_availability_zones.available.names, 0, 2)

# Project tags for tracking and referencing the GitHub repository.
tags = {
GithubRepo = "https://github.com/KubedAI/spark-rapids-on-kubernetes"
}
}

# ---------------------------------------------------------------
# AWS Data Sources
# ---------------------------------------------------------------
# Data sources used to retrieve AWS-specific information such as current identity, region, and session context.

# EKS cluster authentication data
# data "aws_eks_cluster_auth" "this" {
# name = module.eks.cluster_name
# }

# Retrieves an authorization token for public ECR registry to authenticate image pulls.
# data "aws_ecrpublic_authorization_token" "token" {
# provider = aws.ecr
# }

# Retrieves all available AWS availability zones in the selected region.
data "aws_availability_zones" "available" {}

# Retrieves the current AWS region.
# data "aws_region" "current" {}

# Retrieves the AWS account and caller identity details for the session.
data "aws_caller_identity" "current" {}

# Retrieves the current AWS partition (useful for AWS GovCloud or China regions).
# data "aws_partition" "current" {}

# Retrieves the IAM session context, including the ARN of the currently logged-in user/role.
data "aws_iam_session_context" "current" {
arn = data.aws_caller_identity.current.arn
}

# ---------------------------------------------------------------
# IAM Policy Document for Spark Operator
# ---------------------------------------------------------------
# This IAM policy document allows the Spark operator to interact with S3 and CloudWatch Logs for logging and object storage.

# Policy granting permissions for S3 operations required by Spark jobs.
# data "aws_iam_policy_document" "spark_operator" {
# statement {
# sid = "AllowS3AccessForSparkJobs"
# effect = "Allow"
# # Grants access to all S3 resources in the current AWS partition.
# resources = ["arn:${data.aws_partition.current.partition}:s3:::*"]

# actions = [
# "s3:DeleteObject",
# "s3:DeleteObjectVersion",
# "s3:GetObject",
# "s3:ListBucket",
# "s3:PutObject",
# ]
# }

# # Policy granting permissions for CloudWatch Logs operations.
# statement {
# sid = "AllowCloudWatchLogsAccessForSpark"
# effect = "Allow"
# # Grants access to all CloudWatch Log Groups in the current AWS region and account.
# resources = [
# "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:log-group:*"
# ]

# actions = [
# "logs:CreateLogGroup",
# "logs:CreateLogStream",
# "logs:DescribeLogGroups",
# "logs:DescribeLogStreams",
# "logs:PutLogEvents",
# ]
# }
# }
4 changes: 4 additions & 0 deletions infra/aws/terraform/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
output "configure_kubectl" {
description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}"
}
Loading
Loading