From e380f8763ac14946b84d023431a509e2f40fdf81 Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Tue, 17 Dec 2024 08:59:45 +0530 Subject: [PATCH] unused aws instace and vpcs cleanup Signed-off-by: shiva kumar --- .github/workflows/awscleanup.yaml | 81 +++++++++++++++++ .github/workflows/codeql.yml | 8 +- .github/workflows/e2e.yml | 4 +- .github/workflows/go.yml | 8 +- .github/workflows/image.yml | 8 +- pkg/provider/aws/aws.go | 26 ++++++ pkg/provider/aws/create.go | 20 ++++- scripts/awscleanup.sh | 140 ++++++++++++++++++++++++++++++ scripts/checkdependency.sh | 79 +++++++++++++++++ 9 files changed, 359 insertions(+), 15 deletions(-) create mode 100644 .github/workflows/awscleanup.yaml create mode 100755 scripts/awscleanup.sh create mode 100755 scripts/checkdependency.sh diff --git a/.github/workflows/awscleanup.yaml b/.github/workflows/awscleanup.yaml new file mode 100644 index 00000000..ff1e32e1 --- /dev/null +++ b/.github/workflows/awscleanup.yaml @@ -0,0 +1,81 @@ +name: Daily AWS Cleanup Bot + +# on: +# schedule: +# - cron: '0 8 * * *' + +on: + pull_request: + types: + - opened + - synchronize + branches: + - awsresourcecleanup + push: + branches: + - awsresourcecleanup + +jobs: + cleanup: + runs-on: linux-amd64-cpu4 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up AWS CLI + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-west-1 + + - name: Identify resources for deletion + id: identify-resources + run: | + # Find vpcs with names ci* + vpcs=$(aws ec2 describe-vpcs \ + --filters "Name=tag:Name,Values=ci*" \ + --query "Vpcs[].VpcId" \ + --output text | tr -d '\r' | tr '\n' ' ') + echo "Found VPCs: $vpcs" + echo "vpcs=$vpcs" >> $GITHUB_ENV + + - name: Clean up VPCs + if: env.vpcs != '' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + get_tag_value(){ + local vpc_id=$1 + local key=$2 + aws ec2 describe-tags --filters "Name=resource-id,Values=$vpc_id" "Name=key,Values=$key" \ + --query "Tags[0].Value" --output text + } + for vpc in $vpcs; do + echo "SHIVA1" + get_tag_value $vpc "GithubRepository" + get_tag_value $vpc "GithubRunId" + get_tag_value $vpc "GithubJob" + echo "SHIVA2" + github_repository=$(get_tag_value $vpc "GithubRepository") + run_id=$(get_tag_value $vpc "GithubRunId") + job_name=$(get_tag_value $vpc "GithubJob") + response=$(curl -s -H "Authorization: Bearer $GITHUB_TOKEN" \ + "https://api.github.com/repos/NVIDIA/${github_repository}/actions/runs/${run_id}/jobs") + if [[ -z "$response" || "$response" == "null" ]]; then + continue + fi + status=$(echo "$response" | jq -r ".jobs[] | select(.name | test(\"^$job_name\")) | .status" 2>/dev/null || echo "null") + echo "SHIVA status $status" + echo "$status" | grep -qvE '^(queued|in_progress)$' + echo "KUMAR" + if [[ "$status" != "null" && ! -z "$status" && $(echo "$status" | grep -qvE '^(queued|in_progress)$'; echo $?) -eq 0 ]]; then + echo "Holodeck Job status is not in running stage , Delete the dependent resources" + scripts/awscleanup.sh $vpc + fi + done + + - name: Post cleanup + run: | + echo "Cleanup completed." diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 674cb838..51f4df14 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -20,12 +20,12 @@ on: - opened - synchronize branches: - - main - - release-* + - main-no + - release-no push: branches: - - main - - release-* + - main-no + - release-no schedule: - cron: '31 11 * * 4' diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index a186b981..03348975 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -21,8 +21,8 @@ on: - completed branches: - "pull-request/[0-9]+" - - main - - release-* + - main-no + - release-no jobs: e2e-test: diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 9e215402..e99a49e6 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -17,12 +17,12 @@ name: Go on: push: branches: - - main - - release-* + - main-no + - release-no pull_request: branches: - - main - - release-* + - main-no + - release-no jobs: build: diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml index f57058f8..c0666205 100644 --- a/.github/workflows/image.yml +++ b/.github/workflows/image.yml @@ -17,14 +17,14 @@ name: Image on: pull_request: branches: - - 'main' - - 'release-*' + - 'main-no' + - 'release-no' push: tags: - 'v*.*.*' branches: - - 'main' - - 'release-*' + - 'main-no' + - 'release-no' jobs: docker: diff --git a/pkg/provider/aws/aws.go b/pkg/provider/aws/aws.go index d833cfd0..da9bae37 100644 --- a/pkg/provider/aws/aws.go +++ b/pkg/provider/aws/aws.go @@ -19,6 +19,7 @@ package aws import ( "context" "os" + "strings" "github.com/NVIDIA/holodeck/api/holodeck/v1alpha1" "github.com/NVIDIA/holodeck/internal/logger" @@ -87,6 +88,24 @@ func New(log *logger.FunLogger, env v1alpha1.Environment, cacheFile string) (*Pr if envRegion := os.Getenv("AWS_REGION"); envRegion != "" { region = envRegion } + sha := os.Getenv("GITHUB_SHA") + // short sha + if len(sha) > 8 { + sha = sha[:8] + } + actor := os.Getenv("GITHUB_ACTOR") +<<<<<<< HEAD + branchName := os.Getenv("GITHUB_REF_NAME") +======= + branch := os.Getenv("GITHUB_REF_NAME") +>>>>>>> 971df34... dynamic holodeck ci instance name + repo_name := os.Getenv("GITHUB_REPOSITORY") + parts := strings.Split(repo_name, "/") + repoName := parts[len(parts)-1] + githubRunId := os.Getenv("GITHUB_RUN_ID") + githubRunNumber := os.Getenv("GITHUB_RUN_NUMBER") + githubJob := os.Getenv("GITHUB_JOB") + cfg, err := config.LoadDefaultConfig(context.TODO(), config.WithRegion(region)) if err != nil { return nil, err @@ -100,6 +119,13 @@ func New(log *logger.FunLogger, env v1alpha1.Environment, cacheFile string) (*Pr {Key: aws.String("Name"), Value: aws.String(env.Name)}, {Key: aws.String("Project"), Value: aws.String("holodeck")}, {Key: aws.String("Environment"), Value: aws.String("cicd")}, + {Key: aws.String("Sha"), Value: aws.String(sha)}, + {Key: aws.String("Actor"), Value: aws.String(actor)}, + {Key: aws.String("Branch"), Value: aws.String(branch)}, + {Key: aws.String("GithubRepository"), Value: aws.String(repoName)}, + {Key: aws.String("GithubRunId"), Value: aws.String(githubRunId)}, + {Key: aws.String("GithubRunNumber"), Value: aws.String(githubRunNumber)}, + {Key: aws.String("GithubJob"), Value: aws.String(githubJob)}, }, client, r53, diff --git a/pkg/provider/aws/create.go b/pkg/provider/aws/create.go index 184225a6..5b274d36 100644 --- a/pkg/provider/aws/create.go +++ b/pkg/provider/aws/create.go @@ -145,7 +145,14 @@ func (p *Provider) createInternetGateway(cache *AWS) error { p.log.Wg.Add(1) go p.log.Loading("Creating Internet Gateway") - gwInput := &ec2.CreateInternetGatewayInput{} + gwInput := &ec2.CreateInternetGatewayInput{ + TagSpecifications: []types.TagSpecification{ + { + ResourceType: types.ResourceTypeInternetGateway, + Tags: p.Tags, + }, + }, + } gwOutput, err := p.ec2.CreateInternetGateway(context.TODO(), gwInput) if err != nil { p.fail() @@ -357,6 +364,17 @@ func (p *Provider) createEC2Instance(cache *AWS) error { } cache.PublicDnsName = *instanceRunning.Reservations[0].Instances[0].PublicDnsName + // tag network interface + instance := instanceOut.Instances[0] + networkInterfaceId := *instance.NetworkInterfaces[0].NetworkInterfaceId + _, err = p.ec2.CreateTags(context.TODO(), &ec2.CreateTagsInput{ + Resources: []string{networkInterfaceId}, + Tags: p.Tags, + }) + if err != nil { + p.fail() + return fmt.Errorf("Fail to tag network to instance: %v", err) + } p.done() return nil } diff --git a/scripts/awscleanup.sh b/scripts/awscleanup.sh new file mode 100755 index 00000000..4d3f2bc7 --- /dev/null +++ b/scripts/awscleanup.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +if [[ $# -ne 1 ]]; then + echo " vpcid required for deletion" + exit 1 +fi +export vpc=$1 + +echo "Start Deleting VPC: $vpc resource" + +# Delete Instance +instances=$(aws ec2 describe-instances \ + --filters "Name=vpc-id,Values=$vpc" \ + --query "Reservations[].Instances[].InstanceId" \ + --output text | tr -d '\r' | tr '\n' ' ') +for instance in $instances; do + echo "Terminating instance: $instance" + aws ec2 terminate-instances --instance-ids "$instance" +done + +# Delete Internet Gateway +internet_gateways=$(aws ec2 describe-internet-gateways \ + --filters Name=attachment.vpc-id,Values=$vpc \ + --query "InternetGateways[].InternetGatewayId" \ + --output text | tr -d '\r' | tr '\n' ' ') +for igw in $internet_gateways; do + aws ec2 detach-internet-gateway --internet-gateway-id "$igw" --vpc-id "$vpc" + aws ec2 delete-internet-gateway --internet-gateway-id "$igw" +done + +# Delete NAT Gateways +nat_gateways=$(aws ec2 describe-nat-gateways \ + --filter Name=vpc-id,Values=$vpc \ + --query "NatGateways[].NatGatewayId" \ + --output text | tr -d '\r' | tr '\n' ' ') +for ngw in $nat_gateways; do + aws ec2 delete-nat-gateway --nat-gateway-id "$ngw" +done + +# Delete Elastic IPs +eips=$(aws ec2 describe-addresses \ + --filters Name=domain,Values=vpc \ + --query "Addresses[].[AllocationId,Association.VpcId]" \ + --output text | grep "$vpc" | awk '{print $1}' | tr -d '\r' | tr '\n' ' ') +for eip in $eips; do + aws ec2 release-address --allocation-id "$eip" +done + +# Detach and Delete Security Groups +security_groups=$(aws ec2 describe-security-groups \ + --filters Name=vpc-id,Values=$vpc \ + --query "SecurityGroups[?GroupName!='default'].GroupId" \ + --output text | tr -d '\r' | tr '\n' ' ') +for sg in $security_groups; do + enis=$(aws ec2 describe-network-interfaces \ + --filters Name=group-id,Values=$sg \ + --query "NetworkInterfaces[].NetworkInterfaceId" \ + --output text | tr -d '\r' | tr '\n' ' ') + for eni in $enis; do + aws ec2 modify-network-interface-attribute \ + --network-interface-id "$eni" \ + --groups "$(aws ec2 describe-security-groups \ + --query 'SecurityGroups[?GroupName==`default`].GroupId' \ + --output text)" + done + aws ec2 delete-security-group --group-id "$sg" +done + +# Delete Route Tables +# 1. Make first rt as Main , as we cannot delete vpcs attached with main +# 2. replace all rt with first rt +# 3.delete rt +first_rt="" +route_tables=$(aws ec2 describe-route-tables \ + --filters Name=vpc-id,Values=$vpc \ + --query "RouteTables[].RouteTableId" \ + --output text | tr -d '\r' | tr '\n' ' ') +for rt in $route_tables; do + if [ -z "$first_rt" ]; then + aws ec2 replace-route-table-association --association-id $(aws ec2 describe-route-tables --route-table-id $rt --query "RouteTables[].Associations[].RouteTableAssociationId" --output text) --route-table-id $rt + first_rt=$rt + else + associations=$(aws ec2 describe-route-tables \ + --route-table-ids "$rt" \ + --query "RouteTables[].RouteTableId" \ + --output text | tr -d '\r' | tr '\n' ' ') + for assoc_id in $associations; do + aws ec2 replace-route-table-association --association-id $assoc_id --route-table-id $first_rt + done + aws ec2 delete-route-table --route-table-id "$rt" + fi +done + +# Delete Subnets +subnets=$(aws ec2 describe-subnets \ + --filters Name=vpc-id,Values=$vpc \ + --query "Subnets[].SubnetId" \ + --output text | tr -d '\r' | tr '\n' ' ') +for subnet in $subnets; do + aws ec2 delete-subnet --subnet-id "$subnet" +done + +# Delete Network Interfaces +eni_ids=$(aws ec2 describe-network-interfaces \ + --filters Name=vpc-id,Values=$vpc \ + --query "NetworkInterfaces[].NetworkInterfaceId" \ + --output text | tr -d '\r' | tr '\n' ' ') +for eni in $eni_ids; do + aws ec2 delete-network-interface --network-interface-id "$eni" +done + +# Delete Network ACLs +nw_acls=$(aws ec2 describe-network-acls --filters "Name=vpc-id,Values=$vpc" --query "NetworkAcls[?IsDefault==false].NetworkAclId" --output text) +for acl in $nw_acls; do + echo "Deleting Network ACL: $acl" + aws ec2 delete-network-acl --network-acl-id $acl +done + +scripts/checkdependency.sh $vpc + +# Delete vpc +# try 3 times with 30 seconds interval +attempts=0 +echo "All resource Deleted for VPC: $vpc , now delete vpc" +while [ $attempts -lt 3 ]; do + echo "Attempting to delete VPC: $vpc (Attempt $((attempts+1)))" + if aws ec2 delete-vpc --vpc-id $vpc; then + echo "Successfully deleted VPC: $vpc" + break + else + attempts=$((attempts + 1)) + if [ $attempts -lt 3 ]; then + echo "Failed to delete VPC: $vpc. Retrying in 30 seconds..." + sleep 30 + fi + fi +done +if [ $attempts -eq 3 ]; then + echo "Failed to delete VPC: $vpc after 3 attempts. Skipping." +fi diff --git a/scripts/checkdependency.sh b/scripts/checkdependency.sh new file mode 100755 index 00000000..e6306575 --- /dev/null +++ b/scripts/checkdependency.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +vpc="$1" + +subnets=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$vpc" --query "Subnets[].SubnetId" --output text) +if [ ! -z "$subnets" ]; then + echo "Please delete the following subnets before deleting the VPC:" + echo $subnets + exit 1 +fi + +sgs=$(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=$vpc" --query "SecurityGroups[?GroupName!='default'].GroupId" --output text) +if [ ! -z "$sgs" ]; then + echo "Please delete the following security groups before deleting the VPC:" + echo $sgs + exit 1 +fi + +igws=$(aws ec2 describe-internet-gateways --filters "Name=attachment.vpc-id,Values=$vpc" --query "InternetGateways[].InternetGatewayId" --output text) +if [ ! -z "$igws" ]; then + echo "Please detach and delete the following Internet Gateways before deleting the VPC:" + echo $igws + exit 1 +fi + +nat_gateways=$(aws ec2 describe-nat-gateways --filter "Name=vpc-id,Values=$vpc" --query "NatGateways[].NatGatewayId" --output text) +if [ ! -z "$nat_gateways" ]; then + echo "Please delete the following NAT Gateways before deleting the VPC:" + echo $nat_gateways + exit 1 +fi + +eips=$(aws ec2 describe-addresses --filters Name=domain,Values=vpc --query "Addresses[].[AllocationId,Association.VpcId]" --output text) +if [ ! -z "$eips" ]; then + echo "Please release the following Elastic IPs before deleting the VPC:" + echo $eips + exit 1 +fi + +eni_ids=$(aws ec2 describe-network-interfaces --filters "Name=vpc-id,Values=$vpc" --query "NetworkInterfaces[].NetworkInterfaceId" --output text) +if [ ! -z "$eni_ids" ]; then + echo "Please delete or detach the following network interfaces before deleting the VPC:" + echo $eni_ids + exit 1 +fi + +vpc_peering=$(aws ec2 describe-vpc-peering-connections --filters "Name=requester-vpc-info.vpc-id,Values=$vpc" --query "VpcPeeringConnections[].VpcPeeringConnectionId" --output text) +if [ ! -z "$vpc_peering" ]; then + echo "Please delete the following VPC Peering Connections before deleting the VPC:" + echo $vpc_peering + exit 1 +fi + +vpn_connections=$(aws ec2 describe-vpn-connections --filters "Name=vpc-id,Values=$vpc" --query "VpnConnections[].VpnConnectionId" --output text) +if [ ! -z "$vpn_connections" ]; then + echo "Please delete the following VPN Connections before deleting the VPC:" + echo $vpn_connections + exit 1 +fi + +route_tables=$(aws ec2 describe-route-tables --filters "Name=vpc-id,Values=$vpc" --query "RouteTables[].RouteTableId" --output text) +if [ ! -z "$route_tables" ]; then + echo "Please delete the following Route Tables before deleting the VPC:" + echo $route_tables + exit 1 +fi + +nacl_ids=$(aws ec2 describe-network-acls --filters "Name=vpc-id,Values=$vpc" --query "NetworkAcls[].NetworkAclId" --output text) +if [ ! -z "$nacl_ids" ]; then + echo "Please delete the following Network ACLs before deleting the VPC:" + echo $nacl_ids + exit 1 +fi + +aws ec2 describe-vpcs --vpc-ids $vpc --query 'Vpcs[0].State' + + + +echo "No dependencies found. Proceeding with VPC deletion..."