Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up orphaned instances and security groups (HMS-3632) #4513

Merged
18 changes: 15 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,13 @@ man: $(MANPAGES_TROFF)
# They are not supported nor is their use recommended in scripts.
#

.PHONY: build-maintenance
build-maintenance: $(BUILDDIR)/bin/
go build -o $<osbuild-service-maintenance ./cmd/osbuild-service-maintenance
go test -c -tags=integration -o $<osbuild-composer-maintenance-tests ./cmd/osbuild-service-maintenance/

.PHONY: build
build: $(BUILDDIR)/bin/
build: $(BUILDDIR)/bin/ build-maintenance
go build -o $<osbuild-composer ./cmd/osbuild-composer/
go build -o $<osbuild-worker ./cmd/osbuild-worker/
go build -o $<osbuild-worker-executor ./cmd/osbuild-worker-executor/
Expand All @@ -141,7 +146,6 @@ build: $(BUILDDIR)/bin/
go build -o $<osbuild-upload-oci ./cmd/osbuild-upload-oci/
go build -o $<osbuild-upload-generic-s3 ./cmd/osbuild-upload-generic-s3/
go build -o $<osbuild-mock-openid-provider ./cmd/osbuild-mock-openid-provider
go build -o $<osbuild-service-maintenance ./cmd/osbuild-service-maintenance
go build -o $<osbuild-jobsite-manager ./cmd/osbuild-jobsite-manager
go build -o $<osbuild-jobsite-builder ./cmd/osbuild-jobsite-builder
# also build the test binaries
Expand All @@ -152,7 +156,6 @@ build: $(BUILDDIR)/bin/
go test -c -tags=integration -o $<osbuild-auth-tests ./cmd/osbuild-auth-tests/
go test -c -tags=integration -o $<osbuild-koji-tests ./cmd/osbuild-koji-tests/
go test -c -tags=integration -o $<osbuild-composer-dbjobqueue-tests ./cmd/osbuild-composer-dbjobqueue-tests/
go test -c -tags=integration -o $<osbuild-composer-maintenance-tests ./cmd/osbuild-service-maintenance/

.PHONY: install
install: build
Expand Down Expand Up @@ -373,3 +376,12 @@ $(PROCESSED_TEMPLATE_DIR)/%.yml: $(PROCESSED_TEMPLATE_DIR) $(OPENSHIFT_TEMPLATES
.PHONY: process-templates
process-templates: $(addprefix $(PROCESSED_TEMPLATE_DIR)/, $(OPENSHIFT_TEMPLATES))

# get yourself aws access to your deployment by
# either setting AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
# or providing a token in ~/.aws/credentials
# for profile "default"!
.PHONY: osbuild-service-maintenance-dry-run-aws
osbuild-service-maintenance-dry-run-aws: build-maintenance
env DRY_RUN=true \
ENABLE_AWS_MAINTENANCE=true \
$(BUILDDIR)/bin/osbuild-service-maintenance
219 changes: 208 additions & 11 deletions cmd/osbuild-service-maintenance/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package main

import (
"context"
"errors"
"fmt"
"slices"
"sync"
"time"

Expand All @@ -13,10 +15,29 @@ import (
"github.com/osbuild/osbuild-composer/internal/cloud/awscloud"
)

type ChildToParentAssociation struct {
Child string
Parent string
}

func AWSCleanup(maxConcurrentRequests int, dryRun bool, accessKeyID, accessKey string, cutoff time.Time) error {
a, err := awscloud.New("us-east-1", accessKeyID, accessKey, "")
if err != nil {
return err
const region = "us-east-1"
var a *awscloud.AWS
var err error

ctx := context.Background()

if accessKeyID != "" && accessKey != "" {
a, err = awscloud.New(region, accessKeyID, accessKey, "")
if err != nil {
return err
}
} else {
logrus.Infof("One of AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY is missing, trying default credentials…")
a, err = awscloud.NewDefault(region)
if err != nil {
return err
}
}

regions, err := a.Regions()
Expand Down Expand Up @@ -65,7 +86,7 @@ func AWSCleanup(maxConcurrentRequests int, dryRun bool, accessKeyID, accessKey s
continue
}

if err = sem.Acquire(context.Background(), 1); err != nil {
if err = sem.Acquire(ctx, 1); err != nil {
logrus.Errorf("Error acquiring semaphore: %v", err)
continue
}
Expand All @@ -84,33 +105,209 @@ func AWSCleanup(maxConcurrentRequests int, dryRun bool, accessKeyID, accessKey s
wg.Wait()
}

// using `errs` to collect all errors as we want to
// continue execution if only one cleanup fails
var errs []error

err = terminateOrphanedSecureInstances(a, dryRun)
if err != nil {
logrus.Errorf("Error in terminating secure instances: %v, continuing other cleanup.", err)
errs = append(errs, err)
}

err = searchSGAndCleanup(ctx, a, dryRun)
if err != nil {
logrus.Errorf("Error in cleaning up security groups: %v", err)
errs = append(errs, err)
}

err = searchLTAndCleanup(ctx, a, dryRun)
if err != nil {
logrus.Errorf("Error in cleaning up launch templates: %v", err)
errs = append(errs, err)
}

return errors.Join(errs...)
}

func terminateOrphanedSecureInstances(a *awscloud.AWS, dryRun bool) error {
// Terminate leftover secure instances
reservations, err := a.DescribeInstancesByTag("parent", "i-*")
if err != nil {
return fmt.Errorf("Unable to describe instances by tag %w", err)
}

instanceIDs := filterReservations(reservations)
instanceData := getChildParentAssociations(reservations)

var instanceIDs []string
for _, data := range instanceData {
parent, err := a.DescribeInstancesByInstanceID(data.Parent)
if err != nil {
logrus.Errorf("Error getting info of %s (parent of %s): %v", data.Parent, data.Child, err)
continue
}

if !checkValidParent(data.Child, parent) {
instanceIDs = append(instanceIDs, data.Child)
}
}

instanceIDs = filterOnTooOld(instanceIDs, reservations)
logrus.Infof("Cleaning up executor instances: %v", instanceIDs)
if !dryRun {
err = a.TerminateInstances(instanceIDs)
if err != nil {
return fmt.Errorf("Unable to terminate secure instances: %w", err)
if len(instanceIDs) > 0 {
err = a.TerminateInstances(instanceIDs)
if err != nil {
return fmt.Errorf("Unable to terminate secure instances: %w", err)
}
}
} else {
logrus.Info("Dry run, didn't actually terminate any instances")
}
return nil
}

func filterReservations(reservations []ec2types.Reservation) []string {
var instanceIDs []string
func filterOnTooOld(instanceIDs []string, reservations []ec2types.Reservation) []string {
for _, res := range reservations {
for _, i := range res.Instances {
if i.LaunchTime.Before(time.Now().Add(-time.Hour * 2)) {
instanceIDs = append(instanceIDs, *i.InstanceId)
logrus.Infof("Instance %s is too old", *i.InstanceId)
if !slices.Contains(instanceIDs, *i.InstanceId) {
instanceIDs = append(instanceIDs, *i.InstanceId)
}
}
}
}
return instanceIDs
}

func getChildParentAssociations(reservations []ec2types.Reservation) []ChildToParentAssociation {
var ChildToParentIDs []ChildToParentAssociation

for _, res := range reservations {
for _, i := range res.Instances {
for _, t := range i.Tags {
if *t.Key == "parent" {
ChildToParentIDs = append(ChildToParentIDs, ChildToParentAssociation{
Child: *i.InstanceId,
Parent: *t.Value,
})
}
}
}
}
return ChildToParentIDs
}

func checkValidParent(childId string, parent []ec2types.Reservation) bool {
if len(parent) == 0 {
logrus.Infof("Instance %s has no parent, removing it", childId)
return false
}
if len(parent) != 1 {
logrus.Errorf("Instance %s has %d parents. That should never happen, not changing anything here.", childId, len(parent))
return true
}
if len(parent[0].Instances) == 0 {
logrus.Infof("Instance %s has no parent instance, removing it", childId)
return false
}
if len(parent[0].Instances) != 1 {
schuellerf marked this conversation as resolved.
Show resolved Hide resolved
logrus.Errorf("Instance %s has %d parent instances. That should never happen, not changing anything here.", childId, len(parent[0].Instances))
return true
}

parentState := parent[0].Instances[0].State.Name
if parentState != ec2types.InstanceStateNameTerminated {
return true
}
logrus.Infof("Instance %s has a parent (%s) in state %s, so we'll terminate %s.", childId, *parent[0].Instances[0].InstanceId, parentState, childId)
return false
}

func searchSGAndCleanup(ctx context.Context, a *awscloud.AWS, dryRun bool) error {
securityGroups, err := a.DescribeSecurityGroupsByPrefix(ctx, "SG for i-")
if err != nil {
return err
}

for _, sg := range securityGroups {
if sg.GroupId == nil || sg.GroupName == nil {
logrus.Errorf(
"Security Group needs to have a GroupId (%v) and a GroupName (%v).",
sg.GroupId,
sg.GroupName)
continue
}
reservations, err := a.DescribeInstancesBySecurityGroupID(*sg.GroupId)
if err != nil {
logrus.Errorf("Failed to describe security group %s: %v", *sg.GroupId, err)
continue
}

// If no instance is running/pending, delete the SG
if allTerminated(reservations) {
logrus.Infof("Deleting security group: %s (%s)", *sg.GroupName, *sg.GroupId)
if !dryRun {
err := a.DeleteSecurityGroupById(ctx, sg.GroupId)

if err != nil {
logrus.Errorf("Failed to delete security group %s: %v", *sg.GroupId, err)
}
}
} else {
logrus.Debugf("Security group %s has non terminated instances associated with it.", *sg.GroupId)
}
}
return nil
}

// allTerminated returns true if any instance of the reservations is not terminated
// then it's considered "in use"
func allTerminated(reservations []ec2types.Reservation) bool {
for _, reservation := range reservations {
for _, instance := range reservation.Instances {
if instance.State != nil && (instance.State.Name != ec2types.InstanceStateNameTerminated) {
return false
}
}
}
return true
}

func searchLTAndCleanup(ctx context.Context, a *awscloud.AWS, dryRun bool) error {
launchTemplates, err := a.DescribeLaunchTemplatesByPrefix(ctx, "launch-template-for-i-")
if err != nil {
return err
}

for _, lt := range launchTemplates {
if lt.LaunchTemplateName == nil || lt.LaunchTemplateId == nil {
logrus.Errorf(
"Launch template needs to have a LaunchTemplateName (%v) and a LaunchTemplateId (%v).",
lt.LaunchTemplateName,
lt.LaunchTemplateId)
continue
}

reservations, err := a.DescribeInstancesByLaunchTemplateID(*lt.LaunchTemplateId)
if err != nil {
logrus.Errorf("Failed to describe launch template %s: %v", *lt.LaunchTemplateId, err)
continue
}

if allTerminated(reservations) {
logrus.Infof("Deleting launch template: %s (%s)\n", *lt.LaunchTemplateName, *lt.LaunchTemplateId)
if !dryRun {
err := a.DeleteLaunchTemplateById(ctx, lt.LaunchTemplateId)

if err != nil {
logrus.Errorf("Failed to delete launch template %s: %v", *lt.LaunchTemplateId, err)
}
}
} else {
fmt.Printf("Launch template %s has non terminated instances associated with it.\n", *lt.LaunchTemplateId)
}
}
return nil
}
Loading
Loading