sweet: force-kill CockroachDB if SIGTERM isn't working

This change uses SIGKILL when benchmarks are finished running but for whatever reason the CockroachDB cluster isn't responding to SIGTERM. At this point, it's fine to forcibly kill the server, but let's also make sure we kill all other instances too (since there's no clean shutdown). Currently timeouts in the cockroachdb benchmark are causing loss of data (a separate issue we should fix) but even if that wasn't the case, we'd also be losing data for CockroachDB. This CL fixes the problem with the benchmark: locally I couldn't get it to succeed with 20 runs, but with this patch, it has no problem finishing. We should investigate why CockroachDB isn't responding to SIGTERM and whether there's a cleaner way to ensure a shutdown. This is OK for now, and we may want to keep this behavior long-term anyway (useful when benchmarking unvetted patches that cause a hang, for example). This CL also adds a bunch more logging to the benchmark runner, too. Change-Id: I57cf27f35b71b6c69a8ca2ec38107e1c912a5167 Cq-Include-Trybots: luci.golang.try:x_benchmarks-gotip-linux-amd64-longtest,x_benchmarks-go1.22-linux-amd64-longtest Reviewed-on: https://go-review.googlesource.com/c/benchmarks/+/594775 Auto-Submit: Michael Knyszek <[email protected]> Reviewed-by: Michael Pratt <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
golang · Jun 25, 2024 · 1a6c737 · 1a6c737
1 parent 1cf02eb
commit 1a6c737
Showing 1 changed file with 72 additions and 15 deletions.
diff --git a/sweet/benchmarks/cockroachdb/main.go b/sweet/benchmarks/cockroachdb/main.go
@@ -12,9 +12,7 @@ import (
 	"errors"
 	"flag"
 	"fmt"
-	"golang.org/x/benchmarks/sweet/benchmarks/internal/driver"
-	"golang.org/x/benchmarks/sweet/benchmarks/internal/server"
-	"golang.org/x/benchmarks/sweet/common/diagnostics"
+	"log"
 	"os"
 	"os/exec"
 	"regexp"
@@ -25,6 +23,10 @@ import (
 	"sync/atomic"
 	"syscall"
 	"time"
+
+	"golang.org/x/benchmarks/sweet/benchmarks/internal/driver"
+	"golang.org/x/benchmarks/sweet/benchmarks/internal/server"
+	"golang.org/x/benchmarks/sweet/common/diagnostics"
 )
 
 const (
@@ -256,20 +258,53 @@ func (i *cockroachdbInstance) httpAddr() string {
 	return fmt.Sprintf("%s:%d", cliCfg.host, i.httpPort)
 }
 
-func (i *cockroachdbInstance) shutdown() error {
+func (i *cockroachdbInstance) shutdown() (killed bool, err error) {
 	// Only attempt to shut down the process if we got to a point
 	// that a command was constructed and started.
-	if i.cmd != nil && i.cmd.Process != nil {
-		// Send SIGTERM and wait instead of just killing as sending SIGKILL
-		// bypasses node shutdown logic and will leave the node in a bad state.
-		// Normally not an issue unless you want to restart the cluster i.e.
-		// to poke around and see what's wrong.
-		if err := i.cmd.Process.Signal(syscall.SIGTERM); err != nil {
-			return err
-		}
+	if i.cmd == nil || i.cmd.Process == nil {
+		return false, nil
+	}
+	// Send SIGTERM and wait instead of just killing as sending SIGKILL
+	// bypasses node shutdown logic and will leave the node in a bad state.
+	// Normally not an issue unless you want to restart the cluster i.e.
+	// to poke around and see what's wrong.
+	if err := i.cmd.Process.Signal(syscall.SIGTERM); err != nil {
+		return false, err
+	}
+	done := make(chan struct{})
+	go func() {
 		if _, err := i.cmd.Process.Wait(); err != nil {
-			return err
+			fmt.Fprintf(os.Stderr, "failed to wait for instance %s: %v\n", i.name, err)
 		}
+		done <- struct{}{}
+	}()
+	select {
+	case <-done:
+	case <-time.After(1 * time.Minute):
+		// If it takes a full minute to shut down, just kill the instance
+		// and report that we did it. We *probably* won't need it again.
+		if err := i.cmd.Process.Signal(syscall.SIGKILL); err != nil {
+			return false, fmt.Errorf("failed to send SIGKILL to instance %s: %v", i.name, err)
+		}
+		killed = true
+
+		// Wait again -- this time it should happen.
+		log.Println("sent kill signal to", i.name, "and waiting for exit")
+		<-done
+	}
+	return killed, nil
+}
+
+func (i *cockroachdbInstance) kill() error {
+	if i.cmd == nil || i.cmd.Process == nil {
+		// Nothing to kill.
+		return nil
+	}
+	if err := i.cmd.Process.Signal(syscall.SIGKILL); err != nil {
+		return err
+	}
+	if _, err := i.cmd.Process.Wait(); err != nil {
+		return fmt.Errorf("failed to wait for instance %s: %v", i.name, err)
 	}
 	return nil
 }
@@ -345,6 +380,7 @@ func runBenchmark(b *driver.B, cfg *config, instances []*cockroachdbInstance) (e
 		pgurls = append(pgurls, fmt.Sprintf(`postgres://root@%s?sslmode=disable`, host))
 	}
 	// Load in the schema needed for the workload via `workload init`
+	log.Println("loading the schema")
 	initArgs := []string{"workload", "init", cfg.bench.workload}
 	initArgs = append(initArgs, pgurls...)
 	initCmd := exec.Command(cfg.cockroachdbBin, initArgs...)
@@ -355,6 +391,8 @@ func runBenchmark(b *driver.B, cfg *config, instances []*cockroachdbInstance) (e
 		return err
 	}
 
+	log.Println("sleeping")
+
 	// If we try and start the workload right after loading in the schema
 	// it will spam us with database does not exist errors. We could repeatedly
 	// retry until the database exists by parsing the output, or we can just
@@ -369,6 +407,7 @@ func runBenchmark(b *driver.B, cfg *config, instances []*cockroachdbInstance) (e
 	}
 	args = append(args, pgurls...)
 
+	log.Println("running benchmark timeout")
 	cmd := exec.Command(cfg.cockroachdbBin, args...)
 	fmt.Fprintln(os.Stderr, cmd.String())
 
@@ -499,6 +538,7 @@ func reportMetrics(b *driver.B, metricType string, metrics benchmarkMetrics) {
 }
 
 func run(cfg *config) (err error) {
+	log.Println("launching cluster")
 	var instances []*cockroachdbInstance
 	// Launch the server.
 	instances, err = launchCockroachCluster(cfg)
@@ -509,26 +549,41 @@ func run(cfg *config) (err error) {
 
 	// Clean up the cluster after we're done.
 	defer func() {
-		// We only need send SIGTERM to one instance, attempting to
+		log.Println("shutting down cluster")
+
+		// We only need send a shutdown signal to one instance, attempting to
 		// send it again will cause it to hang.
 		inst := instances[0]
-		if r := inst.shutdown(); r != nil {
+		killed, r := inst.shutdown()
+		if r != nil {
 			if err == nil {
 				err = r
 			} else {
 				fmt.Fprintf(os.Stderr, "failed to shutdown %s: %v", inst.name, r)
 			}
 		}
+		if killed {
+			log.Println("killed instance", inst.name, "and killing others")
+
+			// If we ended up killing an instance, try to kill the other instances too.
+			for _, inst := range instances[1:] {
+				if err := inst.kill(); err != nil {
+					fmt.Fprintf(os.Stderr, "failed to kill %s: %v", inst.name, err)
+				}
+			}
+		}
 		if err != nil && inst.output.Len() != 0 {
 			fmt.Fprintf(os.Stderr, "=== Instance %q stdout+stderr ===\n", inst.name)
 			fmt.Fprintln(os.Stderr, inst.output.String())
 		}
 	}()
 
+	log.Println("waiting for cluster")
 	if err = waitForCluster(instances, cfg); err != nil {
 		return err
 	}
 
+	log.Println("setting cluster settings")
 	if err = instances[0].setClusterSettings(cfg); err != nil {
 		return err
 	}
@@ -593,6 +648,7 @@ func run(cfg *config) (err error) {
 		if len(finishers) != 0 {
 			// Finish all the diagnostic collections in concurrently. Otherwise we could be waiting a while.
 			defer func() {
+				log.Println("running finishers")
 				var wg sync.WaitGroup
 				for _, finish := range finishers {
 					finish := finish
@@ -606,6 +662,7 @@ func run(cfg *config) (err error) {
 			}()
 		}
 		// Actually run the benchmark.
+		log.Println("running benchmark")
 		return runBenchmark(d, cfg, instances)
 	}, opts...)
 }