Skip to content

Commit

Permalink
sweet: force-kill CockroachDB if SIGTERM isn't working
Browse files Browse the repository at this point in the history
This change uses SIGKILL when benchmarks are finished running but for
whatever reason the CockroachDB cluster isn't responding to SIGTERM. At
this point, it's fine to forcibly kill the server, but let's also make
sure we kill all other instances too (since there's no clean shutdown).

Currently timeouts in the cockroachdb benchmark are causing loss of data
(a separate issue we should fix) but even if that wasn't the case, we'd
also be losing data for CockroachDB. This CL fixes the problem with the
benchmark: locally I couldn't get it to succeed with 20 runs, but with
this patch, it has no problem finishing. We should investigate why
CockroachDB isn't responding to SIGTERM and whether there's a cleaner
way to ensure a shutdown. This is OK for now, and we may want to keep
this behavior long-term anyway (useful when benchmarking unvetted
patches that cause a hang, for example).

This CL also adds a bunch more logging to the benchmark runner, too.

Change-Id: I57cf27f35b71b6c69a8ca2ec38107e1c912a5167
Cq-Include-Trybots: luci.golang.try:x_benchmarks-gotip-linux-amd64-longtest,x_benchmarks-go1.22-linux-amd64-longtest
Reviewed-on: https://go-review.googlesource.com/c/benchmarks/+/594775
Auto-Submit: Michael Knyszek <[email protected]>
Reviewed-by: Michael Pratt <[email protected]>
LUCI-TryBot-Result: Go LUCI <[email protected]>
  • Loading branch information
mknyszek authored and gopherbot committed Jun 25, 2024
1 parent 1cf02eb commit 1a6c737
Showing 1 changed file with 72 additions and 15 deletions.
87 changes: 72 additions & 15 deletions sweet/benchmarks/cockroachdb/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ import (
"errors"
"flag"
"fmt"
"golang.org/x/benchmarks/sweet/benchmarks/internal/driver"
"golang.org/x/benchmarks/sweet/benchmarks/internal/server"
"golang.org/x/benchmarks/sweet/common/diagnostics"
"log"
"os"
"os/exec"
"regexp"
Expand All @@ -25,6 +23,10 @@ import (
"sync/atomic"
"syscall"
"time"

"golang.org/x/benchmarks/sweet/benchmarks/internal/driver"
"golang.org/x/benchmarks/sweet/benchmarks/internal/server"
"golang.org/x/benchmarks/sweet/common/diagnostics"
)

const (
Expand Down Expand Up @@ -256,20 +258,53 @@ func (i *cockroachdbInstance) httpAddr() string {
return fmt.Sprintf("%s:%d", cliCfg.host, i.httpPort)
}

func (i *cockroachdbInstance) shutdown() error {
func (i *cockroachdbInstance) shutdown() (killed bool, err error) {
// Only attempt to shut down the process if we got to a point
// that a command was constructed and started.
if i.cmd != nil && i.cmd.Process != nil {
// Send SIGTERM and wait instead of just killing as sending SIGKILL
// bypasses node shutdown logic and will leave the node in a bad state.
// Normally not an issue unless you want to restart the cluster i.e.
// to poke around and see what's wrong.
if err := i.cmd.Process.Signal(syscall.SIGTERM); err != nil {
return err
}
if i.cmd == nil || i.cmd.Process == nil {
return false, nil
}
// Send SIGTERM and wait instead of just killing as sending SIGKILL
// bypasses node shutdown logic and will leave the node in a bad state.
// Normally not an issue unless you want to restart the cluster i.e.
// to poke around and see what's wrong.
if err := i.cmd.Process.Signal(syscall.SIGTERM); err != nil {
return false, err
}
done := make(chan struct{})
go func() {
if _, err := i.cmd.Process.Wait(); err != nil {
return err
fmt.Fprintf(os.Stderr, "failed to wait for instance %s: %v\n", i.name, err)
}
done <- struct{}{}
}()
select {
case <-done:
case <-time.After(1 * time.Minute):
// If it takes a full minute to shut down, just kill the instance
// and report that we did it. We *probably* won't need it again.
if err := i.cmd.Process.Signal(syscall.SIGKILL); err != nil {
return false, fmt.Errorf("failed to send SIGKILL to instance %s: %v", i.name, err)
}
killed = true

// Wait again -- this time it should happen.
log.Println("sent kill signal to", i.name, "and waiting for exit")
<-done
}
return killed, nil
}

func (i *cockroachdbInstance) kill() error {
if i.cmd == nil || i.cmd.Process == nil {
// Nothing to kill.
return nil
}
if err := i.cmd.Process.Signal(syscall.SIGKILL); err != nil {
return err
}
if _, err := i.cmd.Process.Wait(); err != nil {
return fmt.Errorf("failed to wait for instance %s: %v", i.name, err)
}
return nil
}
Expand Down Expand Up @@ -345,6 +380,7 @@ func runBenchmark(b *driver.B, cfg *config, instances []*cockroachdbInstance) (e
pgurls = append(pgurls, fmt.Sprintf(`postgres://root@%s?sslmode=disable`, host))
}
// Load in the schema needed for the workload via `workload init`
log.Println("loading the schema")
initArgs := []string{"workload", "init", cfg.bench.workload}
initArgs = append(initArgs, pgurls...)
initCmd := exec.Command(cfg.cockroachdbBin, initArgs...)
Expand All @@ -355,6 +391,8 @@ func runBenchmark(b *driver.B, cfg *config, instances []*cockroachdbInstance) (e
return err
}

log.Println("sleeping")

// If we try and start the workload right after loading in the schema
// it will spam us with database does not exist errors. We could repeatedly
// retry until the database exists by parsing the output, or we can just
Expand All @@ -369,6 +407,7 @@ func runBenchmark(b *driver.B, cfg *config, instances []*cockroachdbInstance) (e
}
args = append(args, pgurls...)

log.Println("running benchmark timeout")
cmd := exec.Command(cfg.cockroachdbBin, args...)
fmt.Fprintln(os.Stderr, cmd.String())

Expand Down Expand Up @@ -499,6 +538,7 @@ func reportMetrics(b *driver.B, metricType string, metrics benchmarkMetrics) {
}

func run(cfg *config) (err error) {
log.Println("launching cluster")
var instances []*cockroachdbInstance
// Launch the server.
instances, err = launchCockroachCluster(cfg)
Expand All @@ -509,26 +549,41 @@ func run(cfg *config) (err error) {

// Clean up the cluster after we're done.
defer func() {
// We only need send SIGTERM to one instance, attempting to
log.Println("shutting down cluster")

// We only need send a shutdown signal to one instance, attempting to
// send it again will cause it to hang.
inst := instances[0]
if r := inst.shutdown(); r != nil {
killed, r := inst.shutdown()
if r != nil {
if err == nil {
err = r
} else {
fmt.Fprintf(os.Stderr, "failed to shutdown %s: %v", inst.name, r)
}
}
if killed {
log.Println("killed instance", inst.name, "and killing others")

// If we ended up killing an instance, try to kill the other instances too.
for _, inst := range instances[1:] {
if err := inst.kill(); err != nil {
fmt.Fprintf(os.Stderr, "failed to kill %s: %v", inst.name, err)
}
}
}
if err != nil && inst.output.Len() != 0 {
fmt.Fprintf(os.Stderr, "=== Instance %q stdout+stderr ===\n", inst.name)
fmt.Fprintln(os.Stderr, inst.output.String())
}
}()

log.Println("waiting for cluster")
if err = waitForCluster(instances, cfg); err != nil {
return err
}

log.Println("setting cluster settings")
if err = instances[0].setClusterSettings(cfg); err != nil {
return err
}
Expand Down Expand Up @@ -593,6 +648,7 @@ func run(cfg *config) (err error) {
if len(finishers) != 0 {
// Finish all the diagnostic collections in concurrently. Otherwise we could be waiting a while.
defer func() {
log.Println("running finishers")
var wg sync.WaitGroup
for _, finish := range finishers {
finish := finish
Expand All @@ -606,6 +662,7 @@ func run(cfg *config) (err error) {
}()
}
// Actually run the benchmark.
log.Println("running benchmark")
return runBenchmark(d, cfg, instances)
}, opts...)
}
Expand Down

0 comments on commit 1a6c737

Please sign in to comment.