Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry osquery instance launch faster when we see the stale lockfile issue #2041

Merged
116 changes: 64 additions & 52 deletions pkg/osquery/runtime/osqueryinstance.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ const (
KolideSaasExtensionName = "kolide_grpc"

// How long to wait before erroring because the osquery process has not started up successfully
osqueryStartupTimeout = 1 * time.Minute
osqueryStartupTimeout = 20 * time.Second
RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW I think this used to be 10s, and that I made it longer trying to track down the monterey bug. Back when I thought it was a slow startup issue.

https://github.com/kolide/launcher/blob/0.9.6/pkg/osquery/runtime/runtime.go#L295-L300

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ooh, good to know, I could not remember what it had been historically. We could probably cut this down even further to 10 seconds again.


// How often to check whether the osquery process has started up successfully
osqueryStartupRecheckInterval = 1 * time.Second
Expand Down Expand Up @@ -331,12 +331,6 @@ func (i *OsqueryInstance) Launch() error {
// Assign a PGID that matches the PID. This lets us kill the entire process group later.
i.cmd.SysProcAttr = setpgid()

i.slogger.Log(ctx, slog.LevelInfo,
"launching osqueryd",
"path", i.cmd.Path,
"args", strings.Join(i.cmd.Args, " "),
)

// remove any socket already at the extension socket path to ensure
// that it's not left over from a previous instance
if err := os.RemoveAll(paths.extensionSocketPath); err != nil {
Expand All @@ -348,53 +342,10 @@ func (i *OsqueryInstance) Launch() error {
}

// Launch osquery process (async)
err = i.startFunc(i.cmd)
if err != nil {
// Failure here is indicative of a failure to exec. A missing
// binary? Bad permissions? TODO: Consider catching errors in the
// update system and falling back to an earlier version.
msgPairs := append(
getOsqueryInfoForLog(i.cmd.Path),
"err", err,
)

i.slogger.Log(ctx, slog.LevelWarn,
"fatal error starting osquery -- could not exec.",
msgPairs...,
)
traces.SetError(span, fmt.Errorf("fatal error starting osqueryd process: %w", err))
return fmt.Errorf("fatal error starting osqueryd process: %w", err)
}

span.AddEvent("launched_osqueryd")
i.slogger.Log(ctx, slog.LevelInfo,
"launched osquery process",
"osqueryd_pid", i.cmd.Process.Pid,
)

// wait for osquery to create the socket before moving on,
// this is intended to serve as a kind of health check
// for osquery, if it's started successfully it will create
// a socket
if err := backoff.WaitFor(func() error {
_, err := os.Stat(paths.extensionSocketPath)
if err != nil {
i.slogger.Log(ctx, slog.LevelDebug,
"osquery extension socket not created yet ... will retry",
"path", paths.extensionSocketPath,
)
}
return err
}, osqueryStartupTimeout, osqueryStartupRecheckInterval); err != nil {
traces.SetError(span, fmt.Errorf("timeout waiting for osqueryd to create socket at %s: %w", paths.extensionSocketPath, err))
return fmt.Errorf("timeout waiting for osqueryd to create socket at %s: %w", paths.extensionSocketPath, err)
if err := i.startOsquerydProcess(ctx, paths); err != nil {
return fmt.Errorf("starting osqueryd process: %w", err)
}

span.AddEvent("socket_created")
i.slogger.Log(ctx, slog.LevelDebug,
"osquery socket created",
)

stats, err := history.NewInstance(i.registrationId, i.runId)
if err != nil {
i.slogger.Log(ctx, slog.LevelWarn,
Expand Down Expand Up @@ -483,6 +434,67 @@ func (i *OsqueryInstance) Launch() error {
return nil
}

// startOsquerydProcess starts the osquery instance's `cmd` and waits for the osqueryd process
// to create a socket file, indicating it's started up successfully.
func (i *OsqueryInstance) startOsquerydProcess(ctx context.Context, paths *osqueryFilePaths) error {
ctx, span := traces.StartSpan(ctx)
defer span.End()

i.slogger.Log(ctx, slog.LevelInfo,
"launching osqueryd",
"path", i.cmd.Path,
"args", strings.Join(i.cmd.Args, " "),
)

if err := i.startFunc(i.cmd); err != nil {
// Failure here is indicative of a failure to exec. A missing
// binary? Bad permissions? TODO: Consider catching errors in the
// update system and falling back to an earlier version.
msgPairs := append(
getOsqueryInfoForLog(i.cmd.Path),
"err", err,
)

i.slogger.Log(ctx, slog.LevelWarn,
"fatal error starting osquery -- could not exec.",
msgPairs...,
)
traces.SetError(span, fmt.Errorf("fatal error starting osqueryd process: %w", err))
return fmt.Errorf("fatal error starting osqueryd process: %w", err)
}

span.AddEvent("launched_osqueryd")
i.slogger.Log(ctx, slog.LevelInfo,
"launched osquery process",
"osqueryd_pid", i.cmd.Process.Pid,
)

// wait for osquery to create the socket before moving on,
// this is intended to serve as a kind of health check
// for osquery, if it's started successfully it will create
// a socket
if err := backoff.WaitFor(func() error {
_, err := os.Stat(paths.extensionSocketPath)
if err != nil {
i.slogger.Log(ctx, slog.LevelDebug,
"osquery extension socket not created yet ... will retry",
"path", paths.extensionSocketPath,
)
}
return err
}, osqueryStartupTimeout, osqueryStartupRecheckInterval); err != nil {
traces.SetError(span, fmt.Errorf("timeout waiting for osqueryd to create socket at %s: %w", paths.extensionSocketPath, err))
return fmt.Errorf("timeout waiting for osqueryd to create socket at %s: %w", paths.extensionSocketPath, err)
}

span.AddEvent("socket_created")
i.slogger.Log(ctx, slog.LevelDebug,
"osquery socket created",
)

return nil
}

// healthcheckWithRetries returns an error if it cannot get a non-error response from
// `Healthy` within `maxHealthChecks` attempts.
func (i *OsqueryInstance) healthcheckWithRetries(ctx context.Context, maxHealthChecks int, retryInterval time.Duration) error {
Expand Down
2 changes: 1 addition & 1 deletion pkg/osquery/runtime/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (
)

const (
launchRetryDelay = 30 * time.Second
launchRetryDelay = 10 * time.Second
)

type Runner struct {
Expand Down
Loading