From 92a2f6f6422d4518a632157166640a41fb204418 Mon Sep 17 00:00:00 2001 From: dongdong Date: Tue, 3 Sep 2024 14:29:40 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20operator=20=E6=94=AF=E6=8C=81=20httpsd?= =?UTF-8?q?=20=E6=9C=8D=E5=8A=A1=E5=8F=91=E7=8E=B0=E6=9C=BA=E5=88=B6=20(#5?= =?UTF-8?q?15)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../compressor.go => gzip/gzip.go} | 27 +- .../compressor_test.go => gzip/gzip_test.go} | 2 +- pkg/operator/common/k8sutils/client.go | 2 +- pkg/operator/common/logconf/hook.go | 26 +- pkg/operator/common/logconf/logger.go | 35 +- .../common/{utils => stringx}/utils.go | 2 +- pkg/operator/common/tasks/tasks.go | 16 +- pkg/operator/config/hook.go | 2 - pkg/operator/go.mod | 2 +- .../operator/dataidwatcher/watcher.go | 3 +- pkg/operator/operator/discover/base.go | 684 +++++++++++++++ .../discover/{pod.go => childconfig.go} | 60 +- pkg/operator/operator/discover/discover.go | 780 +----------------- pkg/operator/operator/discover/endpoint.go | 64 -- .../discover/{cache.go => hashcache.go} | 16 +- pkg/operator/operator/discover/httpd/http.go | 91 ++ .../discover/kubernetesd/kubernetes.go | 237 ++++++ .../discover/{ => shareddiscovery}/metrics.go | 24 +- .../{ => shareddiscovery}/shared_discovery.go | 194 ++--- pkg/operator/operator/hook.go | 131 ++- .../{operator_test.go => kubelet_test.go} | 0 .../operator/objectsref/controller.go | 18 +- pkg/operator/operator/objectsref/metrics.go | 54 +- pkg/operator/operator/operator.go | 466 +---------- pkg/operator/operator/podmonitor.go | 217 +++++ pkg/operator/operator/promrule.go | 46 ++ pkg/operator/operator/promsd.go | 208 +++++ pkg/operator/operator/promsli/promsli.go | 4 +- pkg/operator/operator/recorder.go | 5 + pkg/operator/operator/relabel.go | 42 +- pkg/operator/operator/relabel_test.go | 12 +- pkg/operator/operator/secret.go | 34 +- pkg/operator/operator/server.go | 141 ++-- pkg/operator/operator/servicemonitor.go | 223 +++++ pkg/operator/operator/target/hook.go | 4 - pkg/operator/operator/target/metric.go | 4 +- pkg/operator/reloader/reloader.go | 4 +- 37 files changed, 2182 insertions(+), 1698 deletions(-) rename pkg/operator/common/{compressor/compressor.go => gzip/gzip.go} (71%) rename pkg/operator/common/{compressor/compressor_test.go => gzip/gzip_test.go} (98%) rename pkg/operator/common/{utils => stringx}/utils.go (98%) create mode 100644 pkg/operator/operator/discover/base.go rename pkg/operator/operator/discover/{pod.go => childconfig.go} (50%) delete mode 100644 pkg/operator/operator/discover/endpoint.go rename pkg/operator/operator/discover/{cache.go => hashcache.go} (86%) create mode 100644 pkg/operator/operator/discover/httpd/http.go create mode 100644 pkg/operator/operator/discover/kubernetesd/kubernetes.go rename pkg/operator/operator/discover/{ => shareddiscovery}/metrics.go (85%) rename pkg/operator/operator/discover/{ => shareddiscovery}/shared_discovery.go (51%) rename pkg/operator/operator/{operator_test.go => kubelet_test.go} (100%) create mode 100644 pkg/operator/operator/podmonitor.go create mode 100644 pkg/operator/operator/promrule.go create mode 100644 pkg/operator/operator/promsd.go create mode 100644 pkg/operator/operator/servicemonitor.go diff --git a/pkg/operator/common/compressor/compressor.go b/pkg/operator/common/gzip/gzip.go similarity index 71% rename from pkg/operator/common/compressor/compressor.go rename to pkg/operator/common/gzip/gzip.go index 9c4b87628..9c749d9af 100644 --- a/pkg/operator/common/compressor/compressor.go +++ b/pkg/operator/common/gzip/gzip.go @@ -7,7 +7,7 @@ // an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -package compressor +package gzip import ( "bytes" @@ -15,36 +15,19 @@ import ( "io" ) -type Compressor interface { - Compress(b []byte) ([]byte, error) - Uncompress(b []byte) ([]byte, error) -} - func Compress(b []byte) ([]byte, error) { - return defaultCompressor.Compress(b) -} - -func Uncompress(b []byte) ([]byte, error) { - return defaultCompressor.Uncompress(b) -} - -var defaultCompressor = gzipCompressor{} - -type gzipCompressor struct{} - -func (gzipCompressor) Compress(b []byte) ([]byte, error) { buf := &bytes.Buffer{} w := gzip.NewWriter(buf) if _, err := w.Write(b); err != nil { - w.Close() + _ = w.Close() return nil, err } - w.Close() + _ = w.Close() return buf.Bytes(), nil } -func (gzipCompressor) Uncompress(conf []byte) ([]byte, error) { - reader := bytes.NewReader(conf) +func Uncompress(b []byte) ([]byte, error) { + reader := bytes.NewReader(b) r, err := gzip.NewReader(reader) if err != nil { return nil, err diff --git a/pkg/operator/common/compressor/compressor_test.go b/pkg/operator/common/gzip/gzip_test.go similarity index 98% rename from pkg/operator/common/compressor/compressor_test.go rename to pkg/operator/common/gzip/gzip_test.go index 720538eb7..bfe8d735a 100644 --- a/pkg/operator/common/compressor/compressor_test.go +++ b/pkg/operator/common/gzip/gzip_test.go @@ -7,7 +7,7 @@ // an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -package compressor +package gzip import ( "testing" diff --git a/pkg/operator/common/k8sutils/client.go b/pkg/operator/common/k8sutils/client.go index 8ebb77e3d..57c024970 100644 --- a/pkg/operator/common/k8sutils/client.go +++ b/pkg/operator/common/k8sutils/client.go @@ -77,7 +77,7 @@ func NewTkexClient(host string, tlsConfig *rest.TLSClientConfig) (tkexversiond.I } func WaitForNamedCacheSync(ctx context.Context, controllerName string, inf cache.SharedIndexInformer) bool { - return operator.WaitForNamedCacheSync(ctx, controllerName, new(logconf.Logger), inf) + return operator.WaitForNamedCacheSync(ctx, controllerName, logconf.New(controllerName), inf) } func CreateOrUpdateSecret(ctx context.Context, secretClient clientv1.SecretInterface, desired *corev1.Secret) error { diff --git a/pkg/operator/common/logconf/hook.go b/pkg/operator/common/logconf/hook.go index fce1d7456..1d2b173b5 100644 --- a/pkg/operator/common/logconf/hook.go +++ b/pkg/operator/common/logconf/hook.go @@ -19,34 +19,18 @@ import ( ) const ( - confStdoutPath = "log.stdout" - confFormatPath = "log.format" - confFileNamePath = "log.filename" - confMaxAgePath = "log.max_age" - confMaxSizePath = "log.max_size" - confMaxBackupPath = "log.max_backup" - confLogLevelPath = "log.level" + confLoggerLevelPath = "logger.level" ) func initConfig() { - viper.SetDefault(confStdoutPath, false) - viper.SetDefault(confFormatPath, "logfmt") - viper.SetDefault(confFileNamePath, "bkmonitor-operator.log") - viper.SetDefault(confMaxAgePath, 3) - viper.SetDefault(confMaxSizePath, 512) - viper.SetDefault(confMaxBackupPath, 5) - viper.SetDefault(confLogLevelPath, "error") + viper.SetDefault(confLoggerLevelPath, "info") } func updateConfig() { logger.SetOptions(logger.Options{ - Stdout: viper.GetBool(confStdoutPath), - Format: viper.GetString(confFormatPath), - Filename: viper.GetString(confFileNamePath), - MaxAge: viper.GetInt(confMaxAgePath), - MaxSize: viper.GetInt(confMaxSizePath), - MaxBackups: viper.GetInt(confMaxBackupPath), - Level: viper.GetString(confLogLevelPath), + Stdout: true, + Format: "logfmt", + Level: viper.GetString(confLoggerLevelPath), }) } diff --git a/pkg/operator/common/logconf/logger.go b/pkg/operator/common/logconf/logger.go index aadcb3850..f55bb0d06 100644 --- a/pkg/operator/common/logconf/logger.go +++ b/pkg/operator/common/logconf/logger.go @@ -9,11 +9,38 @@ package logconf -import "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" +import ( + "bytes" -type Logger struct{} + "github.com/go-kit/log" + + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" +) + +type Logger struct { + prefix string + w log.Logger +} + +func New(prefix string) *Logger { + l := &Logger{ + w: log.NewLogfmtLogger(writer{ + prefix: prefix, + }), + } + return l +} func (l *Logger) Log(keyvals ...interface{}) error { - logger.Debug(keyvals...) - return nil + return l.w.Log(keyvals...) +} + +type writer struct { + prefix string +} + +func (w writer) Write(b []byte) (int, error) { + s := string(bytes.TrimSpace(b)) + logger.Infof("%s\t%s", w.prefix, s) + return len(s), nil } diff --git a/pkg/operator/common/utils/utils.go b/pkg/operator/common/stringx/utils.go similarity index 98% rename from pkg/operator/common/utils/utils.go rename to pkg/operator/common/stringx/utils.go index 647be8a26..6f9da9c97 100644 --- a/pkg/operator/common/utils/utils.go +++ b/pkg/operator/common/stringx/utils.go @@ -7,7 +7,7 @@ // an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -package utils +package stringx import "strings" diff --git a/pkg/operator/common/tasks/tasks.go b/pkg/operator/common/tasks/tasks.go index 48694b623..1ac340cb2 100644 --- a/pkg/operator/common/tasks/tasks.go +++ b/pkg/operator/common/tasks/tasks.go @@ -13,15 +13,13 @@ import ( "fmt" ) -const ( - StatefulSetTaskSecretPrefix = "statefulset-worker" - DaemonSetTaskSecretPrefix = "daemonset-worker" - EventTaskSecretPrefix = "event-worker" -) - const ( LabelTaskType = "taskType" + PrefixStatefulSetTaskSecret = "statefulset-worker" + PrefixDaemonSetTaskSecret = "daemonset-worker" + PrefixEventTaskSecret = "event-worker" + TaskTypeDaemonSet = "daemonset" TaskTypeEvent = "event" TaskTypeStatefulSet = "statefulset" @@ -36,15 +34,15 @@ func ValidateTaskType(t string) bool { } func GetDaemonSetTaskSecretName(s string) string { - return fmt.Sprintf("%s-%s", DaemonSetTaskSecretPrefix, s) + return fmt.Sprintf("%s-%s", PrefixDaemonSetTaskSecret, s) } func GetStatefulSetTaskSecretName(i int) string { - return fmt.Sprintf("%s-%d", StatefulSetTaskSecretPrefix, i) + return fmt.Sprintf("%s-%d", PrefixStatefulSetTaskSecret, i) } func GetEventTaskSecretName() string { - return fmt.Sprintf("%s-0", EventTaskSecretPrefix) + return fmt.Sprintf("%s-0", PrefixEventTaskSecret) } func GetTaskLabelSelector(s string) string { diff --git a/pkg/operator/config/hook.go b/pkg/operator/config/hook.go index bcfc06eaa..ba1535b5c 100644 --- a/pkg/operator/config/hook.go +++ b/pkg/operator/config/hook.go @@ -45,8 +45,6 @@ func InitConfig() error { return err } - fmt.Printf("using config file: %s\n", viper.ConfigFileUsed()) - fmt.Printf("settings: %+v\n", viper.AllSettings()) EventBus.Publish(EventConfigPostParse) return nil } diff --git a/pkg/operator/go.mod b/pkg/operator/go.mod index 6ca79145d..259249f62 100644 --- a/pkg/operator/go.mod +++ b/pkg/operator/go.mod @@ -34,6 +34,7 @@ require ( github.com/blang/semver/v4 v4.0.0 github.com/cespare/xxhash/v2 v2.2.0 github.com/ghodss/yaml v1.0.0 + github.com/go-kit/log v0.2.1 github.com/valyala/bytebufferpool v1.0.0 ) @@ -74,7 +75,6 @@ require ( github.com/fatih/color v1.13.0 // indirect github.com/fsnotify/fsnotify v1.5.4 // indirect github.com/garyburd/redigo v1.6.2 // indirect - github.com/go-kit/log v0.2.1 // indirect github.com/go-logfmt/logfmt v0.5.1 // indirect github.com/go-logr/logr v1.2.4 // indirect github.com/go-logr/stdr v1.2.2 // indirect diff --git a/pkg/operator/operator/dataidwatcher/watcher.go b/pkg/operator/operator/dataidwatcher/watcher.go index ede9359cd..f1adae4d2 100644 --- a/pkg/operator/operator/dataidwatcher/watcher.go +++ b/pkg/operator/operator/dataidwatcher/watcher.go @@ -257,8 +257,7 @@ func (w *dataIDWatcher) deleteDataID(dataID *bkv1beta1.DataID) { return } - logger.Infof("delete DataID, name=%v, id=%v, labels=%v", dataID.Name, dataID.Spec.DataID, dataID.Labels) - Publish() + Publish() // 发布信号 } func (w *dataIDWatcher) deleteMetricDataID(dataID *bkv1beta1.DataID) { diff --git a/pkg/operator/operator/discover/base.go b/pkg/operator/operator/discover/base.go new file mode 100644 index 000000000..b5e734180 --- /dev/null +++ b/pkg/operator/operator/discover/base.go @@ -0,0 +1,684 @@ +// Tencent is pleased to support the open source community by making +// 蓝鲸智云 - 监控平台 (BlueKing - Monitor) available. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +// You may obtain a copy of the License at http://opensource.org/licenses/MIT +// Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +// an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +package discover + +import ( + "context" + "fmt" + "net" + "net/url" + "regexp" + "sort" + "strings" + "sync" + "time" + + "github.com/elastic/beats/libbeat/common/transport/tlscommon" + "github.com/goware/urlx" + "github.com/pkg/errors" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/config" + "github.com/prometheus/prometheus/discovery/targetgroup" + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/relabel" + "gopkg.in/yaml.v2" + + bkv1beta1 "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/apis/crd/v1beta1" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/feature" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/labelspool" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/tasks" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover/shareddiscovery" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/target" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" +) + +// Helper 一系列执行函数 由 discover 自行实现 +type Helper struct { + AccessBasicAuth func() (string, string, error) + AccessBearerToken func() (string, error) + AccessTlsConfig func() (*tlscommon.Config, error) + MatchNodeName func(labels.Labels) string +} + +// CommonOptions baseDiscover 通用的 Options +type CommonOptions struct { + MonitorMeta define.MonitorMeta + UniqueKey string + RelabelRule string + RelabelIndex string + NormalizeMetricName bool + AntiAffinity bool + Name string + Path string + Scheme string + ProxyURL string + Period string + Timeout string + ForwardLocalhost bool + DisableCustomTimestamp bool + DataID *bkv1beta1.DataID + Relabels []*relabel.Config + BearerTokenFile string + ExtraLabels map[string]string + System bool + UrlValues url.Values + MetricRelabelConfigs []yaml.MapSlice + MatchSelector map[string]string + DropSelector map[string]string + LabelJoinMatcher *feature.LabelJoinMatcherSpec +} + +type BaseDiscover struct { + opts *CommonOptions + parentCtx context.Context + ctx context.Context + cancel context.CancelFunc + wg sync.WaitGroup + monitorMeta define.MonitorMeta + mm *shareddiscovery.MetricMonitor + checkNodeFunc define.CheckFunc + fetched bool + cache *hashCache + helper Helper + + // 任务配置文件信息 通过 source 进行分组 使用 hash 进行唯一校验 + childConfigMut sync.RWMutex + childConfigGroups map[string]map[uint64]*ChildConfig // map[targetGroup.Source]map[hash]*ChildConfig +} + +func NewBaseDiscover(ctx context.Context, checkFn define.CheckFunc, opts *CommonOptions) *BaseDiscover { + return &BaseDiscover{ + parentCtx: ctx, + opts: opts, + checkNodeFunc: checkFn, + monitorMeta: opts.MonitorMeta, + mm: shareddiscovery.NewMetricMonitor(opts.Name), + } +} + +func (d *BaseDiscover) getUrlValues() url.Values { + if d.opts.UrlValues == nil { + return nil + } + values := make(map[string][]string) + for k, items := range d.opts.UrlValues { + for _, item := range items { + values[k] = append(values[k], item) + } + } + return values +} + +func (d *BaseDiscover) SetUK(s string) { + d.opts.UniqueKey = s +} + +func (d *BaseDiscover) SetHelper(helper Helper) { + d.helper = helper +} + +func (d *BaseDiscover) UK() string { + return d.opts.UniqueKey +} + +func (d *BaseDiscover) Type() string { + return "base" +} + +func (d *BaseDiscover) Name() string { + return d.opts.Name +} + +func (d *BaseDiscover) IsSystem() bool { + return d.opts.System +} + +func (d *BaseDiscover) DataID() *bkv1beta1.DataID { + return d.opts.DataID +} + +func (d *BaseDiscover) MonitorMeta() define.MonitorMeta { + return d.monitorMeta +} + +func (d *BaseDiscover) PreStart() { + d.mm.IncStartedCounter() + + d.ctx, d.cancel = context.WithCancel(d.parentCtx) + d.childConfigGroups = make(map[string]map[uint64]*ChildConfig) + d.cache = newHashCache(d.opts.Name, time.Minute*10) + logger.Infof("starting discover %s", d.Name()) +} + +func (d *BaseDiscover) SetDataID(dataID *bkv1beta1.DataID) { + d.opts.DataID = dataID + d.opts.ExtraLabels = dataID.Spec.Labels +} + +func (d *BaseDiscover) String() string { + return fmt.Sprintf("Name=%s, Type=%s, System=%v", d.Name(), d.Type(), d.opts.System) +} + +func (d *BaseDiscover) Stop() { + d.cancel() + logger.Infof("waiting discover %s", d.Name()) + + d.wg.Wait() + d.mm.IncStoppedCounter() + d.cache.Clean() + logger.Infof("shutting discover %s", d.Name()) +} + +func (d *BaseDiscover) makeMetricTarget(lbls, origLabels labels.Labels, namespace string) (*target.MetricTarget, error) { + metricTarget := &target.MetricTarget{} + taskType := tasks.TaskTypeStatefulSet + + // model.* 相关 label 有可能会被重写 使用 lbls(保证一定有 __address__ 字段) + for _, label := range lbls { + switch label.Name { + case model.AddressLabel: + metricTarget.Address = label.Value + case model.SchemeLabel: + metricTarget.Scheme = label.Value + case model.MetricsPathLabel: + metricTarget.Path = label.Value + } + } + + if d.helper.MatchNodeName != nil { + metricTarget.NodeName = d.helper.MatchNodeName(origLabels) + } + + if d.checkNodeFunc != nil { + nodeName, exist := d.checkNodeFunc(metricTarget.NodeName) + if exist { + taskType = tasks.TaskTypeDaemonSet + } + // 修正 nodename + metricTarget.NodeName = nodeName + } + + if metricTarget.NodeName == "" { + logger.Debugf("%s no node info from labels: %+v", d.Name(), origLabels) + metricTarget.NodeName = define.UnknownNode + } + + // 初始化参数列表 + metricTarget.Params = d.getUrlValues() + if d.opts.UrlValues == nil { + metricTarget.Params = make(url.Values) + } + + if metricTarget.Scheme == "" { + metricTarget.Scheme = d.opts.Scheme + } + if metricTarget.Path == "" { + metricTarget.Path = d.opts.Path + } + + requestURL, err := url.Parse(metricTarget.Path) + if err != nil { + return nil, errors.Wrap(err, "parse request path failed") + } + metricTarget.Path = requestURL.Path + + params, err := url.ParseQuery(requestURL.RawQuery) + if err != nil { + return nil, errors.Wrap(err, "parse request query failed") + } + for key := range params { + metricTarget.Params[key] = append(metricTarget.Params[key], params[key]...) + } + + if d.helper.AccessBasicAuth != nil { + username, password, err := d.helper.AccessBasicAuth() + if err != nil { + return nil, err + } + metricTarget.Username = username + metricTarget.Password = password + } + + if d.helper.AccessBearerToken != nil { + bearerToken, err := d.helper.AccessBearerToken() + if err != nil { + return nil, err + } + metricTarget.BearerToken = bearerToken + } + + if d.helper.AccessTlsConfig != nil { + tlsConfig, err := d.helper.AccessTlsConfig() + if err != nil { + return nil, err + } + metricTarget.TLSConfig = tlsConfig + } + + if len(lbls) == 0 { + metricTarget.Labels = origLabels + } else { + metricTarget.Labels = lbls + } + + period := d.opts.Period + if period == "" { + period = ConfDefaultPeriod + } + timeout := d.opts.Timeout + if timeout == "" { + timeout = period + } + + metricTarget.Meta = d.monitorMeta + metricTarget.ExtraLabels = d.opts.ExtraLabels + metricTarget.Namespace = namespace // 采集目标的 namespace + metricTarget.DataID = d.DataID().Spec.DataID + metricTarget.DimensionReplace = d.DataID().Spec.DimensionReplace + metricTarget.MetricReplace = d.DataID().Spec.MetricReplace + metricTarget.MetricRelabelConfigs = d.opts.MetricRelabelConfigs + metricTarget.Period = period + metricTarget.Timeout = timeout + metricTarget.BearerTokenFile = d.opts.BearerTokenFile + metricTarget.ProxyURL = d.opts.ProxyURL + metricTarget.Mask = d.Mask() + metricTarget.TaskType = taskType + metricTarget.RelabelRule = d.opts.RelabelRule + metricTarget.RelabelIndex = d.opts.RelabelIndex + metricTarget.NormalizeMetricName = d.opts.NormalizeMetricName + metricTarget.LabelJoinMatcher = d.opts.LabelJoinMatcher + + return metricTarget, nil +} + +func (d *BaseDiscover) StatefulSetChildConfigs() []*ChildConfig { + d.childConfigMut.RLock() + defer d.childConfigMut.RUnlock() + + configs := make([]*ChildConfig, 0) + for _, group := range d.childConfigGroups { + for _, cfg := range group { + if cfg.TaskType == tasks.TaskTypeStatefulSet { + configs = append(configs, cfg) + } + } + } + return configs +} + +func (d *BaseDiscover) DaemonSetChildConfigs() []*ChildConfig { + d.childConfigMut.RLock() + defer d.childConfigMut.RUnlock() + + configs := make([]*ChildConfig, 0) + for _, group := range d.childConfigGroups { + for _, cfg := range group { + if cfg.TaskType == tasks.TaskTypeDaemonSet { + configs = append(configs, cfg) + } + } + } + return configs +} + +func (d *BaseDiscover) Mask() string { + var mask string + conv := func(b bool) string { + if b { + return "1" + } + return "0" + } + + mask += conv(d.opts.System) + return mask +} + +func (d *BaseDiscover) LoopHandle() { + d.wg.Add(1) + defer d.wg.Done() + + d.loopHandleTargetGroup() +} + +// loopHandleTargetGroup 持续处理来自 k8s 的 targets +func (d *BaseDiscover) loopHandleTargetGroup() { + defer Publish() + + const duration = 10 + const resync = 100 // 避免事件丢失 + + ticker := time.NewTicker(time.Second * duration) + defer ticker.Stop() + + counter := 0 + for { + select { + case <-d.ctx.Done(): + return + + case <-ticker.C: + counter++ + tgList, updatedAt := shareddiscovery.FetchTargetGroups(d.UK()) + logger.Debugf("%s updated at: %v", d.Name(), time.Unix(updatedAt, 0)) + if time.Now().Unix()-updatedAt > duration*2 && counter%resync != 0 && d.fetched { + logger.Debugf("%s found nothing changed, skip targetgourps handled", d.Name()) + continue + } + d.fetched = true + + for _, tg := range tgList { + if tg == nil { + continue + } + logger.Debugf("%s get targets source: %s, targets: %+v, labels: %+v", d.Name(), tg.Source, tg.Targets, tg.Labels) + d.handleTargetGroup(tg) + } + } + } +} + +func forwardAddress(addr string) (string, error) { + withSchema := strings.HasPrefix(addr, "https") || strings.HasPrefix(addr, "http") + + u, err := urlx.Parse(addr) + if err != nil { + return "", err + } + + port := u.Port() + if port != "" { + u.Host = "127.0.0.1:" + port + } else { + u.Host = "127.0.0.1" + } + if !withSchema { + u.Scheme = "" + return u.String()[2:], nil + } + + return u.String(), nil +} + +func tgSourceNamespace(s string) string { + parts := strings.Split(s, "/") + if len(parts) == 3 && parts[1] != "" { + return parts[1] + } + return "-" +} + +func matchSelector(labels []labels.Label, selector map[string]string) bool { + var count int + for k, v := range selector { + re, err := regexp.Compile(v) + if err != nil { + logger.Errorf("failed to compile expr '%s', err: %v", v, err) + continue + } + for _, lbs := range labels { + if lbs.Name == k { + if !re.MatchString(lbs.Value) { + return false + } + count++ + break + } + } + } + return count == len(selector) +} + +func (d *BaseDiscover) handleTarget(namespace string, tlset, tglbs model.LabelSet) (*ChildConfig, error) { + lbls := labelspool.Get() + defer labelspool.Put(lbls) + + for ln, lv := range tlset { + lbls = append(lbls, labels.Label{ + Name: string(ln), + Value: string(lv), + }) + } + for ln, lv := range tglbs { + if _, ok := tlset[ln]; !ok { + lbls = append(lbls, labels.Label{ + Name: string(ln), + Value: string(lv), + }) + } + } + + // annotations 白名单过滤 + if len(d.opts.MatchSelector) > 0 { + if !matchSelector(lbls, d.opts.MatchSelector) { + logger.Debugf("%s annotation selector not match: %v", d.Name(), d.opts.MatchSelector) + return nil, nil + } + } + + // annotations 黑名单过滤 + if len(d.opts.DropSelector) > 0 { + if matchSelector(lbls, d.opts.DropSelector) { + logger.Debugf("%s annotation selector drop: %v", d.Name(), d.opts.DropSelector) + return nil, nil + } + } + + sort.Sort(lbls) + res, orig, err := d.populateLabels(lbls) + if err != nil { + return nil, errors.Wrap(err, "populate labels failed") + } + if len(res) == 0 { + return nil, nil + } + + logger.Debugf("%s populate labels %+v", d.Name(), res) + metricTarget, err := d.makeMetricTarget(res, orig, namespace) + if err != nil { + return nil, errors.Wrap(err, "make metric target failed") + } + + interval, _ := time.ParseDuration(metricTarget.Period) + d.mm.SetMonitorScrapeInterval(interval.Seconds()) + + if d.opts.ForwardLocalhost { + metricTarget.Address, err = forwardAddress(metricTarget.Address) + if err != nil { + return nil, errors.Wrapf(err, "forward address failed, address=%s", metricTarget.Address) + } + } + + metricTarget.DisableCustomTimestamp = d.opts.DisableCustomTimestamp + data, err := metricTarget.YamlBytes() + if err != nil { + return nil, errors.Wrap(err, "marshal target failed") + } + + childConfig := &ChildConfig{ + Node: metricTarget.NodeName, + FileName: metricTarget.FileName(), + Address: metricTarget.Address, + Data: data, + Scheme: metricTarget.Scheme, + Path: metricTarget.Path, + Mask: metricTarget.Mask, + Meta: metricTarget.Meta, + Namespace: metricTarget.Namespace, + TaskType: metricTarget.TaskType, + AntiAffinity: d.opts.AntiAffinity, + } + logger.Debugf("%s create child config: %+v", d.Name(), childConfig) + return childConfig, nil +} + +// handleTargetGroup 遍历自身的所有 target group 计算得到活跃的 target 并删除消失的 target +func (d *BaseDiscover) handleTargetGroup(targetGroup *targetgroup.Group) { + d.mm.IncHandledTgCounter() + + namespace := tgSourceNamespace(targetGroup.Source) + sourceName := targetGroup.Source + childConfigs := make([]*ChildConfig, 0) + + for _, tlset := range targetGroup.Targets { + skipped := d.cache.Check(namespace, tlset, targetGroup.Labels) + if skipped { + d.mm.IncCreatedChildConfigCachedCounter() + continue + } + + childConfig, err := d.handleTarget(namespace, tlset, targetGroup.Labels) + if err != nil { + logger.Errorf("%s handle target failed: %v", d.Name(), err) + d.mm.IncCreatedChildConfigFailedCounter() + continue + } + if childConfig == nil { + d.cache.Set(namespace, tlset, targetGroup.Labels) + continue + } + + d.mm.IncCreatedChildConfigSuccessCounter() + childConfigs = append(childConfigs, childConfig) + } + + d.notify(sourceName, childConfigs) +} + +// notify 判断是否刷新文件配置 需要则要发送通知信号 +func (d *BaseDiscover) notify(source string, childConfigs []*ChildConfig) { + d.childConfigMut.Lock() + defer d.childConfigMut.Unlock() + + if _, ok := d.childConfigGroups[source]; !ok { + d.childConfigGroups[source] = make(map[uint64]*ChildConfig) + } + + added := make(map[uint64]struct{}) + var changed bool + + // 增加新出现的配置 + for _, cfg := range childConfigs { + hash := cfg.Hash() + if _, ok := d.childConfigGroups[source][hash]; !ok { + logger.Infof("%s adds file, node=%s, filename=%s", d.Name(), cfg.Node, cfg.FileName) + d.childConfigGroups[source][hash] = cfg + changed = true + } + added[hash] = struct{}{} + } + + // 删除已经消失的配置 + removed := make([]uint64, 0) + for key := range d.childConfigGroups[source] { + if _, ok := added[key]; !ok { + removed = append(removed, key) + changed = true + } + } + + for _, key := range removed { + cfg := d.childConfigGroups[source][key] + logger.Infof("%s deletes file, node=%s, filename=%s", d.Name(), cfg.Node, cfg.FileName) + delete(d.childConfigGroups[source], key) + } + + // 如果文件有变更则发送通知 + if changed { + logger.Infof("%s found targetgroup.source changed", source) + Publish() + } +} + +// populateLabels builds a label set from the given label set and scrape configuration. +// It returns a label set before relabeling was applied as the second return value. +// Returns the original discovered label set found before relabelling was applied if the target is dropped during relabeling. +func (d *BaseDiscover) populateLabels(lset labels.Labels) (res, orig labels.Labels, err error) { + // Copy labels into the labelset for the target if they are not set already. + scrapeLabels := []labels.Label{ + {Name: model.JobLabel, Value: d.Name()}, + {Name: model.MetricsPathLabel, Value: d.opts.Path}, + {Name: model.SchemeLabel, Value: d.opts.Scheme}, + } + lb := labels.NewBuilder(lset) + + for _, l := range scrapeLabels { + if lv := lset.Get(l.Name); lv == "" { + lb.Set(l.Name, l.Value) + } + } + + preRelabelLabels := lb.Labels() + lset = relabel.Process(preRelabelLabels, d.opts.Relabels...) + + // Check if the target was dropped. + if lset == nil { + return nil, preRelabelLabels, nil + } + if v := lset.Get(model.AddressLabel); v == "" { + return nil, nil, errors.New("no address") + } + + lb = labels.NewBuilder(lset) + + // addPort checks whether we should add a default port to the address. + // If the address is not valid, we don't append a port either. + addPort := func(s string) bool { + // If we can split, a port exists and we don't have to add one. + if _, _, err := net.SplitHostPort(s); err == nil { + return false + } + // If adding a port makes it valid, the previous error + // was not due to an invalid address and we can append a port. + _, _, err := net.SplitHostPort(s + ":1234") + return err == nil + } + addr := lset.Get(model.AddressLabel) + // If it's an address with no trailing port, infer it based on the used scheme. + if addPort(addr) { + // Addresses reaching this point are already wrapped in [] if necessary. + switch lset.Get(model.SchemeLabel) { + case "http", "": + addr = addr + ":80" + case "https": + addr = addr + ":443" + default: + return nil, nil, errors.Errorf("invalid scheme: %q", d.opts.Scheme) + } + lb.Set(model.AddressLabel, addr) + } + + if err := config.CheckTargetAddress(model.LabelValue(addr)); err != nil { + return nil, nil, err + } + + // Meta labels are deleted after relabelling. Other internal labels propagate to + // the target which decides whether they will be part of their label set. + for _, l := range lset { + if strings.HasPrefix(l.Name, model.MetaLabelPrefix) { + lb.Del(l.Name) + } + } + + // Default the instance label to the target address. + if v := lset.Get(model.InstanceLabel); v == "" { + lb.Set(model.InstanceLabel, addr) + } + + res = lb.Labels() + for _, l := range res { + // Check label values are valid, drop the target if not. + if !model.LabelValue(l.Value).IsValid() { + return nil, nil, errors.Errorf("invalid label value for %q: %q", l.Name, l.Value) + } + } + return res, preRelabelLabels, nil +} diff --git a/pkg/operator/operator/discover/pod.go b/pkg/operator/operator/discover/childconfig.go similarity index 50% rename from pkg/operator/operator/discover/pod.go rename to pkg/operator/operator/discover/childconfig.go index 332a464c2..5fcc8d5d2 100644 --- a/pkg/operator/operator/discover/pod.go +++ b/pkg/operator/operator/discover/childconfig.go @@ -10,49 +10,35 @@ package discover import ( - "context" - - promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "fmt" + "hash/fnv" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" ) -const ( - discoverTypePod = "pod" -) - -type PodParams struct { - *BaseParams - TLSConfig *promv1.PodMetricsEndpointTLSConfig -} - -type Pod struct { - *BaseDiscover +// ChildConfig 子任务配置文件信息 +type ChildConfig struct { + Meta define.MonitorMeta + Node string + FileName string + Address string + Data []byte + Scheme string + Path string + Mask string + TaskType string + Namespace string + AntiAffinity bool } -func NewPodDiscover(ctx context.Context, meta define.MonitorMeta, checkFn define.CheckFunc, params *PodParams) Discover { - return &Pod{ - BaseDiscover: NewBaseDiscover(ctx, discoverTypePod, meta, checkFn, params.BaseParams), - } +func (c ChildConfig) String() string { + return fmt.Sprintf("Node=%s, FileName=%s, Address=%s", c.Node, c.FileName, c.Address) } -func (d *Pod) Type() string { - return discoverTypePod -} - -func (d *Pod) Reload() error { - d.Stop() - return d.Start() -} - -func (d *Pod) Start() error { - d.PreStart() - RegisterSharedDiscover(discoverTypePod, d.KubeConfig, d.getNamespaces()) - - d.wg.Add(1) - go func() { - defer d.wg.Done() - d.loopHandleTargetGroup() - }() - return nil +func (c ChildConfig) Hash() uint64 { + h := fnv.New64a() + h.Write([]byte(c.Node)) + h.Write(c.Data) + h.Write([]byte(c.Mask)) + return h.Sum64() } diff --git a/pkg/operator/operator/discover/discover.go b/pkg/operator/operator/discover/discover.go index 0a95872fd..179aa007c 100644 --- a/pkg/operator/operator/discover/discover.go +++ b/pkg/operator/operator/discover/discover.go @@ -10,59 +10,33 @@ package discover import ( - "context" - "encoding/base64" - "fmt" - "hash/fnv" - "net" - "net/url" - "regexp" - "sort" - "strings" - "sync" - "time" - - "github.com/elastic/beats/libbeat/common/transport/tlscommon" - "github.com/goware/urlx" - "github.com/pkg/errors" - promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "github.com/prometheus/common/model" - "github.com/prometheus/prometheus/config" - "github.com/prometheus/prometheus/discovery/targetgroup" - "github.com/prometheus/prometheus/model/labels" - "github.com/prometheus/prometheus/model/relabel" - "gopkg.in/yaml.v2" - corev1 "k8s.io/api/core/v1" - "k8s.io/client-go/kubernetes" - bkv1beta1 "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/apis/crd/v1beta1" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/eplabels" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/feature" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/k8sutils" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/labelspool" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/notifier" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/tasks" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/target" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" -) - -const ( - Base64Protocol = "base64://" ) var bus = notifier.NewDefaultRateBus() -func Publish() { bus.Publish() } +// Publish 发布 discover 变更信号 +func Publish() { + bus.Publish() +} -func Notify() <-chan struct{} { return bus.Subscribe() } +// Notify 接收 discover 变更信号 +func Notify() <-chan struct{} { + return bus.Subscribe() +} // Discover 是监控资源监视器 +// discover 负责启动对各类监控资源的 watch 操作并处理来自 prometheus discovery 的 targetgroups type Discover interface { - // Name 实例名称 discover 唯一标识 + // Name 实例名称 Name() string - // Type 实例类型 目前有 endpoints、pod,ingress + // UK 唯一标识 格式为 $Kind:.. + UK() string + + // Type 实例类型 Type() string // IsSystem 是否为系统内置资源 @@ -92,729 +66,3 @@ type Discover interface { // StatefulSetChildConfigs 获取 statafulset 类型子配置信息 StatefulSetChildConfigs() []*ChildConfig } - -// ChildConfig 子任务配置文件信息 -type ChildConfig struct { - Meta define.MonitorMeta - Node string - FileName string - Address string - Data []byte - Scheme string - Path string - Mask string - TaskType string - Namespace string - AntiAffinity bool -} - -func (c ChildConfig) String() string { - return fmt.Sprintf("Node=%s, FileName=%s, Address=%s", c.Node, c.FileName, c.Address) -} - -func (c ChildConfig) Hash() uint64 { - h := fnv.New64a() - h.Write([]byte(c.Node)) - h.Write(c.Data) - h.Write([]byte(c.Mask)) - return h.Sum64() -} - -func EncodeBase64(s string) string { - return Base64Protocol + base64.StdEncoding.EncodeToString([]byte(s)) -} - -type BaseParams struct { - Client kubernetes.Interface - RelabelRule string - RelabelIndex string - NormalizeMetricName bool - AntiAffinity bool - Name string - KubeConfig string - Namespaces []string - Path string - Scheme string - ProxyURL string - Period string - Timeout string - ForwardLocalhost bool - DisableCustomTimestamp bool - DataID *bkv1beta1.DataID - Relabels []*relabel.Config - BasicAuth *promv1.BasicAuth - TLSConfig *promv1.TLSConfig - BearerTokenFile string - BearerTokenSecret *corev1.SecretKeySelector - ExtraLabels map[string]string - System bool - UrlValues url.Values - MetricRelabelConfigs []yaml.MapSlice - MatchSelector map[string]string - DropSelector map[string]string - LabelJoinMatcher *feature.LabelJoinMatcherSpec - UseEndpointSlice bool -} - -type BaseDiscover struct { - *BaseParams - parentCtx context.Context - ctx context.Context - cancel context.CancelFunc - wg sync.WaitGroup - role string - monitorMeta define.MonitorMeta - mm *metricMonitor - checkIfNodeExists define.CheckFunc - fetched bool - cache *Cache - - // 任务配置文件信息 通过 source 进行分组 使用 hash 进行唯一校验 - childConfigMut sync.RWMutex - childConfigGroups map[string]map[uint64]*ChildConfig // map[targetGroup.Source]map[hash]*ChildConfig -} - -func NewBaseDiscover(ctx context.Context, role string, monitorMeta define.MonitorMeta, checkFn define.CheckFunc, params *BaseParams) *BaseDiscover { - return &BaseDiscover{ - parentCtx: ctx, - role: role, - BaseParams: params, - checkIfNodeExists: checkFn, - monitorMeta: monitorMeta, - mm: newMetricMonitor(params.Name), - cache: NewCache(params.Name, time.Minute*10), - } -} - -func (d *BaseDiscover) getUrlValues() url.Values { - if d.UrlValues == nil { - return nil - } - values := make(map[string][]string) - for k, items := range d.UrlValues { - for _, item := range items { - values[k] = append(values[k], item) - } - } - return values -} - -func (d *BaseDiscover) getNamespaces() []string { - namespaces := d.Namespaces - if len(namespaces) == 0 { - namespaces = []string{corev1.NamespaceAll} - } - return namespaces -} - -func (d *BaseDiscover) Type() string { - return "base" -} - -func (d *BaseDiscover) Name() string { - return d.BaseParams.Name -} - -func (d *BaseDiscover) IsSystem() bool { - return d.System -} - -func (d *BaseDiscover) DataID() *bkv1beta1.DataID { - return d.BaseParams.DataID -} - -func (d *BaseDiscover) MonitorMeta() define.MonitorMeta { - return d.monitorMeta -} - -func (d *BaseDiscover) PreStart() { - d.mm.IncStartedCounter() - d.ctx, d.cancel = context.WithCancel(d.parentCtx) - d.childConfigGroups = make(map[string]map[uint64]*ChildConfig) - logger.Infof("starting discover %s", d.Name()) -} - -func (d *BaseDiscover) SetDataID(dataID *bkv1beta1.DataID) { - d.BaseParams.DataID = dataID - d.BaseParams.ExtraLabels = dataID.Spec.Labels -} - -func (d *BaseDiscover) String() string { - return fmt.Sprintf("Name=%s, Type=%s, Namespace=%v, System=%v", d.Name(), d.Type(), d.getNamespaces(), d.System) -} - -func (d *BaseDiscover) Stop() { - d.cancel() - logger.Infof("waiting discover %s", d.Name()) - - d.wg.Wait() - d.mm.IncStoppedCounter() - d.cache.Clean() - logger.Infof("shutting discover %s", d.Name()) -} - -func (d *BaseDiscover) makeMetricTarget(lbls, origLabels labels.Labels, namespace string) (*target.MetricTarget, error) { - metricTarget := &target.MetricTarget{} - var isNodeType bool - var targetName string - taskType := tasks.TaskTypeStatefulSet - - // model.* 相关 label 有可能会被重写 使用 lbls(保证一定有 __address__ 字段) - for _, label := range lbls { - switch label.Name { - case model.AddressLabel: - metricTarget.Address = label.Value - case model.SchemeLabel: - metricTarget.Scheme = label.Value - case model.MetricsPathLabel: - metricTarget.Path = label.Value - } - } - - // 这里是通过原始 label 查找固定字段,所以使用的还是 combinedlabels - for _, label := range origLabels { - switch label.Name { - // 补充 NodeName - case eplabels.EndpointNodeName(d.UseEndpointSlice), labelPodNodeName: - metricTarget.NodeName = label.Value - - // 如果 target 类型是 node,则需要特殊处理,此时 endpointNodeName 对应 label 会为空 - case eplabels.EndpointAddressTargetKind(d.UseEndpointSlice), labelPodAddressTargetKind: - if label.Value == "Node" { - isNodeType = true - } - case eplabels.EndpointAddressTargetName(d.UseEndpointSlice), labelPodAddressTargetName: - targetName = label.Value - } - } - - if isNodeType { - metricTarget.NodeName = targetName - } - - if d.checkIfNodeExists != nil { - nodeName, exist := d.checkIfNodeExists(metricTarget.NodeName) - if exist { - taskType = tasks.TaskTypeDaemonSet - } - // 修正 nodename - metricTarget.NodeName = nodeName - } - - if metricTarget.NodeName == "" { - logger.Debugf("%s no node info from labels: %+v", d.Name(), origLabels) - metricTarget.NodeName = define.UnknownNode - } - - // 初始化参数列表 - metricTarget.Params = d.getUrlValues() - if d.UrlValues == nil { - metricTarget.Params = make(url.Values) - } - - if metricTarget.Scheme == "" { - metricTarget.Scheme = d.Scheme - } - if metricTarget.Path == "" { - metricTarget.Path = d.Path - } - - requestURL, err := url.Parse(metricTarget.Path) - if err != nil { - return nil, errors.Wrap(err, "parse request path failed") - } - metricTarget.Path = requestURL.Path - - params, err := url.ParseQuery(requestURL.RawQuery) - if err != nil { - return nil, errors.Wrap(err, "parse request query failed") - } - for key := range params { - metricTarget.Params[key] = append(metricTarget.Params[key], params[key]...) - } - - if d.BasicAuth != nil && d.BasicAuth.Username.String() != "" && d.BasicAuth.Password.String() != "" { - secretClient := d.Client.CoreV1().Secrets(d.monitorMeta.Namespace) - username, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, d.BasicAuth.Username) - if err != nil { - return nil, errors.Wrap(err, "get username from secret failed") - } - - password, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, d.BasicAuth.Password) - if err != nil { - return nil, errors.Wrap(err, "get password from secret failed") - } - - metricTarget.Username = username - metricTarget.Password = password - } - - metricTarget.BearerTokenFile = d.BearerTokenFile - if d.BearerTokenSecret != nil && d.BearerTokenSecret.Name != "" && d.BearerTokenSecret.Key != "" { - secretClient := d.Client.CoreV1().Secrets(d.monitorMeta.Namespace) - bearerToken, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, *d.BearerTokenSecret) - if err != nil { - return nil, errors.Wrap(err, "get bearer token from secret failed") - } - metricTarget.BearerToken = bearerToken - } - - if d.TLSConfig != nil { - metricTarget.TLSConfig = &tlscommon.Config{} - secretClient := d.Client.CoreV1().Secrets(d.monitorMeta.Namespace) - if d.TLSConfig.CAFile != "" { - metricTarget.TLSConfig.CAs = []string{d.TLSConfig.CAFile} - } - if d.TLSConfig.CA.Secret != nil { - ca, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, *d.TLSConfig.CA.Secret) - if err != nil { - return nil, errors.Wrap(err, "get TLS CA from secret failed") - } - metricTarget.TLSConfig.CAs = []string{EncodeBase64(ca)} - } - - if d.TLSConfig.CertFile != "" { - metricTarget.TLSConfig.Certificate.Certificate = d.TLSConfig.CertFile - } - if d.TLSConfig.Cert.Secret != nil { - cert, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, *d.TLSConfig.Cert.Secret) - if err != nil { - return nil, errors.Wrap(err, "get TLS Cert from secret failed") - } - metricTarget.TLSConfig.Certificate.Certificate = EncodeBase64(cert) - } - - if d.TLSConfig.KeyFile != "" { - metricTarget.TLSConfig.Certificate.Key = d.TLSConfig.KeyFile - } - if d.TLSConfig.KeySecret != nil { - key, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, *d.TLSConfig.KeySecret) - if err != nil { - return nil, errors.Wrap(err, "get TLS Key from secret failed") - } - metricTarget.TLSConfig.Certificate.Key = EncodeBase64(key) - } - } - - if len(lbls) == 0 { - metricTarget.Labels = origLabels - } else { - metricTarget.Labels = lbls - } - - period := d.Period - if period == "" { - period = ConfDefaultPeriod - } - timeout := d.Timeout - if timeout == "" { - timeout = period - } - - metricTarget.Meta = d.monitorMeta - metricTarget.ExtraLabels = d.ExtraLabels - metricTarget.Namespace = namespace // 采集目标的 namespace - metricTarget.DataID = d.DataID().Spec.DataID - metricTarget.DimensionReplace = d.DataID().Spec.DimensionReplace - metricTarget.MetricReplace = d.DataID().Spec.MetricReplace - metricTarget.MetricRelabelConfigs = d.MetricRelabelConfigs - metricTarget.Period = period - metricTarget.Timeout = timeout - metricTarget.ProxyURL = d.ProxyURL - metricTarget.Mask = d.Mask() - metricTarget.TaskType = taskType - metricTarget.RelabelRule = d.RelabelRule - metricTarget.RelabelIndex = d.RelabelIndex - metricTarget.NormalizeMetricName = d.NormalizeMetricName - metricTarget.LabelJoinMatcher = d.LabelJoinMatcher - - return metricTarget, nil -} - -func (d *BaseDiscover) StatefulSetChildConfigs() []*ChildConfig { - d.childConfigMut.RLock() - defer d.childConfigMut.RUnlock() - - configs := make([]*ChildConfig, 0) - for _, group := range d.childConfigGroups { - for _, cfg := range group { - if cfg.TaskType == tasks.TaskTypeStatefulSet { - configs = append(configs, cfg) - } - } - } - return configs -} - -func (d *BaseDiscover) DaemonSetChildConfigs() []*ChildConfig { - d.childConfigMut.RLock() - defer d.childConfigMut.RUnlock() - - configs := make([]*ChildConfig, 0) - for _, group := range d.childConfigGroups { - for _, cfg := range group { - if cfg.TaskType == tasks.TaskTypeDaemonSet { - configs = append(configs, cfg) - } - } - } - return configs -} - -func (d *BaseDiscover) Mask() string { - var mask string - conv := func(b bool) string { - if b { - return "1" - } - return "0" - } - - mask += conv(d.System) - return mask -} - -// loopHandleTargetGroup 持续处理来自 k8s 的 targets -func (d *BaseDiscover) loopHandleTargetGroup() { - defer Publish() - - const duration = 10 - const resync = 100 // 避免事件丢失 - - ticker := time.NewTicker(time.Second * duration) - defer ticker.Stop() - - counter := 0 - for { - select { - case <-d.ctx.Done(): - return - - case <-ticker.C: - counter++ - tgList, updatedAt := GetTargetGroups(d.role, d.getNamespaces()) - logger.Debugf("%s updated at: %v", d.Name(), time.Unix(updatedAt, 0)) - if time.Now().Unix()-updatedAt > duration*2 && counter%resync != 0 && d.fetched { - logger.Debugf("%s found nothing changed, skip targetgourps handled", d.Name()) - continue - } - d.fetched = true - - for _, tg := range tgList { - if tg == nil { - continue - } - logger.Debugf("%s get targets source: %s, targets: %+v, labels: %+v", d.Name(), tg.Source, tg.Targets, tg.Labels) - d.handleTargetGroup(tg) - } - } - } -} - -func forwardAddress(addr string) (string, error) { - withSchema := strings.HasPrefix(addr, "https") || strings.HasPrefix(addr, "http") - - u, err := urlx.Parse(addr) - if err != nil { - return "", err - } - - port := u.Port() - if port != "" { - u.Host = "127.0.0.1:" + port - } else { - u.Host = "127.0.0.1" - } - if !withSchema { - u.Scheme = "" - return u.String()[2:], nil - } - - return u.String(), nil -} - -func metaFromSource(s string) (string, string, error) { - parts := strings.Split(s, "/") - if len(parts) != 3 { - return "", "", errors.Errorf("invalid source: %v", s) - } - return parts[1], parts[2], nil -} - -func matchSelector(labels []labels.Label, selector map[string]string) bool { - var count int - for k, v := range selector { - re, err := regexp.Compile(v) - if err != nil { - logger.Errorf("failed to compile expr '%s', err: %v", v, err) - continue - } - for _, lbs := range labels { - if lbs.Name == k { - if !re.MatchString(lbs.Value) { - return false - } - count++ - break - } - } - } - return count == len(selector) -} - -func (d *BaseDiscover) handleTarget(namespace string, tlset, tglbs model.LabelSet) (*ChildConfig, error) { - lbls := labelspool.Get() - defer labelspool.Put(lbls) - - for ln, lv := range tlset { - lbls = append(lbls, labels.Label{ - Name: string(ln), - Value: string(lv), - }) - } - for ln, lv := range tglbs { - if _, ok := tlset[ln]; !ok { - lbls = append(lbls, labels.Label{ - Name: string(ln), - Value: string(lv), - }) - } - } - - // annotations 白名单过滤 - if len(d.MatchSelector) > 0 { - if !matchSelector(lbls, d.MatchSelector) { - logger.Debugf("%s annotation selector not match: %v", d.Name(), d.MatchSelector) - return nil, nil - } - } - - // annotations 黑名单过滤 - if len(d.DropSelector) > 0 { - if matchSelector(lbls, d.DropSelector) { - logger.Debugf("%s annotation selector drop: %v", d.Name(), d.DropSelector) - return nil, nil - } - } - - sort.Sort(lbls) - res, orig, err := d.populateLabels(lbls) - if err != nil { - return nil, errors.Wrap(err, "populate labels failed") - } - if len(res) == 0 { - return nil, nil - } - - logger.Debugf("%s populate labels %+v", d.Name(), res) - metricTarget, err := d.makeMetricTarget(res, orig, namespace) - if err != nil { - return nil, errors.Wrap(err, "make metric target failed") - } - - interval, _ := time.ParseDuration(metricTarget.Period) - d.mm.SetMonitorScrapeInterval(interval.Seconds()) - - if d.ForwardLocalhost { - metricTarget.Address, err = forwardAddress(metricTarget.Address) - if err != nil { - return nil, errors.Wrapf(err, "forward address failed, address=%s", metricTarget.Address) - } - } - - metricTarget.DisableCustomTimestamp = d.DisableCustomTimestamp - data, err := metricTarget.YamlBytes() - if err != nil { - return nil, errors.Wrap(err, "marshal target failed") - } - - childConfig := &ChildConfig{ - Node: metricTarget.NodeName, - FileName: metricTarget.FileName(), - Address: metricTarget.Address, - Data: data, - Scheme: metricTarget.Scheme, - Path: metricTarget.Path, - Mask: metricTarget.Mask, - Meta: metricTarget.Meta, - Namespace: metricTarget.Namespace, - TaskType: metricTarget.TaskType, - AntiAffinity: d.AntiAffinity, - } - logger.Debugf("%s create child config: %+v", d.Name(), childConfig) - return childConfig, nil -} - -// handleTargetGroup 遍历自身的所有 target group 计算得到活跃的 target 并删除消失的 target -func (d *BaseDiscover) handleTargetGroup(targetGroup *targetgroup.Group) { - d.mm.IncHandledTgCounter() - - namespace, _, err := metaFromSource(targetGroup.Source) - if err != nil { - logger.Errorf("%s failed to parse source: %v", d.Name(), err) - return - } - - sourceName := targetGroup.Source - childConfigs := make([]*ChildConfig, 0) - - for _, tlset := range targetGroup.Targets { - skipped := d.cache.Check(namespace, tlset, targetGroup.Labels) - if skipped { - d.mm.IncCreatedChildConfigCachedCounter() - continue - } - - childConfig, err := d.handleTarget(namespace, tlset, targetGroup.Labels) - if err != nil { - logger.Errorf("%s handle target failed: %v", d.Name(), err) - d.mm.IncCreatedChildConfigFailedCounter() - continue - } - if childConfig == nil { - d.cache.Set(namespace, tlset, targetGroup.Labels) - continue - } - - d.mm.IncCreatedChildConfigSuccessCounter() - childConfigs = append(childConfigs, childConfig) - } - - d.notify(sourceName, childConfigs) -} - -// notify 判断是否刷新文件配置 需要则要发送通知信号 -func (d *BaseDiscover) notify(source string, childConfigs []*ChildConfig) { - d.childConfigMut.Lock() - defer d.childConfigMut.Unlock() - - if _, ok := d.childConfigGroups[source]; !ok { - d.childConfigGroups[source] = make(map[uint64]*ChildConfig) - } - - added := make(map[uint64]struct{}) - var changed bool - - // 增加新出现的配置 - for _, cfg := range childConfigs { - hash := cfg.Hash() - if _, ok := d.childConfigGroups[source][hash]; !ok { - logger.Infof("%s adds file, node=%s, filename=%s", d.Name(), cfg.Node, cfg.FileName) - d.childConfigGroups[source][hash] = cfg - changed = true - } - added[hash] = struct{}{} - } - - // 删除已经消失的配置 - removed := make([]uint64, 0) - for key := range d.childConfigGroups[source] { - if _, ok := added[key]; !ok { - removed = append(removed, key) - changed = true - } - } - - for _, key := range removed { - cfg := d.childConfigGroups[source][key] - logger.Infof("%s deletes file, node=%s, filename=%s", d.Name(), cfg.Node, cfg.FileName) - delete(d.childConfigGroups[source], key) - } - - // 如果文件有变更则发送通知 - if changed { - logger.Infof("%s found targetgroup.source changed", source) - Publish() - } -} - -// populateLabels builds a label set from the given label set and scrape configuration. -// It returns a label set before relabeling was applied as the second return value. -// Returns the original discovered label set found before relabelling was applied if the target is dropped during relabeling. -func (d *BaseDiscover) populateLabels(lset labels.Labels) (res, orig labels.Labels, err error) { - // Copy labels into the labelset for the target if they are not set already. - scrapeLabels := []labels.Label{ - {Name: model.JobLabel, Value: d.Name()}, - {Name: model.MetricsPathLabel, Value: d.Path}, - {Name: model.SchemeLabel, Value: d.Scheme}, - } - lb := labels.NewBuilder(lset) - - for _, l := range scrapeLabels { - if lv := lset.Get(l.Name); lv == "" { - lb.Set(l.Name, l.Value) - } - } - // Encode scrape query parameters as labels. - // for k, v := range d.UrlValues { - // if len(v) > 0 { - // lb.Set(model.ParamLabelPrefix+k, v[0]) - // } - // } - - preRelabelLabels := lb.Labels() - lset = relabel.Process(preRelabelLabels, d.Relabels...) - - // Check if the target was dropped. - if lset == nil { - return nil, preRelabelLabels, nil - } - if v := lset.Get(model.AddressLabel); v == "" { - return nil, nil, errors.New("no address") - } - - lb = labels.NewBuilder(lset) - - // addPort checks whether we should add a default port to the address. - // If the address is not valid, we don't append a port either. - addPort := func(s string) bool { - // If we can split, a port exists and we don't have to add one. - if _, _, err := net.SplitHostPort(s); err == nil { - return false - } - // If adding a port makes it valid, the previous error - // was not due to an invalid address and we can append a port. - _, _, err := net.SplitHostPort(s + ":1234") - return err == nil - } - addr := lset.Get(model.AddressLabel) - // If it's an address with no trailing port, infer it based on the used scheme. - if addPort(addr) { - // Addresses reaching this point are already wrapped in [] if necessary. - switch lset.Get(model.SchemeLabel) { - case "http", "": - addr = addr + ":80" - case "https": - addr = addr + ":443" - default: - return nil, nil, errors.Errorf("invalid scheme: %q", d.Scheme) - } - lb.Set(model.AddressLabel, addr) - } - - if err := config.CheckTargetAddress(model.LabelValue(addr)); err != nil { - return nil, nil, err - } - - // Meta labels are deleted after relabelling. Other internal labels propagate to - // the target which decides whether they will be part of their label set. - for _, l := range lset { - if strings.HasPrefix(l.Name, model.MetaLabelPrefix) { - lb.Del(l.Name) - } - } - - // Default the instance label to the target address. - if v := lset.Get(model.InstanceLabel); v == "" { - lb.Set(model.InstanceLabel, addr) - } - - res = lb.Labels() - for _, l := range res { - // Check label values are valid, drop the target if not. - if !model.LabelValue(l.Value).IsValid() { - return nil, nil, errors.Errorf("invalid label value for %q: %q", l.Name, l.Value) - } - } - return res, preRelabelLabels, nil -} diff --git a/pkg/operator/operator/discover/endpoint.go b/pkg/operator/operator/discover/endpoint.go deleted file mode 100644 index 4df1f4a0d..000000000 --- a/pkg/operator/operator/discover/endpoint.go +++ /dev/null @@ -1,64 +0,0 @@ -// Tencent is pleased to support the open source community by making -// 蓝鲸智云 - 监控平台 (BlueKing - Monitor) available. -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. -// You may obtain a copy of the License at http://opensource.org/licenses/MIT -// Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on -// an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -package discover - -import ( - "context" - - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" -) - -const ( - labelPodNodeName = "__meta_kubernetes_pod_node_name" - labelPodAddressTargetKind = "__meta_kubernetes_pod_address_target_kind" - labelPodAddressTargetName = "__meta_kubernetes_pod_address_target_name" -) - -func discoverTypeEndpoints(endpointslice bool) string { - if endpointslice { - return "endpointslice" - } - return "endpoints" -} - -type EndpointParams struct { - *BaseParams -} - -type Endpoint struct { - *BaseDiscover -} - -func NewEndpointDiscover(ctx context.Context, meta define.MonitorMeta, checkFn define.CheckFunc, params *EndpointParams) Discover { - return &Endpoint{ - BaseDiscover: NewBaseDiscover(ctx, discoverTypeEndpoints(params.UseEndpointSlice), meta, checkFn, params.BaseParams), - } -} - -func (d *Endpoint) Type() string { - return discoverTypeEndpoints(d.UseEndpointSlice) -} - -func (d *Endpoint) Reload() error { - d.Stop() - return d.Start() -} - -func (d *Endpoint) Start() error { - d.PreStart() - RegisterSharedDiscover(discoverTypeEndpoints(d.UseEndpointSlice), d.KubeConfig, d.getNamespaces()) - - d.wg.Add(1) - go func() { - defer d.wg.Done() - d.loopHandleTargetGroup() - }() - return nil -} diff --git a/pkg/operator/operator/discover/cache.go b/pkg/operator/operator/discover/hashcache.go similarity index 86% rename from pkg/operator/operator/discover/cache.go rename to pkg/operator/operator/discover/hashcache.go index edc416c32..241d3e6ab 100644 --- a/pkg/operator/operator/discover/cache.go +++ b/pkg/operator/operator/discover/hashcache.go @@ -25,7 +25,7 @@ import ( var seps = []byte{'\xff'} -type Cache struct { +type hashCache struct { name string mut sync.Mutex cache map[uint64]int64 @@ -33,8 +33,8 @@ type Cache struct { done chan struct{} } -func NewCache(name string, expired time.Duration) *Cache { - c := &Cache{ +func newHashCache(name string, expired time.Duration) *hashCache { + c := &hashCache{ name: name, cache: make(map[uint64]int64), expired: expired, @@ -45,11 +45,11 @@ func NewCache(name string, expired time.Duration) *Cache { return c } -func (c *Cache) Clean() { +func (c *hashCache) Clean() { close(c.done) } -func (c *Cache) gc() { +func (c *hashCache) gc() { ticker := time.NewTicker(time.Minute) defer ticker.Stop() @@ -77,7 +77,7 @@ func (c *Cache) gc() { } } -func (c *Cache) Check(namespace string, tlset, tglbs model.LabelSet) bool { +func (c *hashCache) Check(namespace string, tlset, tglbs model.LabelSet) bool { h := c.hash(namespace, tlset, tglbs) c.mut.Lock() @@ -90,7 +90,7 @@ func (c *Cache) Check(namespace string, tlset, tglbs model.LabelSet) bool { return ok } -func (c *Cache) Set(namespace string, tlset, tglbs model.LabelSet) { +func (c *hashCache) Set(namespace string, tlset, tglbs model.LabelSet) { h := c.hash(namespace, tlset, tglbs) c.mut.Lock() @@ -99,7 +99,7 @@ func (c *Cache) Set(namespace string, tlset, tglbs model.LabelSet) { c.cache[h] = time.Now().Unix() } -func (c *Cache) hash(namespace string, tlset, tglbs model.LabelSet) uint64 { +func (c *hashCache) hash(namespace string, tlset, tglbs model.LabelSet) uint64 { lbs := labelspool.Get() defer labelspool.Put(lbs) diff --git a/pkg/operator/operator/discover/httpd/http.go b/pkg/operator/operator/discover/httpd/http.go new file mode 100644 index 000000000..bf5f37fd2 --- /dev/null +++ b/pkg/operator/operator/discover/httpd/http.go @@ -0,0 +1,91 @@ +// Tencent is pleased to support the open source community by making +// 蓝鲸智云 - 监控平台 (BlueKing - Monitor) available. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +// You may obtain a copy of the License at http://opensource.org/licenses/MIT +// Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +// an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +package httpd + +import ( + "context" + "fmt" + + "github.com/pkg/errors" + promconfig "github.com/prometheus/common/config" + promhttpsd "github.com/prometheus/prometheus/discovery/http" + + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/logconf" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover/shareddiscovery" +) + +const ( + TypeHttpSd = "httpsd" +) + +type Options struct { + *discover.CommonOptions + + SDConfig *promhttpsd.SDConfig + HTTPClientConfig promconfig.HTTPClientConfig +} + +type Discover struct { + *discover.BaseDiscover + + opts *Options +} + +var _ discover.Discover = (*Discover)(nil) + +func New(ctx context.Context, checkFn define.CheckFunc, opts *Options) *Discover { + d := &Discover{ + BaseDiscover: discover.NewBaseDiscover(ctx, checkFn, opts.CommonOptions), + opts: opts, + } + + d.SetUK(fmt.Sprintf("%s:%s", d.Type(), opts.Name)) + d.SetHelper(discover.Helper{ + AccessBasicAuth: d.accessBasicAuth, + }) + return d +} + +func (d *Discover) Type() string { + return TypeHttpSd +} + +func (d *Discover) Reload() error { + d.Stop() + return d.Start() +} + +func (d *Discover) Start() error { + d.PreStart() + + err := shareddiscovery.Register(d.UK(), func() (*shareddiscovery.SharedDiscovery, error) { + discovery, err := promhttpsd.NewDiscovery(d.opts.SDConfig, logconf.New(TypeHttpSd), nil) + if err != nil { + return nil, errors.Wrap(err, d.Type()) + } + return shareddiscovery.New(d.UK(), discovery), nil + }) + if err != nil { + return err + } + + go d.LoopHandle() + return nil +} + +func (d *Discover) accessBasicAuth() (string, string, error) { + auth := d.opts.HTTPClientConfig.BasicAuth + if auth != nil { + return auth.Username, string(auth.Password), nil + } + return "", "", nil +} diff --git a/pkg/operator/operator/discover/kubernetesd/kubernetes.go b/pkg/operator/operator/discover/kubernetesd/kubernetes.go new file mode 100644 index 000000000..362effbf7 --- /dev/null +++ b/pkg/operator/operator/discover/kubernetesd/kubernetes.go @@ -0,0 +1,237 @@ +// Tencent is pleased to support the open source community by making +// 蓝鲸智云 - 监控平台 (BlueKing - Monitor) available. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +// You may obtain a copy of the License at http://opensource.org/licenses/MIT +// Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +// an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +package kubernetesd + +import ( + "context" + "encoding/base64" + "fmt" + "strings" + + "github.com/elastic/beats/libbeat/common/transport/tlscommon" + "github.com/pkg/errors" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + promk8ssd "github.com/prometheus/prometheus/discovery/kubernetes" + "github.com/prometheus/prometheus/model/labels" + corev1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" + + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/eplabels" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/k8sutils" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/logconf" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover/shareddiscovery" +) + +const ( + TypePod = "pod" +) + +func TypeEndpoints(endpointslice bool) string { + if endpointslice { + return "endpointslice" + } + return "endpoints" +} + +const ( + labelPodNodeName = "__meta_kubernetes_pod_node_name" + labelPodAddressTargetKind = "__meta_kubernetes_pod_address_target_kind" + labelPodAddressTargetName = "__meta_kubernetes_pod_address_target_name" +) + +type Options struct { + *discover.CommonOptions + + KubeConfig string + Namespaces []string + Client kubernetes.Interface + BasicAuth *promv1.BasicAuth + TLSConfig *promv1.TLSConfig + BearerTokenSecret *corev1.SecretKeySelector + UseEndpointSlice bool +} + +type Discover struct { + *discover.BaseDiscover + + ctx context.Context + role string + opts *Options +} + +var _ discover.Discover = (*Discover)(nil) + +func New(ctx context.Context, role string, checkFn define.CheckFunc, opts *Options) *Discover { + d := &Discover{ + ctx: ctx, + role: role, + opts: opts, + BaseDiscover: discover.NewBaseDiscover(ctx, checkFn, opts.CommonOptions), + } + + d.SetUK(fmt.Sprintf("%s:%s", role, strings.Join(d.getNamespaces(), "/"))) + d.SetHelper(discover.Helper{ + AccessBasicAuth: d.accessBasicAuth, + AccessBearerToken: d.accessBearerToken, + AccessTlsConfig: d.accessTLSConfig, + MatchNodeName: d.matchNodeName, + }) + return d +} + +func (d *Discover) Type() string { + return d.role +} + +func (d *Discover) Reload() error { + d.Stop() + return d.Start() +} + +func (d *Discover) Start() error { + d.PreStart() + + err := shareddiscovery.Register(d.UK(), func() (*shareddiscovery.SharedDiscovery, error) { + cfg := promk8ssd.DefaultSDConfig + cfg.Role = promk8ssd.Role(d.role) + cfg.NamespaceDiscovery.Names = d.getNamespaces() + cfg.KubeConfig = d.opts.KubeConfig + + discovery, err := promk8ssd.New(logconf.New(d.Type()), &cfg) + if err != nil { + return nil, errors.Wrap(err, d.Type()) + } + return shareddiscovery.New(d.UK(), discovery), nil + }) + if err != nil { + return err + } + + go d.LoopHandle() + return nil +} + +func (d *Discover) getNamespaces() []string { + namespaces := d.opts.Namespaces + if len(namespaces) == 0 { + namespaces = []string{corev1.NamespaceAll} + } + return namespaces +} + +func (d *Discover) matchNodeName(lbs labels.Labels) string { + var target string + var isNodeType bool + + // 这里是通过原始 label 查找固定字段,所以使用的还是 combinedlabels + for _, label := range lbs { + switch label.Name { + // 补充 NodeName + case eplabels.EndpointNodeName(d.opts.UseEndpointSlice), labelPodNodeName: + return label.Value + + // 如果 target 类型是 node,则需要特殊处理,此时 endpointNodeName 对应 label 会为空 + case eplabels.EndpointAddressTargetKind(d.opts.UseEndpointSlice), labelPodAddressTargetKind: + if label.Value == "Node" { + isNodeType = true + } + + case eplabels.EndpointAddressTargetName(d.opts.UseEndpointSlice), labelPodAddressTargetName: + target = label.Value + } + } + + if isNodeType { + return target // 仅当为 nodetype 时返回 target + } + return "" +} + +func (d *Discover) accessBasicAuth() (string, string, error) { + auth := d.opts.BasicAuth + if auth == nil || auth.Username.String() == "" || auth.Password.String() == "" { + return "", "", nil + } + + secretClient := d.opts.Client.CoreV1().Secrets(d.opts.MonitorMeta.Namespace) + username, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, auth.Username) + if err != nil { + return "", "", errors.Wrap(err, "get username from secret failed") + } + password, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, auth.Password) + if err != nil { + return "", "", errors.Wrap(err, "get password from secret failed") + } + + return username, password, nil +} + +func (d *Discover) accessBearerToken() (string, error) { + secret := d.opts.BearerTokenSecret + if secret == nil || secret.Name == "" || secret.Key == "" { + return "", nil + } + + secretClient := d.opts.Client.CoreV1().Secrets(d.opts.MonitorMeta.Namespace) + bearerToken, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, *secret) + if err != nil { + return "", errors.Wrap(err, "get bearer token from secret failed") + } + return bearerToken, nil +} + +func (d *Discover) accessTLSConfig() (*tlscommon.Config, error) { + if d.opts.TLSConfig == nil { + return nil, nil + } + + tlsConfig := &tlscommon.Config{} + secretClient := d.opts.Client.CoreV1().Secrets(d.opts.MonitorMeta.Namespace) + if d.opts.TLSConfig.CAFile != "" { + tlsConfig.CAs = []string{d.opts.TLSConfig.CAFile} + } + if d.opts.TLSConfig.CA.Secret != nil { + ca, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, *d.opts.TLSConfig.CA.Secret) + if err != nil { + return nil, errors.Wrap(err, "get TLS CA from secret failed") + } + tlsConfig.CAs = []string{encodeBase64(ca)} + } + + if d.opts.TLSConfig.CertFile != "" { + tlsConfig.Certificate.Certificate = d.opts.TLSConfig.CertFile + } + if d.opts.TLSConfig.Cert.Secret != nil { + cert, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, *d.opts.TLSConfig.Cert.Secret) + if err != nil { + return nil, errors.Wrap(err, "get TLS Cert from secret failed") + } + tlsConfig.Certificate.Certificate = encodeBase64(cert) + } + + if d.opts.TLSConfig.KeyFile != "" { + tlsConfig.Certificate.Key = d.opts.TLSConfig.KeyFile + } + if d.opts.TLSConfig.KeySecret != nil { + key, err := k8sutils.GetSecretDataBySecretKeySelector(d.ctx, secretClient, *d.opts.TLSConfig.KeySecret) + if err != nil { + return nil, errors.Wrap(err, "get TLS Key from secret failed") + } + tlsConfig.Certificate.Key = encodeBase64(key) + } + + return tlsConfig, nil +} + +func encodeBase64(s string) string { + return "base64://" + base64.StdEncoding.EncodeToString([]byte(s)) +} diff --git a/pkg/operator/operator/discover/metrics.go b/pkg/operator/operator/discover/shareddiscovery/metrics.go similarity index 85% rename from pkg/operator/operator/discover/metrics.go rename to pkg/operator/operator/discover/shareddiscovery/metrics.go index f673dfbc7..586b5b8a0 100644 --- a/pkg/operator/operator/discover/metrics.go +++ b/pkg/operator/operator/discover/shareddiscovery/metrics.go @@ -7,7 +7,7 @@ // an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -package discover +package shareddiscovery import ( "github.com/prometheus/client_golang/prometheus" @@ -90,42 +90,42 @@ var ( ) ) -func newMetricMonitor(name string) *metricMonitor { - return &metricMonitor{name: name} +func NewMetricMonitor(name string) *MetricMonitor { + return &MetricMonitor{name: name} } -type metricMonitor struct { +type MetricMonitor struct { name string } -func (m *metricMonitor) IncStartedCounter() { +func (m *MetricMonitor) IncStartedCounter() { discoverStartedTotal.WithLabelValues(m.name).Inc() } -func (m *metricMonitor) IncStoppedCounter() { +func (m *MetricMonitor) IncStoppedCounter() { discoverStoppedTotal.WithLabelValues(m.name).Inc() } -func (m *metricMonitor) IncCreatedChildConfigSuccessCounter() { +func (m *MetricMonitor) IncCreatedChildConfigSuccessCounter() { discoverCreatedChildConfigSuccessTotal.WithLabelValues(m.name).Inc() } -func (m *metricMonitor) IncCreatedChildConfigFailedCounter() { +func (m *MetricMonitor) IncCreatedChildConfigFailedCounter() { discoverCreatedChildConfigFailedTotal.WithLabelValues(m.name).Inc() } -func (m *metricMonitor) IncCreatedChildConfigCachedCounter() { +func (m *MetricMonitor) IncCreatedChildConfigCachedCounter() { discoverCreatedChildConfigCachedTotal.WithLabelValues(m.name).Inc() } -func (m *metricMonitor) IncHandledTgCounter() { +func (m *MetricMonitor) IncHandledTgCounter() { discoverHandledTgTotal.WithLabelValues(m.name).Inc() } -func (m *metricMonitor) IncDeletedTgSourceCounter() { +func (m *MetricMonitor) IncDeletedTgSourceCounter() { discoverDeletedTgSourceTotal.WithLabelValues(m.name).Inc() } -func (m *metricMonitor) SetMonitorScrapeInterval(v float64) { +func (m *MetricMonitor) SetMonitorScrapeInterval(v float64) { monitorScrapeIntervalSeconds.WithLabelValues(m.name).Set(v) } diff --git a/pkg/operator/operator/discover/shared_discovery.go b/pkg/operator/operator/discover/shareddiscovery/shared_discovery.go similarity index 51% rename from pkg/operator/operator/discover/shared_discovery.go rename to pkg/operator/operator/discover/shareddiscovery/shared_discovery.go index c0b4e885a..17a361d91 100644 --- a/pkg/operator/operator/discover/shared_discovery.go +++ b/pkg/operator/operator/discover/shareddiscovery/shared_discovery.go @@ -7,20 +7,17 @@ // an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -package discover +package shareddiscovery import ( "context" - "fmt" "math" - "strings" + "sort" "sync" "time" - promdiscover "github.com/prometheus/prometheus/discovery/kubernetes" "github.com/prometheus/prometheus/discovery/targetgroup" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/logconf" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" ) @@ -30,30 +27,46 @@ var ( gCancel context.CancelFunc sharedDiscoveryLock sync.Mutex - sharedDiscoveryMap map[string]*sharedDiscovery + sharedDiscoveryMap map[string]*SharedDiscovery ) -// Activate 初始化全局 sharedDiscovery +// Activate 初始化全局 SharedDiscovery func Activate() { gCtx, gCancel = context.WithCancel(context.Background()) - sharedDiscoveryMap = map[string]*sharedDiscovery{} + sharedDiscoveryMap = map[string]*SharedDiscovery{} } -// Deactivate 清理全局 sharedDiscovery +// Deactivate 清理全局 SharedDiscovery func Deactivate() { gCancel() gWg.Wait() } -type sharedDiscovery struct { - id string - namespaces []string - ctx context.Context - discovery *promdiscover.Discovery - ch chan []*targetgroup.Group - mut sync.RWMutex - store map[string]*tgWithTime - mm *metricMonitor +// AllDiscovery 返回全局注册的 shared discovery 名称 +func AllDiscovery() []string { + sharedDiscoveryLock.Lock() + defer sharedDiscoveryLock.Unlock() + + names := make([]string, 0, len(sharedDiscoveryMap)) + for k := range sharedDiscoveryMap { + names = append(names, k) + } + sort.Strings(names) + return names +} + +type Discovery interface { + Run(ctx context.Context, ch chan<- []*targetgroup.Group) +} + +type SharedDiscovery struct { + uk string + ctx context.Context + discovery Discovery + ch chan []*targetgroup.Group + mut sync.RWMutex + store map[string]*tgWithTime + mm *MetricMonitor } type tgWithTime struct { @@ -61,23 +74,61 @@ type tgWithTime struct { updatedAt int64 } -func newSharedDiscovery(ctx context.Context, id string, namespaces []string, discovery *promdiscover.Discovery) *sharedDiscovery { - return &sharedDiscovery{ - ctx: ctx, - id: id, - namespaces: namespaces, - discovery: discovery, - ch: make(chan []*targetgroup.Group), - store: map[string]*tgWithTime{}, - mm: newMetricMonitor(id), +// FetchTargetGroups 获取缓存 targetgroups 以及最新更新时间 +func FetchTargetGroups(uk string) ([]*targetgroup.Group, int64) { + sharedDiscoveryLock.Lock() + defer sharedDiscoveryLock.Unlock() + + if d, ok := sharedDiscoveryMap[uk]; ok { + return d.fetch() } + + return nil, 0 } -func (sd *sharedDiscovery) watch() { +// Register 注册 shared discovery +// 共享 Discovery 实例可以减少获取 tgs 通信压力 减少内存使用 +func Register(uk string, createFunc func() (*SharedDiscovery, error)) error { + sharedDiscoveryLock.Lock() + defer sharedDiscoveryLock.Unlock() + + if _, ok := sharedDiscoveryMap[uk]; !ok { + sd, err := createFunc() + if err != nil { + logger.Errorf("failed to create shared discovery(%s): %v", uk, err) + return err + } + gWg.Add(2) + go func() { + defer gWg.Done() + sd.watch() + }() + go func() { + defer gWg.Done() + sd.start() + }() + sharedDiscoveryMap[uk] = sd + } + + return nil +} + +func New(uk string, discovery Discovery) *SharedDiscovery { + return &SharedDiscovery{ + ctx: gCtx, // 生命周期由全局管理 + uk: uk, + discovery: discovery, + ch: make(chan []*targetgroup.Group), + store: map[string]*tgWithTime{}, + mm: NewMetricMonitor(uk), + } +} + +func (sd *SharedDiscovery) watch() { sd.discovery.Run(sd.ctx, sd.ch) } -func (sd *sharedDiscovery) start() { +func (sd *SharedDiscovery) start() { ticker := time.NewTicker(time.Minute) defer ticker.Stop() @@ -95,7 +146,7 @@ func (sd *sharedDiscovery) start() { if !ok { // 第一次记录且没有 targets 则跳过 if tg == nil || len(tg.Targets) == 0 { - logger.Infof("sharedDiscovery %s skip tg source '%s'", sd.id, tg.Source) + logger.Infof("sharedDiscovery %s skip tg source '%s'", sd.uk, tg.Source) continue } } @@ -112,7 +163,7 @@ func (sd *sharedDiscovery) start() { if tg.tg == nil || len(tg.tg.Targets) == 0 { delete(sd.store, source) sd.mm.IncDeletedTgSourceCounter() - logger.Infof("sharedDiscovery %s delete tg source '%s'", sd.id, source) + logger.Infof("sharedDiscovery %s delete tg source '%s'", sd.uk, source) } } } @@ -121,7 +172,7 @@ func (sd *sharedDiscovery) start() { } } -func (sd *sharedDiscovery) fetch() ([]*targetgroup.Group, int64) { +func (sd *SharedDiscovery) fetch() ([]*targetgroup.Group, int64) { sd.mut.RLock() defer sd.mut.RUnlock() @@ -135,82 +186,3 @@ func (sd *sharedDiscovery) fetch() ([]*targetgroup.Group, int64) { } return ret, maxTs } - -type SharedDiscoveryInfo struct { - Role string `json:"role"` - Namespaces []string `json:"namespaces"` -} - -func (si SharedDiscoveryInfo) ID() string { - return fmt.Sprintf("%s/%s", si.Role, strings.Join(si.Namespaces, "/")) -} - -func GetActiveSharedDiscovery() []SharedDiscoveryInfo { - sharedDiscoveryLock.Lock() - defer sharedDiscoveryLock.Unlock() - - info := make([]SharedDiscoveryInfo, 0) - for k := range sharedDiscoveryMap { - parts := strings.Split(k, "/") - info = append(info, SharedDiscoveryInfo{ - Role: parts[0], - Namespaces: parts[1:], - }) - } - return info -} - -func GetSharedDiscoveryCount() int { - sharedDiscoveryLock.Lock() - defer sharedDiscoveryLock.Unlock() - - return len(sharedDiscoveryMap) -} - -func getUniqueKey(role string, namespaces []string) string { - return fmt.Sprintf("%s/%s", role, strings.Join(namespaces, "/")) -} - -// RegisterSharedDiscover 注册 sharedDiscovery -// 共享 Discovery 实例可以减少 API Server 请求压力 同时也可以减少进程内存开销 -func RegisterSharedDiscover(role, kubeConfig string, namespaces []string) { - sharedDiscoveryLock.Lock() - defer sharedDiscoveryLock.Unlock() - - uniqueKey := getUniqueKey(role, namespaces) - if _, ok := sharedDiscoveryMap[uniqueKey]; !ok { - cfg := promdiscover.DefaultSDConfig - cfg.Role = promdiscover.Role(role) - cfg.NamespaceDiscovery.Names = namespaces - cfg.KubeConfig = kubeConfig - - discovery, err := promdiscover.New(new(logconf.Logger), &cfg) - if err != nil { - logger.Errorf("failed to create promdiscover: %v", err) - return - } - sd := newSharedDiscovery(gCtx, uniqueKey, namespaces, discovery) - gWg.Add(2) - go func() { - defer gWg.Done() - sd.watch() - }() - go func() { - defer gWg.Done() - sd.start() - }() - sharedDiscoveryMap[uniqueKey] = sd - } -} - -func GetTargetGroups(role string, namespaces []string) ([]*targetgroup.Group, int64) { - sharedDiscoveryLock.Lock() - defer sharedDiscoveryLock.Unlock() - - uniqueKey := getUniqueKey(role, namespaces) - if d, ok := sharedDiscoveryMap[uniqueKey]; ok { - return d.fetch() - } - - return nil, 0 -} diff --git a/pkg/operator/operator/hook.go b/pkg/operator/operator/hook.go index 58b189f90..fd23661b1 100644 --- a/pkg/operator/operator/hook.go +++ b/pkg/operator/operator/hook.go @@ -32,10 +32,8 @@ const ( confMonitorNamespacePath = "operator.monitor_namespace" confDenyTargetNamespacesPath = "operator.deny_target_namespaces" confTargetNamespacesPath = "operator.target_namespaces" - confTargetLabelSelectorPath = "operator.target_label_selector" confEnableServiceMonitorPath = "operator.enable_service_monitor" confEnablePodMonitorPath = "operator.enable_pod_monitor" - confEnableProbePath = "operator.enable_probe" // TODO(mando): 待支持 confEnablePromRulePath = "operator.enable_prometheus_rule" confEnableStatefulSetWorkerPath = "operator.enable_statefulset_worker" confEnableDaemonSetWorkerPath = "operator.enable_daemonset_worker" @@ -53,6 +51,9 @@ const ( confStatefulSetWorkerRegexPath = "operator.statefulset_worker_regex" confMonitorBlacklistMatchRulesPath = "operator.monitor_blacklist_match_rules" confHttpPortPath = "operator.http.port" + confPromSdConfigsPath = "operator.prom_sd_configs" + + // confEnableProbePath = "operator.enable_probe" // TODO(mando): 待支持 ) const ( @@ -70,6 +71,13 @@ type StatefulSetMatchRule struct { Namespace string `mapstructure:"namespace"` } +// PromSDConfig prometheus 提供的 sdconfigs +// 需要同时指定 namespace 以及 name +type PromSDConfig struct { + Namespace string `mapstructure:"namespace"` + Name string `mapstructure:"name"` +} + // MonitorBlacklistMatchRule monitor 黑名单匹配规则 // 在 monitor namespace 黑名单机制外再提供一种 name 级别的屏蔽机制 // 要求 kind/name/namespace 三者同时不为空 且此配置项优先级最高 @@ -84,33 +92,86 @@ func (r MonitorBlacklistMatchRule) Validate() bool { } var ( - ConfDryRun bool - ConfKubeConfig string // operator 连接 k8s 使用的 kubeconfig 文件路径 - ConfMonitorNamespace string // operator 所处 namespace - ConfTargetNamespaces []string - ConfDenyTargetNamespaces []string - ConfTargetLabelsSelector string - ConfAPIServerHost string - ConfTLSConfig *rest.TLSClientConfig - ConfEnableServiceMonitor bool - ConfEnablePodMonitor bool - ConfEnablePromRule bool - ConfEnableStatefulSetWorker bool - ConfEnableDaemonSetWorker bool - ConfEnableEndpointslice bool - ConfKubeletNamespace string - ConfKubeletName string - ConfKubeletEnable bool - ConfMaxNodeSecretRatio float64 - ConfStatefulSetWorkerHpa bool - ConfStatefulSetWorkerFactor int - ConfStatefulSetReplicas int - ConfStatefulSetMaxReplicas int - ConfStatefulSetMatchRules []StatefulSetMatchRule - ConfStatefulSetDispatchType string - ConfStatefulSetWorkerRegex string + // ConfDryRun 是否使用 dryrun 模式 该模式只匹配 不执行真实的调度逻辑 + ConfDryRun bool + + // ConfKubeConfig 连接 kubernetes 使用的 kubeconfig 文件路径 + ConfKubeConfig string + + // ConfAPIServerHost 链接 kubernetes 使用的 API host + ConfAPIServerHost string + + // ConfTLSConfig 链接 kubernetes 的 TLS 配置 + ConfTLSConfig *rest.TLSClientConfig + + // ConfMonitorNamespace 程序运行所处 namespace + ConfMonitorNamespace string + + // ConfTargetNamespaces namespace 匹配白名单 + ConfTargetNamespaces []string + + // ConfDenyTargetNamespaces namespace 匹配黑名单 + ConfDenyTargetNamespaces []string + + // ConfEnableServiceMonitor 是否启用 servicemonitor + ConfEnableServiceMonitor bool + + // ConfEnablePodMonitor 是否启用 podmonitor + ConfEnablePodMonitor bool + + // ConfEnablePromRule 是否启用 promrules 自监控专用 + ConfEnablePromRule bool + + // ConfEnableStatefulSetWorker 是否启用 statefulset worker 调度 + ConfEnableStatefulSetWorker bool + + // ConfEnableDaemonSetWorker 是否启用 daemonset worker 调度 + ConfEnableDaemonSetWorker bool + + // ConfEnableEndpointslice 是否启用 endpointslice 特性(kubernetes 版本要求 >= 1.22 + ConfEnableEndpointslice bool + + // ConfKubeletNamespace kubelet 组件所在 namespace + ConfKubeletNamespace string + + // ConfKubeletName kubelet 组件 endpoints 名称 + ConfKubeletName string + + // ConfKubeletEnable 是否启用 kubelet 特性 + ConfKubeletEnable bool + + // ConfMaxNodeSecretRatio 最大支持的 secrets 数量 maxSecrets = node x ratio + ConfMaxNodeSecretRatio float64 + + // ConfStatefulSetWorkerHpa 是否开启 statefulset worker HPA 特性 + ConfStatefulSetWorkerHpa bool + + // ConfStatefulSetWorkerFactor statefulset worker 调度因子 即单 worker 最多支持的 secrets 数量 + ConfStatefulSetWorkerFactor int + + // ConfStatefulSetReplicas statefulset worker 最小副本数 + ConfStatefulSetReplicas int + + // ConfStatefulSetMaxReplicas statefulset worker 最大副本数 + ConfStatefulSetMaxReplicas int + + // ConfStatefulSetMatchRules statefulset worker 匹配规则 + ConfStatefulSetMatchRules []StatefulSetMatchRule + + // ConfStatefulSetDispatchType statefulset worker 调度算法 + ConfStatefulSetDispatchType string + + // ConfStatefulSetWorkerRegex statefulset worker 名称匹配规则 用于锁定具体 worker 索引 + ConfStatefulSetWorkerRegex string + + // ConfMonitorBlacklistMatchRules monitor 黑名单匹配规则 ConfMonitorBlacklistMatchRules []MonitorBlacklistMatchRule - ConfHttpPort int + + // ConfHttpPort http 服务监听端口 + ConfHttpPort int + + // ConfPromSdConfigs promethues sdconfigs secrets 资源 + ConfPromSdConfigs []PromSDConfig ) // IfRejectServiceMonitor 判断是否拒绝 serviceMonitor @@ -174,7 +235,6 @@ func updateConfig() { ConfMonitorNamespace = viper.GetString(confMonitorNamespacePath) ConfDenyTargetNamespaces = viper.GetStringSlice(confDenyTargetNamespacesPath) ConfTargetNamespaces = viper.GetStringSlice(confTargetNamespacesPath) - ConfTargetLabelsSelector = viper.GetString(confTargetLabelSelectorPath) ConfEnableServiceMonitor = viper.GetBool(confEnableServiceMonitorPath) ConfEnablePodMonitor = viper.GetBool(confEnablePodMonitorPath) ConfEnablePromRule = viper.GetBool(confEnablePromRulePath) @@ -202,20 +262,23 @@ func updateConfig() { target.ConfServicePort = ConfHttpPort // reload 时状态需要置空 + ConfStatefulSetMatchRules = []StatefulSetMatchRule{} if viper.IsSet(confStatefulSetMatchRulesPath) { if err := viper.UnmarshalKey(confStatefulSetMatchRulesPath, &ConfStatefulSetMatchRules); err != nil { logger.Errorf("failed to unmarshal ConfStatefulSetMatchRules, err: %v", err) } - } else { - ConfStatefulSetMatchRules = []StatefulSetMatchRule{} } - + ConfMonitorBlacklistMatchRules = []MonitorBlacklistMatchRule{} if viper.IsSet(confMonitorBlacklistMatchRulesPath) { if err := viper.UnmarshalKey(confMonitorBlacklistMatchRulesPath, &ConfMonitorBlacklistMatchRules); err != nil { logger.Errorf("failed to unmarshal ConfMonitorBlacklistMatchRules, err: %v", err) } - } else { - ConfMonitorBlacklistMatchRules = []MonitorBlacklistMatchRule{} + } + ConfPromSdConfigs = []PromSDConfig{} + if viper.IsSet(confPromSdConfigsPath) { + if err := viper.UnmarshalKey(confPromSdConfigsPath, &ConfPromSdConfigs); err != nil { + logger.Errorf("failed to unmarshal ConfPromSdConfigs, err: %v", err) + } } } diff --git a/pkg/operator/operator/operator_test.go b/pkg/operator/operator/kubelet_test.go similarity index 100% rename from pkg/operator/operator/operator_test.go rename to pkg/operator/operator/kubelet_test.go diff --git a/pkg/operator/operator/objectsref/controller.go b/pkg/operator/operator/objectsref/controller.go index 34d6759c6..4ea5bc05f 100644 --- a/pkg/operator/operator/objectsref/controller.go +++ b/pkg/operator/operator/objectsref/controller.go @@ -45,8 +45,10 @@ type Object struct { OwnerRefs []OwnerRef // Pod 属性 - NodeName string - PodIP string + NodeName string + PodIP string + + // Metadata 属性 Labels map[string]string Annotations map[string]string @@ -198,7 +200,6 @@ type ObjectsController struct { cancel context.CancelFunc client kubernetes.Interface - mm *metricMonitor podObjs *Objects replicaSetObjs *Objects @@ -289,7 +290,6 @@ func NewController(ctx context.Context, client kubernetes.Interface, tkexClient controller.gameStatefulSetObjs = tkexObjs.gamestatefulset controller.gameDeploymentsObjs = tkexObjs.gamedeployment - controller.mm = newMetricMonitor() go controller.recordMetrics() return controller, nil @@ -321,14 +321,16 @@ func (oc *ObjectsController) recordMetrics() { return case <-ticker.C: - for ns, count := range oc.podObjs.Counter() { - oc.mm.SetWorkloadCount(count, ns, kindPod) + stats := make(map[string]int) + for _, count := range oc.podObjs.Counter() { + stats[kindPod] += count } for kind, objs := range oc.objsMap() { - for ns, count := range objs.Counter() { - oc.mm.SetWorkloadCount(count, ns, kind) + for _, count := range objs.Counter() { + stats[kind] += count } } + setWorkloadCount(stats) } } } diff --git a/pkg/operator/operator/objectsref/metrics.go b/pkg/operator/operator/objectsref/metrics.go index 32d07a197..452d2b50f 100644 --- a/pkg/operator/operator/objectsref/metrics.go +++ b/pkg/operator/operator/objectsref/metrics.go @@ -11,58 +11,44 @@ package objectsref import ( "sync" - "time" + "sync/atomic" ) -type namespaceKind struct { - namespace string - kind string -} - var ( - nsUpdated time.Time - nkWorkloadMut sync.Mutex - nkWorkload = map[namespaceKind]int{} + workloadMapMut sync.Mutex + workloadMap map[string]int ) -func GetWorkloadInfo() (map[string]int, time.Time) { - ret := make(map[string]int) - nkWorkloadMut.Lock() - for k, v := range nkWorkload { - ret[k.kind] += v +func GetWorkloadCount() map[string]int { + workloadMapMut.Lock() + defer workloadMapMut.Unlock() + + counts := make(map[string]int) + for k, v := range workloadMap { + counts[k] = v } - nkWorkloadMut.Unlock() - return ret, nsUpdated + return counts } -type metricMonitor struct{} - -func newMetricMonitor() *metricMonitor { - return &metricMonitor{} -} +func setWorkloadCount(counts map[string]int) { + workloadMapMut.Lock() + defer workloadMapMut.Unlock() -func (mm *metricMonitor) SetWorkloadCount(v int, namespace, kind string) { - nkWorkloadMut.Lock() - nsUpdated = time.Now() - nkWorkload[namespaceKind{namespace: namespace, kind: kind}] = v - nkWorkloadMut.Unlock() + workloadMap = counts } var ( - clusterNode int - clusterNodeUpdatedAt time.Time + clusterNode atomic.Int64 ) -func GetClusterNodeInfo() (int, time.Time) { - return clusterNode, clusterNodeUpdatedAt +func GetClusterNodeCount() int { + return int(clusterNode.Load()) } func incClusterNodeCount() { - clusterNode++ - clusterNodeUpdatedAt = time.Now() + clusterNode.Add(1) } func decClusterNodeCount() { - clusterNode-- - clusterNodeUpdatedAt = time.Now() + clusterNode.Add(-1) } diff --git a/pkg/operator/operator/operator.go b/pkg/operator/operator/operator.go index 6596c89b3..60118d5c3 100644 --- a/pkg/operator/operator/operator.go +++ b/pkg/operator/operator/operator.go @@ -22,19 +22,16 @@ import ( promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" promversioned "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" prominformers "github.com/prometheus-operator/prometheus-operator/pkg/informers" - "gopkg.in/yaml.v2" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" bkversioned "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/client/clientset/versioned" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/feature" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/k8sutils" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/tasks" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/dataidwatcher" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover/shareddiscovery" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/objectsref" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/promsli" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" @@ -43,6 +40,7 @@ import ( const ( monitorKindServiceMonitor = "ServiceMonitor" monitorKindPodMonitor = "PodMonitor" + monitorKindHttpSd = "HttpSd" ) var ( @@ -86,6 +84,8 @@ type Operator struct { statefulSetTaskCache map[int]map[string]struct{} eventTaskCache string scrapeUpdated time.Time + + promSdConfigsBytes map[string][]byte // 无并发读写 } func NewOperator(ctx context.Context, buildInfo BuildInfo) (*Operator, error) { @@ -161,9 +161,7 @@ func NewOperator(ctx context.Context, buildInfo BuildInfo) (*Operator, error) { denyTargetNamespaces, operator.promclient, define.ReSyncPeriod, - func(options *metav1.ListOptions) { - options.LabelSelector = ConfTargetLabelsSelector - }, + nil, ), promv1.SchemeGroupVersion.WithResource(promv1.ServiceMonitorName), ) @@ -179,9 +177,7 @@ func NewOperator(ctx context.Context, buildInfo BuildInfo) (*Operator, error) { denyTargetNamespaces, operator.promclient, define.ReSyncPeriod, - func(options *metav1.ListOptions) { - options.LabelSelector = ConfTargetLabelsSelector - }, + nil, ), promv1.SchemeGroupVersion.WithResource(promv1.PodMonitorName), ) @@ -277,7 +273,7 @@ func (c *Operator) recordMetrics() { } func (c *Operator) updateSharedDiscoveryMetrics() { - c.mm.SetSharedDiscoveryCount(discover.GetSharedDiscoveryCount()) + c.mm.SetSharedDiscoveryCount(len(shareddiscovery.AllDiscovery())) c.mm.SetDiscoverCount(len(c.getAllDiscover())) } @@ -301,19 +297,19 @@ func (c *Operator) updateMonitorEndpointMetrics() { } func (c *Operator) updateWorkloadMetrics() { - workloads, _ := objectsref.GetWorkloadInfo() + workloads := objectsref.GetWorkloadCount() for resource, count := range workloads { c.mm.SetWorkloadCount(resource, count) } } func (c *Operator) updateNodeMetrics() { - nodes, _ := objectsref.GetClusterNodeInfo() + nodes := objectsref.GetClusterNodeCount() c.mm.SetNodeCount(nodes) } func (c *Operator) Run() error { - discover.Activate() + shareddiscovery.Activate() errChan := make(chan error, 2) c.wg.Add(1) go func() { @@ -402,34 +398,11 @@ func (c *Operator) Run() error { go c.reconcileNodeEndpoints(c.ctx) } + go c.loopHandlePromSdConfigs() c.cleanupInvalidSecrets() return nil } -func (c *Operator) cleanupInvalidSecrets() { - secretClient := c.client.CoreV1().Secrets(ConfMonitorNamespace) - secrets, err := secretClient.List(c.ctx, metav1.ListOptions{ - LabelSelector: "createdBy=bkmonitor-operator", - }) - if err != nil { - logger.Errorf("failed to list secrets, err: %v", err) - return - } - - // 清理不合法的 secrets - for _, secret := range secrets.Items { - if _, ok := secret.Labels[tasks.LabelTaskType]; !ok { - if err := secretClient.Delete(c.ctx, secret.Name, metav1.DeleteOptions{}); err != nil { - c.mm.IncHandledSecretFailedCounter(secret.Name, define.ActionDelete) - logger.Errorf("failed to delete secret %s, err: %v", secret.Name, err) - continue - } - c.mm.IncHandledSecretSuccessCounter(secret.Name, define.ActionDelete) - logger.Infof("remove invalid secret %s", secret.Name) - } - } -} - func (c *Operator) Stop() { c.cancel() if err := c.srv.Shutdown(context.Background()); err != nil { @@ -439,7 +412,7 @@ func (c *Operator) Stop() { c.dw.Stop() c.objectsController.Stop() - discover.Deactivate() + shareddiscovery.Deactivate() } // waitForCacheSync waits for the informers' caches to be synced. @@ -501,20 +474,6 @@ func (c *Operator) deleteDiscoverByName(name string) { } } -func (c *Operator) getServiceMonitorDiscoversName(serviceMonitor *promv1.ServiceMonitor) []string { - var names []string - for index := range serviceMonitor.Spec.Endpoints { - monitorMeta := define.MonitorMeta{ - Name: serviceMonitor.Name, - Kind: monitorKindServiceMonitor, - Namespace: serviceMonitor.Namespace, - Index: index, - } - names = append(names, monitorMeta.ID()) - } - return names -} - func ifHonorTimestamps(b *bool) bool { if b == nil { return true @@ -522,407 +481,6 @@ func ifHonorTimestamps(b *bool) bool { return *b } -func (c *Operator) createServiceMonitorDiscovers(serviceMonitor *promv1.ServiceMonitor) []discover.Discover { - var ( - namespaces []string - discovers []discover.Discover - ) - - systemResource := feature.IfSystemResource(serviceMonitor.Annotations) - meta := define.MonitorMeta{ - Name: serviceMonitor.Name, - Kind: monitorKindServiceMonitor, - Namespace: serviceMonitor.Namespace, - } - dataID, err := c.dw.MatchMetricDataID(meta, systemResource) - if err != nil { - logger.Errorf("meta=%v found no dataid", meta) - return discovers - } - specLabels := dataID.Spec.Labels - - if serviceMonitor.Spec.NamespaceSelector.Any { - namespaces = []string{} - } else if len(serviceMonitor.Spec.NamespaceSelector.MatchNames) == 0 { - namespaces = []string{serviceMonitor.Namespace} - } else { - namespaces = serviceMonitor.Spec.NamespaceSelector.MatchNames - } - - logger.Infof("get serviceMonitor, name=%s, namespace=%s", serviceMonitor.Name, serviceMonitor.Namespace) - for index, endpoint := range serviceMonitor.Spec.Endpoints { - if endpoint.Path == "" { - endpoint.Path = "/metrics" - } - if endpoint.Scheme == "" { - endpoint.Scheme = "http" - } - - relabels := getServiceMonitorRelabels(serviceMonitor, &endpoint) - resultLabels, err := convertYamlRelabels(relabels) - if err != nil { - logger.Errorf("failed to convert relabels, err: %s", err) - continue - } - - metricRelabelings := make([]yaml.MapSlice, 0) - if len(endpoint.MetricRelabelConfigs) != 0 { - for _, cfg := range endpoint.MetricRelabelConfigs { - relabeling := generateRelabelConfig(cfg) - metricRelabelings = append(metricRelabelings, relabeling) - } - } - logger.Debugf("serviceMonitor %s get relabels config: %+v", serviceMonitor.Name, relabels) - - monitorMeta := meta - monitorMeta.Index = index - - var proxyURL string - if endpoint.ProxyURL != nil { - proxyURL = *endpoint.ProxyURL - } - - endpointDiscover := discover.NewEndpointDiscover(c.ctx, monitorMeta, c.objectsController.NodeNameExists, &discover.EndpointParams{ - BaseParams: &discover.BaseParams{ - Client: c.client, - RelabelRule: feature.RelabelRule(serviceMonitor.Annotations), - RelabelIndex: feature.RelabelIndex(serviceMonitor.Annotations), - NormalizeMetricName: feature.IfNormalizeMetricName(serviceMonitor.Annotations), - AntiAffinity: feature.IfAntiAffinity(serviceMonitor.Annotations), - MatchSelector: feature.MonitorMatchSelector(serviceMonitor.Annotations), - DropSelector: feature.MonitorDropSelector(serviceMonitor.Annotations), - LabelJoinMatcher: feature.LabelJoinMatcher(serviceMonitor.Annotations), - UseEndpointSlice: useEndpointslice, - Name: monitorMeta.ID(), - DataID: dataID, - KubeConfig: ConfKubeConfig, - Namespaces: namespaces, - Relabels: resultLabels, - Path: endpoint.Path, - Scheme: endpoint.Scheme, - TLSConfig: endpoint.TLSConfig.DeepCopy(), - BasicAuth: endpoint.BasicAuth.DeepCopy(), - BearerTokenFile: endpoint.BearerTokenFile, - BearerTokenSecret: endpoint.BearerTokenSecret.DeepCopy(), - Period: string(endpoint.Interval), - ProxyURL: proxyURL, - Timeout: string(endpoint.ScrapeTimeout), - ExtraLabels: specLabels, - ForwardLocalhost: feature.IfForwardLocalhost(serviceMonitor.Annotations), - DisableCustomTimestamp: !ifHonorTimestamps(endpoint.HonorTimestamps), - System: systemResource, - UrlValues: endpoint.Params, - MetricRelabelConfigs: metricRelabelings, - }, - }) - - logger.Infof("get new endpoint discover %s", endpointDiscover) - discovers = append(discovers, endpointDiscover) - } - return discovers -} - -func (c *Operator) handleServiceMonitorAdd(obj interface{}) { - serviceMonitor, ok := obj.(*promv1.ServiceMonitor) - if !ok { - logger.Errorf("expected ServiceMonitor type, got %T", obj) - return - } - - if ConfEnablePromRule { - c.promsliController.UpdateServiceMonitor(serviceMonitor) - } - - // 新增的 servicemonitor 命中黑名单则流程终止 - if IfRejectServiceMonitor(serviceMonitor) { - logger.Infof("add action match the blacklist rules, serviceMonitor=%+v", serviceMonitor) - return - } - - discovers := c.createServiceMonitorDiscovers(serviceMonitor) - for _, dis := range discovers { - if err := c.addOrUpdateDiscover(dis); err != nil { - logger.Errorf("add or update serviceMonitor discover %s failed, err: %s", dis, err) - } - } -} - -func (c *Operator) handleServiceMonitorUpdate(oldObj interface{}, newObj interface{}) { - old, ok := oldObj.(*promv1.ServiceMonitor) - if !ok { - logger.Errorf("expected ServiceMonitor type, got %T", oldObj) - return - } - cur, ok := newObj.(*promv1.ServiceMonitor) - if !ok { - logger.Errorf("expected ServiceMonitor type, got %T", newObj) - return - } - - if ConfEnablePromRule { - c.promsliController.UpdateServiceMonitor(cur) - } - - if old.ResourceVersion == cur.ResourceVersion { - logger.Debugf("serviceMonitor '%s/%s' does not change", old.Namespace, old.Name) - return - } - - // 对于更新的 servicemonitor 如果新的 spec 命中黑名单 则需要将原有的 servicemonitor 移除 - if IfRejectServiceMonitor(cur) { - logger.Infof("update action match the blacklist rules, serviceMonitor=%+v", cur) - for _, name := range c.getServiceMonitorDiscoversName(cur) { - c.deleteDiscoverByName(name) - } - return - } - - for _, name := range c.getServiceMonitorDiscoversName(old) { - c.deleteDiscoverByName(name) - } - for _, dis := range c.createServiceMonitorDiscovers(cur) { - if err := c.addOrUpdateDiscover(dis); err != nil { - logger.Errorf("add or update serviceMonitor discover %s failed, err: %s", dis, err) - } - } -} - -func (c *Operator) handleServiceMonitorDelete(obj interface{}) { - serviceMonitor, ok := obj.(*promv1.ServiceMonitor) - if !ok { - logger.Errorf("expected ServiceMonitor type, got %T", obj) - return - } - - if ConfEnablePromRule { - c.promsliController.DeleteServiceMonitor(serviceMonitor) - } - - for _, name := range c.getServiceMonitorDiscoversName(serviceMonitor) { - c.deleteDiscoverByName(name) - } -} - -func (c *Operator) getPodMonitorDiscoversName(podMonitor *promv1.PodMonitor) []string { - var names []string - for index := range podMonitor.Spec.PodMetricsEndpoints { - monitorMeta := define.MonitorMeta{ - Name: podMonitor.Name, - Kind: monitorKindPodMonitor, - Namespace: podMonitor.Namespace, - Index: index, - } - names = append(names, monitorMeta.ID()) - } - return names -} - -func (c *Operator) createPodMonitorDiscovers(podMonitor *promv1.PodMonitor) []discover.Discover { - var ( - namespaces []string - discovers []discover.Discover - ) - - systemResource := feature.IfSystemResource(podMonitor.Annotations) - meta := define.MonitorMeta{ - Name: podMonitor.Name, - Kind: monitorKindPodMonitor, - Namespace: podMonitor.Namespace, - } - dataID, err := c.dw.MatchMetricDataID(meta, systemResource) - if err != nil { - logger.Errorf("meta=%v no dataid found", meta) - return discovers - } - specLabels := dataID.Spec.Labels - - if podMonitor.Spec.NamespaceSelector.Any { - namespaces = []string{} - } else if len(podMonitor.Spec.NamespaceSelector.MatchNames) == 0 { - namespaces = []string{podMonitor.Namespace} - } else { - namespaces = podMonitor.Spec.NamespaceSelector.MatchNames - } - - logger.Infof("get podMonitor, name=%s, namespace=%s", podMonitor.Name, podMonitor.Namespace) - for index, endpoint := range podMonitor.Spec.PodMetricsEndpoints { - if endpoint.Path == "" { - endpoint.Path = "/metrics" - } - if endpoint.Scheme == "" { - endpoint.Scheme = "http" - } - - relabels := getPodMonitorRelabels(podMonitor, &endpoint) - resultLabels, err := convertYamlRelabels(relabels) - if err != nil { - logger.Errorf("failed to convert relabels, err: %s", err) - continue - } - - metricRelabelings := make([]yaml.MapSlice, 0) - if len(endpoint.MetricRelabelConfigs) != 0 { - for _, cfg := range endpoint.MetricRelabelConfigs { - relabeling := generateRelabelConfig(cfg) - metricRelabelings = append(metricRelabelings, relabeling) - } - } - - logger.Debugf("podMonitor %s get relabels: %v", podMonitor.Name, relabels) - - monitorMeta := meta - monitorMeta.Index = index - - var proxyURL string - if endpoint.ProxyURL != nil { - proxyURL = *endpoint.ProxyURL - } - - var safeTlsConfig promv1.SafeTLSConfig - tlsConfig := endpoint.TLSConfig.DeepCopy() - if tlsConfig != nil { - safeTlsConfig = tlsConfig.SafeTLSConfig - } - - podDiscover := discover.NewPodDiscover(c.ctx, monitorMeta, c.objectsController.NodeNameExists, &discover.PodParams{ - BaseParams: &discover.BaseParams{ - Client: c.client, - RelabelRule: feature.RelabelRule(podMonitor.Annotations), - RelabelIndex: feature.RelabelIndex(podMonitor.Annotations), - NormalizeMetricName: feature.IfNormalizeMetricName(podMonitor.Annotations), - AntiAffinity: feature.IfAntiAffinity(podMonitor.Annotations), - MatchSelector: feature.MonitorMatchSelector(podMonitor.Annotations), - DropSelector: feature.MonitorDropSelector(podMonitor.Annotations), - LabelJoinMatcher: feature.LabelJoinMatcher(podMonitor.Annotations), - UseEndpointSlice: useEndpointslice, - Name: monitorMeta.ID(), - DataID: dataID, - KubeConfig: ConfKubeConfig, - Namespaces: namespaces, - Relabels: resultLabels, - Path: endpoint.Path, - Scheme: endpoint.Scheme, - BasicAuth: endpoint.BasicAuth.DeepCopy(), - BearerTokenSecret: endpoint.BearerTokenSecret.DeepCopy(), - TLSConfig: &promv1.TLSConfig{SafeTLSConfig: safeTlsConfig}, - Period: string(endpoint.Interval), - Timeout: string(endpoint.ScrapeTimeout), - ProxyURL: proxyURL, - ExtraLabels: specLabels, - ForwardLocalhost: feature.IfForwardLocalhost(podMonitor.Annotations), - DisableCustomTimestamp: !ifHonorTimestamps(endpoint.HonorTimestamps), - System: systemResource, - UrlValues: endpoint.Params, - MetricRelabelConfigs: metricRelabelings, - }, - TLSConfig: endpoint.TLSConfig, - }) - - logger.Infof("get new pod discover %s", podDiscover) - discovers = append(discovers, podDiscover) - } - return discovers -} - -func (c *Operator) handlePrometheusRuleAdd(obj interface{}) { - promRule, ok := obj.(*promv1.PrometheusRule) - if !ok { - logger.Errorf("expected PrometheusRule type, got %T", obj) - return - } - - c.promsliController.UpdatePrometheusRule(promRule) -} - -func (c *Operator) handlePrometheusRuleUpdate(_ interface{}, obj interface{}) { - promRule, ok := obj.(*promv1.PrometheusRule) - if !ok { - logger.Errorf("expected PrometheusRule type, got %T", obj) - return - } - - c.promsliController.UpdatePrometheusRule(promRule) -} - -func (c *Operator) handlePrometheusRuleDelete(obj interface{}) { - promRule, ok := obj.(*promv1.PrometheusRule) - if !ok { - logger.Errorf("expected PrometheusRule type, got %T", obj) - return - } - - c.promsliController.DeletePrometheusRule(promRule) -} - -func (c *Operator) handlePodMonitorAdd(obj interface{}) { - podMonitor, ok := obj.(*promv1.PodMonitor) - if !ok { - logger.Errorf("expected PodMonitor type, got %T", obj) - return - } - - // 新增的 podmonitor 命中黑名单则流程终止 - if IfRejectPodMonitor(podMonitor) { - logger.Infof("add action match the blacklist rules, podMonitor=%+v", podMonitor) - return - } - - discovers := c.createPodMonitorDiscovers(podMonitor) - for _, dis := range discovers { - if err := c.addOrUpdateDiscover(dis); err != nil { - logger.Errorf("add or update podMonitor discover %s failed, err: %s", dis, err) - } - } -} - -func (c *Operator) handlePodMonitorUpdate(oldObj interface{}, newObj interface{}) { - old, ok := oldObj.(*promv1.PodMonitor) - if !ok { - logger.Errorf("expected PodMonitor type, got %T", oldObj) - return - } - cur, ok := newObj.(*promv1.PodMonitor) - if !ok { - logger.Errorf("expected PodMonitor type, got %T", newObj) - return - } - - if old.ResourceVersion == cur.ResourceVersion { - logger.Debugf("podMonitor '%s/%s' does not change", old.Namespace, old.Name) - return - } - - // 对于更新的 podmonitor 如果新的 spec 命中黑名单 则需要将原有的 podmonitor 移除 - if IfRejectPodMonitor(cur) { - logger.Infof("update action match the blacklist rules, podMonitor=%+v", cur) - for _, name := range c.getPodMonitorDiscoversName(cur) { - c.deleteDiscoverByName(name) - } - return - } - - for _, name := range c.getPodMonitorDiscoversName(old) { - c.deleteDiscoverByName(name) - } - for _, dis := range c.createPodMonitorDiscovers(cur) { - if err := c.addOrUpdateDiscover(dis); err != nil { - logger.Errorf("add or update podMonitor discover %s failed, err: %s", dis, err) - } - } -} - -func (c *Operator) handlePodMonitorDelete(obj interface{}) { - podMonitor, ok := obj.(*promv1.PodMonitor) - if !ok { - logger.Errorf("expected PodMonitor type, got %T", obj) - return - } - - for _, name := range c.getPodMonitorDiscoversName(podMonitor) { - c.deleteDiscoverByName(name) - } -} - func (c *Operator) handleDiscoverNotify() { c.wg.Add(1) defer c.wg.Done() diff --git a/pkg/operator/operator/podmonitor.go b/pkg/operator/operator/podmonitor.go new file mode 100644 index 000000000..002331a6f --- /dev/null +++ b/pkg/operator/operator/podmonitor.go @@ -0,0 +1,217 @@ +// Tencent is pleased to support the open source community by making +// 蓝鲸智云 - 监控平台 (BlueKing - Monitor) available. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +// You may obtain a copy of the License at http://opensource.org/licenses/MIT +// Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +// an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +package operator + +import ( + "fmt" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "gopkg.in/yaml.v2" + + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/feature" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover/kubernetesd" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" +) + +func podMonitorID(obj *promv1.PodMonitor) string { + return fmt.Sprintf("%s/%s", obj.Namespace, obj.Name) +} + +func (c *Operator) handlePodMonitorAdd(obj interface{}) { + podMonitor, ok := obj.(*promv1.PodMonitor) + if !ok { + logger.Errorf("expected PodMonitor type, got %T", obj) + return + } + + // 新增的 podmonitor 命中黑名单则流程终止 + if IfRejectPodMonitor(podMonitor) { + logger.Infof("add action match blacklist rules, podMonitor=%s", podMonitorID(podMonitor)) + return + } + + discovers := c.createPodMonitorDiscovers(podMonitor) + for _, dis := range discovers { + if err := c.addOrUpdateDiscover(dis); err != nil { + logger.Errorf("add or update podMonitor discover %s failed, err: %s", dis, err) + } + } +} + +func (c *Operator) handlePodMonitorUpdate(oldObj interface{}, newObj interface{}) { + old, ok := oldObj.(*promv1.PodMonitor) + if !ok { + logger.Errorf("expected PodMonitor type, got %T", oldObj) + return + } + cur, ok := newObj.(*promv1.PodMonitor) + if !ok { + logger.Errorf("expected PodMonitor type, got %T", newObj) + return + } + + if old.ResourceVersion == cur.ResourceVersion { + logger.Debugf("podMonitor '%s' does not change", podMonitorID(old)) + return + } + + // 对于更新的 podmonitor 如果新的 spec 命中黑名单 则需要将原有的 podmonitor 移除 + if IfRejectPodMonitor(cur) { + logger.Infof("update action match blacklist rules, podMonitor=%s", podMonitorID(cur)) + for _, name := range c.getPodMonitorDiscoversName(cur) { + c.deleteDiscoverByName(name) + } + return + } + + for _, name := range c.getPodMonitorDiscoversName(old) { + c.deleteDiscoverByName(name) + } + for _, dis := range c.createPodMonitorDiscovers(cur) { + if err := c.addOrUpdateDiscover(dis); err != nil { + logger.Errorf("add or update podMonitor discover %s failed, err: %s", dis, err) + } + } +} + +func (c *Operator) handlePodMonitorDelete(obj interface{}) { + podMonitor, ok := obj.(*promv1.PodMonitor) + if !ok { + logger.Errorf("expected PodMonitor type, got %T", obj) + return + } + + for _, name := range c.getPodMonitorDiscoversName(podMonitor) { + c.deleteDiscoverByName(name) + } +} + +func (c *Operator) getPodMonitorDiscoversName(podMonitor *promv1.PodMonitor) []string { + var names []string + for index := range podMonitor.Spec.PodMetricsEndpoints { + monitorMeta := define.MonitorMeta{ + Name: podMonitor.Name, + Kind: monitorKindPodMonitor, + Namespace: podMonitor.Namespace, + Index: index, + } + names = append(names, monitorMeta.ID()) + } + return names +} + +func (c *Operator) createPodMonitorDiscovers(podMonitor *promv1.PodMonitor) []discover.Discover { + var ( + namespaces []string + discovers []discover.Discover + ) + + systemResource := feature.IfSystemResource(podMonitor.Annotations) + meta := define.MonitorMeta{ + Name: podMonitor.Name, + Kind: monitorKindPodMonitor, + Namespace: podMonitor.Namespace, + } + dataID, err := c.dw.MatchMetricDataID(meta, systemResource) + if err != nil { + logger.Errorf("podmonitor(%+v) no dataid matched", meta) + return discovers + } + specLabels := dataID.Spec.Labels + + if podMonitor.Spec.NamespaceSelector.Any { + namespaces = []string{} + } else if len(podMonitor.Spec.NamespaceSelector.MatchNames) == 0 { + namespaces = []string{podMonitor.Namespace} + } else { + namespaces = podMonitor.Spec.NamespaceSelector.MatchNames + } + + logger.Infof("found new podMonitor '%s'", podMonitorID(podMonitor)) + for index, endpoint := range podMonitor.Spec.PodMetricsEndpoints { + if endpoint.Path == "" { + endpoint.Path = "/metrics" + } + if endpoint.Scheme == "" { + endpoint.Scheme = "http" + } + + relabels := getPodMonitorRelabels(podMonitor, &endpoint) + resultLabels, err := yamlToRelabels(relabels) + if err != nil { + logger.Errorf("failed to convert relabels, err: %s", err) + continue + } + + metricRelabelings := make([]yaml.MapSlice, 0) + if len(endpoint.MetricRelabelConfigs) != 0 { + for _, cfg := range endpoint.MetricRelabelConfigs { + relabeling := generatePromv1RelabelConfig(cfg) + metricRelabelings = append(metricRelabelings, relabeling) + } + } + + logger.Debugf("podMonitor '%s' get relabels: %v", podMonitorID(podMonitor), relabels) + + monitorMeta := meta + monitorMeta.Index = index + + var proxyURL string + if endpoint.ProxyURL != nil { + proxyURL = *endpoint.ProxyURL + } + + var safeTlsConfig promv1.SafeTLSConfig + tlsConfig := endpoint.TLSConfig.DeepCopy() + if tlsConfig != nil { + safeTlsConfig = tlsConfig.SafeTLSConfig + } + + dis := kubernetesd.New(c.ctx, kubernetesd.TypePod, c.objectsController.NodeNameExists, &kubernetesd.Options{ + CommonOptions: &discover.CommonOptions{ + MonitorMeta: monitorMeta, + RelabelRule: feature.RelabelRule(podMonitor.Annotations), + RelabelIndex: feature.RelabelIndex(podMonitor.Annotations), + NormalizeMetricName: feature.IfNormalizeMetricName(podMonitor.Annotations), + AntiAffinity: feature.IfAntiAffinity(podMonitor.Annotations), + MatchSelector: feature.MonitorMatchSelector(podMonitor.Annotations), + DropSelector: feature.MonitorDropSelector(podMonitor.Annotations), + LabelJoinMatcher: feature.LabelJoinMatcher(podMonitor.Annotations), + ForwardLocalhost: feature.IfForwardLocalhost(podMonitor.Annotations), + Name: monitorMeta.ID(), + DataID: dataID, + Relabels: resultLabels, + Path: endpoint.Path, + Scheme: endpoint.Scheme, + Period: string(endpoint.Interval), + Timeout: string(endpoint.ScrapeTimeout), + ProxyURL: proxyURL, + ExtraLabels: specLabels, + DisableCustomTimestamp: !ifHonorTimestamps(endpoint.HonorTimestamps), + System: systemResource, + UrlValues: endpoint.Params, + MetricRelabelConfigs: metricRelabelings, + }, + Client: c.client, + Namespaces: namespaces, + KubeConfig: ConfKubeConfig, + BasicAuth: endpoint.BasicAuth.DeepCopy(), + BearerTokenSecret: endpoint.BearerTokenSecret.DeepCopy(), + TLSConfig: &promv1.TLSConfig{SafeTLSConfig: safeTlsConfig}, + UseEndpointSlice: useEndpointslice, + }) + + logger.Infof("create new pod discover: %s", dis.Name()) + discovers = append(discovers, dis) + } + return discovers +} diff --git a/pkg/operator/operator/promrule.go b/pkg/operator/operator/promrule.go new file mode 100644 index 000000000..8a7129f4d --- /dev/null +++ b/pkg/operator/operator/promrule.go @@ -0,0 +1,46 @@ +// Tencent is pleased to support the open source community by making +// 蓝鲸智云 - 监控平台 (BlueKing - Monitor) available. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +// You may obtain a copy of the License at http://opensource.org/licenses/MIT +// Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +// an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +package operator + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" +) + +func (c *Operator) handlePrometheusRuleAdd(obj interface{}) { + promRule, ok := obj.(*promv1.PrometheusRule) + if !ok { + logger.Errorf("expected PrometheusRule type, got %T", obj) + return + } + + c.promsliController.UpdatePrometheusRule(promRule) +} + +func (c *Operator) handlePrometheusRuleUpdate(_ interface{}, obj interface{}) { + promRule, ok := obj.(*promv1.PrometheusRule) + if !ok { + logger.Errorf("expected PrometheusRule type, got %T", obj) + return + } + + c.promsliController.UpdatePrometheusRule(promRule) +} + +func (c *Operator) handlePrometheusRuleDelete(obj interface{}) { + promRule, ok := obj.(*promv1.PrometheusRule) + if !ok { + logger.Errorf("expected PrometheusRule type, got %T", obj) + return + } + + c.promsliController.DeletePrometheusRule(promRule) +} diff --git a/pkg/operator/operator/promsd.go b/pkg/operator/operator/promsd.go new file mode 100644 index 000000000..9e499b35f --- /dev/null +++ b/pkg/operator/operator/promsd.go @@ -0,0 +1,208 @@ +// Tencent is pleased to support the open source community by making +// 蓝鲸智云 - 监控平台 (BlueKing - Monitor) available. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +// You may obtain a copy of the License at http://opensource.org/licenses/MIT +// Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +// an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +package operator + +import ( + "fmt" + "reflect" + "time" + + "github.com/pkg/errors" + "github.com/prometheus/prometheus/config" + promhttpsd "github.com/prometheus/prometheus/discovery/http" + "gopkg.in/yaml.v2" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover/httpd" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" +) + +func (c *Operator) getPromScrapeConfigs() ([]config.ScrapeConfig, bool) { + if len(ConfPromSdConfigs) == 0 { + return nil, false + } + + var configs []config.ScrapeConfig + round := make(map[string][]byte) // 本轮获取到的数据 + for _, conf := range ConfPromSdConfigs { + m, err := c.getPromSdConfigs(conf) + if err != nil { + logger.Errorf("get secrets sesource failed, config=(%+v), err: %v", conf, err) + continue + } + + for k, v := range m { + sdc, err := unmarshalPromSdConfigs(v) + if err != nil { + logger.Errorf("unmarshal prom sdconfigs failed, resource=(%s), err: %v", k, err) + continue + } + + round[k] = v + configs = append(configs, sdc...) + } + } + + eq := reflect.DeepEqual(c.promSdConfigsBytes, round) // 对比是否需要更新操作 + c.promSdConfigsBytes = round + return configs, !eq // changed +} + +func (c *Operator) getPromSdConfigs(sdConfig PromSDConfig) (map[string][]byte, error) { + // 需要同时指定 namespace/name + if sdConfig.Namespace == "" || sdConfig.Name == "" { + return nil, errors.New("empty sdconfig namespace/name") + } + secretClient := c.client.CoreV1().Secrets(sdConfig.Namespace) + secret, err := secretClient.Get(c.ctx, sdConfig.Name, metav1.GetOptions{}) + if err != nil { + return nil, err + } + + ret := make(map[string][]byte) + for file, data := range secret.Data { + ret[sdConfigsKeyFunc(sdConfig, file)] = data + } + return ret, nil +} + +func sdConfigsKeyFunc(sdConfig PromSDConfig, file string) string { + return fmt.Sprintf("%s/%s/%s", sdConfig.Namespace, sdConfig.Name, file) +} + +func unmarshalPromSdConfigs(b []byte) ([]config.ScrapeConfig, error) { + var objs []interface{} + if err := yaml.Unmarshal(b, &objs); err != nil { + return nil, err + } + + var ret []config.ScrapeConfig + for i := 0; i < len(objs); i++ { + obj := objs[i] + var sc config.ScrapeConfig + + bs, err := yaml.Marshal(obj) + if err != nil { + return nil, err + } + if err := yaml.Unmarshal(bs, &sc); err != nil { + return nil, err + } + ret = append(ret, sc) + } + + return ret, nil +} + +func (c *Operator) createHttpSdDiscover(scrapeConfig config.ScrapeConfig, sdConfig *promhttpsd.SDConfig, index int) (discover.Discover, error) { + metricRelabelings := make([]yaml.MapSlice, 0) + if len(scrapeConfig.MetricRelabelConfigs) != 0 { + for _, cfg := range scrapeConfig.MetricRelabelConfigs { + relabeling := generatePromRelabelConfig(cfg) + metricRelabelings = append(metricRelabelings, relabeling) + } + } + + monitorMeta := define.MonitorMeta{ + Name: scrapeConfig.JobName, + Kind: monitorKindHttpSd, + Namespace: "-", // 不标记 namespace + Index: index, + } + // 默认使用 custommetric dataid + dataID, err := c.dw.MatchMetricDataID(monitorMeta, false) + if err != nil { + return nil, err + } + + specLabels := dataID.Spec.Labels + httpClientConfig := scrapeConfig.HTTPClientConfig + + var proxyUrl string + if httpClientConfig.ProxyURL.URL != nil { + proxyUrl = httpClientConfig.ProxyURL.String() + } + dis := httpd.New(c.ctx, c.objectsController.NodeNameExists, &httpd.Options{ + CommonOptions: &discover.CommonOptions{ + MonitorMeta: monitorMeta, + Name: monitorMeta.ID(), + DataID: dataID, + Relabels: scrapeConfig.RelabelConfigs, + Path: scrapeConfig.MetricsPath, + Scheme: scrapeConfig.Scheme, + BearerTokenFile: httpClientConfig.BearerTokenFile, + ProxyURL: proxyUrl, + Period: scrapeConfig.ScrapeInterval.String(), + Timeout: scrapeConfig.ScrapeTimeout.String(), + DisableCustomTimestamp: !ifHonorTimestamps(&scrapeConfig.HonorTimestamps), + UrlValues: scrapeConfig.Params, + ExtraLabels: specLabels, + MetricRelabelConfigs: metricRelabelings, + }, + SDConfig: sdConfig, + HTTPClientConfig: scrapeConfig.HTTPClientConfig, + }) + + logger.Infof("create httpsd discover: %v", dis.Name()) + return dis, nil +} + +func (c *Operator) createPromScrapeConfigDiscovers() []discover.Discover { + scrapeConfigs, ok := c.getPromScrapeConfigs() + if !ok { + return nil + } + + logger.Infof("got prom scrapeConfigs, count=%d", len(scrapeConfigs)) + var discovers []discover.Discover + for i := 0; i < len(scrapeConfigs); i++ { + scrapeConfig := scrapeConfigs[i] + for idx, rc := range scrapeConfig.ServiceDiscoveryConfigs { + switch obj := rc.(type) { + case *promhttpsd.SDConfig: // TODO(mando): 目前仅支持 httpsd + httpSdDiscover, err := c.createHttpSdDiscover(scrapeConfig, obj, idx) + if err != nil { + logger.Errorf("failed to create httpsd discover: %v", err) + continue + } + discovers = append(discovers, httpSdDiscover) + } + } + } + return discovers +} + +func (c *Operator) loopHandlePromSdConfigs() { + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() + + fn := func() { + discovers := c.createPromScrapeConfigDiscovers() + for _, dis := range discovers { + if err := c.addOrUpdateDiscover(dis); err != nil { + logger.Errorf("add or update prom scrapeConfigs discover %s failed, err: %s", dis, err) + } + } + } + + fn() // 启动即执行 + + for { + select { + case <-c.ctx.Done(): + return + + case <-ticker.C: + fn() + } + } +} diff --git a/pkg/operator/operator/promsli/promsli.go b/pkg/operator/operator/promsli/promsli.go index 5316055c9..b8a4d98d8 100644 --- a/pkg/operator/operator/promsli/promsli.go +++ b/pkg/operator/operator/promsli/promsli.go @@ -31,9 +31,9 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/compressor" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/eplabels" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/feature" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/gzip" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/k8sutils" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/notifier" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" @@ -302,7 +302,7 @@ func (c *Controller) CreateOrUpdatePromScrapeSecret() error { return errors.Wrap(err, "yaml unmarshal failed") } - compressed, err := compressor.Compress(b) + compressed, err := gzip.Compress(b) if err != nil { return errors.Wrap(err, "compress data failed") } diff --git a/pkg/operator/operator/recorder.go b/pkg/operator/operator/recorder.go index ccbc4fedb..8be9198e1 100644 --- a/pkg/operator/operator/recorder.go +++ b/pkg/operator/operator/recorder.go @@ -11,6 +11,7 @@ package operator import ( "fmt" + "sort" "sync" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" @@ -95,6 +96,10 @@ func (r *Recorder) getActiveConfigFiles() []ConfigFileRecord { for _, cfg := range r.activeConfigFile { cfgs = append(cfgs, cfg) } + + sort.Slice(cfgs, func(i, j int) bool { + return cfgs[i].Meta.ID() < cfgs[j].Meta.ID() + }) return cfgs } diff --git a/pkg/operator/operator/relabel.go b/pkg/operator/operator/relabel.go index 937464d2e..e913c98df 100644 --- a/pkg/operator/operator/relabel.go +++ b/pkg/operator/operator/relabel.go @@ -28,7 +28,7 @@ func sanitizeLabelName(name string) string { return regex.ReplaceAllString(name, "_") } -func convertYamlRelabels(relabels []yaml.MapSlice) ([]*relabel.Config, error) { +func yamlToRelabels(relabels []yaml.MapSlice) ([]*relabel.Config, error) { var confs []*relabel.Config data, err := yaml.Marshal(relabels) if err != nil { @@ -63,7 +63,7 @@ func enforceNamespaceLabel(relabelings []yaml.MapSlice, namespace, enforcedNames }) } -func generateRelabelConfig(c *promv1.RelabelConfig) yaml.MapSlice { +func generatePromv1RelabelConfig(c *promv1.RelabelConfig) yaml.MapSlice { relabeling := yaml.MapSlice{} if len(c.SourceLabels) > 0 { @@ -97,6 +97,40 @@ func generateRelabelConfig(c *promv1.RelabelConfig) yaml.MapSlice { return relabeling } +func generatePromRelabelConfig(c *relabel.Config) yaml.MapSlice { + relabeling := yaml.MapSlice{} + + if len(c.SourceLabels) > 0 { + relabeling = append(relabeling, yaml.MapItem{Key: "source_labels", Value: c.SourceLabels}) + } + + if c.Separator != "" { + relabeling = append(relabeling, yaml.MapItem{Key: "separator", Value: c.Separator}) + } + + if c.TargetLabel != "" { + relabeling = append(relabeling, yaml.MapItem{Key: "target_label", Value: c.TargetLabel}) + } + + if c.Regex.String() != "" { + relabeling = append(relabeling, yaml.MapItem{Key: "regex", Value: c.Regex.String()}) + } + + if c.Modulus != uint64(0) { + relabeling = append(relabeling, yaml.MapItem{Key: "modulus", Value: c.Modulus}) + } + + if c.Replacement != "" { + relabeling = append(relabeling, yaml.MapItem{Key: "replacement", Value: c.Replacement}) + } + + if c.Action != "" { + relabeling = append(relabeling, yaml.MapItem{Key: "action", Value: c.Action}) + } + + return relabeling +} + func getServiceMonitorRelabels(m *promv1.ServiceMonitor, ep *promv1.Endpoint) []yaml.MapSlice { relabelings := initRelabelings() @@ -257,7 +291,7 @@ func getServiceMonitorRelabels(m *promv1.ServiceMonitor, ep *promv1.Endpoint) [] if ep.RelabelConfigs != nil { for _, c := range ep.RelabelConfigs { - relabelings = append(relabelings, generateRelabelConfig(c)) + relabelings = append(relabelings, generatePromv1RelabelConfig(c)) } } // Because of security risks, whenever enforcedNamespaceLabel is set, we want to append it to the @@ -399,7 +433,7 @@ func getPodMonitorRelabels(m *promv1.PodMonitor, ep *promv1.PodMetricsEndpoint) if ep.RelabelConfigs != nil { for _, c := range ep.RelabelConfigs { - relabelings = append(relabelings, generateRelabelConfig(c)) + relabelings = append(relabelings, generatePromv1RelabelConfig(c)) } } // Because of security risks, whenever enforcedNamespaceLabel is set, we want to append it to the diff --git a/pkg/operator/operator/relabel_test.go b/pkg/operator/operator/relabel_test.go index f0687e5d7..2239e4b75 100644 --- a/pkg/operator/operator/relabel_test.go +++ b/pkg/operator/operator/relabel_test.go @@ -50,11 +50,11 @@ func TestServiceMonitorRelabel(t *testing.T) { content := "- source_labels:\n - job\n target_label: monitor_type\n regex: (.+?)/.*\n replacement: ${1}\n- action: keep\n source_labels:\n - __meta_kubernetes_service_label_testa\n regex: a\n- action: keep\n source_labels:\n - __meta_kubernetes_service_label_testb\n regex: b\n- action: keep\n source_labels:\n - __meta_kubernetes_endpoint_port_name\n regex: http\n- source_labels:\n - __meta_kubernetes_endpoint_address_target_kind\n - __meta_kubernetes_endpoint_address_target_name\n separator: ;\n regex: Node;(.*)\n replacement: ${1}\n target_label: node\n- source_labels:\n - __meta_kubernetes_endpoint_address_target_kind\n - __meta_kubernetes_endpoint_address_target_name\n separator: ;\n regex: Pod;(.*)\n replacement: ${1}\n target_label: pod\n- source_labels:\n - __meta_kubernetes_namespace\n target_label: namespace\n- source_labels:\n - __meta_kubernetes_service_name\n target_label: service\n- source_labels:\n - __meta_kubernetes_pod_name\n target_label: pod\n- source_labels:\n - __meta_kubernetes_pod_container_name\n target_label: container\n- source_labels:\n - __meta_kubernetes_service_label_a\n target_label: a\n regex: (.+)\n replacement: ${1}\n- source_labels:\n - __meta_kubernetes_service_label_b\n target_label: b\n regex: (.+)\n replacement: ${1}\n- source_labels:\n - __meta_kubernetes_service_label_c\n target_label: c\n regex: (.+)\n replacement: ${1}\n- source_labels:\n - __meta_kubernetes_pod_label_e\n target_label: e\n regex: (.+)\n replacement: ${1}\n- source_labels:\n - __meta_kubernetes_pod_label_f\n target_label: f\n regex: (.+)\n replacement: ${1}\n- source_labels:\n - __meta_kubernetes_pod_label_g\n target_label: g\n regex: (.+)\n replacement: ${1}\n- source_labels:\n - __meta_kubernetes_service_name\n target_label: job\n replacement: ${1}\n- source_labels:\n - __meta_kubernetes_service_label_job\n target_label: job\n regex: (.+)\n replacement: ${1}\n- target_label: endpoint\n replacement: http\n- source_labels:\n - from\n target_label: to\n" yamlSlice := getServiceMonitorRelabels(m, ep) data, err := yaml.Marshal(yamlSlice) - assert.Nil(t, err) + assert.NoError(t, err) assert.Equal(t, content, string(data)) - _, err = convertYamlRelabels(yamlSlice) - assert.Nil(t, err) + _, err = yamlToRelabels(yamlSlice) + assert.NoError(t, err) } func TestPodMonitorRelabel(t *testing.T) { @@ -79,9 +79,9 @@ func TestPodMonitorRelabel(t *testing.T) { content := "- source_labels:\n - job\n target_label: monitor_type\n regex: (.+?)/.*\n replacement: ${1}\n- action: keep\n source_labels:\n - __meta_kubernetes_pod_label_testa\n regex: a\n- action: keep\n source_labels:\n - __meta_kubernetes_pod_label_testb\n regex: b\n- action: keep\n source_labels:\n - __meta_kubernetes_pod_container_port_name\n regex: http\n- source_labels:\n - __meta_kubernetes_namespace\n target_label: namespace\n- source_labels:\n - __meta_kubernetes_pod_container_name\n target_label: container\n- source_labels:\n - __meta_kubernetes_pod_name\n target_label: pod\n- source_labels:\n - __meta_kubernetes_pod_label_e\n target_label: e\n regex: (.+)\n replacement: ${1}\n- source_labels:\n - __meta_kubernetes_pod_label_f\n target_label: f\n regex: (.+)\n replacement: ${1}\n- source_labels:\n - __meta_kubernetes_pod_label_g\n target_label: g\n regex: (.+)\n replacement: ${1}\n- target_label: job\n replacement: testnamespace/test\n- source_labels:\n - __meta_kubernetes_pod_label_job\n target_label: job\n regex: (.+)\n replacement: ${1}\n- target_label: endpoint\n replacement: http\n" yamlSlice := getPodMonitorRelabels(m, ep) data, err := yaml.Marshal(yamlSlice) - assert.Nil(t, err) + assert.NoError(t, err) assert.Equal(t, content, string(data)) - _, err = convertYamlRelabels(yamlSlice) - assert.Nil(t, err) + _, err = yamlToRelabels(yamlSlice) + assert.NoError(t, err) } diff --git a/pkg/operator/operator/secret.go b/pkg/operator/operator/secret.go index ee2775079..931e6cbef 100644 --- a/pkg/operator/operator/secret.go +++ b/pkg/operator/operator/secret.go @@ -20,8 +20,8 @@ import ( "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/compressor" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/gzip" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/k8sutils" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/notifier" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/tasks" @@ -155,7 +155,7 @@ func (c *Operator) createOrUpdateEventTaskSecrets() { c.eventTaskCache = string(b) secret := newSecret(secretName, tasks.TaskTypeEvent) - compressed, err := compressor.Compress(b) + compressed, err := gzip.Compress(b) if err != nil { logger.Errorf("failed to compress config content, err: %v", err) return @@ -221,7 +221,7 @@ func (c *Operator) createOrUpdateDaemonSetTaskSecrets(childConfigs []*discover.C bytesTotal := 0 secret := newSecret(secretName, tasks.TaskTypeDaemonSet) for _, config := range configs { - compressed, err := compressor.Compress(config.Data) + compressed, err := gzip.Compress(config.Data) if err != nil { logger.Errorf("failed to compress config content, addr=%s, err: %v", config.Address, err) continue @@ -294,7 +294,7 @@ func (c *Operator) cleanupDaemonSetChildSecret(childConfigs []*discover.ChildCon // 如果 node 已经不存在了 也需要删除采集配置 for secret := range existSecrets { // 只处理 daemonset secrets - if !strings.HasPrefix(secret, tasks.DaemonSetTaskSecretPrefix) { + if !strings.HasPrefix(secret, tasks.PrefixDaemonSetTaskSecret) { continue } @@ -444,7 +444,7 @@ func (c *Operator) createOrUpdateStatefulSetTaskSecrets(childConfigs []*discover bytesTotal := 0 secret := newSecret(tasks.GetStatefulSetTaskSecretName(idx), tasks.TaskTypeStatefulSet) for _, config := range configs { - compressed, err := compressor.Compress(config.Data) + compressed, err := gzip.Compress(config.Data) if err != nil { logger.Errorf("failed to compress config content, addr=%s, err: %v", config.Address, err) continue @@ -531,6 +531,30 @@ func (c *Operator) collectChildConfigs() ([]*discover.ChildConfig, []*discover.C return statefulset, daemonset } +func (c *Operator) cleanupInvalidSecrets() { + secretClient := c.client.CoreV1().Secrets(ConfMonitorNamespace) + secrets, err := secretClient.List(c.ctx, metav1.ListOptions{ + LabelSelector: "createdBy=bkmonitor-operator", + }) + if err != nil { + logger.Errorf("failed to list secrets, err: %v", err) + return + } + + // 清理不合法的 secrets + for _, secret := range secrets.Items { + if _, ok := secret.Labels[tasks.LabelTaskType]; !ok { + if err := secretClient.Delete(c.ctx, secret.Name, metav1.DeleteOptions{}); err != nil { + c.mm.IncHandledSecretFailedCounter(secret.Name, define.ActionDelete) + logger.Errorf("failed to delete secret %s, err: %v", secret.Name, err) + continue + } + c.mm.IncHandledSecretSuccessCounter(secret.Name, define.ActionDelete) + logger.Infof("remove invalid secret %s", secret.Name) + } + } +} + func (c *Operator) dispatchTasks() { if ConfDryRun { logger.Info("dryrun mode, skip dispatch") diff --git a/pkg/operator/operator/server.go b/pkg/operator/operator/server.go index f19f0f9c3..6f7887513 100644 --- a/pkg/operator/operator/server.go +++ b/pkg/operator/operator/server.go @@ -10,7 +10,6 @@ package operator import ( - "bytes" "encoding/json" "fmt" "net/http" @@ -24,8 +23,9 @@ import ( "github.com/valyala/bytebufferpool" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/libgse/beat" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/utils" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/stringx" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover/shareddiscovery" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/objectsref" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/pprofsnapshot" @@ -138,7 +138,7 @@ func (c *Operator) CheckActiveChildConfigRoute(w http.ResponseWriter, _ *http.Re } func (c *Operator) CheckActiveSharedDiscoveryRoute(w http.ResponseWriter, _ *http.Request) { - writeResponse(w, discover.GetActiveSharedDiscovery()) + writeResponse(w, shareddiscovery.AllDiscovery()) } func (c *Operator) CheckMonitorResourceRoute(w http.ResponseWriter, _ *http.Request) { @@ -146,72 +146,70 @@ func (c *Operator) CheckMonitorResourceRoute(w http.ResponseWriter, _ *http.Requ } const ( - formatOperatorVersionMsg = ` + formatOperatorVersion = ` [√] check operator version - Description: bkmonitor-operator 版本信息 %s ` - formatKubernetesVersionSuccessMsg = ` + formatKubernetesVersionSuccess = ` [√] check kubernetes version - Description: kubernetes 集群版本为 %s ` - formatKubernetesVersionFailedMsg = ` + formatKubernetesVersionFailed = ` [x] check kubernetes version - Description: 无法正确获取 kubernetes 集群版本 ` - formatClusterInfoSuccessMsg = ` + formatClusterInfoSuccess = ` [√] check cluster information - Description: 集群信息 %s ` - formatClusterInfoFailedMsg = ` + formatClusterInfoFailed = ` [x] check cluster information - Description: 无法正确获取集群信息,错误信息 %s ` - formatCheckDataIDFailedMsg = ` + formatCheckDataIDFailed = ` [x] check dataids - Description: 期待 dataids 数量应大于等于 3 个,目前发现 %d 个 -- Suggestion: dataid 由 metadata 组件注入,请确定接入流程是否规范。同时检查 metadata 日志,确定是否出现异常。 - * operator 从启动到监听 dataids 资源可能存在约 20s 的延迟 - * 监控后台为传统部署,日志路径为 /data/bkee/logs/bkmonitorv3/kernel_metadata.log - * 监控后台为容器部署,请查看 bkmonitor-alarm-cron-worker pod 的日志 +- Suggestion: dataid 由 metadata 组件注入,请确定接入流程是否规范。 + * operator 从启动到监听 dataids 资源可能存在约 30s 的延迟 ` - formatCheckDataIDSuccessMsg = ` + formatCheckDataIDSuccess = ` [√] check dataids - Description: 期待 dataids 数量应大于等于 3 个,目前发现 %d 个 %s ` - formatDryRunMsg = ` + formatCheckDryRun = ` [√] check dryrun - Description: %s ` - formatCheckNamespaceMsg = ` + formatCheckNamespaceSuccess = ` [√] check namespaces - Description: 监测 namespace 白名单列表 %v,namespace 黑名单列表 %v - Suggestion: 请检查所需监控资源是否位于监测命名空间列表下,黑名单只在白名单列表为空时生效 - * 如若发现所需命名空间没有在监测列表中,请修改 values.yaml 中的 denyTargetNamespaces 或者 targetNamespaces,并更新到集群中 + * 如若发现所需命名空间没有在监测列表中,请更新 targetNamespaces 配置字段 ` - formatCheckNamespaceFailedMsg = ` + formatCheckNamespaceFailed = ` [x] check namespaces - Description: 监测 namespace 白名单列表 %v,namespace 黑名单列表 %v - Suggestion: 黑名单列表只在白名单列表为空时生效 ` - formatCheckMonitorBlacklistMsg = ` + formatCheckMonitorBlacklist = ` [√] check monitor blacklist rules - Description: monitor name 黑名单匹配规则,此规则优先级最高 %s ` - formatWorkloadMsg = ` + formatWorkload = ` [√] check workload -- Description: 集群各类型工作负载数量如下,最近一次更新时间 %v +- Description: 集群各类型工作负载数量 %s ` - formatMonitorEndpointMsg = ` + formatMonitorEndpoint = ` [√] check endpoint - Description: operator 监听 monitor endpoints 数量,共 %d 个 %s ` - formatScrapeMsg = ` + formatScrapeStats = ` [√] check scrape stats - Description: 总共发现 %d 个 monitor 资源,抓取数据行数为 %d,采集共出现 %d 次错误,更新时间 %s - Suggestion: 错误可能由 forwardLocal 导致(可忽略),可过滤 'scrape error' 关键字查看详细错误信息。 @@ -219,20 +217,16 @@ const ( * TOP%d 数据量如下,详细情况可访问 /check/scrape 路由。%s %s ` - formatListNodeMsg = ` -[√] check nodes -- Description: 获取集群节点列表成功,节点数量为 %d,最近一次更新时间 %v -` - formatHandledSecretFailedMsg = ` -[x] check kubernetes secrets handled + formatHandleSecretFailed = ` +[x] check kubernetes secrets operation - Description: 操作 secrets 资源出现错误 -- Suggestion: 请检查 apiserver 是否处于异常状态,最近一次操作时间 %v,考虑重启/删除 ${bkm-operator-pod} +- Suggestion: 请检查 apiserver 是否处于异常状态,最近一次操作时间 %v,考虑重启 ${bkm-operator-pod} ` - formatHandledSecretSuccessMsg = ` -[√] check kubernetes secrets handled + formatHandleSecretSuccess = ` +[√] check kubernetes secrets operation - Description: 操作 secrets 资源未出现错误,最近一次操作时间 %v ` - formatMonitorResourcesMsg = ` + formatMonitorResources = ` [√] check monitor resources - Description: 通过 '%s' 关键字匹配到以下监控资源。 * 监测到 ServiceMonitor/PodMonitor/Probe 资源以及对应的采集目标,请检查资源数量是否一致 @@ -240,23 +234,13 @@ const ( * 生成的 bkmonitorbeat 采集配置文件 %s ` - formatCheckNamespaceWithoutKeywordMsg = ` + formatMonitorResourceNoKeyword = ` [√] check monitor resources - Description: 无 'monitor' 请求参数,无资源匹配。 ` - formatOperatorLogMsg = ` -[o] bkmonitor-operator logs -- Description: 使用 'kubectl logs -n ${.Namespace} ${bkm-operator-pod}' 查看是否有 ERROR 信息。 -` - formatBkmonitorbeatTroubleshootingMsg = ` -[o] bkmonitorbeat troubleshooting -- Description: 如若上述检查无发现异常问题,则考虑排查 bkmonitorbeat 本身的采集是否出现异常 -- Suggestion: 优先检查采集器日志是否有异常,采集器会在每次采集记录日志流水 - 1)根据上述检查得到采集任务所在节点,并使用 'kubectl get pods -n ${.Namespace} -owide' 确定对应的采集器 pod - 2)使用 'kubectl exec it -n ${.Namespace} ${worker-pod}' 命令查看 bkmonitorbeat 所在进程 pid - 3)使用 'kubectl exec' 执行 'strace -p ${pid} -s 1024000 -f -e write 2>&1 > /tmp/strace' 等待一分钟导出 strace 数据 - 4)过滤 *.strace 文件查看是否有采集任务指标对应的关键字,判断数据是否有写入到 gse sockets,如若有写到 gse 则说采集工作正常,需要排查链路问题 - 5)链路排查可按照二进制部署排查思路 kafka -> transfer -> influxdb-proxy -> influxdb + formatLogContent = ` +[-] bkmonitor-operator logs +- Description: 使用 'kubectl logs -n ${.Namespace} ${bkm-operator-pod}' 查看是否有关键 ERROR 信息。 ` ) @@ -271,67 +255,67 @@ const ( // 检查黑名单匹配规则 // 检查集群负载情况 // 检查采集指标数据量 -// 检查节点列表 // 检查处理 secrets 是否有问题 // 检查给定关键字监测资源 func (c *Operator) CheckRoute(w http.ResponseWriter, r *http.Request) { - buf := &bytes.Buffer{} - var b []byte + buf := bytebufferpool.Get() + defer bytebufferpool.Put(buf) // 检查 kubernetes 版本信息 if kubernetesVersion == "" { - buf.WriteString(formatKubernetesVersionFailedMsg) + buf.WriteString(formatKubernetesVersionFailed) } else { - buf.WriteString(fmt.Sprintf(formatKubernetesVersionSuccessMsg, kubernetesVersion)) + buf.WriteString(fmt.Sprintf(formatKubernetesVersionSuccess, kubernetesVersion)) } // 检查 bkmonitor-operator 版本信息 - b, _ = json.MarshalIndent(c.buildInfo, "", " ") - buf.WriteString(fmt.Sprintf(formatOperatorVersionMsg, string(b))) + b, _ := json.MarshalIndent(c.buildInfo, "", " ") + buf.WriteString(fmt.Sprintf(formatOperatorVersion, string(b))) // 检查 dataids 是否符合预期 dataids := c.checkDataIdRoute() n := len(dataids) if n < 3 { - w.Write([]byte(fmt.Sprintf(formatCheckDataIDFailedMsg, n))) + w.Write([]byte(fmt.Sprintf(formatCheckDataIDFailed, n))) return } b, _ = json.MarshalIndent(dataids, "", " ") - buf.WriteString(fmt.Sprintf(formatCheckDataIDSuccessMsg, n, string(b))) + buf.WriteString(fmt.Sprintf(formatCheckDataIDSuccess, n, string(b))) // 检查集群信息 clusterInfo, err := c.dw.GetClusterInfo() if err != nil { - w.Write([]byte(fmt.Sprintf(formatClusterInfoFailedMsg, err.Error()))) + w.Write([]byte(fmt.Sprintf(formatClusterInfoFailed, err.Error()))) return } b, _ = json.MarshalIndent(clusterInfo, "", " ") - buf.WriteString(fmt.Sprintf(formatClusterInfoSuccessMsg, string(b))) + buf.WriteString(fmt.Sprintf(formatClusterInfoSuccess, string(b))) // 检查 dryrun 标识是否打开 if ConfDryRun { - buf.WriteString(fmt.Sprintf(formatDryRunMsg, "dryrun 模式,operator 不会调度采集任务")) + buf.WriteString(fmt.Sprintf(formatCheckDryRun, "dryrun 模式,operator 不会调度采集任务")) } else { - buf.WriteString(fmt.Sprintf(formatDryRunMsg, "非 dryrun 模式,operator 正常调度采集任务")) + buf.WriteString(fmt.Sprintf(formatCheckDryRun, "非 dryrun 模式,operator 正常调度采集任务")) } // 检查监测命名空间是否符合预期 namespaces := c.checkNamespaceRoute() if len(namespaces.DenyNamespaces) > 0 && len(namespaces.AllowNamespaces) > 0 { - buf.WriteString(fmt.Sprintf(formatCheckNamespaceFailedMsg, namespaces.AllowNamespaces, namespaces.DenyNamespaces)) + buf.WriteString(fmt.Sprintf(formatCheckNamespaceFailed, namespaces.AllowNamespaces, namespaces.DenyNamespaces)) } else { - buf.WriteString(fmt.Sprintf(formatCheckNamespaceMsg, namespaces.AllowNamespaces, namespaces.DenyNamespaces)) + buf.WriteString(fmt.Sprintf(formatCheckNamespaceSuccess, namespaces.AllowNamespaces, namespaces.DenyNamespaces)) } // 检查黑名单匹配规则 blacklist := c.checkMonitorBlacklistRoute() b, _ = json.MarshalIndent(blacklist, "", " ") - buf.WriteString(fmt.Sprintf(formatCheckMonitorBlacklistMsg, string(b))) + buf.WriteString(fmt.Sprintf(formatCheckMonitorBlacklist, string(b))) // 检查集群工作负载数量 - workloadInfo, workloadUpdated := objectsref.GetWorkloadInfo() + workloadInfo := objectsref.GetWorkloadCount() + workloadInfo["Node"] = objectsref.GetClusterNodeCount() // 顺便补充 node 数量 b, _ = json.MarshalIndent(workloadInfo, "", " ") - buf.WriteString(fmt.Sprintf(formatWorkloadMsg, workloadUpdated.Format(time.RFC3339), string(b))) + buf.WriteString(fmt.Sprintf(formatWorkload, string(b))) // 检查 Endpoint 数量 endpoints := c.recorder.getActiveEndpoints() @@ -340,7 +324,7 @@ func (c *Operator) CheckRoute(w http.ResponseWriter, r *http.Request) { for _, v := range endpoints { total += v } - buf.WriteString(fmt.Sprintf(formatMonitorEndpointMsg, total, string(b))) + buf.WriteString(fmt.Sprintf(formatMonitorEndpoint, total, string(b))) // 检查采集指标数据量 onScrape := r.URL.Query().Get("scrape") @@ -359,18 +343,14 @@ func (c *Operator) CheckRoute(w http.ResponseWriter, r *http.Request) { warning = "数据行数已超过 300w 警戒线,请重点关注数据库负载!" } scrapeUpdated := c.scrapeUpdated.Format(time.RFC3339) - buf.WriteString(fmt.Sprintf(formatScrapeMsg, stats.MonitorCount, stats.LinesTotal, stats.ErrorsTotal, scrapeUpdated, n, warning, string(b))) + buf.WriteString(fmt.Sprintf(formatScrapeStats, stats.MonitorCount, stats.LinesTotal, stats.ErrorsTotal, scrapeUpdated, n, warning, string(b))) } - // 检查节点列表 - nodeCount, nodeUpdated := objectsref.GetClusterNodeInfo() - buf.WriteString(fmt.Sprintf(formatListNodeMsg, nodeCount, nodeUpdated.Format(time.RFC3339))) - // 检查处理 secrets 是否有问题 if c.mm.handledSecretFailed <= 0 || c.mm.handledSecretSuccessTime.After(c.mm.handledSecretFailedTime) { - buf.WriteString(fmt.Sprintf(formatHandledSecretSuccessMsg, c.mm.handledSecretSuccessTime.Format(time.RFC3339))) + buf.WriteString(fmt.Sprintf(formatHandleSecretSuccess, c.mm.handledSecretSuccessTime.Format(time.RFC3339))) } else { - buf.WriteString(fmt.Sprintf(formatHandledSecretFailedMsg, c.mm.handledSecretFailedTime.Format(time.RFC3339))) + buf.WriteString(fmt.Sprintf(formatHandleSecretFailed, c.mm.handledSecretFailedTime.Format(time.RFC3339))) } // 检查给定关键字监测资源 @@ -391,13 +371,12 @@ func (c *Operator) CheckRoute(w http.ResponseWriter, r *http.Request) { } } childConfigsContent, _ := json.MarshalIndent(childConfigs, "", " ") - buf.WriteString(fmt.Sprintf(formatMonitorResourcesMsg, monitorKeyword, monitorResourcesContent, childConfigsContent)) + buf.WriteString(fmt.Sprintf(formatMonitorResources, monitorKeyword, monitorResourcesContent, childConfigsContent)) } else { - buf.WriteString(formatCheckNamespaceWithoutKeywordMsg) + buf.WriteString(formatMonitorResourceNoKeyword) } - buf.WriteString(formatOperatorLogMsg) - buf.WriteString(formatBkmonitorbeatTroubleshootingMsg) + buf.WriteString(formatLogContent) w.Write(buf.Bytes()) } @@ -469,8 +448,8 @@ func (c *Operator) WorkloadNodeRoute(w http.ResponseWriter, r *http.Request) { query := r.URL.Query() podName := query.Get("podName") - annotations := utils.SplitTrim(query.Get("annotations"), ",") - labels := utils.SplitTrim(query.Get("labels"), ",") + annotations := stringx.SplitTrim(query.Get("annotations"), ",") + labels := stringx.SplitTrim(query.Get("labels"), ",") var configs []objectsref.RelabelConfig configs = append(configs, c.objectsController.WorkloadsRelabelConfigsByPodName(nodeName, podName, annotations, labels)...) @@ -491,8 +470,8 @@ func (c *Operator) WorkloadNodeRoute(w http.ResponseWriter, r *http.Request) { func (c *Operator) LabelJoinRoute(w http.ResponseWriter, r *http.Request) { query := r.URL.Query() kind := query.Get("kind") - annotations := utils.SplitTrim(query.Get("annotations"), ",") - labels := utils.SplitTrim(query.Get("labels"), ",") + annotations := stringx.SplitTrim(query.Get("annotations"), ",") + labels := stringx.SplitTrim(query.Get("labels"), ",") switch kind { case "Pod": diff --git a/pkg/operator/operator/servicemonitor.go b/pkg/operator/operator/servicemonitor.go new file mode 100644 index 000000000..5df8db4fa --- /dev/null +++ b/pkg/operator/operator/servicemonitor.go @@ -0,0 +1,223 @@ +// Tencent is pleased to support the open source community by making +// 蓝鲸智云 - 监控平台 (BlueKing - Monitor) available. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +// You may obtain a copy of the License at http://opensource.org/licenses/MIT +// Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +// an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +package operator + +import ( + "fmt" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "gopkg.in/yaml.v2" + + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/feature" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/operator/discover/kubernetesd" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/utils/logger" +) + +func serviceMonitorID(obj *promv1.ServiceMonitor) string { + return fmt.Sprintf("%s/%s", obj.Namespace, obj.Name) +} + +func (c *Operator) handleServiceMonitorAdd(obj interface{}) { + serviceMonitor, ok := obj.(*promv1.ServiceMonitor) + if !ok { + logger.Errorf("expected ServiceMonitor type, got %T", obj) + return + } + + if ConfEnablePromRule { + c.promsliController.UpdateServiceMonitor(serviceMonitor) + } + + // 新增的 servicemonitor 命中黑名单则流程终止 + if IfRejectServiceMonitor(serviceMonitor) { + logger.Infof("add action match blacklist rules, serviceMonitor=%s", serviceMonitorID(serviceMonitor)) + return + } + + discovers := c.createServiceMonitorDiscovers(serviceMonitor) + for _, dis := range discovers { + if err := c.addOrUpdateDiscover(dis); err != nil { + logger.Errorf("add or update serviceMonitor discover %s failed, err: %s", dis, err) + } + } +} + +func (c *Operator) handleServiceMonitorUpdate(oldObj interface{}, newObj interface{}) { + old, ok := oldObj.(*promv1.ServiceMonitor) + if !ok { + logger.Errorf("expected ServiceMonitor type, got %T", oldObj) + return + } + cur, ok := newObj.(*promv1.ServiceMonitor) + if !ok { + logger.Errorf("expected ServiceMonitor type, got %T", newObj) + return + } + + if ConfEnablePromRule { + c.promsliController.UpdateServiceMonitor(cur) + } + + if old.ResourceVersion == cur.ResourceVersion { + logger.Debugf("serviceMonitor '%s' does not change", serviceMonitorID(old)) + return + } + + // 对于更新的 servicemonitor 如果新的 spec 命中黑名单 则需要将原有的 servicemonitor 移除 + if IfRejectServiceMonitor(cur) { + logger.Infof("update action match blacklist rules, serviceMonitor=%s", serviceMonitorID(cur)) + for _, name := range c.getServiceMonitorDiscoversName(cur) { + c.deleteDiscoverByName(name) + } + return + } + + for _, name := range c.getServiceMonitorDiscoversName(old) { + c.deleteDiscoverByName(name) + } + for _, dis := range c.createServiceMonitorDiscovers(cur) { + if err := c.addOrUpdateDiscover(dis); err != nil { + logger.Errorf("add or update serviceMonitor discover %s failed, err: %s", dis, err) + } + } +} + +func (c *Operator) handleServiceMonitorDelete(obj interface{}) { + serviceMonitor, ok := obj.(*promv1.ServiceMonitor) + if !ok { + logger.Errorf("expected ServiceMonitor type, got %T", obj) + return + } + + if ConfEnablePromRule { + c.promsliController.DeleteServiceMonitor(serviceMonitor) + } + + for _, name := range c.getServiceMonitorDiscoversName(serviceMonitor) { + c.deleteDiscoverByName(name) + } +} + +func (c *Operator) getServiceMonitorDiscoversName(serviceMonitor *promv1.ServiceMonitor) []string { + var names []string + for index := range serviceMonitor.Spec.Endpoints { + monitorMeta := define.MonitorMeta{ + Name: serviceMonitor.Name, + Kind: monitorKindServiceMonitor, + Namespace: serviceMonitor.Namespace, + Index: index, + } + names = append(names, monitorMeta.ID()) + } + return names +} + +func (c *Operator) createServiceMonitorDiscovers(serviceMonitor *promv1.ServiceMonitor) []discover.Discover { + var ( + namespaces []string + discovers []discover.Discover + ) + + systemResource := feature.IfSystemResource(serviceMonitor.Annotations) + meta := define.MonitorMeta{ + Name: serviceMonitor.Name, + Kind: monitorKindServiceMonitor, + Namespace: serviceMonitor.Namespace, + } + dataID, err := c.dw.MatchMetricDataID(meta, systemResource) + if err != nil { + logger.Errorf("servicemonitor(%+v) no dataid matched", meta) + return discovers + } + specLabels := dataID.Spec.Labels + + if serviceMonitor.Spec.NamespaceSelector.Any { + namespaces = []string{} + } else if len(serviceMonitor.Spec.NamespaceSelector.MatchNames) == 0 { + namespaces = []string{serviceMonitor.Namespace} + } else { + namespaces = serviceMonitor.Spec.NamespaceSelector.MatchNames + } + + logger.Infof("found new serviceMonitor '%s'", serviceMonitorID(serviceMonitor)) + for index, endpoint := range serviceMonitor.Spec.Endpoints { + if endpoint.Path == "" { + endpoint.Path = "/metrics" + } + if endpoint.Scheme == "" { + endpoint.Scheme = "http" + } + + relabels := getServiceMonitorRelabels(serviceMonitor, &endpoint) + resultLabels, err := yamlToRelabels(relabels) + if err != nil { + logger.Errorf("failed to convert relabels, err: %s", err) + continue + } + + metricRelabelings := make([]yaml.MapSlice, 0) + if len(endpoint.MetricRelabelConfigs) != 0 { + for _, cfg := range endpoint.MetricRelabelConfigs { + relabeling := generatePromv1RelabelConfig(cfg) + metricRelabelings = append(metricRelabelings, relabeling) + } + } + logger.Debugf("serviceMonitor '%s' get relabels config: %+v", serviceMonitorID(serviceMonitor), relabels) + + monitorMeta := meta + monitorMeta.Index = index + + var proxyURL string + if endpoint.ProxyURL != nil { + proxyURL = *endpoint.ProxyURL + } + + dis := kubernetesd.New(c.ctx, kubernetesd.TypeEndpoints(useEndpointslice), c.objectsController.NodeNameExists, &kubernetesd.Options{ + CommonOptions: &discover.CommonOptions{ + MonitorMeta: monitorMeta, + RelabelRule: feature.RelabelRule(serviceMonitor.Annotations), + RelabelIndex: feature.RelabelIndex(serviceMonitor.Annotations), + NormalizeMetricName: feature.IfNormalizeMetricName(serviceMonitor.Annotations), + AntiAffinity: feature.IfAntiAffinity(serviceMonitor.Annotations), + MatchSelector: feature.MonitorMatchSelector(serviceMonitor.Annotations), + DropSelector: feature.MonitorDropSelector(serviceMonitor.Annotations), + LabelJoinMatcher: feature.LabelJoinMatcher(serviceMonitor.Annotations), + ForwardLocalhost: feature.IfForwardLocalhost(serviceMonitor.Annotations), + Name: monitorMeta.ID(), + DataID: dataID, + Relabels: resultLabels, + Path: endpoint.Path, + Scheme: endpoint.Scheme, + BearerTokenFile: endpoint.BearerTokenFile, + Period: string(endpoint.Interval), + ProxyURL: proxyURL, + Timeout: string(endpoint.ScrapeTimeout), + ExtraLabels: specLabels, + DisableCustomTimestamp: !ifHonorTimestamps(endpoint.HonorTimestamps), + System: systemResource, + UrlValues: endpoint.Params, + MetricRelabelConfigs: metricRelabelings, + }, + Client: c.client, + Namespaces: namespaces, + KubeConfig: ConfKubeConfig, + TLSConfig: endpoint.TLSConfig.DeepCopy(), + BasicAuth: endpoint.BasicAuth.DeepCopy(), + BearerTokenSecret: endpoint.BearerTokenSecret.DeepCopy(), + UseEndpointSlice: useEndpointslice, + }) + + logger.Infof("create new endpoint discover: %s", dis.Name()) + discovers = append(discovers, dis) + } + return discovers +} diff --git a/pkg/operator/operator/target/hook.go b/pkg/operator/operator/target/hook.go index fcf6c92a9..b7c76fa81 100644 --- a/pkg/operator/operator/target/hook.go +++ b/pkg/operator/operator/target/hook.go @@ -19,7 +19,6 @@ import ( const ( confMaxTimeoutPath = "discover.scrape.max_timeout" confMinPeriodPath = "discover.scrape.min_period" - confDefaultPeriodPath = "discover.scrape.default_period" confEventScrapeMaxSpanPath = "operator.event.max_span" confEventScrapeIntervalPath = "operator.event.scrape_interval" confEventScrapeFilesPath = "operator.event.scrape_path" @@ -30,7 +29,6 @@ const ( var ( ConfMaxTimeout string ConfMinPeriod string - ConfDefaultPeriod string ConfEventScrapeInterval string ConfEventScrapeFiles []string ConfEventMaxSpan string @@ -42,7 +40,6 @@ var ( func initConfig() { viper.SetDefault(confMaxTimeoutPath, "100s") viper.SetDefault(confMinPeriodPath, "3s") - viper.SetDefault(confDefaultPeriodPath, "60s") viper.SetDefault(confEventScrapeMaxSpanPath, "2h") viper.SetDefault(confEventScrapeIntervalPath, "60s") viper.SetDefault(confEventScrapeFilesPath, []string{"/var/log/gse/events.log"}) @@ -52,7 +49,6 @@ func initConfig() { func updateConfig() { ConfMaxTimeout = viper.GetString(confMaxTimeoutPath) ConfMinPeriod = viper.GetString(confMinPeriodPath) - ConfDefaultPeriod = viper.GetString(confDefaultPeriodPath) ConfEventMaxSpan = viper.GetString(confEventScrapeMaxSpanPath) ConfEventScrapeInterval = viper.GetString(confEventScrapeIntervalPath) ConfEventScrapeFiles = viper.GetStringSlice(confEventScrapeFilesPath) diff --git a/pkg/operator/operator/target/metric.go b/pkg/operator/operator/target/metric.go index 311014558..a1622e515 100644 --- a/pkg/operator/operator/target/metric.go +++ b/pkg/operator/operator/target/metric.go @@ -26,7 +26,7 @@ import ( "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/feature" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/utils" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/stringx" ) const ( @@ -113,7 +113,7 @@ func (t *MetricTarget) RemoteRelabelConfig() *yaml.MapItem { host := fmt.Sprintf("http://%s:%d", ConfServiceName, ConfServicePort) params := map[string]string{} - rules := utils.SplitTrim(t.RelabelRule, ",") + rules := stringx.SplitTrim(t.RelabelRule, ",") for _, rule := range rules { switch rule { case relabelV1RuleWorkload: diff --git a/pkg/operator/reloader/reloader.go b/pkg/operator/reloader/reloader.go index 6bc338658..ae76b1e1d 100644 --- a/pkg/operator/reloader/reloader.go +++ b/pkg/operator/reloader/reloader.go @@ -27,9 +27,9 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" - "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/compressor" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/define" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/filewatcher" + "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/gzip" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/k8sutils" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/notifier" "github.com/TencentBlueKing/bkmonitor-datalink/pkg/operator/common/tasks" @@ -217,7 +217,7 @@ func (r *Reloader) syncSecretToFiles(secret *corev1.Secret) error { logger.Infof("start add or update file '%s'", filePath) // 如果存在无法解压缩的数据则直接使用原始数据 - uncompressed, err := compressor.Uncompress(data) + uncompressed, err := gzip.Uncompress(data) if err != nil { logger.Errorf("failed to uncompress config content: %v", err) continue