From 31f4c79d8dcb8a01b45936da6ed9bb067ed3a720 Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Tue, 12 Mar 2024 14:15:34 +0000 Subject: [PATCH 01/11] CRD: Add IsUserspaceData flag for args When we swap all probe_read()s to probe_read_kernel() or probe_read_user() (ditto for _str) users will need to be able to specify if an argument is userspace data or not. Signed-off-by: Kevin Sheldrake --- .../v1alpha1/cilium.io_tracingpolicies.yaml | 20 +++++++++++++++++++ .../cilium.io_tracingpoliciesnamespaced.yaml | 20 +++++++++++++++++++ pkg/k8s/apis/cilium.io/v1alpha1/types.go | 4 ++++ pkg/k8s/apis/cilium.io/v1alpha1/version.go | 2 +- .../v1alpha1/cilium.io_tracingpolicies.yaml | 20 +++++++++++++++++++ .../cilium.io_tracingpoliciesnamespaced.yaml | 20 +++++++++++++++++++ .../pkg/k8s/apis/cilium.io/v1alpha1/types.go | 4 ++++ .../k8s/apis/cilium.io/v1alpha1/version.go | 2 +- 8 files changed, 90 insertions(+), 2 deletions(-) diff --git a/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpolicies.yaml b/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpolicies.yaml index f8699a4f451..ccb9d1bbb27 100644 --- a/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpolicies.yaml +++ b/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpolicies.yaml @@ -118,6 +118,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -217,6 +222,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -882,6 +892,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -1454,6 +1469,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string diff --git a/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpoliciesnamespaced.yaml b/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpoliciesnamespaced.yaml index 19b141f0b64..26eb9e31016 100644 --- a/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpoliciesnamespaced.yaml +++ b/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpoliciesnamespaced.yaml @@ -118,6 +118,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -217,6 +222,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -882,6 +892,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -1454,6 +1469,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string diff --git a/pkg/k8s/apis/cilium.io/v1alpha1/types.go b/pkg/k8s/apis/cilium.io/v1alpha1/types.go index e491c37df3e..7bb15508307 100644 --- a/pkg/k8s/apis/cilium.io/v1alpha1/types.go +++ b/pkg/k8s/apis/cilium.io/v1alpha1/types.go @@ -65,6 +65,10 @@ type KProbeArg struct { // Argument type. Type string `json:"type"` // +kubebuilder:validation:Optional + // Specifies if the argument is userspace data or not. This defaults to true + // for uprobes and syscalls, and false for non-syscall tracepoints and kprobes. + IsUserspaceData *bool `json:"isUserspaceData,omitempty"` + // +kubebuilder:validation:Optional // +kubebuilder:validation:Minimum=0 // Specifies the position of the corresponding size argument for this argument. // This field is used only for char_buf and char_iovec types. diff --git a/pkg/k8s/apis/cilium.io/v1alpha1/version.go b/pkg/k8s/apis/cilium.io/v1alpha1/version.go index dc8b58a087f..8ecbbf04ed2 100644 --- a/pkg/k8s/apis/cilium.io/v1alpha1/version.go +++ b/pkg/k8s/apis/cilium.io/v1alpha1/version.go @@ -7,4 +7,4 @@ package v1alpha1 // Used to determine if CRD needs to be updated in cluster // // Developers: Bump patch for each change in the CRD schema. -const CustomResourceDefinitionSchemaVersion = "1.1.10" +const CustomResourceDefinitionSchemaVersion = "1.1.11" diff --git a/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpolicies.yaml b/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpolicies.yaml index f8699a4f451..ccb9d1bbb27 100644 --- a/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpolicies.yaml +++ b/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpolicies.yaml @@ -118,6 +118,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -217,6 +222,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -882,6 +892,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -1454,6 +1469,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string diff --git a/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpoliciesnamespaced.yaml b/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpoliciesnamespaced.yaml index 19b141f0b64..26eb9e31016 100644 --- a/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpoliciesnamespaced.yaml +++ b/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/client/crds/v1alpha1/cilium.io_tracingpoliciesnamespaced.yaml @@ -118,6 +118,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -217,6 +222,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -882,6 +892,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string @@ -1454,6 +1469,11 @@ spec: format: int32 minimum: 0 type: integer + isUserspaceData: + description: Specifies if the argument is userspace data + or not. This defaults to true for uprobes and syscalls, + and false for non-syscall tracepoints and kprobes. + type: boolean label: description: Label to output in the JSON type: string diff --git a/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1/types.go b/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1/types.go index e491c37df3e..7bb15508307 100644 --- a/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1/types.go +++ b/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1/types.go @@ -65,6 +65,10 @@ type KProbeArg struct { // Argument type. Type string `json:"type"` // +kubebuilder:validation:Optional + // Specifies if the argument is userspace data or not. This defaults to true + // for uprobes and syscalls, and false for non-syscall tracepoints and kprobes. + IsUserspaceData *bool `json:"isUserspaceData,omitempty"` + // +kubebuilder:validation:Optional // +kubebuilder:validation:Minimum=0 // Specifies the position of the corresponding size argument for this argument. // This field is used only for char_buf and char_iovec types. diff --git a/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1/version.go b/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1/version.go index dc8b58a087f..8ecbbf04ed2 100644 --- a/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1/version.go +++ b/vendor/github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1/version.go @@ -7,4 +7,4 @@ package v1alpha1 // Used to determine if CRD needs to be updated in cluster // // Developers: Bump patch for each change in the CRD schema. -const CustomResourceDefinitionSchemaVersion = "1.1.10" +const CustomResourceDefinitionSchemaVersion = "1.1.11" From 5fed43cdfd17b1fba255649174ebb71bc3a14bbe Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Tue, 12 Mar 2024 16:31:15 +0000 Subject: [PATCH 02/11] Tracing: Add UserspaceData flag to arg meta Add the UserspaceData flag to the meta value, taking into account the default value if it is missing from the arg in the policy. Modify how tracepoints use the meta value to permit adding this flag. Signed-off-by: Kevin Sheldrake --- bpf/process/types/basic.h | 21 +++++++---- pkg/sensors/tracing/args.go | 44 +++++++++++++----------- pkg/sensors/tracing/generickprobe.go | 3 +- pkg/sensors/tracing/generictracepoint.go | 31 ++++++++++++----- pkg/sensors/tracing/genericuprobe.go | 3 +- 5 files changed, 65 insertions(+), 37 deletions(-) diff --git a/bpf/process/types/basic.h b/bpf/process/types/basic.h index ab42100897a..c44c11886aa 100644 --- a/bpf/process/types/basic.h +++ b/bpf/process/types/basic.h @@ -47,7 +47,7 @@ enum { fd_ty = 17, /* const_buf_type is a type for buffers with static size that is passed - * in the meta argument + * in the meta argument's upper 16 bits */ const_buf_type = 18, bpf_attr_type = 19, @@ -636,9 +636,10 @@ copy_kernel_module(char *args, unsigned long arg) return sizeof(struct tg_kernel_module); } -#define ARGM_INDEX_MASK 0xf -#define ARGM_RETURN_COPY BIT(4) -#define ARGM_MAX_DATA BIT(5) +#define ARGM_INDEX_MASK 0xf +#define ARGM_RETURN_COPY BIT(4) +#define ARGM_MAX_DATA BIT(5) +#define ARGM_USERSPACE_DATA BIT(6) static inline __attribute__((always_inline)) bool hasReturnCopy(unsigned long argm) @@ -652,6 +653,12 @@ has_max_data(unsigned long argm) return (argm & ARGM_MAX_DATA) != 0; } +static inline __attribute__((always_inline)) bool +is_userspace_data(unsigned long argm) +{ + return (argm & ARGM_USERSPACE_DATA) != 0; +} + static inline __attribute__((always_inline)) unsigned long get_arg_meta(int meta, struct msg_generic_kprobe *e) { @@ -1614,7 +1621,8 @@ static inline __attribute__((always_inline)) size_t type_to_min_size(int type, case char_iovec: return 4; case const_buf_type: - return argm; + // For const_buf_type, the size is in the upper 16 bits of the meta argument. + return argm >> 16; case bpf_attr_type: return sizeof(struct bpf_info_type); case perf_event_type: @@ -2663,8 +2671,9 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, size = copy_char_iovec(ctx, orig_off, arg, argm, e); break; case const_buf_type: { + // for const_buf_type the size is in the upper 16 bits of the meta argument // bound size to 1023 to help the verifier out - size = argm & 0x03ff; + size = (argm >> 16) & 0x03ff; probe_read(args, size, (char *)arg); break; } diff --git a/pkg/sensors/tracing/args.go b/pkg/sensors/tracing/args.go index f9a673cf596..bf1d54332fa 100644 --- a/pkg/sensors/tracing/args.go +++ b/pkg/sensors/tracing/args.go @@ -30,8 +30,10 @@ type argPrinter struct { } const ( - argReturnCopyBit = 1 << 4 - argMaxDataBit = 1 << 5 + argSizeArgIndexMask = int(0xf) + argReturnCopyBit = 1 << 4 + argMaxDataBit = 1 << 5 + argUserspaceDataBit = 1 << 6 ) func argReturnCopy(meta int) bool { @@ -41,17 +43,20 @@ func argReturnCopy(meta int) bool { // meta value format: // bits // -// 0-3 : SizeArgIndex -// 4 : ReturnCopy -// 5 : MaxData -func getMetaValue(arg *v1alpha1.KProbeArg) (int, error) { - var meta int +// 0-3 : SizeArgIndex +// 4 : ReturnCopy +// 5 : MaxData +// 6 : UserspaceData +// 7-15 : reserved +// 16-31 : size for const_buf +func getMetaValue(arg *v1alpha1.KProbeArg, userspaceDataDefault bool) (int, error) { + meta := 0 if arg.SizeArgIndex > 0 { if arg.SizeArgIndex > 15 { return 0, fmt.Errorf("invalid SizeArgIndex value (>15): %v", arg.SizeArgIndex) } - meta = int(arg.SizeArgIndex) + meta = meta | int(arg.SizeArgIndex) } if arg.ReturnCopy { meta = meta | argReturnCopyBit @@ -59,19 +64,18 @@ func getMetaValue(arg *v1alpha1.KProbeArg) (int, error) { if arg.MaxData { meta = meta | argMaxDataBit } - return meta, nil -} - -// getTracepointMetaArg is a temporary helper to find meta values while tracepoint -// converts into new CRD and config formats. -func getTracepointMetaValue(arg *v1alpha1.KProbeArg) int { - if arg.SizeArgIndex > 0 { - return int(arg.SizeArgIndex) - } - if arg.ReturnCopy { - return -1 + if arg.IsUserspaceData == nil { + // If not set in policy, use the default. + if userspaceDataDefault { + meta = meta | argUserspaceDataBit + } + } else { + // Otherwise, use the provided value. + if *arg.IsUserspaceData { + meta = meta | argUserspaceDataBit + } } - return 0 + return meta, nil } func getArg(r *bytes.Reader, a argPrinter) api.MsgGenericKprobeArg { diff --git a/pkg/sensors/tracing/generickprobe.go b/pkg/sensors/tracing/generickprobe.go index bf120e99876..4f40eb36315 100644 --- a/pkg/sensors/tracing/generickprobe.go +++ b/pkg/sensors/tracing/generickprobe.go @@ -670,7 +670,8 @@ func addKprobe(funcName string, f *v1alpha1.KProbeSpec, in *addKprobeIn) (id idt logger.GetLogger().Warnf("maxData flag is ignored (supported from large programs)") } } - argMValue, err := getMetaValue(&a) + // For kprobes, args default to userspace memory for syscalls, and kernel memory otherwise. + argMValue, err := getMetaValue(&a, f.Syscall) if err != nil { return errFn(err) } diff --git a/pkg/sensors/tracing/generictracepoint.go b/pkg/sensors/tracing/generictracepoint.go index 2dcd72a9759..ffd5bc70f7d 100644 --- a/pkg/sensors/tracing/generictracepoint.go +++ b/pkg/sensors/tracing/generictracepoint.go @@ -222,10 +222,10 @@ func (out *genericTracepointArg) getGenericTypeId() (int, error) { if err != nil { return gt.GenericInvalidType, fmt.Errorf("failed to get size of array type %w", err) } - if out.MetaArg == 0 { - // set MetaArg equal to the number of bytes we need to copy - out.MetaArg = nbytes - } + // set MetaArg's upper half-word equal to the number of bytes we need to copy + out.MetaArg = out.MetaArg & 0xffff + out.MetaArg = out.MetaArg | (nbytes << 16) + return gt.GenericConstBuffer, nil case tracepoint.SizeTy: @@ -238,6 +238,7 @@ func (out *genericTracepointArg) getGenericTypeId() (int, error) { func buildGenericTracepointArgs(info *tracepoint.Tracepoint, specArgs []v1alpha1.KProbeArg) ([]genericTracepointArg, error) { ret := make([]genericTracepointArg, 0, len(specArgs)) nfields := uint32(len(info.Format.Fields)) + syscall := info.Subsys == "syscalls" || info.Subsys == "raw_syscalls" for argIdx := range specArgs { specArg := &specArgs[argIdx] @@ -245,11 +246,16 @@ func buildGenericTracepointArgs(info *tracepoint.Tracepoint, specArgs []v1alpha1 return nil, fmt.Errorf("tracepoint %s/%s has %d fields but field %d was requested", info.Subsys, info.Event, nfields, specArg.Index) } field := info.Format.Fields[specArg.Index] + // Syscall tracepoint arguments are in userspace memory. + metaTp, err := getMetaValue(specArg, syscall) + if err != nil { + return nil, fmt.Errorf("tracepoint %s/%s getMetaValue error: %w", info.Subsys, info.Event, err) + } ret = append(ret, genericTracepointArg{ CtxOffset: int(field.Offset), ArgIdx: uint32(argIdx), TpIdx: int(specArg.Index), - MetaTp: getTracepointMetaValue(specArg), + MetaTp: metaTp, nopTy: false, format: &field, genericTypeId: gt.GenericInvalidType, @@ -275,12 +281,16 @@ func buildGenericTracepointArgs(info *tracepoint.Tracepoint, specArgs []v1alpha1 } field := info.Format.Fields[tpIdx] argIdx := uint32(len(ret)) + metaArg := 0 + if syscall { + metaArg = argUserspaceDataBit + } ret = append(ret, genericTracepointArg{ CtxOffset: int(field.Offset), ArgIdx: argIdx, TpIdx: tpIdx, MetaTp: 0, - MetaArg: 0, + MetaArg: metaArg, nopTy: true, format: &field, genericTypeId: gt.GenericInvalidType, @@ -290,15 +300,18 @@ func buildGenericTracepointArgs(info *tracepoint.Tracepoint, specArgs []v1alpha1 for idx := 0; idx < len(ret); idx++ { meta := ret[idx].MetaTp - if meta == 0 || meta == -1 { + metaArgIndex := meta & argSizeArgIndexMask + + if metaArgIndex == 0 || (meta&argReturnCopyBit != 0) { ret[idx].MetaArg = meta continue } - a, err := getOrAppendMeta(meta) + a, err := getOrAppendMeta(metaArgIndex) if err != nil { return nil, err } - ret[idx].MetaArg = int(a.ArgIdx) + 1 + meta = meta & ^argSizeArgIndexMask + ret[idx].MetaArg = meta | (int(a.ArgIdx) + 1) } return ret, nil } diff --git a/pkg/sensors/tracing/genericuprobe.go b/pkg/sensors/tracing/genericuprobe.go index 20e927f6af1..e12198cb25c 100644 --- a/pkg/sensors/tracing/genericuprobe.go +++ b/pkg/sensors/tracing/genericuprobe.go @@ -338,7 +338,8 @@ func addUprobe(spec *v1alpha1.UProbeSpec, ids []idtable.EntryID, in *addUprobeIn if argType == gt.GenericInvalidType { return nil, fmt.Errorf("Arg(%d) type '%s' unsupported", i, a.Type) } - argMValue, err := getMetaValue(&a) + // For uprobes, args default to userspace memory. + argMValue, err := getMetaValue(&a, true) if err != nil { return nil, err } From 7d5190419adb4019958b75ac3a828ffa12d7e9e9 Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Tue, 9 Apr 2024 17:11:48 +0100 Subject: [PATCH 03/11] Memory: Swap probe_read to kernel or user version We should always use the probe_read_kernel or probe_read_user helpers over the probe_read helper (ditto for _str versions). This commit changes most probe_read to either probe_read_kernel or probe_read_user (ditto for _str versions). Signed-off-by: Kevin Sheldrake --- bpf/cgroup/bpf_cgroup_events.h | 2 +- bpf/include/api.h | 3 + bpf/lib/bpf_cgroup.h | 18 ++-- bpf/lib/bpf_task.h | 27 +++--- bpf/lib/process.h | 2 +- bpf/libbpf/bpf_tracing.h | 2 + bpf/process/bpf_execve_event.c | 18 ++-- bpf/process/bpf_exit.h | 4 +- bpf/process/bpf_generic_tracepoint.c | 16 +-- bpf/process/bpf_loader.c | 6 +- bpf/process/bpf_process_event.h | 140 +++++++++++++-------------- bpf/process/generic_calls.h | 2 +- bpf/process/types/basic.h | 58 ++++++----- 13 files changed, 150 insertions(+), 148 deletions(-) diff --git a/bpf/cgroup/bpf_cgroup_events.h b/bpf/cgroup/bpf_cgroup_events.h index 026ac358617..9227eaf85e1 100644 --- a/bpf/cgroup/bpf_cgroup_events.h +++ b/bpf/cgroup/bpf_cgroup_events.h @@ -49,7 +49,7 @@ send_cgrp_event(struct bpf_raw_tracepoint_args *ctx, msg->cgrp_data.level = cgrp_track->level; msg->cgrp_data.hierarchy_id = cgrp_track->hierarchy_id; memcpy(&msg->cgrp_data.name, &cgrp_track->name, KN_NAME_LENGTH); - probe_read_str(&msg->path, PATH_MAP_SIZE - 1, path); + probe_read_kernel_str(&msg->path, PATH_MAP_SIZE - 1, path); perf_event_output_metric(ctx, MSG_OP_CGROUP, &tcpmon_map, BPF_F_CURRENT_CPU, msg, size); diff --git a/bpf/include/api.h b/bpf/include/api.h index 9c272ab738b..7cfaa944c53 100644 --- a/bpf/include/api.h +++ b/bpf/include/api.h @@ -204,6 +204,9 @@ static int BPF_FUNC(fib_lookup, void *ctx, struct bpf_fib_lookup *params, uint32 static int BPF_FUNC(probe_read, void *dst, uint32_t size, const void *src); static int BPF_FUNC(probe_read_str, void *dst, int size, const void *src); static int BPF_FUNC(probe_read_kernel, void *dst, uint32_t size, const void *src); +static int BPF_FUNC(probe_read_kernel_str, void *dst, int size, const void *src); +static int BPF_FUNC(probe_read_user, void *dst, uint32_t size, const void *src); +static int BPF_FUNC(probe_read_user_str, void *dst, int size, const void *src); static uint64_t BPF_FUNC(get_current_task); diff --git a/bpf/lib/bpf_cgroup.h b/bpf/lib/bpf_cgroup.h index 94ccb106d7c..ffe01e170e2 100644 --- a/bpf/lib/bpf_cgroup.h +++ b/bpf/lib/bpf_cgroup.h @@ -109,7 +109,7 @@ __get_cgroup_kn_name(const struct kernfs_node *kn) const char *name = NULL; if (kn) - probe_read(&name, sizeof(name), _(&kn->name)); + probe_read_kernel(&name, sizeof(name), _(&kn->name)); return name; } @@ -139,7 +139,7 @@ __get_cgroup_kn_id(const struct kernfs_node *kn) if (BPF_CORE_READ_INTO(&id, old_kn, id.id) != 0) return 0; } else { - probe_read(&id, sizeof(id), _(&kn->id)); + probe_read_kernel(&id, sizeof(id), _(&kn->id)); } return id; @@ -157,7 +157,7 @@ __get_cgroup_kn(const struct cgroup *cgrp) struct kernfs_node *kn = NULL; if (cgrp) - probe_read(&kn, sizeof(cgrp->kn), _(&cgrp->kn)); + probe_read_kernel(&kn, sizeof(cgrp->kn), _(&cgrp->kn)); return kn; } @@ -187,7 +187,7 @@ get_cgroup_hierarchy_id(const struct cgroup *cgrp) * @cgrp: target cgroup * * Returns a pointer to the cgroup node name on success that can - * be read with probe_read(). NULL on failures. + * be read with probe_read_kernel(). NULL on failures. */ static inline __attribute__((always_inline)) const char * get_cgroup_name(const struct cgroup *cgrp) @@ -214,7 +214,7 @@ get_cgroup_level(const struct cgroup *cgrp) { __u32 level = 0; - probe_read(&level, sizeof(level), _(&cgrp->level)); + probe_read_kernel(&level, sizeof(level), _(&cgrp->level)); return level; } @@ -264,7 +264,7 @@ get_task_cgroup(struct task_struct *task, __u32 subsys_idx, __u32 *error_flags) struct css_set *cgroups; struct cgroup *cgrp = NULL; - probe_read(&cgroups, sizeof(cgroups), _(&task->cgroups)); + probe_read_kernel(&cgroups, sizeof(cgroups), _(&task->cgroups)); if (unlikely(!cgroups)) { *error_flags |= EVENT_ERROR_CGROUPS; return cgrp; @@ -297,13 +297,13 @@ get_task_cgroup(struct task_struct *task, __u32 subsys_idx, __u32 *error_flags) * support as much as workload as possible. It also reduces errors * in a significant way. */ - probe_read(&subsys, sizeof(subsys), _(&cgroups->subsys[subsys_idx])); + probe_read_kernel(&subsys, sizeof(subsys), _(&cgroups->subsys[subsys_idx])); if (unlikely(!subsys)) { *error_flags |= EVENT_ERROR_CGROUP_SUBSYS; return cgrp; } - probe_read(&cgrp, sizeof(cgrp), _(&subsys->cgroup)); + probe_read_kernel(&cgrp, sizeof(cgrp), _(&subsys->cgroup)); if (!cgrp) *error_flags |= EVENT_ERROR_CGROUP_SUBSYSCGRP; @@ -426,7 +426,7 @@ __init_cgrp_tracking_val_heap(struct cgroup *cgrp, cgroup_state state) kn = __get_cgroup_kn(cgrp); name = __get_cgroup_kn_name(kn); if (name) - probe_read_str(&heap->name, KN_NAME_LENGTH - 1, name); + probe_read_kernel_str(&heap->name, KN_NAME_LENGTH - 1, name); return heap; } diff --git a/bpf/lib/bpf_task.h b/bpf/lib/bpf_task.h index f4e2e9bc13a..c1e2713642d 100644 --- a/bpf/lib/bpf_task.h +++ b/bpf/lib/bpf_task.h @@ -7,6 +7,7 @@ #include "bpf_event.h" #include "bpf_helpers.h" #include "generic.h" +#include "bpf_tracing.h" /* __d_path_local flags */ // #define UNRESOLVED_MOUNT_POINTS 0x01 // (deprecated) @@ -27,7 +28,7 @@ get_parent(struct task_struct *t) struct task_struct *task; /* Read the real parent */ - probe_read(&task, sizeof(task), _(&t->real_parent)); + probe_read_kernel(&task, sizeof(task), _(&t->real_parent)); if (!task) return 0; return task; @@ -47,7 +48,7 @@ get_task_from_pid(__u32 pid) i = TASK_PID_LOOP; continue; } - probe_read(&cpid, sizeof(cpid), _(&task->tgid)); + probe_read_kernel(&cpid, sizeof(cpid), _(&task->tgid)); if (cpid == pid) { i = TASK_PID_LOOP; continue; @@ -70,7 +71,7 @@ static inline __attribute__((always_inline)) __u32 get_task_pid_vnr(void) thread_pid_exists = bpf_core_field_exists(task->thread_pid); if (thread_pid_exists) { - probe_read(&pid, sizeof(pid), _(&task->thread_pid)); + probe_read_kernel(&pid, sizeof(pid), _(&task->thread_pid)); if (!pid) return 0; } else { @@ -85,16 +86,16 @@ static inline __attribute__((always_inline)) __u32 get_task_pid_vnr(void) if (!thread_pid_exists) link_sz = 24; // voodoo magic, hard-code 24 to init stack - probe_read(&link, link_sz, - (void *)_(&task->pids) + (PIDTYPE_PID * link_sz)); + probe_read_kernel(&link, link_sz, + (void *)_(&task->pids) + (PIDTYPE_PID * link_sz)); pid = link.pid; } upid_sz = bpf_core_field_size(pid->numbers[0]); - probe_read(&level, sizeof(level), _(&pid->level)); + probe_read_kernel(&level, sizeof(level), _(&pid->level)); if (level < 1) return 0; - probe_read(&upid, upid_sz, - (void *)_(&pid->numbers) + (level * upid_sz)); + probe_read_kernel(&upid, upid_sz, + (void *)_(&pid->numbers) + (level * upid_sz)); return upid.nr; } @@ -106,7 +107,7 @@ event_find_parent_pid(struct task_struct *t) if (!task) return 0; - probe_read(&pid, sizeof(pid), _(&task->tgid)); + probe_read_kernel(&pid, sizeof(pid), _(&task->tgid)); return pid; } @@ -119,10 +120,10 @@ __event_find_parent(struct task_struct *task) #pragma unroll for (i = 0; i < 4; i++) { - probe_read(&task, sizeof(task), _(&task->real_parent)); + probe_read_kernel(&task, sizeof(task), _(&task->real_parent)); if (!task) break; - probe_read(&pid, sizeof(pid), _(&task->tgid)); + probe_read_kernel(&pid, sizeof(pid), _(&task->tgid)); value = execve_map_get_noinit(pid); if (value && value->key.ktime != 0) return value; @@ -164,13 +165,13 @@ event_find_curr(__u32 *ppid, bool *walked) #pragma unroll for (i = 0; i < 4; i++) { - probe_read(&pid, sizeof(pid), _(&task->tgid)); + probe_read_kernel(&pid, sizeof(pid), _(&task->tgid)); value = execve_map_get_noinit(pid); if (value && value->key.ktime != 0) break; value = 0; *walked = 1; - probe_read(&task, sizeof(task), _(&task->real_parent)); + probe_read_kernel(&task, sizeof(task), _(&task->real_parent)); if (!task) break; } diff --git a/bpf/lib/process.h b/bpf/lib/process.h index 4dc7aac4b05..ee8f67e0989 100644 --- a/bpf/lib/process.h +++ b/bpf/lib/process.h @@ -51,7 +51,7 @@ * Now we want to read this with call 45 aka probe_read_str as follows, * where 'kernel_struct_arg' is the kernel data struct we are reading. * - * probe_read_str(args[offset], size, kernel_struct_arg) + * probe_read_kernel_str(args[offset], size, kernel_struct_arg) * * But we have a bit of a problem determining if 'size' is out of array * range. The math would be, diff --git a/bpf/libbpf/bpf_tracing.h b/bpf/libbpf/bpf_tracing.h index 789556811e2..42129a99344 100644 --- a/bpf/libbpf/bpf_tracing.h +++ b/bpf/libbpf/bpf_tracing.h @@ -2,6 +2,8 @@ #ifndef __BPF_TRACING_H__ #define __BPF_TRACING_H__ +#include "bpf_core_read.h" + /* Scan the ARCH passed in from ARCH env variable (see Makefile) */ #if defined(__TARGET_ARCH_x86) #define bpf_target_x86 diff --git a/bpf/process/bpf_execve_event.c b/bpf/process/bpf_execve_event.c index 85600f8d80a..c05bc9667e7 100644 --- a/bpf/process/bpf_execve_event.c +++ b/bpf/process/bpf_execve_event.c @@ -41,13 +41,13 @@ read_args(void *ctx, struct msg_execve_event *event) long off; int err; - probe_read(&mm, sizeof(mm), _(&task->mm)); + probe_read_kernel(&mm, sizeof(mm), _(&task->mm)); if (!mm) return 0; - probe_read(&start_stack, sizeof(start_stack), - _(&mm->arg_start)); - probe_read(&end_stack, sizeof(start_stack), _(&mm->arg_end)); + probe_read_kernel(&start_stack, sizeof(start_stack), + _(&mm->arg_start)); + probe_read_kernel(&end_stack, sizeof(start_stack), _(&mm->arg_end)); if (!start_stack || !end_stack) return 0; @@ -58,7 +58,7 @@ read_args(void *ctx, struct msg_execve_event *event) return 0; /* poor man's strlen */ - off = probe_read_str(&heap->maxpath, 4096, (char *)start_stack); + off = probe_read_user_str(&heap->maxpath, 4096, (char *)start_stack); if (off < 0) return 0; @@ -78,7 +78,7 @@ read_args(void *ctx, struct msg_execve_event *event) if (args_size < BUFFER && args_size < free_size) { size = args_size & 0x3ff /* BUFFER - 1 */; - err = probe_read(args, size, (char *)start_stack); + err = probe_read_user(args, size, (char *)start_stack); if (err < 0) { p->flags |= EVENT_ERROR_ARGS; size = 0; @@ -104,7 +104,7 @@ read_path(void *ctx, struct msg_execve_event *event, void *filename) earg = (void *)p + offsetof(struct msg_process, args); - size = probe_read_str(earg, MAXARGLENGTH - 1, filename); + size = probe_read_kernel_str(earg, MAXARGLENGTH - 1, filename); if (size < 0) { flags |= EVENT_ERROR_FILENAME; size = 0; @@ -305,7 +305,7 @@ execve_send(struct sched_execve_args *ctx) #ifdef __LARGE_BPF_PROG // read from proc exe stored at execve time if (event->exe.len <= BINARY_PATH_MAX_LEN) { - curr->bin.path_length = probe_read(curr->bin.path, event->exe.len, event->exe.off); + curr->bin.path_length = probe_read_kernel(curr->bin.path, event->exe.len, event->exe.off); if (curr->bin.path_length == 0) curr->bin.path_length = event->exe.len; } @@ -313,7 +313,7 @@ execve_send(struct sched_execve_args *ctx) // reuse p->args first string that contains the filename, this can't be // above 256 in size (otherwise the complete will be send via data msg) // which is okay because we need the 256 first bytes. - curr->bin.path_length = probe_read_str(curr->bin.path, BINARY_PATH_MAX_LEN, &p->args); + curr->bin.path_length = probe_read_kernel_str(curr->bin.path, BINARY_PATH_MAX_LEN, &p->args); if (curr->bin.path_length > 1) { // don't include the NULL byte in the length curr->bin.path_length--; diff --git a/bpf/process/bpf_exit.h b/bpf/process/bpf_exit.h index b683bac7a5d..7df5bdd84af 100644 --- a/bpf/process/bpf_exit.h +++ b/bpf/process/bpf_exit.h @@ -63,8 +63,8 @@ static inline __attribute__((always_inline)) void event_exit_send(void *ctx, __u * entry from the execve_map anyway and explicitly set it to the to tgid. */ exit->info.tid = tgid; - probe_read(&exit->info.code, sizeof(exit->info.code), - _(&task->exit_code)); + probe_read_kernel(&exit->info.code, sizeof(exit->info.code), + _(&task->exit_code)); perf_event_output_metric(ctx, MSG_OP_EXIT, &tcpmon_map, BPF_F_CURRENT_CPU, exit, size); } diff --git a/bpf/process/bpf_generic_tracepoint.c b/bpf/process/bpf_generic_tracepoint.c index f84367b9b23..1456784fa22 100644 --- a/bpf/process/bpf_generic_tracepoint.c +++ b/bpf/process/bpf_generic_tracepoint.c @@ -69,14 +69,14 @@ static inline __attribute__((always_inline)) unsigned long get_ctx_ul(void *src, case u64_ty: { u64 ret; - probe_read(&ret, sizeof(u64), src); + probe_read_kernel(&ret, sizeof(u64), src); return ret; } case size_type: { size_t ret; - probe_read(&ret, sizeof(size_t), src); + probe_read_kernel(&ret, sizeof(size_t), src); return (unsigned long)ret; } @@ -84,7 +84,7 @@ static inline __attribute__((always_inline)) unsigned long get_ctx_ul(void *src, case s32_ty: { s32 ret; - probe_read(&ret, sizeof(u32), src); + probe_read_kernel(&ret, sizeof(u32), src); return ret; } @@ -92,21 +92,21 @@ static inline __attribute__((always_inline)) unsigned long get_ctx_ul(void *src, case u32_ty: { u32 ret; - probe_read(&ret, sizeof(u32), src); + probe_read_kernel(&ret, sizeof(u32), src); return ret; } case char_buf: case string_type: { char *buff; - probe_read(&buff, sizeof(char *), src); + probe_read_kernel(&buff, sizeof(char *), src); return (unsigned long)buff; } case data_loc_type: { u32 ret; - probe_read(&ret, sizeof(ret), src); + probe_read_kernel(&ret, sizeof(ret), src); return ret; } @@ -117,14 +117,14 @@ static inline __attribute__((always_inline)) unsigned long get_ctx_ul(void *src, case skb_type: { struct sk_buff *skb; - probe_read(&skb, sizeof(struct sk_buff *), src); + probe_read_kernel(&skb, sizeof(struct sk_buff *), src); return (unsigned long)skb; } case sock_type: { struct sock *sk; - probe_read(&sk, sizeof(struct sock *), src); + probe_read_kernel(&sk, sizeof(struct sock *), src); return (unsigned long)sk; } diff --git a/bpf/process/bpf_loader.c b/bpf/process/bpf_loader.c index 332fc289bfd..7eca0630b3c 100644 --- a/bpf/process/bpf_loader.c +++ b/bpf/process/bpf_loader.c @@ -115,11 +115,11 @@ loader_kprobe(struct pt_regs *ctx) if (!msg->buildid_size) return 0; - probe_read(&msg->buildid[0], sizeof(msg->buildid), - _(&mmap_event->build_id[0])); + probe_read_kernel(&msg->buildid[0], sizeof(msg->buildid), + _(&mmap_event->build_id[0])); path = BPF_CORE_READ(mmap_event, file_name); - len = probe_read_str(&msg->path, sizeof(msg->path), path); + len = probe_read_kernel_str(&msg->path, sizeof(msg->path), path); msg->path_size = (__u32)len; msg->pid = tgid; diff --git a/bpf/process/bpf_process_event.h b/bpf/process/bpf_process_event.h index 2b381b2f6f7..4b02209db13 100644 --- a/bpf/process/bpf_process_event.h +++ b/bpf/process/bpf_process_event.h @@ -39,15 +39,15 @@ __get_auid(struct task_struct *task) return auid; if (bpf_core_field_exists(task->loginuid)) { - probe_read(&auid, sizeof(auid), _(&task->loginuid.val)); + probe_read_kernel(&auid, sizeof(auid), _(&task->loginuid.val)); } else { struct audit_task_info *audit; if (bpf_core_field_exists(task->audit)) { - probe_read(&audit, sizeof(audit), _(&task->audit)); + probe_read_kernel(&audit, sizeof(audit), _(&task->audit)); if (audit) { - probe_read(&auid, sizeof(__u32), - _(&audit->loginuid)); + probe_read_kernel(&auid, sizeof(__u32), + _(&audit->loginuid)); } } } @@ -88,7 +88,7 @@ static inline __attribute__((always_inline)) bool IS_ROOT(struct dentry *dentry) { struct dentry *d_parent; - probe_read(&d_parent, sizeof(d_parent), _(&dentry->d_parent)); + probe_read_kernel(&d_parent, sizeof(d_parent), _(&dentry->d_parent)); return (dentry == d_parent); } @@ -97,7 +97,7 @@ hlist_bl_unhashed(const struct hlist_bl_node *h) { struct hlist_bl_node **pprev; - probe_read(&pprev, sizeof(pprev), _(&h->pprev)); + probe_read_kernel(&pprev, sizeof(pprev), _(&h->pprev)); return !pprev; } @@ -153,7 +153,7 @@ prepend_name(char *buf, char **bufptr, int *buflen, const char *name, u32 namele // Needed to bound that for probe_read call. asm volatile("%[namelen] &= 0xff;\n" ::[namelen] "+r"(namelen) :); - probe_read(buf + buffer_offset + write_slash, namelen * sizeof(char), name); + probe_read_kernel(buf + buffer_offset + write_slash, namelen * sizeof(char), name); *bufptr = buf + buffer_offset; return write_slash ? 0 : -ENAMETOOLONG; @@ -204,28 +204,28 @@ cwd_read(struct cwd_read_data *data) return 1; } - probe_read(&vfsmnt_mnt_root, sizeof(vfsmnt_mnt_root), - _(&vfsmnt->mnt_root)); + probe_read_kernel(&vfsmnt_mnt_root, sizeof(vfsmnt_mnt_root), + _(&vfsmnt->mnt_root)); if (dentry == vfsmnt_mnt_root || IS_ROOT(dentry)) { struct mount *parent; - probe_read(&parent, sizeof(parent), _(&mnt->mnt_parent)); + probe_read_kernel(&parent, sizeof(parent), _(&mnt->mnt_parent)); /* Global root? */ if (data->mnt != parent) { - probe_read(&data->dentry, sizeof(data->dentry), - _(&mnt->mnt_mountpoint)); + probe_read_kernel(&data->dentry, sizeof(data->dentry), + _(&mnt->mnt_mountpoint)); data->mnt = parent; - probe_read(&data->vfsmnt, sizeof(data->vfsmnt), - _(&mnt->mnt)); + probe_read_kernel(&data->vfsmnt, sizeof(data->vfsmnt), + _(&mnt->mnt)); return 0; } // resolved all path components successfully data->resolved = true; return 1; } - probe_read(&parent, sizeof(parent), _(&dentry->d_parent)); - probe_read(&d_name, sizeof(d_name), _(&dentry->d_name)); + probe_read_kernel(&parent, sizeof(parent), _(&dentry->d_parent)); + probe_read_kernel(&d_name, sizeof(d_name), _(&dentry->d_name)); error = prepend_name(data->bf, &data->bptr, &data->blen, (const char *)d_name.name, d_name.len); // This will happen where the dentry name does not fit in the buffer. @@ -256,11 +256,11 @@ prepend_path(const struct path *path, const struct path *root, char *bf, }; int error = 0; - probe_read(&data.root_dentry, sizeof(data.root_dentry), - _(&root->dentry)); - probe_read(&data.root_mnt, sizeof(data.root_mnt), _(&root->mnt)); - probe_read(&data.dentry, sizeof(data.dentry), _(&path->dentry)); - probe_read(&data.vfsmnt, sizeof(data.vfsmnt), _(&path->mnt)); + probe_read_kernel(&data.root_dentry, sizeof(data.root_dentry), + _(&root->dentry)); + probe_read_kernel(&data.root_mnt, sizeof(data.root_mnt), _(&root->mnt)); + probe_read_kernel(&data.dentry, sizeof(data.dentry), _(&path->dentry)); + probe_read_kernel(&data.vfsmnt, sizeof(data.vfsmnt), _(&path->mnt)); data.mnt = real_mount(data.vfsmnt); #ifndef __V61_BPF_PROG @@ -290,7 +290,7 @@ path_with_deleted(const struct path *path, const struct path *root, char *bf, { struct dentry *dentry; - probe_read(&dentry, sizeof(dentry), _(&path->dentry)); + probe_read_kernel(&dentry, sizeof(dentry), _(&path->dentry)); if (d_unlinked(dentry)) { int error = prepend(buf, buflen, " (deleted)", 10); if (error) // will never happen as prepend will never return a value != 0 @@ -342,7 +342,7 @@ __d_path_local(const struct path *path, char *buf, int *buflen, int *error) struct fs_struct *fs; task = (struct task_struct *)get_current_task(); - probe_read(&fs, sizeof(fs), _(&task->fs)); + probe_read_kernel(&fs, sizeof(fs), _(&task->fs)); *error = path_with_deleted(path, _(&fs->root), buf, &res, buflen); return res; } @@ -386,7 +386,7 @@ getcwd(struct msg_process *curr, __u32 offset, __u32 proc_pid) int flags = 0, size; char *buffer; - probe_read(&fs, sizeof(fs), _(&task->fs)); + probe_read_kernel(&fs, sizeof(fs), _(&task->fs)); if (!fs) { curr->flags |= EVENT_ERROR_CWD; return 0; @@ -400,7 +400,7 @@ getcwd(struct msg_process *curr, __u32 offset, __u32 proc_pid) :); asm volatile("%[size] &= 0xff;\n" ::[size] "+r"(size) :); - probe_read((char *)curr + offset, size, buffer); + probe_read_kernel((char *)curr + offset, size, buffer); // Unfortunate special case for '/' where nothing was added we need // to truncate with '\n' for parser. @@ -421,9 +421,9 @@ event_set_clone(struct msg_process *pid) static inline __attribute__((always_inline)) void __get_caps(struct msg_capabilities *msg, const struct cred *cred) { - probe_read(&msg->effective, sizeof(__u64), _(&cred->cap_effective)); - probe_read(&msg->inheritable, sizeof(__u64), _(&cred->cap_inheritable)); - probe_read(&msg->permitted, sizeof(__u64), _(&cred->cap_permitted)); + probe_read_kernel(&msg->effective, sizeof(__u64), _(&cred->cap_effective)); + probe_read_kernel(&msg->inheritable, sizeof(__u64), _(&cred->cap_inheritable)); + probe_read_kernel(&msg->permitted, sizeof(__u64), _(&cred->cap_permitted)); } /* @get_current_subj_caps: @@ -463,7 +463,7 @@ get_current_subj_caps(struct msg_capabilities *msg, struct task_struct *task) const struct cred *cred; /* Get the task's subjective creds */ - probe_read(&cred, sizeof(cred), _(&task->cred)); + probe_read_kernel(&cred, sizeof(cred), _(&task->cred)); __get_caps(msg, cred); } @@ -473,17 +473,17 @@ get_current_subj_creds(struct msg_cred *info, struct task_struct *task) const struct cred *cred; /* Get the task's subjective creds */ - probe_read(&cred, sizeof(cred), _(&task->cred)); - - probe_read(&info->uid, sizeof(__u32), _(&cred->uid)); - probe_read(&info->gid, sizeof(__u32), _(&cred->gid)); - probe_read(&info->euid, sizeof(__u32), _(&cred->euid)); - probe_read(&info->egid, sizeof(__u32), _(&cred->egid)); - probe_read(&info->suid, sizeof(__u32), _(&cred->suid)); - probe_read(&info->sgid, sizeof(__u32), _(&cred->sgid)); - probe_read(&info->fsuid, sizeof(__u32), _(&cred->fsuid)); - probe_read(&info->fsgid, sizeof(__u32), _(&cred->fsgid)); - probe_read(&info->securebits, sizeof(__u32), _(&cred->securebits)); + probe_read_kernel(&cred, sizeof(cred), _(&task->cred)); + + probe_read_kernel(&info->uid, sizeof(__u32), _(&cred->uid)); + probe_read_kernel(&info->gid, sizeof(__u32), _(&cred->gid)); + probe_read_kernel(&info->euid, sizeof(__u32), _(&cred->euid)); + probe_read_kernel(&info->egid, sizeof(__u32), _(&cred->egid)); + probe_read_kernel(&info->suid, sizeof(__u32), _(&cred->suid)); + probe_read_kernel(&info->sgid, sizeof(__u32), _(&cred->sgid)); + probe_read_kernel(&info->fsuid, sizeof(__u32), _(&cred->fsuid)); + probe_read_kernel(&info->fsgid, sizeof(__u32), _(&cred->fsgid)); + probe_read_kernel(&info->securebits, sizeof(__u32), _(&cred->securebits)); /* Get capabilities */ __get_caps(&info->caps, cred); @@ -495,55 +495,55 @@ get_namespaces(struct msg_ns *msg, struct task_struct *task) struct nsproxy *nsproxy; struct nsproxy nsp; - probe_read(&nsproxy, sizeof(nsproxy), _(&task->nsproxy)); - probe_read(&nsp, sizeof(nsp), _(nsproxy)); + probe_read_kernel(&nsproxy, sizeof(nsproxy), _(&task->nsproxy)); + probe_read_kernel(&nsp, sizeof(nsp), _(nsproxy)); - probe_read(&msg->uts_inum, sizeof(msg->uts_inum), - _(&nsp.uts_ns->ns.inum)); - probe_read(&msg->ipc_inum, sizeof(msg->ipc_inum), - _(&nsp.ipc_ns->ns.inum)); - probe_read(&msg->mnt_inum, sizeof(msg->mnt_inum), - _(&nsp.mnt_ns->ns.inum)); + probe_read_kernel(&msg->uts_inum, sizeof(msg->uts_inum), + _(&nsp.uts_ns->ns.inum)); + probe_read_kernel(&msg->ipc_inum, sizeof(msg->ipc_inum), + _(&nsp.ipc_ns->ns.inum)); + probe_read_kernel(&msg->mnt_inum, sizeof(msg->mnt_inum), + _(&nsp.mnt_ns->ns.inum)); { struct pid *p = 0; - probe_read(&p, sizeof(p), _(&task->thread_pid)); + probe_read_kernel(&p, sizeof(p), _(&task->thread_pid)); if (p) { int level = 0; struct upid up; - probe_read(&level, sizeof(level), _(&p->level)); - probe_read(&up, sizeof(up), _(&p->numbers[level])); - probe_read(&msg->pid_inum, sizeof(msg->pid_inum), - _(&up.ns->ns.inum)); + probe_read_kernel(&level, sizeof(level), _(&p->level)); + probe_read_kernel(&up, sizeof(up), _(&p->numbers[level])); + probe_read_kernel(&msg->pid_inum, sizeof(msg->pid_inum), + _(&up.ns->ns.inum)); } else msg->pid_inum = 0; } - probe_read(&msg->pid_for_children_inum, - sizeof(msg->pid_for_children_inum), - _(&nsp.pid_ns_for_children->ns.inum)); - probe_read(&msg->net_inum, sizeof(msg->net_inum), - _(&nsp.net_ns->ns.inum)); + probe_read_kernel(&msg->pid_for_children_inum, + sizeof(msg->pid_for_children_inum), + _(&nsp.pid_ns_for_children->ns.inum)); + probe_read_kernel(&msg->net_inum, sizeof(msg->net_inum), + _(&nsp.net_ns->ns.inum)); // this also includes time_ns_for_children if (bpf_core_field_exists(nsproxy->time_ns)) { - probe_read(&msg->time_inum, sizeof(msg->time_inum), - _(&nsp.time_ns->ns.inum)); - probe_read(&msg->time_for_children_inum, - sizeof(msg->time_for_children_inum), - _(&nsp.time_ns_for_children->ns.inum)); + probe_read_kernel(&msg->time_inum, sizeof(msg->time_inum), + _(&nsp.time_ns->ns.inum)); + probe_read_kernel(&msg->time_for_children_inum, + sizeof(msg->time_for_children_inum), + _(&nsp.time_ns_for_children->ns.inum)); } - probe_read(&msg->cgroup_inum, sizeof(msg->cgroup_inum), - _(&nsp.cgroup_ns->ns.inum)); + probe_read_kernel(&msg->cgroup_inum, sizeof(msg->cgroup_inum), + _(&nsp.cgroup_ns->ns.inum)); { struct mm_struct *mm; struct user_namespace *user_ns; - probe_read(&mm, sizeof(mm), _(&task->mm)); - probe_read(&user_ns, sizeof(user_ns), _(&mm->user_ns)); - probe_read(&msg->user_inum, sizeof(msg->user_inum), - _(&user_ns->ns.inum)); + probe_read_kernel(&mm, sizeof(mm), _(&task->mm)); + probe_read_kernel(&user_ns, sizeof(user_ns), _(&mm->user_ns)); + probe_read_kernel(&msg->user_inum, sizeof(msg->user_inum), + _(&user_ns->ns.inum)); } } @@ -565,7 +565,7 @@ __event_get_current_cgroup_name(struct cgroup *cgrp, struct msg_k8s *kube) name = get_cgroup_name(cgrp); if (name) - probe_read_str(kube->docker_id, KN_NAME_LENGTH, name); + probe_read_kernel_str(kube->docker_id, KN_NAME_LENGTH, name); return name ? 0 : EVENT_ERROR_CGROUP_NAME; } diff --git a/bpf/process/generic_calls.h b/bpf/process/generic_calls.h index d23ac848fbc..fc649df5265 100644 --- a/bpf/process/generic_calls.h +++ b/bpf/process/generic_calls.h @@ -84,7 +84,7 @@ generic_setup_32bit_syscall(struct msg_generic_kprobe *e, u8 op) case MSG_OP_GENERIC_TRACEPOINT: case MSG_OP_GENERIC_KPROBE: info = (struct thread_info *)get_current_task(); - probe_read(&status, sizeof(status), _(&info->status)); + probe_read_kernel(&status, sizeof(status), _(&info->status)); e->sel.is32BitSyscall = status & TS_COMPAT; default: break; diff --git a/bpf/process/types/basic.h b/bpf/process/types/basic.h index c44c11886aa..0d9745f05a2 100644 --- a/bpf/process/types/basic.h +++ b/bpf/process/types/basic.h @@ -453,7 +453,7 @@ copy_path(char *args, const struct path *arg) asm volatile("%[size] &= 0xff;\n" ::[size] "+r"(size) :); - probe_read(curr, size, buffer); + probe_read_kernel(curr, size, buffer); *s = size; size += 4; @@ -885,11 +885,11 @@ filter_char_buf_equal(struct selector_arg_filter *filter, char *arg_str, uint or : "i"(STRING_MAPS_HEAP_MASK)); #ifdef __LARGE_BPF_PROG if (index <= 5) - probe_read(&heap[1], len, arg_str); + probe_read_kernel(&heap[1], len, arg_str); else - probe_read(&heap[2], len, arg_str); + probe_read_kernel(&heap[2], len, arg_str); #else - probe_read(&heap[1], len, arg_str); + probe_read_kernel(&heap[1], len, arg_str); #endif // Pad string to multiple of key increment size @@ -899,11 +899,11 @@ filter_char_buf_equal(struct selector_arg_filter *filter, char *arg_str, uint or : "i"(STRING_MAPS_HEAP_MASK)); #ifdef __LARGE_BPF_PROG if (index <= 5) - probe_read(heap + len + 1, (padded_len - len) & STRING_MAPS_COPY_MASK, zero_heap); + probe_read_kernel(heap + len + 1, (padded_len - len) & STRING_MAPS_COPY_MASK, zero_heap); else - probe_read(heap + len + 2, (padded_len - len) & STRING_MAPS_COPY_MASK, zero_heap); + probe_read_kernel(heap + len + 2, (padded_len - len) & STRING_MAPS_COPY_MASK, zero_heap); #else - probe_read(heap + len + 1, (padded_len - len) & STRING_MAPS_COPY_MASK, zero_heap); + probe_read_kernel(heap + len + 1, (padded_len - len) & STRING_MAPS_COPY_MASK, zero_heap); #endif } @@ -945,7 +945,7 @@ filter_char_buf_prefix(struct selector_arg_filter *filter, char *arg_str, uint a : [arg_len] "+r"(arg_len) : [mask] "i"(STRING_PREFIX_MAX_LENGTH - 1)); - probe_read(arg->data, arg_len & (STRING_PREFIX_MAX_LENGTH - 1), arg_str); + probe_read_kernel(arg->data, arg_len & (STRING_PREFIX_MAX_LENGTH - 1), arg_str); __u8 *pass = map_lookup_elem(addrmap, arg); @@ -1726,7 +1726,7 @@ static inline __attribute__((always_inline)) int match_binaries(__u32 selidx) // prepare the key on the stack to perform lookup in the LPM_TRIE memset(&prefix_key, 0, sizeof(prefix_key)); prefix_key.prefixlen = current->bin.path_length * 8; // prefixlen is in bits - ret = probe_read(prefix_key.data, current->bin.path_length & (STRING_PREFIX_MAX_LENGTH - 1), current->bin.path); + ret = probe_read_kernel(prefix_key.data, current->bin.path_length & (STRING_PREFIX_MAX_LENGTH - 1), current->bin.path); if (ret < 0) return 0; found_key = map_lookup_elem(path_map, &prefix_key); @@ -1957,8 +1957,8 @@ installfd(struct msg_generic_kprobe *e, int fd, int name, bool follow) : [size] "+r"(size) :); - probe_read(&val.file[0], size + 4 /* size */ + 4 /* flags */, - &e->args[nameoff]); + probe_read_kernel(&val.file[0], size + 4 /* size */ + 4 /* flags */, + &e->args[nameoff]); map_update_elem(&fdinstall_map, &key, &val, BPF_ANY); } else { err = map_delete_elem(&fdinstall_map, &key); @@ -2105,7 +2105,7 @@ rate_limit(__u64 ratelimit_interval, __u64 ratelimit_scope, struct msg_generic_k } // Clean the heap - probe_read(key->data, MAX_POSSIBLE_ARGS * KEY_BYTES_PER_ARG, ro_heap); + probe_read_kernel(key->data, MAX_POSSIBLE_ARGS * KEY_BYTES_PER_ARG, ro_heap); dst = key->data; for (i = 0; i < MAX_POSSIBLE_ARGS; i++) { @@ -2122,7 +2122,7 @@ rate_limit(__u64 ratelimit_interval, __u64 ratelimit_scope, struct msg_generic_k asm volatile("%[arg_size] &= 0x3f;\n" // ensure this mask is greater than KEY_BYTES_PER_ARG : [arg_size] "+r"(arg_size) :); - probe_read(&dst[index], arg_size, &e->args[key_index]); + probe_read_kernel(&dst[index], arg_size, &e->args[key_index]); index += arg_size; } } @@ -2527,9 +2527,9 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, struct bpf_map_def *data_heap) { size_t min_size = type_to_min_size(type, argm); + const struct path *path_arg = 0; char *args = e->args; long size = -1; - const struct path *path_arg = 0; if (orig_off >= 16383 - min_size) { return 0; @@ -2554,38 +2554,34 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, } // fallthrough to file_ty case file_ty: { - struct file *file; - probe_read(&file, sizeof(file), &arg); + struct file *file = (struct file *)arg; path_arg = _(&file->f_path); goto do_copy_path; } case path_ty: { - probe_read(&path_arg, sizeof(path_arg), &arg); + path_arg = (struct path *)arg; goto do_copy_path; } case fd_ty: { struct fdinstall_key key = { 0 }; struct fdinstall_value *val; - __u32 fd; key.tid = get_current_pid_tgid() >> 32; - probe_read(&fd, sizeof(__u32), &arg); - key.fd = fd; - + key.fd = arg; val = map_lookup_elem(&fdinstall_map, &key); if (val) { __u32 bytes = (__u32)val->file[0]; - probe_read(&args[0], sizeof(__u32), &fd); + *(__u32 *)args = key.fd; asm volatile("%[bytes] &= 0xff;\n" : [bytes] "+r"(bytes) :); - probe_read(&args[4], bytes + 4, (char *)&val->file[0]); + probe_read_kernel(&args[4], bytes + 4, (char *)&val->file[0]); size = bytes + 4 + 4; // flags - probe_read(&args[size], 4, - (char *)&val->file[size - 4]); + probe_read_kernel(&args[size], 4, + (char *)&val->file[size - 4]); size += 4; } else { /* If filter specification is fd type then we @@ -2607,8 +2603,8 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, } break; #endif case filename_ty: { - struct filename *file; - probe_read(&file, sizeof(file), &arg); + struct filename *file = (struct filename *)arg; + probe_read(&arg, sizeof(arg), &file->name); } // fallthrough to copy_string @@ -2631,26 +2627,26 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, case size_type: case s64_ty: case u64_ty: - probe_read(args, sizeof(__u64), &arg); + *(__u64 *)args = arg; size = sizeof(__u64); break; /* Consolidate all the types to save instructions */ case int_type: case s32_ty: case u32_ty: - probe_read(args, sizeof(__u32), &arg); + *(__u32 *)args = arg; size = sizeof(__u32); break; case s16_ty: case u16_ty: /* read 2 bytes, but send 4 to keep alignment */ - probe_read(args, sizeof(__u16), &arg); + *(__u16 *)args = arg; size = sizeof(__u32); break; case s8_ty: case u8_ty: /* read 1 byte, but send 4 to keep alignment */ - probe_read(args, sizeof(__u8), &arg); + *(__u8 *)args = arg; size = sizeof(__u32); break; case skb_type: From 0cb1f703a32e6191456e1e1e385795cd7a01809e Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Tue, 9 Apr 2024 17:17:51 +0100 Subject: [PATCH 04/11] Memory: Add probe_read_kernel_or_user helpers Some calls to probe_read could be reading either kernel or user memory. Add some helpers that take a userspace boolean to determine which type of memory to read. Also introduce a masked version that masks the size variable with a given mask immediately before (using inline asm) the probe_read_*. This will prevent the test for the userspace boolean spilling our registers after the masking and so will retain their bounds. Signed-off-by: Kevin Sheldrake --- bpf/process/types/probe_read_kernel_or_user.h | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 bpf/process/types/probe_read_kernel_or_user.h diff --git a/bpf/process/types/probe_read_kernel_or_user.h b/bpf/process/types/probe_read_kernel_or_user.h new file mode 100644 index 00000000000..105a625a564 --- /dev/null +++ b/bpf/process/types/probe_read_kernel_or_user.h @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright Authors of Cilium */ + +#ifndef __PROBE_READ_KERNEL_OR_USER_H__ +#define __PROBE_READ_KERNEL_OR_USER_H__ + +#include "bpf_core_read.h" + +#define bpf_probe_read_kernel probe_read_kernel +#define bpf_probe_read_user probe_read_user + +#ifdef __LARGE_BPF_PROG +static inline __attribute__((always_inline)) int +probe_read_kernel_or_user(void *dst, uint32_t size, const void *src, bool userspace) +{ + if (userspace) + return probe_read_user(dst, size, src); + return probe_read_kernel(dst, size, src); +} + +static inline __attribute__((always_inline)) int +probe_read_kernel_or_user_masked(void *dst, uint32_t size, uint32_t size_mask, const void *src, bool userspace) +{ + if (userspace) { + asm volatile("%[size] &= %1;\n" + : [size] "+r"(size) + : "i"(size_mask)); + return probe_read_user(dst, size, src); + } + asm volatile("%[size] &= %1;\n" + : [size] "+r"(size) + : "i"(size_mask)); + return probe_read_kernel(dst, size, src); +} + +static inline __attribute__((always_inline)) int +probe_read_kernel_or_user_str(void *dst, int size, const void *src, bool userspace) +{ + if (userspace) + return probe_read_user_str(dst, size, src); + return probe_read_kernel_str(dst, size, src); +} +#else +static inline __attribute__((always_inline)) int +probe_read_kernel_or_user(void *dst, uint32_t size, const void *src, bool userspace) +{ + return probe_read(dst, size, src); +} + +static inline __attribute__((always_inline)) int +probe_read_kernel_or_user_masked(void *dst, uint32_t size, uint32_t size_mask, const void *src, bool userspace) +{ + asm volatile("%[size] &= %1;\n" + : [size] "+r"(size) + : "i"(size_mask)); + return probe_read(dst, size, src); +} + +static inline __attribute__((always_inline)) int +probe_read_kernel_or_user_str(void *dst, int size, const void *src, bool userspace) +{ + return probe_read_str(dst, size, src); +} +#endif // __LARGE_BPF_PROG + +/* + * bpf_core_read_kernel_or_user() abstracts away bpf_probe_read_kernel_or_user() call and captures offset + * relocation for source address using __builtin_preserve_access_index() + * built-in, provided by Clang. + */ +#define bpf_core_read_kernel_or_user(userspace, dst, sz, sz_mask, src) \ + probe_read_kernel_or_user(dst, sz, sz_mask, \ + (const void *)__builtin_preserve_access_index(src), \ + userspace) + +/* + * bpf_core_read_kernel_or_user_str() is a thin wrapper around bpf_probe_read_kernel_or_user_str() + * additionally emitting BPF CO-RE field relocation for specified source + * argument. + */ +#define bpf_core_read_kernel_or_user_str(userspace, dst, sz, src) \ + probe_read_kernel_or_user_str(dst, sz, \ + (const void *)__builtin_preserve_access_index(src), \ + userspace) + +/* + * BPF_CORE_READ_KERNEL_OR_USER_INTO() is a more performance-conscious variant of + * BPF_CORE_READ_KERNEL_OR_USER(), in which final field is read into user-provided storage. + * See BPF_CORE_READ_KERNEL_OR_USER() below for more details on general usage. + */ +#define BPF_CORE_READ_KERNEL_OR_USER_INTO(userspace, dst, src, a, ...) \ + ({ \ + typeof(dst) dst_x = dst; \ + typeof(src) src_x = src; \ + typeof(a) a_x = a; \ + ((userspace) ? (___core_read(bpf_core_read_user, dst_x, \ + src_x, a_x, ##__VA_ARGS__)) \ + : (___core_read(bpf_core_read, dst_x, src_x, a_x, \ + ##__VA_ARGS__))) \ + }) + +/* + * BPF_CORE_READ_KERNEL_OR_USER_STR_INTO() does same "pointer chasing" as + * BPF_CORE_READ_KERNEL_OR_USER_STR() for intermediate pointers, but then executes (and returns + * corresponding error code) bpf_core_read_kernel_or_user_str() for final string read. + */ +#define BPF_CORE_READ_KERNEL_OR_USER_STR_INTO(userspace, dst, src, a, ...) \ + ({ \ + typeof(dst) dst_x = dst; \ + typeof(src) src_x = src; \ + typeof(a) a_x = a; \ + if (userspace) \ + ___core_read(bpf_core_read_user_str, dst_x, src_x, \ + a_x, ##__VA_ARGS__) else ___core_read(bpf_core_read_str, dst_x, src_x, a_x, \ + ##__VA_ARGS__) \ + }) + +/* + * BPF_CORE_READ() is used to simplify BPF CO-RE relocatable read, especially + * when there are few pointer chasing steps. + * E.g., what in non-BPF world (or in BPF w/ BCC) would be something like: + * int x = s->a.b.c->d.e->f->g; + * can be succinctly achieved using BPF_CORE_READ as: + * int x = BPF_CORE_READ(s, a.b.c, d.e, f, g); + * + * BPF_CORE_READ will decompose above statement into 4 bpf_core_read (BPF + * CO-RE relocatable bpf_probe_read() wrapper) calls, logically equivalent to: + * 1. const void *__t = s->a.b.c; + * 2. __t = __t->d.e; + * 3. __t = __t->f; + * 4. return __t->g; + * + * Equivalence is logical, because there is a heavy type casting/preservation + * involved, as well as all the reads are happening through bpf_probe_read() + * calls using __builtin_preserve_access_index() to emit CO-RE relocations. + * + * N.B. Only up to 9 "field accessors" are supported, which should be more + * than enough for any practical purpose. + */ +#define BPF_CORE_READ_KERNEL_OR_USER(userspace, src, a, ...) \ + ({ \ + typeof(src) src_x = src; \ + typeof(a) a_x = a; \ + ___type(src_x, a_x, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_KERNEL_OR_USER_INTO(userspace, &__r, src_x, \ + a_x, ##__VA_ARGS__); \ + __r; \ + }) + +#endif // __PROBE_READ_KERNEL_OR_USER_H__ From e1bcd2ea9772000f4f6b164b232646bf16ebcb3a Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Tue, 9 Apr 2024 17:23:08 +0100 Subject: [PATCH 05/11] Memory: Update bpf_core_read.h We copy bpf_core_read.h from libbpf. We now need to copy it again to take in updates that let us use probe_read helpers with CORE. This will fail our static tests because they warn about register reuse. While this is indeed a potential problem, it is unlikely to affect us in reality. It is better to keep bpf_core_read.h the same as the upstream version than to fix it locally (which is tricky). We could potentially attempt to fix the warnings and upstream the changes, but this is outside the scope of this PR. Hence, accept the warnings. Changes introduced by the updated bpf_core_read.h require macros for bpf_probe_read_kernel and bpf_probe_read_user. These were introduced in the previous commit in the new probe_read_kernel_or_user.h. This commit adds that include to source files that require access to these macros. Signed-off-by: Kevin Sheldrake --- bpf/lib/bpf_cgroup.h | 2 + bpf/lib/bpf_helpers.h | 11 +- bpf/libbpf/bpf_core_read.h | 382 +++++++++++++++++---- bpf/process/bpf_execve_bprm_commit_creds.c | 1 + bpf/process/bpf_exit.c | 1 + bpf/process/bpf_loader.c | 1 + bpf/process/data_event.h | 1 + bpf/process/retprobe_map.h | 3 + bpf/process/types/basic.h | 1 + 9 files changed, 327 insertions(+), 76 deletions(-) diff --git a/bpf/lib/bpf_cgroup.h b/bpf/lib/bpf_cgroup.h index ffe01e170e2..b07c8cd8ef8 100644 --- a/bpf/lib/bpf_cgroup.h +++ b/bpf/lib/bpf_cgroup.h @@ -9,6 +9,8 @@ #include "environ_conf.h" #include "common.h" #include "process.h" +#include "../process/types/probe_read_kernel_or_user.h" +#include "bpf_tracing.h" #define NULL ((void *)0) diff --git a/bpf/lib/bpf_helpers.h b/bpf/lib/bpf_helpers.h index f57410880af..91b5691aac5 100644 --- a/bpf/lib/bpf_helpers.h +++ b/bpf/lib/bpf_helpers.h @@ -43,7 +43,7 @@ * Following define is to assist VSCode Intellisense so that it treats * __builtin_preserve_access_index() as a const void * instead of a * simple void (because it doesn't have a definition for it). This stops - * Intellisense marking all _(P) macros (used in probe_read()) as errors. + * Intellisense marking all _(P) macros (used in probe_read_kernel()) as errors. * To use this, just define VSCODE in 'C/C++: Edit Configurations (JSON)' * in the Command Palette in VSCODE (F1 or View->Command Palette...): * "defines": ["VSCODE"] @@ -54,15 +54,6 @@ const void *__builtin_preserve_access_index(void *); #endif #define _(P) (__builtin_preserve_access_index(P)) -/* - * Convenience macro to check that field actually exists in target kernel's. - * Returns: - * 1, if matching field is present in target kernel; - * 0, if no matching field found. - */ -#define bpf_core_field_exists(field) \ - __builtin_preserve_field_info(field, BPF_FIELD_EXISTS) - /* second argument to __builtin_preserve_enum_value() built-in */ enum bpf_enum_value_kind { BPF_ENUMVAL_EXISTS = 0, /* enum value existence in kernel */ diff --git a/bpf/libbpf/bpf_core_read.h b/bpf/libbpf/bpf_core_read.h index 27634770a94..e03ecaa356f 100644 --- a/bpf/libbpf/bpf_core_read.h +++ b/bpf/libbpf/bpf_core_read.h @@ -19,6 +19,12 @@ enum bpf_field_info_kind { BPF_FIELD_RSHIFT_U64 = 5, }; +/* second argument to __builtin_btf_type_id() built-in */ +enum bpf_type_id_kind { + BPF_TYPE_ID_LOCAL = 0, /* BTF type ID in local program */ + BPF_TYPE_ID_TARGET = 1, /* BTF type ID in target kernel */ +}; + /* second argument to __builtin_preserve_type_info() built-in */ enum bpf_type_info_kind { BPF_TYPE_EXISTS = 0, /* type existence in target kernel */ @@ -29,29 +35,31 @@ enum bpf_type_info_kind { #define __CORE_RELO(src, field, info) \ __builtin_preserve_field_info((src)->field, BPF_FIELD_##info) -#if __BYTE_ORDER == __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define __CORE_BITFIELD_PROBE_READ(dst, src, fld) \ - bpf_probe_read((void *)dst, \ - __CORE_RELO(src, fld, BYTE_SIZE), \ - (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) + bpf_probe_read_kernel( \ + (void *)dst, \ + __CORE_RELO(src, fld, BYTE_SIZE), \ + (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) #else /* semantics of LSHIFT_64 assumes loading values into low-ordered bytes, so * for big-endian we need to adjust destination pointer accordingly, based on * field byte size */ #define __CORE_BITFIELD_PROBE_READ(dst, src, fld) \ - bpf_probe_read((void *)dst + (8 - __CORE_RELO(src, fld, BYTE_SIZE)), \ - __CORE_RELO(src, fld, BYTE_SIZE), \ - (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) + bpf_probe_read_kernel( \ + (void *)dst + (8 - __CORE_RELO(src, fld, BYTE_SIZE)), \ + __CORE_RELO(src, fld, BYTE_SIZE), \ + (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) #endif /* * Extract bitfield, identified by s->field, and return its value as u64. * All this is done in relocatable manner, so bitfield changes such as * signedness, bit size, offset changes, this will be handled automatically. - * This version of macro is using bpf_probe_read() to read underlying integer - * storage. Macro functions as an expression and its return type is - * bpf_probe_read()'s return value: 0, on success, <0 on error. + * This version of macro is using bpf_probe_read_kernel() to read underlying + * integer storage. Macro functions as an expression and its return type is + * bpf_probe_read_kernel()'s return value: 0, on success, <0 on error. */ #define BPF_CORE_READ_BITFIELD_PROBED(s, field) ({ \ unsigned long long val = 0; \ @@ -75,11 +83,19 @@ enum bpf_type_info_kind { const void *p = (const void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \ unsigned long long val; \ \ + /* This is a so-called barrier_var() operation that makes specified \ + * variable "a black box" for optimizing compiler. \ + * It forces compiler to perform BYTE_OFFSET relocation on p and use \ + * its calculated value in the switch below, instead of applying \ + * the same relocation 4 times for each individual memory load. \ + */ \ + asm volatile("" : "=r"(p) : "0"(p)); \ + \ switch (__CORE_RELO(s, field, BYTE_SIZE)) { \ - case 1: val = *(const unsigned char *)p; \ - case 2: val = *(const unsigned short *)p; \ - case 4: val = *(const unsigned int *)p; \ - case 8: val = *(const unsigned long long *)p; \ + case 1: val = *(const unsigned char *)p; break; \ + case 2: val = *(const unsigned short *)p; break; \ + case 4: val = *(const unsigned int *)p; break; \ + case 8: val = *(const unsigned long long *)p; break; \ } \ val <<= __CORE_RELO(s, field, LSHIFT_U64); \ if (__CORE_RELO(s, field, SIGNED)) \ @@ -89,21 +105,121 @@ enum bpf_type_info_kind { val; \ }) +/* + * Write to a bitfield, identified by s->field. + * This is the inverse of BPF_CORE_WRITE_BITFIELD(). + */ +#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({ \ + void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \ + unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \ + unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \ + unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \ + unsigned long long mask, val, nval = new_val; \ + unsigned int rpad = rshift - lshift; \ + \ + asm volatile("" : "+r"(p)); \ + \ + switch (byte_size) { \ + case 1: val = *(unsigned char *)p; break; \ + case 2: val = *(unsigned short *)p; break; \ + case 4: val = *(unsigned int *)p; break; \ + case 8: val = *(unsigned long long *)p; break; \ + } \ + \ + mask = (~0ULL << rshift) >> lshift; \ + val = (val & ~mask) | ((nval << rpad) & mask); \ + \ + switch (byte_size) { \ + case 1: *(unsigned char *)p = val; break; \ + case 2: *(unsigned short *)p = val; break; \ + case 4: *(unsigned int *)p = val; break; \ + case 8: *(unsigned long long *)p = val; break; \ + } \ +}) + +/* Differentiator between compilers builtin implementations. This is a + * requirement due to the compiler parsing differences where GCC optimizes + * early in parsing those constructs of type pointers to the builtin specific + * type, resulting in not being possible to collect the required type + * information in the builtin expansion. + */ +#ifdef __clang__ +#define ___bpf_typeof(type) ((typeof(type) *) 0) +#else +#define ___bpf_typeof1(type, NR) ({ \ + extern typeof(type) *___concat(bpf_type_tmp_, NR); \ + ___concat(bpf_type_tmp_, NR); \ +}) +#define ___bpf_typeof(type) ___bpf_typeof1(type, __COUNTER__) +#endif + +#ifdef __clang__ +#define ___bpf_field_ref1(field) (field) +#define ___bpf_field_ref2(type, field) (___bpf_typeof(type)->field) +#else +#define ___bpf_field_ref1(field) (&(field)) +#define ___bpf_field_ref2(type, field) (&(___bpf_typeof(type)->field)) +#endif +#define ___bpf_field_ref(args...) \ + ___bpf_apply(___bpf_field_ref, ___bpf_narg(args))(args) + /* * Convenience macro to check that field actually exists in target kernel's. * Returns: * 1, if matching field is present in target kernel; * 0, if no matching field found. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_exists(p->my_field); + * - field reference through type and field names: + * bpf_core_field_exists(struct my_type, my_field). */ -#define bpf_core_field_exists(field) \ - __builtin_preserve_field_info(field, BPF_FIELD_EXISTS) +#define bpf_core_field_exists(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_EXISTS) /* - * Convenience macro to get byte size of a field. Works for integers, + * Convenience macro to get the byte size of a field. Works for integers, * struct/unions, pointers, arrays, and enums. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_size(p->my_field); + * - field reference through type and field names: + * bpf_core_field_size(struct my_type, my_field). + */ +#define bpf_core_field_size(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_BYTE_SIZE) + +/* + * Convenience macro to get field's byte offset. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_offset(p->my_field); + * - field reference through type and field names: + * bpf_core_field_offset(struct my_type, my_field). + */ +#define bpf_core_field_offset(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_BYTE_OFFSET) + +/* + * Convenience macro to get BTF type ID of a specified type, using a local BTF + * information. Return 32-bit unsigned integer with type ID from program's own + * BTF. Always succeeds. + */ +#define bpf_core_type_id_local(type) \ + __builtin_btf_type_id(*___bpf_typeof(type), BPF_TYPE_ID_LOCAL) + +/* + * Convenience macro to get BTF type ID of a target kernel's type that matches + * specified local type. + * Returns: + * - valid 32-bit unsigned type ID in kernel BTF; + * - 0, if no matching type was found in a target kernel BTF. */ -#define bpf_core_field_size(field) \ - __builtin_preserve_field_info(field, BPF_FIELD_BYTE_SIZE) +#define bpf_core_type_id_kernel(type) \ + __builtin_btf_type_id(*___bpf_typeof(type), BPF_TYPE_ID_TARGET) /* * Convenience macro to check that provided named type @@ -113,7 +229,27 @@ enum bpf_type_info_kind { * 0, if no matching type is found. */ #define bpf_core_type_exists(type) \ - __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_EXISTS) + __builtin_preserve_type_info(*___bpf_typeof(type), BPF_TYPE_EXISTS) + +/* + * Convenience macro to check that provided named type + * (struct/union/enum/typedef) "matches" that in a target kernel. + * Returns: + * 1, if the type matches in the target kernel's BTF; + * 0, if the type does not match any in the target kernel + */ +#define bpf_core_type_matches(type) \ + __builtin_preserve_type_info(*___bpf_typeof(type), BPF_TYPE_MATCHES) + +/* + * Convenience macro to get the byte size of a provided named type + * (struct/union/enum/typedef) in a target kernel. + * Returns: + * >= 0 size (in bytes), if type is present in target kernel's BTF; + * 0, if no matching type is found. + */ +#define bpf_core_type_size(type) \ + __builtin_preserve_type_info(*___bpf_typeof(type), BPF_TYPE_SIZE) /* * Convenience macro to check that provided enumerator value is defined in @@ -123,8 +259,13 @@ enum bpf_type_info_kind { * kernel's BTF; * 0, if no matching enum and/or enum value within that enum is found. */ +#ifdef __clang__ #define bpf_core_enum_value_exists(enum_type, enum_value) \ __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_EXISTS) +#else +#define bpf_core_enum_value_exists(enum_type, enum_value) \ + __builtin_preserve_enum_value(___bpf_typeof(enum_type), enum_value, BPF_ENUMVAL_EXISTS) +#endif /* * Convenience macro to get the integer value of an enumerator value in @@ -134,12 +275,17 @@ enum bpf_type_info_kind { * present in target kernel's BTF; * 0, if no matching enum and/or enum value within that enum is found. */ -#define bpf_core_enum_value(enum_type, enum_value) \ +#ifdef __clang__ +#define bpf_core_enum_value(enum_type, enum_value) \ __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_VALUE) +#else +#define bpf_core_enum_value(enum_type, enum_value) \ + __builtin_preserve_enum_value(___bpf_typeof(enum_type), enum_value, BPF_ENUMVAL_VALUE) +#endif /* - * bpf_core_read() abstracts away bpf_probe_read() call and captures offset - * relocation for source address using __builtin_preserve_access_index() + * bpf_core_read() abstracts away bpf_probe_read_kernel() call and captures + * offset relocation for source address using __builtin_preserve_access_index() * built-in, provided by Clang. * * __builtin_preserve_access_index() takes as an argument an expression of @@ -147,24 +293,40 @@ enum bpf_type_info_kind { * a relocation, which records BTF type ID describing root struct/union and an * accessor string which describes exact embedded field that was used to take * an address. See detailed description of this relocation format and - * semantics in comments to struct bpf_field_reloc in libbpf_internal.h. + * semantics in comments to struct bpf_core_relo in include/uapi/linux/bpf.h. * * This relocation allows libbpf to adjust BPF instruction to use correct * actual field offset, based on target kernel BTF type that matches original * (local) BTF, used to record relocation. */ #define bpf_core_read(dst, sz, src) \ - probe_read(dst, sz, \ - (const void *)__builtin_preserve_access_index(src)) + bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src)) +/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */ +#define bpf_core_read_user(dst, sz, src) \ + bpf_probe_read_user(dst, sz, (const void *)__builtin_preserve_access_index(src)) /* * bpf_core_read_str() is a thin wrapper around bpf_probe_read_str() * additionally emitting BPF CO-RE field relocation for specified source * argument. */ #define bpf_core_read_str(dst, sz, src) \ - bpf_probe_read_str(dst, sz, \ - (const void *)__builtin_preserve_access_index(src)) + bpf_probe_read_kernel_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */ +#define bpf_core_read_user_str(dst, sz, src) \ + bpf_probe_read_user_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +extern void *bpf_rdonly_cast(const void *obj, __u32 btf_id); + +/* + * Cast provided pointer *ptr* into a pointer to a specified *type* in such + * a way that BPF verifier will become aware of associated kernel-side BTF + * type. This allows to access members of kernel types directly without the + * need to use BPF_CORE_READ() macros. + */ +#define bpf_core_cast(ptr, type) \ + ((typeof(type) *)bpf_rdonly_cast((ptr), bpf_core_type_id_kernel(type))) #define ___concat(a, b) a ## b #define ___apply(fn, n) ___concat(fn, n) @@ -223,30 +385,29 @@ enum bpf_type_info_kind { read_fn((void *)(dst), sizeof(*(dst)), &((src_type)(src))->accessor) /* "recursively" read a sequence of inner pointers using local __t var */ -#define ___rd_first(src, a) ___read(bpf_core_read, &__t, ___type(src), src, a); -#define ___rd_last(...) \ - ___read(bpf_core_read, &__t, \ - ___type(___nolast(__VA_ARGS__)), __t, ___last(__VA_ARGS__)); -#define ___rd_p1(...) const void *__t; ___rd_first(__VA_ARGS__) -#define ___rd_p2(...) ___rd_p1(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p3(...) ___rd_p2(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p4(...) ___rd_p3(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p5(...) ___rd_p4(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p6(...) ___rd_p5(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p7(...) ___rd_p6(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p8(...) ___rd_p7(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___rd_p9(...) ___rd_p8(___nolast(__VA_ARGS__)) ___rd_last(__VA_ARGS__) -#define ___read_ptrs(src, ...) \ - ___apply(___rd_p, ___narg(__VA_ARGS__))(src, __VA_ARGS__) - -#define ___core_read0(fn, dst, src, a) \ +#define ___rd_first(fn, src, a) ___read(fn, &__t, ___type(src), src, a); +#define ___rd_last(fn, ...) \ + ___read(fn, &__t, ___type(___nolast(__VA_ARGS__)), __t, ___last(__VA_ARGS__)); +#define ___rd_p1(fn, ...) const void *__t; ___rd_first(fn, __VA_ARGS__) +#define ___rd_p2(fn, ...) ___rd_p1(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p3(fn, ...) ___rd_p2(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p4(fn, ...) ___rd_p3(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p5(fn, ...) ___rd_p4(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p6(fn, ...) ___rd_p5(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p7(fn, ...) ___rd_p6(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p8(fn, ...) ___rd_p7(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p9(fn, ...) ___rd_p8(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___read_ptrs(fn, src, ...) \ + ___apply(___rd_p, ___narg(__VA_ARGS__))(fn, src, __VA_ARGS__) + +#define ___core_read0(fn, fn_ptr, dst, src, a) \ ___read(fn, dst, ___type(src), src, a); -#define ___core_readN(fn, dst, src, ...) \ - ___read_ptrs(src, ___nolast(__VA_ARGS__)) \ +#define ___core_readN(fn, fn_ptr, dst, src, ...) \ + ___read_ptrs(fn_ptr, src, ___nolast(__VA_ARGS__)) \ ___read(fn, dst, ___type(src, ___nolast(__VA_ARGS__)), __t, \ ___last(__VA_ARGS__)); -#define ___core_read(fn, dst, src, a, ...) \ - ___apply(___core_read, ___empty(__VA_ARGS__))(fn, dst, \ +#define ___core_read(fn, fn_ptr, dst, src, a, ...) \ + ___apply(___core_read, ___empty(__VA_ARGS__))(fn, fn_ptr, dst, \ src, a, ##__VA_ARGS__) /* @@ -254,20 +415,73 @@ enum bpf_type_info_kind { * BPF_CORE_READ(), in which final field is read into user-provided storage. * See BPF_CORE_READ() below for more details on general usage. */ -#define BPF_CORE_READ_INTO(dst, src, a, ...) \ - ({ \ - ___core_read(bpf_core_read, dst, src, a, ##__VA_ARGS__) \ - }) +#define BPF_CORE_READ_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read, bpf_core_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Variant of BPF_CORE_READ_INTO() for reading from user-space memory. + * + * NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. + */ +#define BPF_CORE_READ_USER_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_user, bpf_core_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_INTO() */ +#define BPF_PROBE_READ_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_kernel, bpf_probe_read_kernel, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_USER_INTO(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_user, bpf_probe_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) /* * BPF_CORE_READ_STR_INTO() does same "pointer chasing" as * BPF_CORE_READ() for intermediate pointers, but then executes (and returns * corresponding error code) bpf_core_read_str() for final string read. */ -#define BPF_CORE_READ_STR_INTO(dst, src, a, ...) \ - ({ \ - ___core_read(bpf_core_read_str, dst, src, a, ##__VA_ARGS__) \ - }) +#define BPF_CORE_READ_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_str, bpf_core_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Variant of BPF_CORE_READ_STR_INTO() for reading from user-space memory. + * + * NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. + */ +#define BPF_CORE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_user_str, bpf_core_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_STR_INTO() */ +#define BPF_PROBE_READ_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_kernel_str, bpf_probe_read_kernel, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Non-CO-RE variant of BPF_CORE_READ_USER_STR_INTO(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_user_str, bpf_probe_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) /* * BPF_CORE_READ() is used to simplify BPF CO-RE relocatable read, especially @@ -278,25 +492,61 @@ enum bpf_type_info_kind { * int x = BPF_CORE_READ(s, a.b.c, d.e, f, g); * * BPF_CORE_READ will decompose above statement into 4 bpf_core_read (BPF - * CO-RE relocatable bpf_probe_read() wrapper) calls, logically equivalent to: + * CO-RE relocatable bpf_probe_read_kernel() wrapper) calls, logically + * equivalent to: * 1. const void *__t = s->a.b.c; * 2. __t = __t->d.e; * 3. __t = __t->f; * 4. return __t->g; * * Equivalence is logical, because there is a heavy type casting/preservation - * involved, as well as all the reads are happening through bpf_probe_read() - * calls using __builtin_preserve_access_index() to emit CO-RE relocations. + * involved, as well as all the reads are happening through + * bpf_probe_read_kernel() calls using __builtin_preserve_access_index() to + * emit CO-RE relocations. * * N.B. Only up to 9 "field accessors" are supported, which should be more * than enough for any practical purpose. */ -#define BPF_CORE_READ(src, a, ...) \ - ({ \ - ___type(src, a, ##__VA_ARGS__) __r; \ - BPF_CORE_READ_INTO(&__r, src, a, ##__VA_ARGS__); \ - __r; \ - }) +#define BPF_CORE_READ(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* + * Variant of BPF_CORE_READ() for reading from user-space memory. + * + * NOTE: all the source types involved are still *kernel types* and need to + * exist in kernel (or kernel module) BTF, otherwise CO-RE relocation will + * fail. Custom user types are not relocatable with CO-RE. + * The typical situation in which BPF_CORE_READ_USER() might be used is to + * read kernel UAPI types from the user-space memory passed in as a syscall + * input argument. + */ +#define BPF_CORE_READ_USER(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ() */ +#define BPF_PROBE_READ(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_PROBE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* + * Non-CO-RE variant of BPF_CORE_READ_USER(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_PROBE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) #endif diff --git a/bpf/process/bpf_execve_bprm_commit_creds.c b/bpf/process/bpf_execve_bprm_commit_creds.c index ef75a5945fe..e109022551b 100644 --- a/bpf/process/bpf_execve_bprm_commit_creds.c +++ b/bpf/process/bpf_execve_bprm_commit_creds.c @@ -3,6 +3,7 @@ #include "vmlinux.h" #include "api.h" +#include "types/probe_read_kernel_or_user.h" #include "bpf_tracing.h" #include "common.h" diff --git a/bpf/process/bpf_exit.c b/bpf/process/bpf_exit.c index baadae713f5..355431abc0c 100644 --- a/bpf/process/bpf_exit.c +++ b/bpf/process/bpf_exit.c @@ -3,6 +3,7 @@ #include "vmlinux.h" #include "bpf_exit.h" +#include "types/probe_read_kernel_or_user.h" #include "bpf_tracing.h" char _license[] __attribute__((section("license"), used)) = "Dual BSD/GPL"; diff --git a/bpf/process/bpf_loader.c b/bpf/process/bpf_loader.c index 7eca0630b3c..0a0a2ddb7c6 100644 --- a/bpf/process/bpf_loader.c +++ b/bpf/process/bpf_loader.c @@ -3,6 +3,7 @@ #include "vmlinux.h" #include "api.h" +#include "types/probe_read_kernel_or_user.h" #include "bpf_tracing.h" #include "bpf_helpers.h" #include "bpf_event.h" diff --git a/bpf/process/data_event.h b/bpf/process/data_event.h index 974fee26b2c..821c86d6174 100644 --- a/bpf/process/data_event.h +++ b/bpf/process/data_event.h @@ -6,6 +6,7 @@ #include "bpf_tracing.h" #include "data_msg.h" +#include "types/probe_read_kernel_or_user.h" static inline __attribute__((always_inline)) long __do_bytes(void *ctx, struct msg_data *msg, unsigned long uptr, size_t bytes) diff --git a/bpf/process/retprobe_map.h b/bpf/process/retprobe_map.h index b1f4b81e3bc..f8fc13953c4 100644 --- a/bpf/process/retprobe_map.h +++ b/bpf/process/retprobe_map.h @@ -4,6 +4,9 @@ #ifndef __RETPROBE_MAP_H__ #define __RETPROBE_MAP_H__ +#include "vmlinux.h" +#include "api.h" +#include "types/probe_read_kernel_or_user.h" #include "bpf_tracing.h" struct retprobe_key { diff --git a/bpf/process/types/basic.h b/bpf/process/types/basic.h index 0d9745f05a2..7e400486021 100644 --- a/bpf/process/types/basic.h +++ b/bpf/process/types/basic.h @@ -22,6 +22,7 @@ #include "common.h" #include "process/data_event.h" #include "process/bpf_enforcer.h" +#include "probe_read_kernel_or_user.h" /* Type IDs form API with user space generickprobe.go */ enum { From 4a2fe971200b87a0494042f189635ebf1b6b17f7 Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Wed, 10 Apr 2024 11:32:54 +0100 Subject: [PATCH 06/11] Memory: Add raw_syscall flag on arg meta The argument meta value indicates (among others) whether an argument is in userspace memory or in kernel memory. This generally works for kprobes, tracepoints and uprobes, but raw_syscall tracepoints are slightly different. The raw_syscall argument (an array of uint64) exists in kernel memory but the buffers the pointers reference might be userspace. This commit adds a raw_syscall flag to indicate to the BPF programs that the pointers are in kernel memory even if the buffers they point to are in userspace. Signed-off-by: Kevin Sheldrake --- pkg/sensors/tracing/args.go | 9 +++++++-- pkg/sensors/tracing/generickprobe.go | 2 +- pkg/sensors/tracing/generictracepoint.go | 5 +++-- pkg/sensors/tracing/genericuprobe.go | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pkg/sensors/tracing/args.go b/pkg/sensors/tracing/args.go index bf1d54332fa..7eb9de8a9c2 100644 --- a/pkg/sensors/tracing/args.go +++ b/pkg/sensors/tracing/args.go @@ -34,6 +34,7 @@ const ( argReturnCopyBit = 1 << 4 argMaxDataBit = 1 << 5 argUserspaceDataBit = 1 << 6 + argRawSyscallsBit = 1 << 7 ) func argReturnCopy(meta int) bool { @@ -47,9 +48,10 @@ func argReturnCopy(meta int) bool { // 4 : ReturnCopy // 5 : MaxData // 6 : UserspaceData -// 7-15 : reserved +// 7 : RawSyscalls +// 8-15 : reserved // 16-31 : size for const_buf -func getMetaValue(arg *v1alpha1.KProbeArg, userspaceDataDefault bool) (int, error) { +func getMetaValue(arg *v1alpha1.KProbeArg, userspaceDataDefault bool, rawSyscalls bool) (int, error) { meta := 0 if arg.SizeArgIndex > 0 { @@ -75,6 +77,9 @@ func getMetaValue(arg *v1alpha1.KProbeArg, userspaceDataDefault bool) (int, erro meta = meta | argUserspaceDataBit } } + if rawSyscalls { + meta = meta | argRawSyscallsBit + } return meta, nil } diff --git a/pkg/sensors/tracing/generickprobe.go b/pkg/sensors/tracing/generickprobe.go index 4f40eb36315..914ff1051b1 100644 --- a/pkg/sensors/tracing/generickprobe.go +++ b/pkg/sensors/tracing/generickprobe.go @@ -671,7 +671,7 @@ func addKprobe(funcName string, f *v1alpha1.KProbeSpec, in *addKprobeIn) (id idt } } // For kprobes, args default to userspace memory for syscalls, and kernel memory otherwise. - argMValue, err := getMetaValue(&a, f.Syscall) + argMValue, err := getMetaValue(&a, f.Syscall, false) if err != nil { return errFn(err) } diff --git a/pkg/sensors/tracing/generictracepoint.go b/pkg/sensors/tracing/generictracepoint.go index ffd5bc70f7d..c5f361582bd 100644 --- a/pkg/sensors/tracing/generictracepoint.go +++ b/pkg/sensors/tracing/generictracepoint.go @@ -238,7 +238,8 @@ func (out *genericTracepointArg) getGenericTypeId() (int, error) { func buildGenericTracepointArgs(info *tracepoint.Tracepoint, specArgs []v1alpha1.KProbeArg) ([]genericTracepointArg, error) { ret := make([]genericTracepointArg, 0, len(specArgs)) nfields := uint32(len(info.Format.Fields)) - syscall := info.Subsys == "syscalls" || info.Subsys == "raw_syscalls" + rawSyscalls := info.Subsys == "raw_syscalls" + syscall := rawSyscalls || info.Subsys == "syscalls" for argIdx := range specArgs { specArg := &specArgs[argIdx] @@ -247,7 +248,7 @@ func buildGenericTracepointArgs(info *tracepoint.Tracepoint, specArgs []v1alpha1 } field := info.Format.Fields[specArg.Index] // Syscall tracepoint arguments are in userspace memory. - metaTp, err := getMetaValue(specArg, syscall) + metaTp, err := getMetaValue(specArg, syscall, rawSyscalls && specArg.Index == 5 && (specArg.Type == "")) if err != nil { return nil, fmt.Errorf("tracepoint %s/%s getMetaValue error: %w", info.Subsys, info.Event, err) } diff --git a/pkg/sensors/tracing/genericuprobe.go b/pkg/sensors/tracing/genericuprobe.go index e12198cb25c..9b1edd7c3d4 100644 --- a/pkg/sensors/tracing/genericuprobe.go +++ b/pkg/sensors/tracing/genericuprobe.go @@ -339,7 +339,7 @@ func addUprobe(spec *v1alpha1.UProbeSpec, ids []idtable.EntryID, in *addUprobeIn return nil, fmt.Errorf("Arg(%d) type '%s' unsupported", i, a.Type) } // For uprobes, args default to userspace memory. - argMValue, err := getMetaValue(&a, true) + argMValue, err := getMetaValue(&a, true, false) if err != nil { return nil, err } From 9c4d85eb3eb14d4ace9ac02c90a3bdf7a06d366c Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Wed, 10 Apr 2024 12:12:27 +0100 Subject: [PATCH 07/11] Memory: Use probe_read_kernel_or_user helpers Having introduced helpers that can read kernel or user memory depending on a boolean, and having also provided information over location of buffers, including in raw_syscalls, to the BPF programs, this commit uses the probe_read_kernel_or_user helpers to read the memory from the correct location. Signed-off-by: Kevin Sheldrake --- bpf/process/bpf_execve_event.c | 4 +- bpf/process/bpf_generic_retkprobe.c | 4 +- bpf/process/data_event.h | 54 +++--- bpf/process/types/basic.h | 247 +++++++++++++++------------- bpf/process/types/skb.h | 65 ++++---- bpf/process/types/sock.h | 35 ++-- 6 files changed, 220 insertions(+), 189 deletions(-) diff --git a/bpf/process/bpf_execve_event.c b/bpf/process/bpf_execve_event.c index c05bc9667e7..f2fd0a64c2a 100644 --- a/bpf/process/bpf_execve_event.c +++ b/bpf/process/bpf_execve_event.c @@ -87,7 +87,7 @@ read_args(void *ctx, struct msg_execve_event *event) size = data_event_bytes(ctx, (struct data_event_desc *)args, (unsigned long)start_stack, args_size, - (struct bpf_map_def *)&data_heap); + (struct bpf_map_def *)&data_heap, true); if (size > 0) p->flags |= EVENT_DATA_ARGS; } @@ -111,7 +111,7 @@ read_path(void *ctx, struct msg_execve_event *event, void *filename) } else if (size == MAXARGLENGTH - 1) { size = data_event_str(ctx, (struct data_event_desc *)earg, (unsigned long)filename, - (struct bpf_map_def *)&data_heap); + (struct bpf_map_def *)&data_heap, false); if (size == 0) flags |= EVENT_ERROR_FILENAME; else diff --git a/bpf/process/bpf_generic_retkprobe.c b/bpf/process/bpf_generic_retkprobe.c index f5b9bca0b9a..c557a5ee0a0 100644 --- a/bpf/process/bpf_generic_retkprobe.c +++ b/bpf/process/bpf_generic_retkprobe.c @@ -130,10 +130,10 @@ BPF_KRETPROBE(generic_retkprobe_event, unsigned long ret) switch (do_copy) { case char_buf: - size += __copy_char_buf(ctx, size, info.ptr, ret, false, e, (struct bpf_map_def *)data_heap_ptr); + size += __copy_char_buf(ctx, size, info.ptr, ret, false, e, (struct bpf_map_def *)data_heap_ptr, false); break; case char_iovec: - size += __copy_char_iovec(size, info.ptr, info.cnt, ret, e); + size += __copy_char_iovec(size, info.ptr, info.cnt, ret, e, false); default: break; } diff --git a/bpf/process/data_event.h b/bpf/process/data_event.h index 821c86d6174..09bdfc5f01a 100644 --- a/bpf/process/data_event.h +++ b/bpf/process/data_event.h @@ -9,7 +9,7 @@ #include "types/probe_read_kernel_or_user.h" static inline __attribute__((always_inline)) long -__do_bytes(void *ctx, struct msg_data *msg, unsigned long uptr, size_t bytes) +__do_bytes(void *ctx, struct msg_data *msg, unsigned long uptr, size_t bytes, bool userspace) { int err; @@ -23,31 +23,33 @@ __do_bytes(void *ctx, struct msg_data *msg, unsigned long uptr, size_t bytes) a: // < 5.3 verifier still requires value masking like 'val &= xxx' #ifndef __LARGE_BPF_PROG - asm volatile("%[bytes] &= 0x3fff;\n" - : - : [bytes] "+r"(bytes) - :); + err = probe_read_kernel_or_user_masked(&msg->arg[0], bytes, 0x3fff, (char *)uptr, userspace); +#else + err = probe_read_kernel_or_user_masked(&msg->arg[0], bytes, 0x7fff, (char *)uptr, userspace); #endif - err = probe_read(&msg->arg[0], bytes, (char *)uptr); if (err < 0) return err; msg->common.size = offsetof(struct msg_data, arg) + bytes; - perf_event_output_metric(ctx, MSG_OP_DATA, &tcpmon_map, BPF_F_CURRENT_CPU, msg, msg->common.size); +#ifndef __LARGE_BPF_PROG + perf_event_output_metric(ctx, MSG_OP_DATA, &tcpmon_map, BPF_F_CURRENT_CPU, msg, msg->common.size & 0x7fff); +#else + perf_event_output_metric(ctx, MSG_OP_DATA, &tcpmon_map, BPF_F_CURRENT_CPU, msg, msg->common.size & 0xffff); +#endif return bytes; b: return -1; } static inline __attribute__((always_inline)) long -do_bytes(void *ctx, struct msg_data *msg, unsigned long arg, size_t bytes) +do_bytes(void *ctx, struct msg_data *msg, unsigned long arg, size_t bytes, bool userspace) { size_t rd_bytes = 0; int err, i __maybe_unused; #ifdef __LARGE_BPF_PROG for (i = 0; i < 10; i++) { - err = __do_bytes(ctx, msg, arg + rd_bytes, bytes - rd_bytes); + err = __do_bytes(ctx, msg, arg + rd_bytes, bytes - rd_bytes, userspace); if (err < 0) return err; rd_bytes += err; @@ -55,12 +57,12 @@ do_bytes(void *ctx, struct msg_data *msg, unsigned long arg, size_t bytes) return rd_bytes; } #else -#define BYTES_COPY \ - err = __do_bytes(ctx, msg, arg + rd_bytes, bytes - rd_bytes); \ - if (err < 0) \ - return err; \ - rd_bytes += err; \ - if (rd_bytes == bytes) \ +#define BYTES_COPY \ + err = __do_bytes(ctx, msg, arg + rd_bytes, bytes - rd_bytes, userspace); \ + if (err < 0) \ + return err; \ + rd_bytes += err; \ + if (rd_bytes == bytes) \ return rd_bytes; #define BYTES_COPY_5 BYTES_COPY BYTES_COPY BYTES_COPY BYTES_COPY BYTES_COPY @@ -76,7 +78,7 @@ do_bytes(void *ctx, struct msg_data *msg, unsigned long arg, size_t bytes) } static inline __attribute__((always_inline)) long -__do_str(void *ctx, struct msg_data *msg, unsigned long arg, bool *done) +__do_str(void *ctx, struct msg_data *msg, unsigned long arg, bool *done, bool userspace) { size_t size, max = sizeof(msg->arg) - 1; long ret; @@ -89,7 +91,8 @@ __do_str(void *ctx, struct msg_data *msg, unsigned long arg, bool *done) : [max] "+r"(max) :); - ret = probe_read_str(&msg->arg[0], max, (char *)arg); + ret = probe_read_kernel_or_user_str(&msg->arg[0], max, (char *)arg, userspace); + if (ret < 0) return ret; @@ -112,7 +115,7 @@ __do_str(void *ctx, struct msg_data *msg, unsigned long arg, bool *done) static inline __attribute__((always_inline)) long do_str(void *ctx, struct msg_data *msg, unsigned long arg, - size_t bytes __maybe_unused) + size_t bytes __maybe_unused, bool userspace) { size_t rd_bytes = 0; bool done = false; @@ -122,7 +125,7 @@ do_str(void *ctx, struct msg_data *msg, unsigned long arg, #define __CNT 2 #pragma unroll for (i = 0; i < __CNT; i++) { - ret = __do_str(ctx, msg, arg + rd_bytes, &done); + ret = __do_str(ctx, msg, arg + rd_bytes, &done, userspace); if (ret < 0) return ret; rd_bytes += ret; @@ -138,7 +141,8 @@ do_str(void *ctx, struct msg_data *msg, unsigned long arg, static inline __attribute__((always_inline)) size_t data_event( void *ctx, struct data_event_desc *desc, unsigned long uptr, size_t size, struct bpf_map_def *heap, - long (*do_data_event)(void *, struct msg_data *, unsigned long, size_t)) + long (*do_data_event)(void *, struct msg_data *, unsigned long, size_t, bool), + bool userspace) { struct msg_data *msg; int zero = 0, err; @@ -166,7 +170,7 @@ static inline __attribute__((always_inline)) size_t data_event( * Leftover for data_event_str is always 0, because we don't know * how much more was there to copy. */ - err = do_data_event(ctx, msg, uptr, size); + err = do_data_event(ctx, msg, uptr, size, userspace); if (err < 0) { desc->error = err; @@ -195,9 +199,9 @@ static inline __attribute__((always_inline)) size_t data_event( */ static inline __attribute__((always_inline)) size_t data_event_bytes(void *ctx, struct data_event_desc *desc, unsigned long uptr, - size_t size, struct bpf_map_def *heap) + size_t size, struct bpf_map_def *heap, bool userspace) { - return data_event(ctx, desc, uptr, size, heap, do_bytes); + return data_event(ctx, desc, uptr, size, heap, do_bytes, userspace); } /** @@ -212,9 +216,9 @@ data_event_bytes(void *ctx, struct data_event_desc *desc, unsigned long uptr, */ static inline __attribute__((always_inline)) size_t data_event_str(void *ctx, struct data_event_desc *desc, unsigned long uptr, - struct bpf_map_def *heap) + struct bpf_map_def *heap, bool userspace) { - return data_event(ctx, desc, uptr, -1, heap, do_str); + return data_event(ctx, desc, uptr, -1, heap, do_str, userspace); } #endif /* __DATA_EVENT_H__ */ diff --git a/bpf/process/types/basic.h b/bpf/process/types/basic.h index 7e400486021..6b3c323c9af 100644 --- a/bpf/process/types/basic.h +++ b/bpf/process/types/basic.h @@ -277,7 +277,7 @@ return_stack_error(char *args, int orig, int err) static inline __attribute__((always_inline)) int parse_iovec_array(long off, unsigned long arg, int i, unsigned long max, - struct msg_generic_kprobe *e) + struct msg_generic_kprobe *e, bool userspace) { struct iovec iov; // limit is 1024 using a hack now. For 5.4 kernel we should loop over 1024 @@ -285,7 +285,7 @@ parse_iovec_array(long off, unsigned long arg, int i, unsigned long max, __u64 size; int err; - err = probe_read(&iov, sizeof(iov), (struct iovec *)(arg + index)); + err = probe_read_kernel_or_user(&iov, sizeof(iov), (struct iovec *)(arg + index), userspace); if (err < 0) return char_buf_pagefault; size = iov.iov_len; @@ -293,9 +293,7 @@ parse_iovec_array(long off, unsigned long arg, int i, unsigned long max, size = max; if (size > 4094) return char_buf_toolarge; - asm volatile("%[size] &= 0xfff;\n" ::[size] "+r"(size) - :); - err = probe_read(args_off(e, off), size, (char *)iov.iov_base); + err = probe_read_kernel_or_user_masked(args_off(e, off), size, 0xfff, (char *)iov.iov_base, userspace); if (err < 0) return char_buf_pagefault; return size; @@ -308,7 +306,7 @@ parse_iovec_array(long off, unsigned long arg, int i, unsigned long max, /* embedding this in the loop counter breaks verifier */ \ if (i >= cnt) \ goto char_iovec_done; \ - c = parse_iovec_array(off, arg, i, max, e); \ + c = parse_iovec_array(off, arg, i, max, e, userspace); \ if (c < 0) { \ char *args = args_off(e, off_orig); \ return return_stack_error(args, 0, c); \ @@ -489,15 +487,15 @@ copy_path(char *args, const struct path *arg) } static inline __attribute__((always_inline)) long -copy_strings(char *args, char *arg, int max_size) +copy_strings(char *args, char *arg, int max_size, bool userspace) { int *s = (int *)args; long size; - // probe_read_str() always nul-terminates the string. + // probe_read_kernel_or_user_str() always nul-terminates the string. // So add one to the length to allow for it. This should // result in us honouring our max_size correctly. - size = probe_read_str(&args[4], max_size + 1, arg); + size = probe_read_kernel_or_user_str(&args[4], max_size + 1, arg, userspace); if (size <= 1) return invalid_ty; // Remove the nul character from end. @@ -508,51 +506,54 @@ copy_strings(char *args, char *arg, int max_size) } static inline __attribute__((always_inline)) long copy_skb(char *args, - unsigned long arg) + unsigned long arg, + bool userspace) { struct sk_buff *skb = (struct sk_buff *)arg; struct skb_type *skb_event = (struct skb_type *)args; /* struct values */ - probe_read(&skb_event->hash, sizeof(__u32), _(&skb->hash)); - probe_read(&skb_event->len, sizeof(__u32), _(&skb->len)); - probe_read(&skb_event->priority, sizeof(__u32), _(&skb->priority)); - probe_read(&skb_event->mark, sizeof(__u32), _(&skb->mark)); + probe_read_kernel_or_user(&skb_event->hash, sizeof(__u32), _(&skb->hash), userspace); + probe_read_kernel_or_user(&skb_event->len, sizeof(__u32), _(&skb->len), userspace); + probe_read_kernel_or_user(&skb_event->priority, sizeof(__u32), _(&skb->priority), userspace); + probe_read_kernel_or_user(&skb_event->mark, sizeof(__u32), _(&skb->mark), userspace); /* socket data */ - set_event_from_skb(skb_event, skb); + set_event_from_skb(skb_event, skb, userspace); return sizeof(struct skb_type); } static inline __attribute__((always_inline)) long copy_sock(char *args, - unsigned long arg) + unsigned long arg, + bool userspace) { struct sock *sk = (struct sock *)arg; struct sk_type *sk_event = (struct sk_type *)args; - set_event_from_sock(sk_event, sk); + set_event_from_sock(sk_event, sk, userspace); return sizeof(struct sk_type); } static inline __attribute__((always_inline)) long -copy_user_ns(char *args, unsigned long arg) +copy_user_ns(char *args, unsigned long arg, bool userspace) { struct user_namespace *ns = (struct user_namespace *)arg; struct msg_user_namespace *u_ns_info = (struct msg_user_namespace *)args; - probe_read(&u_ns_info->level, sizeof(__s32), _(&ns->level)); - probe_read(&u_ns_info->uid, sizeof(__u32), _(&ns->owner)); - probe_read(&u_ns_info->gid, sizeof(__u32), _(&ns->group)); - probe_read(&u_ns_info->ns_inum, sizeof(__u32), _(&ns->ns.inum)); + probe_read_kernel_or_user(&u_ns_info->level, sizeof(__s32), _(&ns->level), userspace); + probe_read_kernel_or_user(&u_ns_info->uid, sizeof(__u32), _(&ns->owner), userspace); + probe_read_kernel_or_user(&u_ns_info->gid, sizeof(__u32), _(&ns->group), userspace); + probe_read_kernel_or_user(&u_ns_info->ns_inum, sizeof(__u32), _(&ns->ns.inum), userspace); return sizeof(struct msg_user_namespace); } static inline __attribute__((always_inline)) long copy_cred(char *args, - unsigned long arg) + unsigned long arg, + bool userspace) { struct user_namespace *ns; struct cred *cred = (struct cred *)arg; @@ -560,21 +561,21 @@ static inline __attribute__((always_inline)) long copy_cred(char *args, struct msg_capabilities *caps = &info->caps; struct msg_user_namespace *user_ns_info = &info->user_ns; - probe_read(&info->uid, sizeof(__u32), _(&cred->uid)); - probe_read(&info->gid, sizeof(__u32), _(&cred->gid)); - probe_read(&info->euid, sizeof(__u32), _(&cred->euid)); - probe_read(&info->egid, sizeof(__u32), _(&cred->egid)); - probe_read(&info->suid, sizeof(__u32), _(&cred->suid)); - probe_read(&info->sgid, sizeof(__u32), _(&cred->sgid)); - probe_read(&info->fsuid, sizeof(__u32), _(&cred->fsuid)); - probe_read(&info->fsgid, sizeof(__u32), _(&cred->fsgid)); + probe_read_kernel_or_user(&info->uid, sizeof(__u32), _(&cred->uid), userspace); + probe_read_kernel_or_user(&info->gid, sizeof(__u32), _(&cred->gid), userspace); + probe_read_kernel_or_user(&info->euid, sizeof(__u32), _(&cred->euid), userspace); + probe_read_kernel_or_user(&info->egid, sizeof(__u32), _(&cred->egid), userspace); + probe_read_kernel_or_user(&info->suid, sizeof(__u32), _(&cred->suid), userspace); + probe_read_kernel_or_user(&info->sgid, sizeof(__u32), _(&cred->sgid), userspace); + probe_read_kernel_or_user(&info->fsuid, sizeof(__u32), _(&cred->fsuid), userspace); + probe_read_kernel_or_user(&info->fsgid, sizeof(__u32), _(&cred->fsgid), userspace); info->pad = 0; - probe_read(&info->securebits, sizeof(__u32), _(&cred->securebits)); + probe_read_kernel_or_user(&info->securebits, sizeof(__u32), _(&cred->securebits), userspace); __get_caps(caps, cred); - probe_read(&ns, sizeof(ns), _(&cred->user_ns)); - copy_user_ns((char *)user_ns_info, (unsigned long)ns); + probe_read_kernel_or_user(&ns, sizeof(ns), _(&cred->user_ns), userspace); + copy_user_ns((char *)user_ns_info, (unsigned long)ns, userspace); return sizeof(struct msg_cred); } @@ -592,7 +593,7 @@ copy_capability(char *args, unsigned long arg) } static inline __attribute__((always_inline)) long -copy_load_module(char *args, unsigned long arg) +copy_load_module(char *args, unsigned long arg, bool userspace) { int ok; const char *name; @@ -601,32 +602,44 @@ copy_load_module(char *args, unsigned long arg) memset(info, 0, sizeof(struct tg_kernel_module)); - if (BPF_CORE_READ_INTO(&name, mod, name) != 0) - return 0; - - if (probe_read_str(&info->name, TG_MODULE_NAME_LEN - 1, name) < 0) + if (userspace) { + if (BPF_CORE_READ_USER_INTO(&name, mod, name) != 0) + return 0; + } else { + if (BPF_CORE_READ_INTO(&name, mod, name) != 0) + return 0; + } + if (probe_read_kernel_or_user_str(&info->name, TG_MODULE_NAME_LEN - 1, name, userspace) < 0) return 0; - BPF_CORE_READ_INTO(&info->taints, mod, mod, taints); - - if (BPF_CORE_READ_INTO(&ok, mod, sig_ok) == 0) - info->sig_ok = !!ok; + if (userspace) { + BPF_CORE_READ_USER_INTO(&info->taints, mod, mod, taints); + if (BPF_CORE_READ_USER_INTO(&ok, mod, sig_ok) == 0) + info->sig_ok = !!ok; + } else { + BPF_CORE_READ_INTO(&info->taints, mod, mod, taints); + if (BPF_CORE_READ_INTO(&ok, mod, sig_ok) == 0) + info->sig_ok = !!ok; + } return sizeof(struct tg_kernel_module); } static inline __attribute__((always_inline)) long -copy_kernel_module(char *args, unsigned long arg) +copy_kernel_module(char *args, unsigned long arg, bool userspace) { const struct module *mod = (struct module *)arg; struct tg_kernel_module *info = (struct tg_kernel_module *)args; memset(info, 0, sizeof(struct tg_kernel_module)); - if (probe_read_str(&info->name, TG_MODULE_NAME_LEN - 1, mod->name) < 0) + if (probe_read_kernel_or_user_str(&info->name, TG_MODULE_NAME_LEN - 1, mod->name, userspace) < 0) return 0; - BPF_CORE_READ_INTO(&info->taints, mod, taints); + if (userspace) + BPF_CORE_READ_USER_INTO(&info->taints, mod, taints); + else + BPF_CORE_READ_INTO(&info->taints, mod, taints); /* * Todo: allow to check if module is signed here too. @@ -637,10 +650,11 @@ copy_kernel_module(char *args, unsigned long arg) return sizeof(struct tg_kernel_module); } -#define ARGM_INDEX_MASK 0xf +#define ARGM_INDEX_MASK 0xf #define ARGM_RETURN_COPY BIT(4) -#define ARGM_MAX_DATA BIT(5) +#define ARGM_MAX_DATA BIT(5) #define ARGM_USERSPACE_DATA BIT(6) +#define ARGM_RAW_SYSCALLS BIT(7) static inline __attribute__((always_inline)) bool hasReturnCopy(unsigned long argm) @@ -660,6 +674,12 @@ is_userspace_data(unsigned long argm) return (argm & ARGM_USERSPACE_DATA) != 0; } +static inline __attribute__((always_inline)) bool +is_raw_syscalls(unsigned long argm) +{ + return (argm & ARGM_RAW_SYSCALLS) != 0; +} + static inline __attribute__((always_inline)) unsigned long get_arg_meta(int meta, struct msg_generic_kprobe *e) { @@ -681,7 +701,7 @@ get_arg_meta(int meta, struct msg_generic_kprobe *e) static inline __attribute__((always_inline)) long __copy_char_buf(void *ctx, long off, unsigned long arg, unsigned long bytes, bool max_data, struct msg_generic_kprobe *e, - struct bpf_map_def *data_heap) + struct bpf_map_def *data_heap, bool userspace) { int *s = (int *)args_off(e, off); size_t rd_bytes, extra = 8; @@ -696,7 +716,7 @@ __copy_char_buf(void *ctx, long off, unsigned long arg, unsigned long bytes, s[0] = 1; return data_event_bytes(ctx, (struct data_event_desc *)&s[1], - arg, bytes, data_heap) + + arg, bytes, data_heap, userspace) + 4; } s[0] = 0; @@ -707,9 +727,7 @@ __copy_char_buf(void *ctx, long off, unsigned long arg, unsigned long bytes, /* Bound bytes <4095 to ensure bytes does not read past end of buffer */ rd_bytes = bytes < 0x1000 ? bytes : 0xfff; - asm volatile("%[rd_bytes] &= 0xfff;\n" ::[rd_bytes] "+r"(rd_bytes) - :); - err = probe_read(&s[2], rd_bytes, (char *)arg); + err = probe_read_kernel_or_user_masked(&s[2], rd_bytes, 0xfff, (char *)arg, userspace); if (err < 0) return return_error(s, char_buf_pagefault); s[0] = (int)bytes; @@ -720,7 +738,7 @@ __copy_char_buf(void *ctx, long off, unsigned long arg, unsigned long bytes, static inline __attribute__((always_inline)) long copy_char_buf(void *ctx, long off, unsigned long arg, int argm, struct msg_generic_kprobe *e, - struct bpf_map_def *data_heap) + struct bpf_map_def *data_heap, bool userspace) { int *s = (int *)args_off(e, off); unsigned long meta; @@ -733,8 +751,8 @@ copy_char_buf(void *ctx, long off, unsigned long arg, int argm, return return_error(s, char_buf_saved_for_retprobe); } meta = get_arg_meta(argm, e); - probe_read(&bytes, sizeof(bytes), &meta); - return __copy_char_buf(ctx, off, arg, bytes, has_max_data(argm), e, data_heap); + bytes = meta; + return __copy_char_buf(ctx, off, arg, bytes, has_max_data(argm), e, data_heap, userspace); } static inline __attribute__((always_inline)) u16 @@ -1224,7 +1242,8 @@ filter_inet(struct selector_arg_filter *filter, char *args) static inline __attribute__((always_inline)) long __copy_char_iovec(long off, unsigned long arg, unsigned long cnt, - unsigned long max, struct msg_generic_kprobe *e) + unsigned long max, struct msg_generic_kprobe *e, + bool userspace) { long size, off_orig = off; unsigned long i = 0; @@ -1257,25 +1276,25 @@ copy_char_iovec(void *ctx, long off, unsigned long arg, int argm, retprobe_map_set_iovec(e->func_id, retid, e->common.ktime, arg, meta); return return_error(s, char_buf_saved_for_retprobe); } - return __copy_char_iovec(off, arg, meta, 0, e); + return __copy_char_iovec(off, arg, meta, 0, e, is_userspace_data(argm)); } static inline __attribute__((always_inline)) long -copy_bpf_attr(char *args, unsigned long arg) +copy_bpf_attr(char *args, unsigned long arg, bool userspace) { union bpf_attr *ba = (union bpf_attr *)arg; struct bpf_info_type *bpf_info = (struct bpf_info_type *)args; /* struct values */ - probe_read(&bpf_info->prog_type, sizeof(__u32), _(&ba->prog_type)); - probe_read(&bpf_info->insn_cnt, sizeof(__u32), _(&ba->insn_cnt)); - probe_read(&bpf_info->prog_name, BPF_OBJ_NAME_LEN, _(&ba->prog_name)); + probe_read_kernel_or_user(&bpf_info->prog_type, sizeof(__u32), _(&ba->prog_type), userspace); + probe_read_kernel_or_user(&bpf_info->insn_cnt, sizeof(__u32), _(&ba->insn_cnt), userspace); + probe_read_kernel_or_user(&bpf_info->prog_name, BPF_OBJ_NAME_LEN, _(&ba->prog_name), userspace); return sizeof(struct bpf_info_type); } static inline __attribute__((always_inline)) long -copy_perf_event(char *args, unsigned long arg) +copy_perf_event(char *args, unsigned long arg, bool userspace) { struct perf_event *p_event = (struct perf_event *)arg; struct perf_event_info_type *event_info = @@ -1284,34 +1303,34 @@ copy_perf_event(char *args, unsigned long arg) /* struct values */ __u64 kprobe_func_addr = 0; - probe_read(&kprobe_func_addr, sizeof(__u64), - _(&p_event->attr.kprobe_func)); - probe_read_str(&event_info->kprobe_func, KSYM_NAME_LEN, - (char *)kprobe_func_addr); + probe_read_kernel_or_user(&kprobe_func_addr, sizeof(__u64), + _(&p_event->attr.kprobe_func), userspace); + probe_read_kernel_or_user_str(&event_info->kprobe_func, KSYM_NAME_LEN, + (char *)kprobe_func_addr, userspace); - probe_read(&event_info->type, sizeof(__u32), _(&p_event->attr.type)); - probe_read(&event_info->config, sizeof(__u64), - _(&p_event->attr.config)); - probe_read(&event_info->probe_offset, sizeof(__u64), - _(&p_event->attr.probe_offset)); + probe_read_kernel_or_user(&event_info->type, sizeof(__u32), _(&p_event->attr.type), userspace); + probe_read_kernel_or_user(&event_info->config, sizeof(__u64), + _(&p_event->attr.config), userspace); + probe_read_kernel_or_user(&event_info->probe_offset, sizeof(__u64), + _(&p_event->attr.probe_offset), userspace); return sizeof(struct perf_event_info_type); } static inline __attribute__((always_inline)) long -copy_bpf_map(char *args, unsigned long arg) +copy_bpf_map(char *args, unsigned long arg, bool userspace) { struct bpf_map *bpfmap = (struct bpf_map *)arg; struct bpf_map_info_type *map_info = (struct bpf_map_info_type *)args; /* struct values */ - probe_read(&map_info->map_type, sizeof(__u32), _(&bpfmap->map_type)); - probe_read(&map_info->key_size, sizeof(__u32), _(&bpfmap->key_size)); - probe_read(&map_info->value_size, sizeof(__u32), - _(&bpfmap->value_size)); - probe_read(&map_info->max_entries, sizeof(__u32), - _(&bpfmap->max_entries)); - probe_read(&map_info->map_name, BPF_OBJ_NAME_LEN, _(&bpfmap->name)); + probe_read_kernel_or_user(&map_info->map_type, sizeof(__u32), _(&bpfmap->map_type), userspace); + probe_read_kernel_or_user(&map_info->key_size, sizeof(__u32), _(&bpfmap->key_size), userspace); + probe_read_kernel_or_user(&map_info->value_size, sizeof(__u32), + _(&bpfmap->value_size), userspace); + probe_read_kernel_or_user(&map_info->max_entries, sizeof(__u32), + _(&bpfmap->max_entries), userspace); + probe_read_kernel_or_user(&map_info->map_name, BPF_OBJ_NAME_LEN, _(&bpfmap->name), userspace); return sizeof(struct bpf_map_info_type); } @@ -1319,7 +1338,7 @@ copy_bpf_map(char *args, unsigned long arg) #ifdef __LARGE_BPF_PROG static inline __attribute__((always_inline)) long copy_iov_iter(void *ctx, long off, unsigned long arg, int argm, struct msg_generic_kprobe *e, - struct bpf_map_def *data_heap) + struct bpf_map_def *data_heap, bool userspace) { long iter_iovec = -1, iter_ubuf __maybe_unused = -1; struct iov_iter *iov_iter = (struct iov_iter *)arg; @@ -1334,7 +1353,7 @@ copy_iov_iter(void *ctx, long off, unsigned long arg, int argm, struct msg_gener goto nodata; tmp = _(&iov_iter->iter_type); - probe_read(&iter_type, sizeof(iter_type), tmp); + probe_read_kernel_or_user(&iter_type, sizeof(iter_type), tmp, userspace); if (bpf_core_enum_value_exists(enum iter_type, ITER_IOVEC)) iter_iovec = bpf_core_enum_value(enum iter_type, ITER_IOVEC); @@ -1346,28 +1365,28 @@ copy_iov_iter(void *ctx, long off, unsigned long arg, int argm, struct msg_gener if (iter_type == iter_iovec) { tmp = _(&iov_iter->kvec); - probe_read(&kvec, sizeof(kvec), tmp); + probe_read_kernel_or_user(&kvec, sizeof(kvec), tmp, userspace); tmp = _(&kvec->iov_base); - probe_read(&buf, sizeof(buf), tmp); + probe_read_kernel_or_user(&buf, sizeof(buf), tmp, userspace); tmp = _(&kvec->iov_len); - probe_read(&count, sizeof(count), tmp); + probe_read_kernel_or_user(&count, sizeof(count), tmp, userspace); return __copy_char_buf(ctx, off, (unsigned long)buf, count, - has_max_data(argm), e, data_heap); + has_max_data(argm), e, data_heap, userspace); } #ifdef __V61_BPF_PROG if (iter_type == iter_ubuf) { tmp = _(&iov_iter->ubuf); - probe_read(&buf, sizeof(buf), tmp); + probe_read_kernel_or_user(&buf, sizeof(buf), tmp, userspace); tmp = _(&iov_iter->count); - probe_read(&count, sizeof(count), tmp); + probe_read_kernel_or_user(&count, sizeof(count), tmp, userspace); return __copy_char_buf(ctx, off, (unsigned long)buf, count, - has_max_data(argm), e, data_heap); + has_max_data(argm), e, data_heap, userspace); } #endif @@ -1378,7 +1397,7 @@ copy_iov_iter(void *ctx, long off, unsigned long arg, int argm, struct msg_gener return 8; } #else -#define copy_iov_iter(ctx, orig_off, arg, argm, e, data_heap) 0 +#define copy_iov_iter(ctx, orig_off, arg, argm, e, data_heap, userspace) 0 #endif /* __LARGE_BPF_PROG */ static inline __attribute__((always_inline)) bool is_signed_type(int type) @@ -1422,7 +1441,6 @@ filter_64ty_selector_val(struct selector_arg_filter *filter, char *args) case op_filter_eq: case op_filter_neq: res = (*(u64 *)args == w); - if (filter->op == op_filter_eq && res) return 1; if (filter->op == op_filter_neq && !res) @@ -2528,6 +2546,8 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, struct bpf_map_def *data_heap) { size_t min_size = type_to_min_size(type, argm); + bool raw_syscalls = is_raw_syscalls(argm); + bool userspace = is_userspace_data(argm); const struct path *path_arg = 0; char *args = e->args; long size = -1; @@ -2543,14 +2563,14 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, switch (type) { case iov_iter_type: - size = copy_iov_iter(ctx, orig_off, arg, argm, e, data_heap); + size = copy_iov_iter(ctx, orig_off, arg, argm, e, data_heap, userspace); break; case kiocb_type: { struct kiocb *kiocb = (struct kiocb *)arg; struct file *file; arg = (unsigned long)_(&kiocb->ki_filp); - probe_read(&file, sizeof(file), (const void *)arg); + probe_read_kernel_or_user(&file, sizeof(file), (const void *)arg, userspace); arg = (unsigned long)file; } // fallthrough to file_ty @@ -2598,7 +2618,7 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, struct file *file; arg = (unsigned long)_(&bprm->file); - probe_read(&file, sizeof(file), (const void *)arg); + probe_read_kernel_or_user(&file, sizeof(file), (const void *)arg, userspace); path_arg = _(&file->f_path); goto do_copy_path; } break; @@ -2606,23 +2626,23 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, case filename_ty: { struct filename *file = (struct filename *)arg; - probe_read(&arg, sizeof(arg), &file->name); + probe_read_kernel_or_user(&arg, sizeof(arg), &file->name, userspace); } // fallthrough to copy_string case string_type: - size = copy_strings(args, (char *)arg, MAX_STRING); + size = copy_strings(args, (char *)arg, MAX_STRING, userspace); break; case net_dev_ty: { struct net_device *dev = (struct net_device *)arg; - size = copy_strings(args, dev->name, IFNAMSIZ); + size = copy_strings(args, dev->name, IFNAMSIZ, userspace); } break; case data_loc_type: { // data_loc: lower 16 bits is offset from ctx; upper 16 bits is length long dl_len = (arg >> 16) & 0xfff; // masked to 4095 chars char *dl_loc = ctx + (arg & 0xffff); - - size = copy_strings(args, dl_loc, dl_len); + // data_loc will always be a kernel type + size = copy_strings(args, dl_loc, dl_len, false); } break; case syscall64_type: case size_type: @@ -2651,18 +2671,18 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, size = sizeof(__u32); break; case skb_type: - size = copy_skb(args, arg); + size = copy_skb(args, arg, userspace); break; case sock_type: - size = copy_sock(args, arg); + size = copy_sock(args, arg, userspace); // Look up socket in our sock->pid_tgid map update_pid_tid_from_sock(e, arg); break; case cred_type: - size = copy_cred(args, arg); + size = copy_cred(args, arg, userspace); break; case char_buf: - size = copy_char_buf(ctx, orig_off, arg, argm, e, data_heap); + size = copy_char_buf(ctx, orig_off, arg, argm, e, data_heap, userspace); break; case char_iovec: size = copy_char_iovec(ctx, orig_off, arg, argm, e); @@ -2671,23 +2691,28 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, // for const_buf_type the size is in the upper 16 bits of the meta argument // bound size to 1023 to help the verifier out size = (argm >> 16) & 0x03ff; - probe_read(args, size, (char *)arg); + // the const_buf_type that represents an array of arguments for raw_syscalls + // is special, as the array contents are in kernel memory, but they point + // to userspace memory. In this case, they will be marked as userspace, but + // we actually want to read kernel memory. The raw_syscalls bit of the meta + // value indicates when the arg is special in this way. + probe_read_kernel_or_user_masked(args, size, 0x3ff, (char *)arg, userspace && !raw_syscalls); break; } case bpf_attr_type: { - size = copy_bpf_attr(args, arg); + size = copy_bpf_attr(args, arg, userspace); break; } case perf_event_type: { - size = copy_perf_event(args, arg); + size = copy_perf_event(args, arg, userspace); break; } case bpf_map_type: { - size = copy_bpf_map(args, arg); + size = copy_bpf_map(args, arg, userspace); break; } case user_namespace_type: { - size = copy_user_ns(args, arg); + size = copy_user_ns(args, arg, userspace); break; } case capability_type: { @@ -2695,18 +2720,18 @@ read_call_arg(void *ctx, struct msg_generic_kprobe *e, int index, int type, break; } case load_module_type: { - size = copy_load_module(args, arg); + size = copy_load_module(args, arg, userspace); break; } case kernel_module_type: { - size = copy_kernel_module(args, arg); + size = copy_kernel_module(args, arg, userspace); break; } case kernel_cap_ty: case cap_inh_ty: case cap_prm_ty: case cap_eff_ty: - probe_read(args, sizeof(__u64), (char *)arg); + probe_read_kernel_or_user(args, sizeof(__u64), (char *)arg, userspace); size = sizeof(__u64); break; default: diff --git a/bpf/process/types/skb.h b/bpf/process/types/skb.h index e26e6400973..8631d5bd536 100644 --- a/bpf/process/types/skb.h +++ b/bpf/process/types/skb.h @@ -5,6 +5,7 @@ #define __SKB_H__ #include "tuple.h" +#include "probe_read_kernel_or_user.h" struct skb_type { struct tuple_type tuple; @@ -57,7 +58,7 @@ struct { static inline __attribute__((always_inline)) u8 get_ip6_protocol(u16 *payload_off, struct ipv6hdr *ip, u16 network_header_off, - void *skb_head) + void *skb_head, bool userspace) { struct ipv6extension *e; int zero = 0; @@ -70,7 +71,7 @@ get_ip6_protocol(u16 *payload_off, struct ipv6hdr *ip, u16 network_header_off, e->ip_off = network_header_off; e->curr = 255; e->len = 0; - if (probe_read(&e->next, sizeof(e->next), _(&ip->nexthdr)) < 0) + if (probe_read_kernel_or_user(&e->next, sizeof(e->next), _(&ip->nexthdr), userspace) < 0) return 0; // Maximum 7 valid extensions. @@ -105,8 +106,8 @@ get_ip6_protocol(u16 *payload_off, struct ipv6hdr *ip, u16 network_header_off, } e->curr = e->next; // Read next header and current length. - if (probe_read(&e->next, 2, - skb_head + e->ip_off) < 0) { + if (probe_read_kernel_or_user(&e->next, 2, + skb_head + e->ip_off, userspace) < 0) { return 0; } } @@ -120,40 +121,40 @@ get_ip6_protocol(u16 *payload_off, struct ipv6hdr *ip, u16 network_header_off, * only supports IPv4 with TCP/UDP. */ static inline __attribute__((unused)) int -set_event_from_skb(struct skb_type *event, struct sk_buff *skb) +set_event_from_skb(struct skb_type *event, struct sk_buff *skb, bool userspace) { unsigned char *skb_head = 0; u16 l3_off; typeof(skb->transport_header) l4_off; u8 protocol; - probe_read(&skb_head, sizeof(skb_head), _(&skb->head)); - probe_read(&l3_off, sizeof(l3_off), _(&skb->network_header)); + probe_read_kernel_or_user(&skb_head, sizeof(skb_head), _(&skb->head), userspace); + probe_read_kernel_or_user(&l3_off, sizeof(l3_off), _(&skb->network_header), userspace); struct iphdr *ip = (struct iphdr *)(skb_head + l3_off); u8 iphdr_byte0; - probe_read(&iphdr_byte0, 1, _(ip)); + probe_read_kernel_or_user(&iphdr_byte0, 1, _(ip), userspace); u8 ip_ver = iphdr_byte0 >> 4; if (ip_ver == 4) { // IPv4 - probe_read(&protocol, 1, _(&ip->protocol)); + probe_read_kernel_or_user(&protocol, 1, _(&ip->protocol), userspace); event->tuple.protocol = protocol; event->tuple.family = AF_INET; event->tuple.saddr[0] = 0; event->tuple.saddr[1] = 0; event->tuple.daddr[0] = 0; event->tuple.daddr[1] = 0; - probe_read(&event->tuple.saddr, IPV4LEN, _(&ip->saddr)); - probe_read(&event->tuple.daddr, IPV4LEN, _(&ip->daddr)); - probe_read(&l4_off, sizeof(l4_off), _(&skb->transport_header)); + probe_read_kernel_or_user(&event->tuple.saddr, IPV4LEN, _(&ip->saddr), userspace); + probe_read_kernel_or_user(&event->tuple.daddr, IPV4LEN, _(&ip->daddr), userspace); + probe_read_kernel_or_user(&l4_off, sizeof(l4_off), _(&skb->transport_header), userspace); } else if (ip_ver == 6) { struct ipv6hdr *ip6 = (struct ipv6hdr *)(skb_head + l3_off); - protocol = get_ip6_protocol(&l4_off, ip6, l3_off, skb_head); + protocol = get_ip6_protocol(&l4_off, ip6, l3_off, skb_head, userspace); event->tuple.protocol = protocol; event->tuple.family = AF_INET6; - probe_read(&event->tuple.saddr, IPV6LEN, _(&ip6->saddr)); - probe_read(&event->tuple.daddr, IPV6LEN, _(&ip6->daddr)); + probe_read_kernel_or_user(&event->tuple.saddr, IPV6LEN, _(&ip6->saddr), userspace); + probe_read_kernel_or_user(&event->tuple.daddr, IPV6LEN, _(&ip6->daddr), userspace); } else { // This is not IP, so we don't know how to parse further. return -22; @@ -162,17 +163,17 @@ set_event_from_skb(struct skb_type *event, struct sk_buff *skb) if (protocol == IPPROTO_TCP) { // TCP struct tcphdr *tcp = (struct tcphdr *)(skb_head + l4_off); - probe_read(&event->tuple.sport, sizeof(event->tuple.sport), - _(&tcp->source)); - probe_read(&event->tuple.dport, sizeof(event->tuple.dport), - _(&tcp->dest)); + probe_read_kernel_or_user(&event->tuple.sport, sizeof(event->tuple.sport), + _(&tcp->source), userspace); + probe_read_kernel_or_user(&event->tuple.dport, sizeof(event->tuple.dport), + _(&tcp->dest), userspace); } else if (protocol == IPPROTO_UDP) { // UDP struct udphdr *udp = (struct udphdr *)(skb_head + l4_off); - probe_read(&event->tuple.sport, sizeof(event->tuple.sport), - _(&udp->source)); - probe_read(&event->tuple.dport, sizeof(event->tuple.dport), - _(&udp->dest)); + probe_read_kernel_or_user(&event->tuple.sport, sizeof(event->tuple.sport), + _(&udp->source), userspace); + probe_read_kernel_or_user(&event->tuple.dport, sizeof(event->tuple.dport), + _(&udp->dest), userspace); } else { event->tuple.sport = 0; event->tuple.dport = 0; @@ -186,18 +187,18 @@ set_event_from_skb(struct skb_type *event, struct sk_buff *skb) u64 offset; #define SKB_EXT_SEC_PATH 1 // TBD do this with BTF - probe_read(&ext, sizeof(ext), _(&skb->extensions)); + probe_read_kernel_or_user(&ext, sizeof(ext), _(&skb->extensions), userspace); if (ext) { - probe_read(&offset, sizeof(offset), - _(&ext->offset[SKB_EXT_SEC_PATH])); + probe_read_kernel_or_user(&offset, sizeof(offset), + _(&ext->offset[SKB_EXT_SEC_PATH]), userspace); sp = (void *)ext + (offset << 3); - probe_read(&event->secpath_len, - sizeof(event->secpath_len), - _(&sp->len)); - probe_read(&event->secpath_olen, - sizeof(event->secpath_olen), - _(&sp->olen)); + probe_read_kernel_or_user(&event->secpath_len, + sizeof(event->secpath_len), + _(&sp->len), userspace); + probe_read_kernel_or_user(&event->secpath_olen, + sizeof(event->secpath_olen), + _(&sp->olen), userspace); } } return 0; diff --git a/bpf/process/types/sock.h b/bpf/process/types/sock.h index d11307189b0..bd8878f9cc3 100644 --- a/bpf/process/types/sock.h +++ b/bpf/process/types/sock.h @@ -5,6 +5,7 @@ #define __SOCK_H__ #include "tuple.h" +#include "probe_read_kernel_or_user.h" // The sockaddr field is specifically a __u64 to deter from trying to dereference it. // If an application needs more fields from the sock then they should be added to @@ -24,28 +25,28 @@ struct sk_type { * Populate the event args with the sock info. */ static inline __attribute__((unused)) void -set_event_from_sock(struct sk_type *event, struct sock *sk) +set_event_from_sock(struct sk_type *event, struct sock *sk, bool userspace) { struct sock_common *common = (struct sock_common *)sk; event->sockaddr = (__u64)sk; - probe_read(&event->tuple.family, sizeof(event->tuple.family), - _(&common->skc_family)); - probe_read(&event->state, sizeof(event->state), - _((const void *)&common->skc_state)); - probe_read(&event->type, sizeof(event->type), _(&sk->sk_type)); - probe_read(&event->tuple.protocol, sizeof(event->tuple.protocol), - _(&sk->sk_protocol)); + probe_read_kernel_or_user(&event->tuple.family, sizeof(event->tuple.family), + _(&common->skc_family), userspace); + probe_read_kernel_or_user(&event->state, sizeof(event->state), + _((const void *)&common->skc_state), userspace); + probe_read_kernel_or_user(&event->type, sizeof(event->type), _(&sk->sk_type), userspace); + probe_read_kernel_or_user(&event->tuple.protocol, sizeof(event->tuple.protocol), + _(&sk->sk_protocol), userspace); if (bpf_core_field_size(sk->sk_protocol) == 4) { // In the BTF, the protocol field in kernels tuple.protocol = event->tuple.protocol >> 8; } - probe_read(&event->mark, sizeof(event->mark), _(&sk->sk_mark)); - probe_read(&event->priority, sizeof(event->priority), - _(&sk->sk_priority)); + probe_read_kernel_or_user(&event->mark, sizeof(event->mark), _(&sk->sk_mark), userspace); + probe_read_kernel_or_user(&event->priority, sizeof(event->priority), + _(&sk->sk_priority), userspace); event->tuple.saddr[0] = 0; event->tuple.saddr[1] = 0; @@ -53,16 +54,16 @@ set_event_from_sock(struct sk_type *event, struct sock *sk) event->tuple.daddr[1] = 0; switch (event->tuple.family) { case AF_INET: - probe_read(&event->tuple.saddr, IPV4LEN, _(&common->skc_rcv_saddr)); - probe_read(&event->tuple.daddr, IPV4LEN, _(&common->skc_daddr)); + probe_read_kernel_or_user(&event->tuple.saddr, IPV4LEN, _(&common->skc_rcv_saddr), userspace); + probe_read_kernel_or_user(&event->tuple.daddr, IPV4LEN, _(&common->skc_daddr), userspace); break; case AF_INET6: - probe_read(&event->tuple.saddr, IPV6LEN, _(&common->skc_v6_rcv_saddr)); - probe_read(&event->tuple.daddr, IPV6LEN, _(&common->skc_v6_daddr)); + probe_read_kernel_or_user(&event->tuple.saddr, IPV6LEN, _(&common->skc_v6_rcv_saddr), userspace); + probe_read_kernel_or_user(&event->tuple.daddr, IPV6LEN, _(&common->skc_v6_daddr), userspace); } - probe_read(&event->tuple.sport, sizeof(event->tuple.sport), _(&common->skc_num)); - probe_read(&event->tuple.dport, sizeof(event->tuple.dport), _(&common->skc_dport)); + probe_read_kernel_or_user(&event->tuple.sport, sizeof(event->tuple.sport), _(&common->skc_num), userspace); + probe_read_kernel_or_user(&event->tuple.dport, sizeof(event->tuple.dport), _(&common->skc_dport), userspace); event->tuple.dport = bpf_ntohs(event->tuple.dport); } #endif // __SOCK_H__ From e2f0091bf269a6a050e03dc180db7b5a19e5b12b Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Wed, 10 Apr 2024 12:57:31 +0100 Subject: [PATCH 08/11] Retprobe: Use correct key for retprobe_map The key for the retprobe map includes the pid_tgid, but in cases where this is not available, it uses the FP register in its place. The retprobe_map_get_key function is used to get this ID. This commit changes the filter_args_reject function to use retprobe_map_get_key instead of just using the pid_tgid. Signed-off-by: Kevin Sheldrake --- bpf/process/generic_calls.h | 2 +- bpf/process/types/basic.h | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/bpf/process/generic_calls.h b/bpf/process/generic_calls.h index fc649df5265..f8ae8ad7aff 100644 --- a/bpf/process/generic_calls.h +++ b/bpf/process/generic_calls.h @@ -56,7 +56,7 @@ generic_process_event(void *ctx, struct bpf_map_def *heap_map, * do it where it makes most sense. */ if (errv < 0) - return filter_args_reject(e->func_id); + return filter_args_reject(ctx, e->func_id); } e->common.size = total; /* Continue to process other arguments. */ diff --git a/bpf/process/types/basic.h b/bpf/process/types/basic.h index 6b3c323c9af..6657aeb4cc9 100644 --- a/bpf/process/types/basic.h +++ b/bpf/process/types/basic.h @@ -1878,15 +1878,16 @@ selector_arg_offset(__u8 *f, struct msg_generic_kprobe *e, __u32 selidx, return pass ? seloff : 0; } -static inline __attribute__((always_inline)) int filter_args_reject(u64 id) +static inline __attribute__((always_inline)) int filter_args_reject(void *ctx, u64 id) { - u64 tid = get_current_pid_tgid(); - retprobe_map_clear(id, tid); + u64 retid = retprobe_map_get_key(ctx); + + retprobe_map_clear(id, retid); return 0; } static inline __attribute__((always_inline)) int -filter_args(struct msg_generic_kprobe *e, int selidx, void *filter_map, +filter_args(void *ctx, struct msg_generic_kprobe *e, int selidx, void *filter_map, bool is_entry) { __u8 *f; @@ -1906,7 +1907,7 @@ filter_args(struct msg_generic_kprobe *e, int selidx, void *filter_map, * have their arg filters run. */ if (selidx > SELECTORS_ACTIVE) - return filter_args_reject(e->func_id); + return filter_args_reject(ctx, e->func_id); if (e->sel.active[selidx]) { int pass = selector_arg_offset(f, e, selidx, is_entry); @@ -2412,7 +2413,7 @@ filter_read_arg(void *ctx, struct bpf_map_def *heap, if (!e) return 0; selidx = e->tailcall_index_selector; - pass = filter_args(e, selidx & MAX_SELECTORS_MASK, filter, is_entry); + pass = filter_args(ctx, e, selidx & MAX_SELECTORS_MASK, filter, is_entry); if (!pass) { selidx++; if (selidx <= MAX_SELECTORS && e->sel.active[selidx & MAX_SELECTORS_MASK]) { @@ -2420,7 +2421,7 @@ filter_read_arg(void *ctx, struct bpf_map_def *heap, tail_call(ctx, tailcalls, TAIL_CALL_ARGS); } // reject if we did not attempt to tailcall, or if tailcall failed. - return filter_args_reject(e->func_id); + return filter_args_reject(ctx, e->func_id); } // If pass >1 then we need to consult the selector actions From 1bc49cd669de540f4d0ace9ea44c5fa370117d80 Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Wed, 10 Apr 2024 13:26:27 +0100 Subject: [PATCH 09/11] Memory: Read kernel or user for return arguments The return arguments can be buffers and these can reside in kernel or userspace. This commit adds meta information to the return arguments (both function return arg and any params that should be read on return) so that the BPF programs know whether the data resides in kernel or user memory. Signed-off-by: Kevin Sheldrake --- bpf/lib/generic.h | 1 + bpf/process/bpf_generic_retkprobe.c | 19 ++++++++++--------- bpf/process/generic_calls.h | 5 +---- bpf/process/retprobe_map.h | 8 ++++++-- bpf/process/types/basic.h | 6 ++++-- pkg/api/tracingapi/client_kprobe.go | 2 ++ pkg/sensors/tracing/generickprobe.go | 10 ++++++++++ 7 files changed, 34 insertions(+), 17 deletions(-) diff --git a/bpf/lib/generic.h b/bpf/lib/generic.h index 38d6e4c3acc..0879f8a3c26 100644 --- a/bpf/lib/generic.h +++ b/bpf/lib/generic.h @@ -49,6 +49,7 @@ struct msg_generic_kprobe { /* anything above is shared with the userspace so it should match structs MsgGenericKprobe and MsgGenericTracepoint in Go */ char args[24000]; unsigned long a0, a1, a2, a3, a4; + unsigned long ret; long argsoff[MAX_POSSIBLE_ARGS]; struct msg_selector_data sel; __u32 idx; // attach cookie index diff --git a/bpf/process/bpf_generic_retkprobe.c b/bpf/process/bpf_generic_retkprobe.c index c557a5ee0a0..fd006ec18c3 100644 --- a/bpf/process/bpf_generic_retkprobe.c +++ b/bpf/process/bpf_generic_retkprobe.c @@ -71,14 +71,15 @@ BPF_KRETPROBE(generic_retkprobe_event, unsigned long ret) { struct execve_map_value *enter; struct msg_generic_kprobe *e; - struct retprobe_info info; struct event_config *config; + struct retprobe_info info; bool walker = false; + bool userspace; + __u64 pid_tgid; + long size = 0; int zero = 0; + long ty_arg; __u32 ppid; - long size = 0; - long ty_arg, do_copy; - __u64 pid_tgid; e = map_lookup_elem(&process_call_heap, &zero); if (!e) @@ -102,9 +103,8 @@ BPF_KRETPROBE(generic_retkprobe_event, unsigned long ret) size += sizeof(info.ktime_enter); ty_arg = config->argreturn; - do_copy = config->argreturncopy; if (ty_arg) { - size += read_call_arg(ctx, e, 0, ty_arg, size, ret, 0, (struct bpf_map_def *)data_heap_ptr); + size += read_call_arg(ctx, e, 0, ty_arg, size, ret, config->argmreturn, (struct bpf_map_def *)data_heap_ptr); #ifdef __LARGE_BPF_PROG struct socket_owner owner; switch (config->argreturnaction) { @@ -128,12 +128,13 @@ BPF_KRETPROBE(generic_retkprobe_event, unsigned long ret) asm volatile("%[size] &= 0x1fff;\n" ::[size] "+r"(size) :); - switch (do_copy) { + userspace = is_userspace_data(info.meta); + switch (config->argreturncopy) { case char_buf: - size += __copy_char_buf(ctx, size, info.ptr, ret, false, e, (struct bpf_map_def *)data_heap_ptr, false); + size += __copy_char_buf(ctx, size, info.ptr, ret, false, e, (struct bpf_map_def *)data_heap_ptr, userspace); break; case char_iovec: - size += __copy_char_iovec(size, info.ptr, info.cnt, ret, e, false); + size += __copy_char_iovec(size, info.ptr, info.cnt, ret, e, userspace); default: break; } diff --git a/bpf/process/generic_calls.h b/bpf/process/generic_calls.h index f8ae8ad7aff..e4830eb40c9 100644 --- a/bpf/process/generic_calls.h +++ b/bpf/process/generic_calls.h @@ -44,9 +44,6 @@ generic_process_event(void *ctx, struct bpf_map_def *heap_map, int am; am = (&config->arg0m)[index]; - asm volatile("%[am] &= 0xffff;\n" ::[am] "+r"(am) - :); - errv = read_call_arg(ctx, e, index, ty, total, a, am, data_heap); if (errv > 0) total += errv; @@ -169,7 +166,7 @@ generic_process_event_and_setup(struct pt_regs *ctx, /* If return arg is needed mark retprobe */ ty = config->argreturn; if (ty > 0) - retprobe_map_set(e->func_id, e->retprobe_id, e->common.ktime, 1); + retprobe_map_set(e->func_id, e->retprobe_id, e->common.ktime, 1, config->argmreturn); #endif #ifdef GENERIC_UPROBE diff --git a/bpf/process/retprobe_map.h b/bpf/process/retprobe_map.h index f8fc13953c4..959d7ae8294 100644 --- a/bpf/process/retprobe_map.h +++ b/bpf/process/retprobe_map.h @@ -18,6 +18,8 @@ struct retprobe_info { unsigned long ktime_enter; unsigned long ptr; unsigned long cnt; + unsigned int meta; + unsigned int pad; }; struct { @@ -59,11 +61,12 @@ static inline __attribute__((always_inline)) void retprobe_map_clear(__u64 id, } static inline __attribute__((always_inline)) void -retprobe_map_set(__u64 id, __u64 tid, __u64 ktime, unsigned long ptr) +retprobe_map_set(__u64 id, __u64 tid, __u64 ktime, unsigned long ptr, __u32 meta) { struct retprobe_info info = { .ktime_enter = ktime, .ptr = ptr, + .meta = meta, }; struct retprobe_key key = { .id = id, @@ -75,12 +78,13 @@ retprobe_map_set(__u64 id, __u64 tid, __u64 ktime, unsigned long ptr) static inline __attribute__((always_inline)) void retprobe_map_set_iovec(__u64 id, __u64 tid, __u64 ktime, unsigned long ptr, - unsigned long cnt) + unsigned long cnt, __u32 meta) { struct retprobe_info info = { .ktime_enter = ktime, .ptr = ptr, .cnt = cnt, + .meta = meta, }; struct retprobe_key key = { .id = id, diff --git a/bpf/process/types/basic.h b/bpf/process/types/basic.h index 6657aeb4cc9..af8909865c4 100644 --- a/bpf/process/types/basic.h +++ b/bpf/process/types/basic.h @@ -162,7 +162,9 @@ struct event_config { __u32 t_arg4_ctx_off; __u32 syscall; __s32 argreturncopy; + __u32 argmreturncopy; __s32 argreturn; + __u32 argmreturn; /* arg return action specifies to act on the return value; currently * supported actions include: TrackSock and UntrackSock. */ @@ -747,7 +749,7 @@ copy_char_buf(void *ctx, long off, unsigned long arg, int argm, if (hasReturnCopy(argm)) { u64 retid = retprobe_map_get_key(ctx); - retprobe_map_set(e->func_id, retid, e->common.ktime, arg); + retprobe_map_set(e->func_id, retid, e->common.ktime, arg, argm); return return_error(s, char_buf_saved_for_retprobe); } meta = get_arg_meta(argm, e); @@ -1273,7 +1275,7 @@ copy_char_iovec(void *ctx, long off, unsigned long arg, int argm, if (hasReturnCopy(argm)) { u64 retid = retprobe_map_get_key(ctx); - retprobe_map_set_iovec(e->func_id, retid, e->common.ktime, arg, meta); + retprobe_map_set_iovec(e->func_id, retid, e->common.ktime, arg, meta, argm); return return_error(s, char_buf_saved_for_retprobe); } return __copy_char_iovec(off, arg, meta, 0, e, is_userspace_data(argm)); diff --git a/pkg/api/tracingapi/client_kprobe.go b/pkg/api/tracingapi/client_kprobe.go index d95bb4663f6..9b1c2132a04 100644 --- a/pkg/api/tracingapi/client_kprobe.go +++ b/pkg/api/tracingapi/client_kprobe.go @@ -576,7 +576,9 @@ type EventConfig struct { ArgTpCtxOff [EventConfigMaxArgs]uint32 `align:"t_arg0_ctx_off"` Syscall uint32 `align:"syscall"` ArgReturnCopy int32 `align:"argreturncopy"` + ArgMReturnCopy uint32 `align:"argmreturncopy"` ArgReturn int32 `align:"argreturn"` + ArgMReturn uint32 `align:"argmreturn"` ArgReturnAction int32 `align:"argreturnaction"` PolicyID uint32 `align:"policy_id"` Flags uint32 `align:"flags"` diff --git a/pkg/sensors/tracing/generickprobe.go b/pkg/sensors/tracing/generickprobe.go index 914ff1051b1..bedd16d024f 100644 --- a/pkg/sensors/tracing/generickprobe.go +++ b/pkg/sensors/tracing/generickprobe.go @@ -711,6 +711,11 @@ func addKprobe(funcName string, f *v1alpha1.KProbeSpec, in *addKprobeIn) (id idt return errFn(fmt.Errorf("ReturnArg type '%s' unsupported", f.ReturnArg.Type)) } config.ArgReturn = int32(argType) + argMValue, err := getMetaValue(f.ReturnArg, f.Syscall, false) + if err != nil { + return errFn(err) + } + config.ArgMReturn = uint32(argMValue) argsBTFSet[api.ReturnArgIndex] = true argP := argPrinter{index: api.ReturnArgIndex, ty: argType} argReturnPrinters = append(argReturnPrinters, argP) @@ -724,6 +729,11 @@ func addKprobe(funcName string, f *v1alpha1.KProbeSpec, in *addKprobeIn) (id idt argType := gt.GenericTypeFromString(argRetprobe.Type) config.ArgReturnCopy = int32(argType) + argMValue, err := getMetaValue(argRetprobe, f.Syscall, false) + if err != nil { + return errFn(err) + } + config.ArgMReturnCopy = uint32(argMValue) argP := argPrinter{index: int(argRetprobe.Index), ty: argType, label: argRetprobe.Label} argReturnPrinters = append(argReturnPrinters, argP) From 70a327eed871c5109c4f8cff7a9f9c7c0ccc7444 Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Wed, 10 Apr 2024 13:33:23 +0100 Subject: [PATCH 10/11] Memory: Add tail call to retkprobe The addition of code to optionally read memory from kernel or user addresses caused the retkprobe program to exceed the 4096 instruction limit required by some older kernels. This commit divides one retkprobe program into two and connects them together with a tailcal. Signed-off-by: Kevin Sheldrake --- bpf/process/bpf_generic_retkprobe.c | 46 +++++++++++++++++++++++------ pkg/sensors/tracing/kprobe_test.go | 23 ++++++++------- 2 files changed, 49 insertions(+), 20 deletions(-) diff --git a/bpf/process/bpf_generic_retkprobe.c b/bpf/process/bpf_generic_retkprobe.c index fd006ec18c3..7f20ea60559 100644 --- a/bpf/process/bpf_generic_retkprobe.c +++ b/bpf/process/bpf_generic_retkprobe.c @@ -69,17 +69,13 @@ struct { __attribute__((section((MAIN)), used)) int BPF_KRETPROBE(generic_retkprobe_event, unsigned long ret) { - struct execve_map_value *enter; struct msg_generic_kprobe *e; struct event_config *config; struct retprobe_info info; - bool walker = false; - bool userspace; __u64 pid_tgid; long size = 0; int zero = 0; long ty_arg; - __u32 ppid; e = map_lookup_elem(&process_call_heap, &zero); if (!e) @@ -96,10 +92,6 @@ BPF_KRETPROBE(generic_retkprobe_event, unsigned long ret) pid_tgid = get_current_pid_tgid(); e->tid = (__u32)pid_tgid; - if (!retprobe_map_get(e->func_id, e->retprobe_id, &info)) - return 0; - - *(unsigned long *)e->args = info.ktime_enter; size += sizeof(info.ktime_enter); ty_arg = config->argreturn; @@ -121,6 +113,43 @@ BPF_KRETPROBE(generic_retkprobe_event, unsigned long ret) #endif } + e->ret = ret; + e->common.size = size; + e->common.ktime = ktime_get_ns(); + + tail_call(ctx, &retkprobe_calls, TAIL_CALL_FILTER); + return 1; +} + +__attribute__((section("kprobe/2"), used)) int +BPF_KRETPROBE(generic_retkprobe_copy_arg) +{ + struct execve_map_value *enter; + struct msg_generic_kprobe *e; + struct event_config *config; + struct retprobe_info info; + bool walker = false; + unsigned long ret; + bool userspace; + long size = 0; + int zero = 0; + __u32 ppid; + + e = map_lookup_elem(&process_call_heap, &zero); + if (!e) + return 0; + + config = map_lookup_elem(&config_map, &e->idx); + if (!config) + return 0; + + if (!retprobe_map_get(e->func_id, e->retprobe_id, &info)) + return 0; + + *(unsigned long *)e->args = info.ktime_enter; + size = e->common.size; + ret = e->ret; + /* * 0x1000 should be maximum argument length, so masking * with 0x1fff is safe and verifier will be happy. @@ -147,7 +176,6 @@ BPF_KRETPROBE(generic_retkprobe_event, unsigned long ret) e->common.pad[0] = 0; e->common.pad[1] = 0; e->common.size = size; - e->common.ktime = ktime_get_ns(); if (enter) { e->current.pid = enter->key.pid; diff --git a/pkg/sensors/tracing/kprobe_test.go b/pkg/sensors/tracing/kprobe_test.go index 9f11f8754f7..2e2398dfec5 100644 --- a/pkg/sensors/tracing/kprobe_test.go +++ b/pkg/sensors/tracing/kprobe_test.go @@ -4243,20 +4243,21 @@ func TestLoadKprobeSensor(t *testing.T) { 6: tus.SensorProg{Name: "generic_kprobe_output", Type: ebpf.Kprobe}, // retkprobe 7: tus.SensorProg{Name: "generic_retkprobe_event", Type: ebpf.Kprobe}, - 8: tus.SensorProg{Name: "generic_retkprobe_filter_arg", Type: ebpf.Kprobe}, - 9: tus.SensorProg{Name: "generic_retkprobe_actions", Type: ebpf.Kprobe}, - 10: tus.SensorProg{Name: "generic_retkprobe_output", Type: ebpf.Kprobe}, + 8: tus.SensorProg{Name: "generic_retkprobe_copy_arg", Type: ebpf.Kprobe}, + 9: tus.SensorProg{Name: "generic_retkprobe_filter_arg", Type: ebpf.Kprobe}, + 10: tus.SensorProg{Name: "generic_retkprobe_actions", Type: ebpf.Kprobe}, + 11: tus.SensorProg{Name: "generic_retkprobe_output", Type: ebpf.Kprobe}, } var sensorMaps = []tus.SensorMap{ // all kprobe programs - tus.SensorMap{Name: "process_call_heap", Progs: []uint{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}}, + tus.SensorMap{Name: "process_call_heap", Progs: []uint{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}}, // all but generic_kprobe_output tus.SensorMap{Name: "kprobe_calls", Progs: []uint{0, 1, 2, 3, 4, 5}}, // generic_retkprobe_event - tus.SensorMap{Name: "retkprobe_calls", Progs: []uint{7, 8, 9}}, + tus.SensorMap{Name: "retkprobe_calls", Progs: []uint{7, 8, 9, 10}}, // generic_kprobe_process_filter,generic_kprobe_filter_arg, // generic_kprobe_actions,generic_kprobe_output @@ -4269,7 +4270,7 @@ func TestLoadKprobeSensor(t *testing.T) { tus.SensorMap{Name: "config_map", Progs: []uint{0, 1, 2}}, // generic_kprobe_process_event*,generic_kprobe_actions,retkprobe - tus.SensorMap{Name: "fdinstall_map", Progs: []uint{1, 2, 5, 7, 9}}, + tus.SensorMap{Name: "fdinstall_map", Progs: []uint{1, 2, 5, 7, 10}}, // generic_kprobe_event tus.SensorMap{Name: "tg_conf_map", Progs: []uint{0}}, @@ -4277,19 +4278,19 @@ func TestLoadKprobeSensor(t *testing.T) { if kernels.EnableLargeProgs() { // shared with base sensor - sensorMaps = append(sensorMaps, tus.SensorMap{Name: "execve_map", Progs: []uint{4, 5, 6, 7, 9}}) + sensorMaps = append(sensorMaps, tus.SensorMap{Name: "execve_map", Progs: []uint{4, 5, 6, 8, 10}}) // generic_kprobe_process_event*,generic_kprobe_output,generic_retkprobe_output - sensorMaps = append(sensorMaps, tus.SensorMap{Name: "tcpmon_map", Progs: []uint{1, 2, 6, 10}}) + sensorMaps = append(sensorMaps, tus.SensorMap{Name: "tcpmon_map", Progs: []uint{1, 2, 6, 7, 11}}) // generic_kprobe_process_event*,generic_kprobe_actions,retkprobe - sensorMaps = append(sensorMaps, tus.SensorMap{Name: "socktrack_map", Progs: []uint{1, 2, 5, 7, 9}}) + sensorMaps = append(sensorMaps, tus.SensorMap{Name: "socktrack_map", Progs: []uint{1, 2, 5, 7, 10}}) } else { // shared with base sensor - sensorMaps = append(sensorMaps, tus.SensorMap{Name: "execve_map", Progs: []uint{4, 7}}) + sensorMaps = append(sensorMaps, tus.SensorMap{Name: "execve_map", Progs: []uint{4, 8}}) // generic_kprobe_output,generic_retkprobe_output - sensorMaps = append(sensorMaps, tus.SensorMap{Name: "tcpmon_map", Progs: []uint{6, 10}}) + sensorMaps = append(sensorMaps, tus.SensorMap{Name: "tcpmon_map", Progs: []uint{6, 11}}) } readHook := ` From 179404a1706386cab2e4f6cebb524f24e490393a Mon Sep 17 00:00:00 2001 From: Kevin Sheldrake Date: Wed, 10 Apr 2024 13:42:47 +0100 Subject: [PATCH 11/11] Raw_syscalls: fix selector indices in policies In some raw_syscalls policies the selector indices are set to the list index of the relevant argument instead of the actual index of the relevant argument. This usually doesn't break anything because these are the actual indices that are used in the config with the BPF programs. There is a potential for error, however, as the code tries to match the selector index number with an argument's index number; on a match it rewrites the selector index with the argument's list index. Therefore if the first argument to a function isn't listed in the args section then an explicit list index in the selectors section could reference the wrong argument. In order to discourage the direct use of list indices, the examples and test code have been changed so the selectors use the arg index and not its list index. Signed-off-by: Kevin Sheldrake --- examples/tracingpolicy/killer.yaml | 2 +- examples/tracingpolicy/list-syscalls-tracepoint.yaml | 2 +- pkg/sensors/tracing/enforcer_builder.go | 2 +- pkg/sensors/tracing/enforcer_test.go | 8 ++++---- pkg/sensors/tracing/tracepoint_amd64_test.go | 8 ++++---- pkg/sensors/tracing/tracepoint_test.go | 4 ++-- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/tracingpolicy/killer.yaml b/examples/tracingpolicy/killer.yaml index 94a38a603cc..aa0a4230f42 100644 --- a/examples/tracingpolicy/killer.yaml +++ b/examples/tracingpolicy/killer.yaml @@ -21,7 +21,7 @@ spec: type: "syscall64" selectors: - matchArgs: - - index: 0 + - index: 4 operator: "InMap" values: - "list:dups" diff --git a/examples/tracingpolicy/list-syscalls-tracepoint.yaml b/examples/tracingpolicy/list-syscalls-tracepoint.yaml index a35be452132..c6c1d4acb29 100644 --- a/examples/tracingpolicy/list-syscalls-tracepoint.yaml +++ b/examples/tracingpolicy/list-syscalls-tracepoint.yaml @@ -17,7 +17,7 @@ spec: type: "uint64" selectors: - matchArgs: - - index: 0 + - index: 4 operator: "InMap" values: - "list:dups" diff --git a/pkg/sensors/tracing/enforcer_builder.go b/pkg/sensors/tracing/enforcer_builder.go index c01c05363b1..9ec0ec0e74d 100644 --- a/pkg/sensors/tracing/enforcer_builder.go +++ b/pkg/sensors/tracing/enforcer_builder.go @@ -184,7 +184,7 @@ func (ksb *EnforcerSpecBuilder) Build() (*v1alpha1.TracingPolicy, error) { }}, Selectors: []v1alpha1.KProbeSelector{{ MatchArgs: []v1alpha1.ArgSelector{{ - Index: 0, + Index: 4, Operator: operator, Values: listNames, }}, diff --git a/pkg/sensors/tracing/enforcer_test.go b/pkg/sensors/tracing/enforcer_test.go index 1b7943ce75f..2b9816c1b18 100644 --- a/pkg/sensors/tracing/enforcer_test.go +++ b/pkg/sensors/tracing/enforcer_test.go @@ -464,11 +464,11 @@ spec: type: "int64" selectors: - matchArgs: - - index: 0 + - index: 4 operator: "InMap" values: - "list:prctl" - - index: 1 + - index: 5 operator: "Equal" values: - 0xffff @@ -506,11 +506,11 @@ spec: type: "int64" selectors: - matchArgs: - - index: 0 + - index: 4 operator: "InMap" values: - "list:prctl" - - index: 1 + - index: 5 operator: "Equal" values: - 0xfffe diff --git a/pkg/sensors/tracing/tracepoint_amd64_test.go b/pkg/sensors/tracing/tracepoint_amd64_test.go index 82f60e6ff73..09da3c91139 100644 --- a/pkg/sensors/tracing/tracepoint_amd64_test.go +++ b/pkg/sensors/tracing/tracepoint_amd64_test.go @@ -51,9 +51,9 @@ func testListSyscallsDups(t *testing.T, checker *eventchecker.UnorderedEventChec assert.NoError(t, err) } -func TestTracepointListSyscallDups(t *testing.T) { +func TestTracepointListSyscallDupsEqual(t *testing.T) { if !kernels.MinKernelVersion("5.3.0") { - t.Skip("TestCopyFd requires at least 5.3.0 version") + t.Skip("TestTracepointListSyscallDupsEqual requires at least 5.3.0 version") } myPid := observertesthelper.GetMyPid() @@ -87,11 +87,11 @@ spec: values: - ` + pidStr + ` matchArgs: - - index: 0 + - index: 4 operator: "InMap" values: - "list:test" - - index: 1 + - index: 5 operator: "Equal" values: - 9999 diff --git a/pkg/sensors/tracing/tracepoint_test.go b/pkg/sensors/tracing/tracepoint_test.go index d49e1462ff7..2b754028382 100644 --- a/pkg/sensors/tracing/tracepoint_test.go +++ b/pkg/sensors/tracing/tracepoint_test.go @@ -824,11 +824,11 @@ spec: values: - ` + pidStr + ` matchArgs: - - index: 0 + - index: 4 operator: "InMap" values: - "list:test" - - index: 1 + - index: 5 operator: "InMap" values: - 9910:9920