Skip to content

Commit 908432c

Browse files
yonghong-songdavem330
authored andcommitted
bpf: add helper bpf_perf_event_read_value for perf event array map
Hardware pmu counters are limited resources. When there are more pmu based perf events opened than available counters, kernel will multiplex these events so each event gets certain percentage (but not 100%) of the pmu time. In case that multiplexing happens, the number of samples or counter value will not reflect the case compared to no multiplexing. This makes comparison between different runs difficult. Typically, the number of samples or counter value should be normalized before comparing to other experiments. The typical normalization is done like: normalized_num_samples = num_samples * time_enabled / time_running normalized_counter_value = counter_value * time_enabled / time_running where time_enabled is the time enabled for event and time_running is the time running for event since last normalization. This patch adds helper bpf_perf_event_read_value for kprobed based perf event array map, to read perf counter and enabled/running time. The enabled/running time is accumulated since the perf event open. To achieve scaling factor between two bpf invocations, users can can use cpu_id as the key (which is typical for perf array usage model) to remember the previous value and do the calculation inside the bpf program. Signed-off-by: Yonghong Song <[email protected]> Acked-by: Alexei Starovoitov <[email protected]> Acked-by: Daniel Borkmann <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 9756263 commit 908432c

File tree

3 files changed

+63
-7
lines changed

3 files changed

+63
-7
lines changed

include/uapi/linux/bpf.h

+19-2
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,14 @@ union bpf_attr {
641641
* @xdp_md: pointer to xdp_md
642642
* @delta: An positive/negative integer to be added to xdp_md.data_meta
643643
* Return: 0 on success or negative on error
644+
*
645+
* int bpf_perf_event_read_value(map, flags, buf, buf_size)
646+
* read perf event counter value and perf event enabled/running time
647+
* @map: pointer to perf_event_array map
648+
* @flags: index of event in the map or bitmask flags
649+
* @buf: buf to fill
650+
* @buf_size: size of the buf
651+
* Return: 0 on success or negative error code
644652
*/
645653
#define __BPF_FUNC_MAPPER(FN) \
646654
FN(unspec), \
@@ -697,7 +705,8 @@ union bpf_attr {
697705
FN(redirect_map), \
698706
FN(sk_redirect_map), \
699707
FN(sock_map_update), \
700-
FN(xdp_adjust_meta),
708+
FN(xdp_adjust_meta), \
709+
FN(perf_event_read_value),
701710

702711
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
703712
* function eBPF program intends to call
@@ -741,7 +750,9 @@ enum bpf_func_id {
741750
#define BPF_F_ZERO_CSUM_TX (1ULL << 1)
742751
#define BPF_F_DONT_FRAGMENT (1ULL << 2)
743752

744-
/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
753+
/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
754+
* BPF_FUNC_perf_event_read_value flags.
755+
*/
745756
#define BPF_F_INDEX_MASK 0xffffffffULL
746757
#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK
747758
/* BPF_FUNC_perf_event_output for sk_buff input context. */
@@ -934,4 +945,10 @@ enum {
934945
#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
935946
#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */
936947

948+
struct bpf_perf_event_value {
949+
__u64 counter;
950+
__u64 enabled;
951+
__u64 running;
952+
};
953+
937954
#endif /* _UAPI__LINUX_BPF_H__ */

kernel/bpf/verifier.c

+3-1
Original file line numberDiff line numberDiff line change
@@ -1552,7 +1552,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
15521552
break;
15531553
case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
15541554
if (func_id != BPF_FUNC_perf_event_read &&
1555-
func_id != BPF_FUNC_perf_event_output)
1555+
func_id != BPF_FUNC_perf_event_output &&
1556+
func_id != BPF_FUNC_perf_event_read_value)
15561557
goto error;
15571558
break;
15581559
case BPF_MAP_TYPE_STACK_TRACE:
@@ -1595,6 +1596,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
15951596
break;
15961597
case BPF_FUNC_perf_event_read:
15971598
case BPF_FUNC_perf_event_output:
1599+
case BPF_FUNC_perf_event_read_value:
15981600
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
15991601
goto error;
16001602
break;

kernel/trace/bpf_trace.c

+41-4
Original file line numberDiff line numberDiff line change
@@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
255255
return &bpf_trace_printk_proto;
256256
}
257257

258-
BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
258+
static __always_inline int
259+
get_map_perf_counter(struct bpf_map *map, u64 flags,
260+
u64 *value, u64 *enabled, u64 *running)
259261
{
260262
struct bpf_array *array = container_of(map, struct bpf_array, map);
261263
unsigned int cpu = smp_processor_id();
262264
u64 index = flags & BPF_F_INDEX_MASK;
263265
struct bpf_event_entry *ee;
264-
u64 value = 0;
265-
int err;
266266

267267
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
268268
return -EINVAL;
@@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
275275
if (!ee)
276276
return -ENOENT;
277277

278-
err = perf_event_read_local(ee->event, &value, NULL, NULL);
278+
return perf_event_read_local(ee->event, value, enabled, running);
279+
}
280+
281+
BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
282+
{
283+
u64 value = 0;
284+
int err;
285+
286+
err = get_map_perf_counter(map, flags, &value, NULL, NULL);
279287
/*
280288
* this api is ugly since we miss [-22..-2] range of valid
281289
* counter values, but that's uapi
@@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
293301
.arg2_type = ARG_ANYTHING,
294302
};
295303

304+
BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
305+
struct bpf_perf_event_value *, buf, u32, size)
306+
{
307+
int err = -EINVAL;
308+
309+
if (unlikely(size != sizeof(struct bpf_perf_event_value)))
310+
goto clear;
311+
err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
312+
&buf->running);
313+
if (unlikely(err))
314+
goto clear;
315+
return 0;
316+
clear:
317+
memset(buf, 0, size);
318+
return err;
319+
}
320+
321+
static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
322+
.func = bpf_perf_event_read_value,
323+
.gpl_only = true,
324+
.ret_type = RET_INTEGER,
325+
.arg1_type = ARG_CONST_MAP_PTR,
326+
.arg2_type = ARG_ANYTHING,
327+
.arg3_type = ARG_PTR_TO_UNINIT_MEM,
328+
.arg4_type = ARG_CONST_SIZE,
329+
};
330+
296331
static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
297332

298333
static __always_inline u64
@@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
499534
return &bpf_perf_event_output_proto;
500535
case BPF_FUNC_get_stackid:
501536
return &bpf_get_stackid_proto;
537+
case BPF_FUNC_perf_event_read_value:
538+
return &bpf_perf_event_read_value_proto;
502539
default:
503540
return tracing_func_proto(func_id);
504541
}

0 commit comments

Comments
 (0)