Skip to content

Commit de8f3a8

Browse files
borkmanndavem330
authored andcommitted
bpf: add meta pointer for direct access
This work enables generic transfer of metadata from XDP into skb. The basic idea is that we can make use of the fact that the resulting skb must be linear and already comes with a larger headroom for supporting bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work on a similar principle and introduce a small helper bpf_xdp_adjust_meta() for adjusting a new pointer called xdp->data_meta. Thus, the packet has a flexible and programmable room for meta data, followed by the actual packet data. struct xdp_buff is therefore laid out that we first point to data_hard_start, then data_meta directly prepended to data followed by data_end marking the end of packet. bpf_xdp_adjust_head() takes into account whether we have meta data already prepended and if so, memmove()s this along with the given offset provided there's enough room. xdp->data_meta is optional and programs are not required to use it. The rationale is that when we process the packet in XDP (e.g. as DoS filter), we can push further meta data along with it for the XDP_PASS case, and give the guarantee that a clsact ingress BPF program on the same device can pick this up for further post-processing. Since we work with skb there, we can also set skb->mark, skb->priority or other skb meta data out of BPF, thus having this scratch space generic and programmable allows for more flexibility than defining a direct 1:1 transfer of potentially new XDP members into skb (it's also more efficient as we don't need to initialize/handle each of such new members). The facility also works together with GRO aggregation. The scratch space at the head of the packet can be multiple of 4 byte up to 32 byte large. Drivers not yet supporting xdp->data_meta can simply be set up with xdp->data_meta as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out, such that the subsequent match against xdp->data for later access is guaranteed to fail. The verifier treats xdp->data_meta/xdp->data the same way as we treat xdp->data/xdp->data_end pointer comparisons. The requirement for doing the compare against xdp->data is that it hasn't been modified from it's original address we got from ctx access. It may have a range marking already from prior successful xdp->data/xdp->data_end pointer comparisons though. Signed-off-by: Daniel Borkmann <[email protected]> Acked-by: Alexei Starovoitov <[email protected]> Acked-by: John Fastabend <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 6aaae2b commit de8f3a8

File tree

19 files changed

+297
-42
lines changed

19 files changed

+297
-42
lines changed

drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c

+1
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
9494

9595
xdp.data_hard_start = *data_ptr - offset;
9696
xdp.data = *data_ptr;
97+
xdp_set_data_meta_invalid(&xdp);
9798
xdp.data_end = *data_ptr + *len;
9899
orig_data = xdp.data;
99100
mapping = rx_buf->mapping - bp->rx_dma_offset;

drivers/net/ethernet/cavium/thunder/nicvf_main.c

+1
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
523523

524524
xdp.data_hard_start = page_address(page);
525525
xdp.data = (void *)cpu_addr;
526+
xdp_set_data_meta_invalid(&xdp);
526527
xdp.data_end = xdp.data + len;
527528
orig_data = xdp.data;
528529

drivers/net/ethernet/intel/i40e/i40e_txrx.c

+1
Original file line numberDiff line numberDiff line change
@@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
21072107
if (!skb) {
21082108
xdp.data = page_address(rx_buffer->page) +
21092109
rx_buffer->page_offset;
2110+
xdp_set_data_meta_invalid(&xdp);
21102111
xdp.data_hard_start = xdp.data -
21112112
i40e_rx_offset(rx_ring);
21122113
xdp.data_end = xdp.data + size;

drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

+1
Original file line numberDiff line numberDiff line change
@@ -2326,6 +2326,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
23262326
if (!skb) {
23272327
xdp.data = page_address(rx_buffer->page) +
23282328
rx_buffer->page_offset;
2329+
xdp_set_data_meta_invalid(&xdp);
23292330
xdp.data_hard_start = xdp.data -
23302331
ixgbe_rx_offset(rx_ring);
23312332
xdp.data_end = xdp.data + size;

drivers/net/ethernet/mellanox/mlx4/en_rx.c

+1
Original file line numberDiff line numberDiff line change
@@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
762762

763763
xdp.data_hard_start = va - frags[0].page_offset;
764764
xdp.data = va;
765+
xdp_set_data_meta_invalid(&xdp);
765766
xdp.data_end = xdp.data + length;
766767
orig_data = xdp.data;
767768

drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

+1
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
794794
return false;
795795

796796
xdp.data = va + *rx_headroom;
797+
xdp_set_data_meta_invalid(&xdp);
797798
xdp.data_end = xdp.data + *len;
798799
xdp.data_hard_start = va;
799800

drivers/net/ethernet/netronome/nfp/nfp_net_common.c

+1
Original file line numberDiff line numberDiff line change
@@ -1583,6 +1583,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
15831583

15841584
xdp.data_hard_start = hard_start;
15851585
xdp.data = data + *off;
1586+
xdp_set_data_meta_invalid(&xdp);
15861587
xdp.data_end = data + *off + *len;
15871588

15881589
orig_data = xdp.data;

drivers/net/ethernet/qlogic/qede/qede_fp.c

+1
Original file line numberDiff line numberDiff line change
@@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
10041004

10051005
xdp.data_hard_start = page_address(bd->data);
10061006
xdp.data = xdp.data_hard_start + *data_offset;
1007+
xdp_set_data_meta_invalid(&xdp);
10071008
xdp.data_end = xdp.data + *len;
10081009

10091010
/* Queues always have a full reset currently, so for the time

drivers/net/tun.c

+1
Original file line numberDiff line numberDiff line change
@@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
14681468

14691469
xdp.data_hard_start = buf;
14701470
xdp.data = buf + pad;
1471+
xdp_set_data_meta_invalid(&xdp);
14711472
xdp.data_end = xdp.data + len;
14721473
orig_data = xdp.data;
14731474
act = bpf_prog_run_xdp(xdp_prog, &xdp);

drivers/net/virtio_net.c

+2
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
554554

555555
xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
556556
xdp.data = xdp.data_hard_start + xdp_headroom;
557+
xdp_set_data_meta_invalid(&xdp);
557558
xdp.data_end = xdp.data + len;
558559
orig_data = xdp.data;
559560
act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
686687
data = page_address(xdp_page) + offset;
687688
xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
688689
xdp.data = data + vi->hdr_len;
690+
xdp_set_data_meta_invalid(&xdp);
689691
xdp.data_end = xdp.data + (len - vi->hdr_len);
690692
act = bpf_prog_run_xdp(xdp_prog, &xdp);
691693

include/linux/bpf.h

+1
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ enum bpf_reg_type {
137137
PTR_TO_MAP_VALUE, /* reg points to map element value */
138138
PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
139139
PTR_TO_STACK, /* reg == frame_pointer + offset */
140+
PTR_TO_PACKET_META, /* skb->data - meta_len */
140141
PTR_TO_PACKET, /* reg points to skb->data */
141142
PTR_TO_PACKET_END, /* skb->data + headlen */
142143
};

include/linux/filter.h

+19-2
Original file line numberDiff line numberDiff line change
@@ -487,12 +487,14 @@ struct sk_filter {
487487

488488
struct bpf_skb_data_end {
489489
struct qdisc_skb_cb qdisc_cb;
490+
void *data_meta;
490491
void *data_end;
491492
};
492493

493494
struct xdp_buff {
494495
void *data;
495496
void *data_end;
497+
void *data_meta;
496498
void *data_hard_start;
497499
};
498500

@@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
507509
struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
508510

509511
BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
510-
cb->data_end = skb->data + skb_headlen(skb);
512+
cb->data_meta = skb->data - skb_metadata_len(skb);
513+
cb->data_end = skb->data + skb_headlen(skb);
511514
}
512515

513516
static inline u8 *bpf_skb_cb(struct sk_buff *skb)
@@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
728731
struct bpf_prog *prog);
729732
void xdp_do_flush_map(void);
730733

734+
/* Drivers not supporting XDP metadata can use this helper, which
735+
* rejects any room expansion for metadata as a result.
736+
*/
737+
static __always_inline void
738+
xdp_set_data_meta_invalid(struct xdp_buff *xdp)
739+
{
740+
xdp->data_meta = xdp->data + 1;
741+
}
742+
743+
static __always_inline bool
744+
xdp_data_meta_unsupported(const struct xdp_buff *xdp)
745+
{
746+
return unlikely(xdp->data_meta > xdp->data);
747+
}
748+
731749
void bpf_warn_invalid_xdp_action(u32 act);
732-
void bpf_warn_invalid_xdp_redirect(u32 ifindex);
733750

734751
struct sock *do_sk_redirect_map(void);
735752

include/linux/skbuff.h

+66-2
Original file line numberDiff line numberDiff line change
@@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
489489
* the end of the header data, ie. at skb->end.
490490
*/
491491
struct skb_shared_info {
492-
unsigned short _unused;
493-
unsigned char nr_frags;
492+
__u8 __unused;
493+
__u8 meta_len;
494+
__u8 nr_frags;
494495
__u8 tx_flags;
495496
unsigned short gso_size;
496497
/* Warning: this field is not always filled in (UFO)! */
@@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void)
34003401
return 0;
34013402
}
34023403

3404+
static inline u8 skb_metadata_len(const struct sk_buff *skb)
3405+
{
3406+
return skb_shinfo(skb)->meta_len;
3407+
}
3408+
3409+
static inline void *skb_metadata_end(const struct sk_buff *skb)
3410+
{
3411+
return skb_mac_header(skb);
3412+
}
3413+
3414+
static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
3415+
const struct sk_buff *skb_b,
3416+
u8 meta_len)
3417+
{
3418+
const void *a = skb_metadata_end(skb_a);
3419+
const void *b = skb_metadata_end(skb_b);
3420+
/* Using more efficient varaiant than plain call to memcmp(). */
3421+
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
3422+
u64 diffs = 0;
3423+
3424+
switch (meta_len) {
3425+
#define __it(x, op) (x -= sizeof(u##op))
3426+
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
3427+
case 32: diffs |= __it_diff(a, b, 64);
3428+
case 24: diffs |= __it_diff(a, b, 64);
3429+
case 16: diffs |= __it_diff(a, b, 64);
3430+
case 8: diffs |= __it_diff(a, b, 64);
3431+
break;
3432+
case 28: diffs |= __it_diff(a, b, 64);
3433+
case 20: diffs |= __it_diff(a, b, 64);
3434+
case 12: diffs |= __it_diff(a, b, 64);
3435+
case 4: diffs |= __it_diff(a, b, 32);
3436+
break;
3437+
}
3438+
return diffs;
3439+
#else
3440+
return memcmp(a - meta_len, b - meta_len, meta_len);
3441+
#endif
3442+
}
3443+
3444+
static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
3445+
const struct sk_buff *skb_b)
3446+
{
3447+
u8 len_a = skb_metadata_len(skb_a);
3448+
u8 len_b = skb_metadata_len(skb_b);
3449+
3450+
if (!(len_a | len_b))
3451+
return false;
3452+
3453+
return len_a != len_b ?
3454+
true : __skb_metadata_differs(skb_a, skb_b, len_a);
3455+
}
3456+
3457+
static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
3458+
{
3459+
skb_shinfo(skb)->meta_len = meta_len;
3460+
}
3461+
3462+
static inline void skb_metadata_clear(struct sk_buff *skb)
3463+
{
3464+
skb_metadata_set(skb, 0);
3465+
}
3466+
34033467
struct sk_buff *skb_clone_sk(struct sk_buff *skb);
34043468

34053469
#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

include/uapi/linux/bpf.h

+12-1
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,12 @@ union bpf_attr {
582582
* @map: pointer to sockmap to update
583583
* @key: key to insert/update sock in map
584584
* @flags: same flags as map update elem
585+
*
586+
* int bpf_xdp_adjust_meta(xdp_md, delta)
587+
* Adjust the xdp_md.data_meta by delta
588+
* @xdp_md: pointer to xdp_md
589+
* @delta: An positive/negative integer to be added to xdp_md.data_meta
590+
* Return: 0 on success or negative on error
585591
*/
586592
#define __BPF_FUNC_MAPPER(FN) \
587593
FN(unspec), \
@@ -638,6 +644,7 @@ union bpf_attr {
638644
FN(redirect_map), \
639645
FN(sk_redirect_map), \
640646
FN(sock_map_update), \
647+
FN(xdp_adjust_meta),
641648

642649
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
643650
* function eBPF program intends to call
@@ -715,14 +722,17 @@ struct __sk_buff {
715722
__u32 data_end;
716723
__u32 napi_id;
717724

718-
/* accessed by BPF_PROG_TYPE_sk_skb types */
725+
/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
719726
__u32 family;
720727
__u32 remote_ip4; /* Stored in network byte order */
721728
__u32 local_ip4; /* Stored in network byte order */
722729
__u32 remote_ip6[4]; /* Stored in network byte order */
723730
__u32 local_ip6[4]; /* Stored in network byte order */
724731
__u32 remote_port; /* Stored in network byte order */
725732
__u32 local_port; /* stored in host byte order */
733+
/* ... here. */
734+
735+
__u32 data_meta;
726736
};
727737

728738
struct bpf_tunnel_key {
@@ -783,6 +793,7 @@ enum xdp_action {
783793
struct xdp_md {
784794
__u32 data;
785795
__u32 data_end;
796+
__u32 data_meta;
786797
};
787798

788799
enum sk_action {

0 commit comments

Comments
 (0)