Skip to content

Commit 2dbb9b9

Browse files
iamkafaiborkmann
authored andcommitted
bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT
This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN. BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern" to store the bpf context instead of using the skb->cb[48]. At the SO_REUSEPORT sk lookup time, it is in the middle of transiting from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this point, it is not always clear where the bpf context can be appended in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not clear if the lower layer is only ipv4 and ipv6 in the future and will it not touch the cb[] again before transiting to the upper layer. For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB instead of IP[6]CB and it may still modify the cb[] after calling the udp[46]_lib_lookup_skb(). Because of the above reason, if sk->cb is used for the bpf ctx, saving-and-restoring is needed and likely the whole 48 bytes cb[] has to be saved and restored. Instead of saving, setting and restoring the cb[], this patch opts to create a new "struct sk_reuseport_kern" and setting the needed values in there. The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)" will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol specific usage at this point and it is also inline with the current sock_reuseport.c implementation (i.e. no protocol specific requirement). In "struct sk_reuseport_md", this patch exposes data/data_end/len with semantic similar to other existing usages. Together with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()", the bpf prog can peek anywhere in the skb. The "bind_inany" tells the bpf prog that the reuseport group is bind-ed to a local INANY address which cannot be learned from skb. The new "bind_inany" is added to "struct sock_reuseport" which will be used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order to avoid repeating the "bind INANY" test on "sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can only be properly initialized when a "sk->sk_reuseport" enabled sk is adding to a hashtable (i.e. during "reuseport_alloc()" and "reuseport_add_sock()"). The new "sk_select_reuseport()" is the main helper that the bpf prog will use to select a SO_REUSEPORT sk. It is the only function that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in the earlier patch, the validity of a selected sk is checked in run time in "sk_select_reuseport()". Doing the check in verification time is difficult and inflexible (consider the map-in-map use case). The runtime check is to compare the selected sk's reuseport_id with the reuseport_id that we want. This helper will return -EXXX if the selected sk cannot serve the incoming request (e.g. reuseport_id not match). The bpf prog can decide if it wants to do SK_DROP as its discretion. When the bpf prog returns SK_PASS, the kernel will check if a valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL"). If it does , it will use the selected sk. If not, the kernel will select one from "reuse->socks[]" (as before this patch). The SK_DROP and SK_PASS handling logic will be in the next patch. Signed-off-by: Martin KaFai Lau <[email protected]> Acked-by: Alexei Starovoitov <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
1 parent 5dc4c4b commit 2dbb9b9

File tree

11 files changed

+365
-13
lines changed

11 files changed

+365
-13
lines changed

include/linux/bpf_types.h

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
2929
#ifdef CONFIG_BPF_LIRC_MODE2
3030
BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
3131
#endif
32+
#ifdef CONFIG_INET
33+
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
34+
#endif
3235

3336
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
3437
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)

include/linux/filter.h

+15
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ struct seccomp_data;
3232
struct bpf_prog_aux;
3333
struct xdp_rxq_info;
3434
struct xdp_buff;
35+
struct sock_reuseport;
3536

3637
/* ArgX, context and stack frame pointer register positions. Note,
3738
* Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -833,6 +834,20 @@ void bpf_warn_invalid_xdp_action(u32 act);
833834
struct sock *do_sk_redirect_map(struct sk_buff *skb);
834835
struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
835836

837+
#ifdef CONFIG_INET
838+
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
839+
struct bpf_prog *prog, struct sk_buff *skb,
840+
u32 hash);
841+
#else
842+
static inline struct sock *
843+
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
844+
struct bpf_prog *prog, struct sk_buff *skb,
845+
u32 hash)
846+
{
847+
return NULL;
848+
}
849+
#endif
850+
836851
#ifdef CONFIG_BPF_JIT
837852
extern int bpf_jit_enable;
838853
extern int bpf_jit_harden;

include/net/addrconf.h

+1
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
108108
u32 banned_flags);
109109
bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
110110
bool match_wildcard);
111+
bool inet_rcv_saddr_any(const struct sock *sk);
111112
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
112113
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
113114

include/net/sock_reuseport.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ struct sock_reuseport {
2121
unsigned int synq_overflow_ts;
2222
/* ID stays the same even after the size of socks[] grows. */
2323
unsigned int reuseport_id;
24+
bool bind_inany;
2425
struct bpf_prog __rcu *prog; /* optional BPF sock selector */
2526
struct sock *socks[0]; /* array of sock pointers */
2627
};
2728

28-
extern int reuseport_alloc(struct sock *sk);
29-
extern int reuseport_add_sock(struct sock *sk, struct sock *sk2);
29+
extern int reuseport_alloc(struct sock *sk, bool bind_inany);
30+
extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
31+
bool bind_inany);
3032
extern void reuseport_detach_sock(struct sock *sk);
3133
extern struct sock *reuseport_select_sock(struct sock *sk,
3234
u32 hash,

include/uapi/linux/bpf.h

+35-1
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ enum bpf_prog_type {
151151
BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
152152
BPF_PROG_TYPE_LWT_SEG6LOCAL,
153153
BPF_PROG_TYPE_LIRC_MODE2,
154+
BPF_PROG_TYPE_SK_REUSEPORT,
154155
};
155156

156157
enum bpf_attach_type {
@@ -2114,6 +2115,14 @@ union bpf_attr {
21142115
* the shared data.
21152116
* Return
21162117
* Pointer to the local storage area.
2118+
*
2119+
* int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
2120+
* Description
2121+
* Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map
2122+
* It checks the selected sk is matching the incoming
2123+
* request in the skb.
2124+
* Return
2125+
* 0 on success, or a negative error in case of failure.
21172126
*/
21182127
#define __BPF_FUNC_MAPPER(FN) \
21192128
FN(unspec), \
@@ -2197,7 +2206,8 @@ union bpf_attr {
21972206
FN(rc_keydown), \
21982207
FN(skb_cgroup_id), \
21992208
FN(get_current_cgroup_id), \
2200-
FN(get_local_storage),
2209+
FN(get_local_storage), \
2210+
FN(sk_select_reuseport),
22012211

22022212
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
22032213
* function eBPF program intends to call
@@ -2414,6 +2424,30 @@ struct sk_msg_md {
24142424
__u32 local_port; /* stored in host byte order */
24152425
};
24162426

2427+
struct sk_reuseport_md {
2428+
/*
2429+
* Start of directly accessible data. It begins from
2430+
* the tcp/udp header.
2431+
*/
2432+
void *data;
2433+
void *data_end; /* End of directly accessible data */
2434+
/*
2435+
* Total length of packet (starting from the tcp/udp header).
2436+
* Note that the directly accessible bytes (data_end - data)
2437+
* could be less than this "len". Those bytes could be
2438+
* indirectly read by a helper "bpf_skb_load_bytes()".
2439+
*/
2440+
__u32 len;
2441+
/*
2442+
* Eth protocol in the mac header (network byte order). e.g.
2443+
* ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
2444+
*/
2445+
__u32 eth_protocol;
2446+
__u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
2447+
__u32 bind_inany; /* Is sock bound to an INANY address? */
2448+
__u32 hash; /* A hash of the packet 4 tuples */
2449+
};
2450+
24172451
#define BPF_TAG_SIZE 8
24182452

24192453
struct bpf_prog_info {

kernel/bpf/verifier.c

+9
Original file line numberDiff line numberDiff line change
@@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
13101310
case BPF_PROG_TYPE_LWT_IN:
13111311
case BPF_PROG_TYPE_LWT_OUT:
13121312
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
1313+
case BPF_PROG_TYPE_SK_REUSEPORT:
13131314
/* dst_input() and dst_output() can't write for now */
13141315
if (t == BPF_WRITE)
13151316
return false;
@@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
21662167
func_id != BPF_FUNC_msg_redirect_hash)
21672168
goto error;
21682169
break;
2170+
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
2171+
if (func_id != BPF_FUNC_sk_select_reuseport)
2172+
goto error;
2173+
break;
21692174
default:
21702175
break;
21712176
}
@@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
22172222
if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
22182223
goto error;
22192224
break;
2225+
case BPF_FUNC_sk_select_reuseport:
2226+
if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
2227+
goto error;
2228+
break;
22202229
default:
22212230
break;
22222231
}

0 commit comments

Comments
 (0)