Skip to content

Commit 6225827

Browse files
Alexei Starovoitovdavem330
Alexei Starovoitov
authored andcommitted
net: filter: x86: internal BPF JIT
Maps all internal BPF instructions into x86_64 instructions. This patch replaces original BPF x64 JIT with internal BPF x64 JIT. sysctl net.core.bpf_jit_enable is reused as on/off switch. Performance: 1. old BPF JIT and internal BPF JIT generate equivalent x86_64 code. No performance difference is observed for filters that were JIT-able before Example assembler code for BPF filter "tcpdump port 22" original BPF -> old JIT: original BPF -> internal BPF -> new JIT: 0: push %rbp 0: push %rbp 1: mov %rsp,%rbp 1: mov %rsp,%rbp 4: sub $0x60,%rsp 4: sub $0x228,%rsp 8: mov %rbx,-0x8(%rbp) b: mov %rbx,-0x228(%rbp) // prologue 12: mov %r13,-0x220(%rbp) 19: mov %r14,-0x218(%rbp) 20: mov %r15,-0x210(%rbp) 27: xor %eax,%eax // clear A c: xor %ebx,%ebx 29: xor %r13,%r13 // clear X e: mov 0x68(%rdi),%r9d 2c: mov 0x68(%rdi),%r9d 12: sub 0x6c(%rdi),%r9d 30: sub 0x6c(%rdi),%r9d 16: mov 0xd8(%rdi),%r8 34: mov 0xd8(%rdi),%r10 3b: mov %rdi,%rbx 1d: mov $0xc,%esi 3e: mov $0xc,%esi 22: callq 0xffffffffe1021e15 43: callq 0xffffffffe102bd75 27: cmp $0x86dd,%eax 48: cmp $0x86dd,%rax 2c: jne 0x0000000000000069 4f: jne 0x000000000000009a 2e: mov $0x14,%esi 51: mov $0x14,%esi 33: callq 0xffffffffe1021e31 56: callq 0xffffffffe102bd91 38: cmp $0x84,%eax 5b: cmp $0x84,%rax 3d: je 0x0000000000000049 62: je 0x0000000000000074 3f: cmp $0x6,%eax 64: cmp $0x6,%rax 42: je 0x0000000000000049 68: je 0x0000000000000074 44: cmp $0x11,%eax 6a: cmp $0x11,%rax 47: jne 0x00000000000000c6 6e: jne 0x0000000000000117 49: mov $0x36,%esi 74: mov $0x36,%esi 4e: callq 0xffffffffe1021e15 79: callq 0xffffffffe102bd75 53: cmp $0x16,%eax 7e: cmp $0x16,%rax 56: je 0x00000000000000bf 82: je 0x0000000000000110 58: mov $0x38,%esi 88: mov $0x38,%esi 5d: callq 0xffffffffe1021e15 8d: callq 0xffffffffe102bd75 62: cmp $0x16,%eax 92: cmp $0x16,%rax 65: je 0x00000000000000bf 96: je 0x0000000000000110 67: jmp 0x00000000000000c6 98: jmp 0x0000000000000117 69: cmp $0x800,%eax 9a: cmp $0x800,%rax 6e: jne 0x00000000000000c6 a1: jne 0x0000000000000117 70: mov $0x17,%esi a3: mov $0x17,%esi 75: callq 0xffffffffe1021e31 a8: callq 0xffffffffe102bd91 7a: cmp $0x84,%eax ad: cmp $0x84,%rax 7f: je 0x000000000000008b b4: je 0x00000000000000c2 81: cmp $0x6,%eax b6: cmp $0x6,%rax 84: je 0x000000000000008b ba: je 0x00000000000000c2 86: cmp $0x11,%eax bc: cmp $0x11,%rax 89: jne 0x00000000000000c6 c0: jne 0x0000000000000117 8b: mov $0x14,%esi c2: mov $0x14,%esi 90: callq 0xffffffffe1021e15 c7: callq 0xffffffffe102bd75 95: test $0x1fff,%ax cc: test $0x1fff,%rax 99: jne 0x00000000000000c6 d3: jne 0x0000000000000117 d5: mov %rax,%r14 9b: mov $0xe,%esi d8: mov $0xe,%esi a0: callq 0xffffffffe1021e44 dd: callq 0xffffffffe102bd91 // MSH e2: and $0xf,%eax e5: shl $0x2,%eax e8: mov %rax,%r13 eb: mov %r14,%rax ee: mov %r13,%rsi a5: lea 0xe(%rbx),%esi f1: add $0xe,%esi a8: callq 0xffffffffe1021e0d f4: callq 0xffffffffe102bd6d ad: cmp $0x16,%eax f9: cmp $0x16,%rax b0: je 0x00000000000000bf fd: je 0x0000000000000110 ff: mov %r13,%rsi b2: lea 0x10(%rbx),%esi 102: add $0x10,%esi b5: callq 0xffffffffe1021e0d 105: callq 0xffffffffe102bd6d ba: cmp $0x16,%eax 10a: cmp $0x16,%rax bd: jne 0x00000000000000c6 10e: jne 0x0000000000000117 bf: mov $0xffff,%eax 110: mov $0xffff,%eax c4: jmp 0x00000000000000c8 115: jmp 0x000000000000011c c6: xor %eax,%eax 117: mov $0x0,%eax c8: mov -0x8(%rbp),%rbx 11c: mov -0x228(%rbp),%rbx // epilogue cc: leaveq 123: mov -0x220(%rbp),%r13 cd: retq 12a: mov -0x218(%rbp),%r14 131: mov -0x210(%rbp),%r15 138: leaveq 139: retq On fully cached SKBs both JITed functions take 12 nsec to execute. BPF interpreter executes the program in 30 nsec. The difference in generated assembler is due to the following: Old BPF imlements LDX_MSH instruction via sk_load_byte_msh() helper function inside bpf_jit.S. New JIT removes the helper and does it explicitly, so ldx_msh cost is the same for both JITs, but generated code looks longer. New JIT has 4 registers to save, so prologue/epilogue are larger, but the cost is within noise on x64. Old JIT checks whether first insn clears A and if not emits 'xor %eax,%eax'. New JIT clears %rax unconditionally. 2. old BPF JIT doesn't support ANC_NLATTR, ANC_PAY_OFFSET, ANC_RANDOM extensions. New JIT supports all BPF extensions. Performance of such filters improves 2-4 times depending on a filter. The longer the filter the higher performance gain. Synthetic benchmarks with many ancillary loads see 20x speedup which seems to be the maximum gain from JIT Notes: . net.core.bpf_jit_enable=2 + tools/net/bpf_jit_disasm is still functional and can be used to see generated assembler . there are two jit_compile() functions and code flow for classic filters is: sk_attach_filter() - load classic BPF bpf_jit_compile() - try to JIT from classic BPF sk_convert_filter() - convert classic to internal bpf_int_jit_compile() - JIT from internal BPF seccomp and tracing filters will just call bpf_int_jit_compile() Signed-off-by: Alexei Starovoitov <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent f3c2af7 commit 6225827

File tree

4 files changed

+748
-655
lines changed

4 files changed

+748
-655
lines changed

arch/x86/net/bpf_jit.S

+18-59
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,16 @@
1212

1313
/*
1414
* Calling convention :
15-
* rdi : skb pointer
15+
* rbx : skb pointer (callee saved)
1616
* esi : offset of byte(s) to fetch in skb (can be scratched)
17-
* r8 : copy of skb->data
17+
* r10 : copy of skb->data
1818
* r9d : hlen = skb->len - skb->data_len
1919
*/
20-
#define SKBDATA %r8
20+
#define SKBDATA %r10
2121
#define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */
22+
#define MAX_BPF_STACK (512 /* from filter.h */ + \
23+
32 /* space for rbx,r13,r14,r15 */ + \
24+
8 /* space for skb_copy_bits */)
2225

2326
sk_load_word:
2427
.globl sk_load_word
@@ -68,75 +71,43 @@ sk_load_byte_positive_offset:
6871
movzbl (SKBDATA,%rsi),%eax
6972
ret
7073

71-
/**
72-
* sk_load_byte_msh - BPF_S_LDX_B_MSH helper
73-
*
74-
* Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf)
75-
* Must preserve A accumulator (%eax)
76-
* Inputs : %esi is the offset value
77-
*/
78-
sk_load_byte_msh:
79-
.globl sk_load_byte_msh
80-
test %esi,%esi
81-
js bpf_slow_path_byte_msh_neg
82-
83-
sk_load_byte_msh_positive_offset:
84-
.globl sk_load_byte_msh_positive_offset
85-
cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
86-
jle bpf_slow_path_byte_msh
87-
movzbl (SKBDATA,%rsi),%ebx
88-
and $15,%bl
89-
shl $2,%bl
90-
ret
91-
9274
/* rsi contains offset and can be scratched */
9375
#define bpf_slow_path_common(LEN) \
94-
push %rdi; /* save skb */ \
76+
mov %rbx, %rdi; /* arg1 == skb */ \
9577
push %r9; \
9678
push SKBDATA; \
9779
/* rsi already has offset */ \
9880
mov $LEN,%ecx; /* len */ \
99-
lea -12(%rbp),%rdx; \
81+
lea - MAX_BPF_STACK + 32(%rbp),%rdx; \
10082
call skb_copy_bits; \
10183
test %eax,%eax; \
10284
pop SKBDATA; \
103-
pop %r9; \
104-
pop %rdi
85+
pop %r9;
10586

10687

10788
bpf_slow_path_word:
10889
bpf_slow_path_common(4)
10990
js bpf_error
110-
mov -12(%rbp),%eax
91+
mov - MAX_BPF_STACK + 32(%rbp),%eax
11192
bswap %eax
11293
ret
11394

11495
bpf_slow_path_half:
11596
bpf_slow_path_common(2)
11697
js bpf_error
117-
mov -12(%rbp),%ax
98+
mov - MAX_BPF_STACK + 32(%rbp),%ax
11899
rol $8,%ax
119100
movzwl %ax,%eax
120101
ret
121102

122103
bpf_slow_path_byte:
123104
bpf_slow_path_common(1)
124105
js bpf_error
125-
movzbl -12(%rbp),%eax
126-
ret
127-
128-
bpf_slow_path_byte_msh:
129-
xchg %eax,%ebx /* dont lose A , X is about to be scratched */
130-
bpf_slow_path_common(1)
131-
js bpf_error
132-
movzbl -12(%rbp),%eax
133-
and $15,%al
134-
shl $2,%al
135-
xchg %eax,%ebx
106+
movzbl - MAX_BPF_STACK + 32(%rbp),%eax
136107
ret
137108

138109
#define sk_negative_common(SIZE) \
139-
push %rdi; /* save skb */ \
110+
mov %rbx, %rdi; /* arg1 == skb */ \
140111
push %r9; \
141112
push SKBDATA; \
142113
/* rsi already has offset */ \
@@ -145,10 +116,8 @@ bpf_slow_path_byte_msh:
145116
test %rax,%rax; \
146117
pop SKBDATA; \
147118
pop %r9; \
148-
pop %rdi; \
149119
jz bpf_error
150120

151-
152121
bpf_slow_path_word_neg:
153122
cmp SKF_MAX_NEG_OFF, %esi /* test range */
154123
jl bpf_error /* offset lower -> error */
@@ -179,22 +148,12 @@ sk_load_byte_negative_offset:
179148
movzbl (%rax), %eax
180149
ret
181150

182-
bpf_slow_path_byte_msh_neg:
183-
cmp SKF_MAX_NEG_OFF, %esi
184-
jl bpf_error
185-
sk_load_byte_msh_negative_offset:
186-
.globl sk_load_byte_msh_negative_offset
187-
xchg %eax,%ebx /* dont lose A , X is about to be scratched */
188-
sk_negative_common(1)
189-
movzbl (%rax),%eax
190-
and $15,%al
191-
shl $2,%al
192-
xchg %eax,%ebx
193-
ret
194-
195151
bpf_error:
196152
# force a return 0 from jit handler
197-
xor %eax,%eax
198-
mov -8(%rbp),%rbx
153+
xor %eax,%eax
154+
mov - MAX_BPF_STACK(%rbp),%rbx
155+
mov - MAX_BPF_STACK + 8(%rbp),%r13
156+
mov - MAX_BPF_STACK + 16(%rbp),%r14
157+
mov - MAX_BPF_STACK + 24(%rbp),%r15
199158
leaveq
200159
ret

0 commit comments

Comments
 (0)