summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-05-07 23:35:08 -0400
committerDavid S. Miller <davem@davemloft.net>2018-05-07 23:35:08 -0400
commit01adc4851a8090b46c7a5ed9cfc4b97e65abfbf4 (patch)
tree2ae02593d7139962648dff203f3f9701e34ccbc3
parent18b338f5f9539512e76fd9ebd4c6ca1a0e159e2b (diff)
parente94fa1d93117e7f1eb783dc9cae6c70650944449 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Minor conflict, a CHECK was placed into an if() statement in net-next, whilst a newline was added to that CHECK call in 'net'. Thanks to Daniel for the merge resolution. Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/af_xdp.rst297
-rw-r--r--Documentation/networking/filter.txt6
-rw-r--r--Documentation/networking/index.rst1
-rw-r--r--Documentation/sysctl/net.txt1
-rw-r--r--MAINTAINERS9
-rw-r--r--arch/arm/net/bpf_jit_32.c77
-rw-r--r--arch/arm64/net/bpf_jit_comp.c65
-rw-r--r--arch/mips/net/ebpf_jit.c104
-rw-r--r--arch/powerpc/net/Makefile2
-rw-r--r--arch/powerpc/net/bpf_jit64.h37
-rw-r--r--arch/powerpc/net/bpf_jit_asm64.S180
-rw-r--r--arch/powerpc/net/bpf_jit_comp64.c109
-rw-r--r--arch/s390/net/Makefile2
-rw-r--r--arch/s390/net/bpf_jit.S116
-rw-r--r--arch/s390/net/bpf_jit.h20
-rw-r--r--arch/s390/net/bpf_jit_comp.c127
-rw-r--r--arch/sparc/net/Makefile5
-rw-r--r--arch/sparc/net/bpf_jit_64.h29
-rw-r--r--arch/sparc/net/bpf_jit_asm_64.S162
-rw-r--r--arch/sparc/net/bpf_jit_comp_64.c79
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/include/asm/nospec-branch.h30
-rw-r--r--arch/x86/net/Makefile7
-rw-r--r--arch/x86/net/bpf_jit.S154
-rw-r--r--arch/x86/net/bpf_jit_comp.c343
-rw-r--r--arch/x86/net/bpf_jit_comp32.c2419
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/cmsg.c16
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/fw.h20
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/jit.c76
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/main.c28
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/main.h24
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/offload.c172
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/verifier.c78
-rw-r--r--drivers/net/ethernet/netronome/nfp/nfp_app.c2
-rw-r--r--include/linux/bpf.h35
-rw-r--r--include/linux/bpf_trace.h1
-rw-r--r--include/linux/bpf_types.h3
-rw-r--r--include/linux/bpf_verifier.h9
-rw-r--r--include/linux/filter.h9
-rw-r--r--include/linux/netdevice.h1
-rw-r--r--include/linux/socket.h5
-rw-r--r--include/linux/tnum.h4
-rw-r--r--include/net/xdp.h1
-rw-r--r--include/net/xdp_sock.h66
-rw-r--r--include/trace/events/bpf.h355
-rw-r--r--include/uapi/linux/bpf.h94
-rw-r--r--include/uapi/linux/if_xdp.h87
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/core.c108
-rw-r--r--kernel/bpf/inode.c16
-rw-r--r--kernel/bpf/offload.c6
-rw-r--r--kernel/bpf/stackmap.c80
-rw-r--r--kernel/bpf/syscall.c17
-rw-r--r--kernel/bpf/tnum.c10
-rw-r--r--kernel/bpf/verifier.c247
-rw-r--r--kernel/bpf/xskmap.c241
-rw-r--r--kernel/trace/bpf_trace.c52
-rw-r--r--lib/test_bpf.c570
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/core/dev.c73
-rw-r--r--net/core/filter.c345
-rw-r--r--net/core/sock.c12
-rw-r--r--net/core/xdp.c15
-rw-r--r--net/packet/af_packet.c42
-rw-r--r--net/xdp/Kconfig7
-rw-r--r--net/xdp/Makefile2
-rw-r--r--net/xdp/xdp_umem.c260
-rw-r--r--net/xdp/xdp_umem.h67
-rw-r--r--net/xdp/xdp_umem_props.h23
-rw-r--r--net/xdp/xsk.c656
-rw-r--r--net/xdp/xsk_queue.c73
-rw-r--r--net/xdp/xsk_queue.h247
-rw-r--r--samples/bpf/Makefile15
-rw-r--r--samples/bpf/bpf_load.c97
-rw-r--r--samples/bpf/bpf_load.h7
-rw-r--r--samples/bpf/offwaketime_user.c1
-rw-r--r--samples/bpf/sampleip_user.c1
-rw-r--r--samples/bpf/spintest_user.c1
-rw-r--r--samples/bpf/trace_event_user.c1
-rw-r--r--samples/bpf/trace_output_user.c110
-rw-r--r--samples/bpf/xdpsock.h11
-rw-r--r--samples/bpf/xdpsock_kern.c56
-rw-r--r--samples/bpf/xdpsock_user.c948
-rwxr-xr-xscripts/bpf_helpers_doc.py14
-rw-r--r--security/selinux/hooks.c4
-rw-r--r--security/selinux/include/classmap.h4
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-map.rst40
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool.rst2
-rw-r--r--tools/bpf/bpftool/Makefile7
-rw-r--r--tools/bpf/bpftool/bash-completion/bpftool36
-rw-r--r--tools/bpf/bpftool/common.c77
-rw-r--r--tools/bpf/bpftool/main.h7
-rw-r--r--tools/bpf/bpftool/map.c80
-rw-r--r--tools/bpf/bpftool/map_perf_ring.c347
-rw-r--r--tools/bpf/bpftool/prog.c8
-rw-r--r--tools/include/uapi/linux/bpf.h93
-rw-r--r--tools/include/uapi/linux/erspan.h52
-rw-r--r--tools/testing/selftests/bpf/Makefile4
-rw-r--r--tools/testing/selftests/bpf/bpf_helpers.h2
-rw-r--r--tools/testing/selftests/bpf/test_get_stack_rawtp.c102
-rw-r--r--tools/testing/selftests/bpf/test_progs.c242
-rw-r--r--tools/testing/selftests/bpf/test_stacktrace_build_id.c20
-rw-r--r--tools/testing/selftests/bpf/test_stacktrace_map.c19
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c311
-rw-r--r--tools/testing/selftests/bpf/trace_helpers.c180
-rw-r--r--tools/testing/selftests/bpf/trace_helpers.h23
107 files changed, 8852 insertions, 2713 deletions
diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
new file mode 100644
index 000000000000..91928d9ee4bf
--- /dev/null
+++ b/Documentation/networking/af_xdp.rst
@@ -0,0 +1,297 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3======
4AF_XDP
5======
6
7Overview
8========
9
10AF_XDP is an address family that is optimized for high performance
11packet processing.
12
13This document assumes that the reader is familiar with BPF and XDP. If
14not, the Cilium project has an excellent reference guide at
15http://cilium.readthedocs.io/en/doc-1.0/bpf/.
16
17Using the XDP_REDIRECT action from an XDP program, the program can
18redirect ingress frames to other XDP enabled netdevs, using the
19bpf_redirect_map() function. AF_XDP sockets enable the possibility for
20XDP programs to redirect frames to a memory buffer in a user-space
21application.
22
23An AF_XDP socket (XSK) is created with the normal socket()
24syscall. Associated with each XSK are two rings: the RX ring and the
25TX ring. A socket can receive packets on the RX ring and it can send
26packets on the TX ring. These rings are registered and sized with the
27setsockopts XDP_RX_RING and XDP_TX_RING, respectively. It is mandatory
28to have at least one of these rings for each socket. An RX or TX
29descriptor ring points to a data buffer in a memory area called a
30UMEM. RX and TX can share the same UMEM so that a packet does not have
31to be copied between RX and TX. Moreover, if a packet needs to be kept
32for a while due to a possible retransmit, the descriptor that points
33to that packet can be changed to point to another and reused right
34away. This again avoids copying data.
35
36The UMEM consists of a number of equally size frames and each frame
37has a unique frame id. A descriptor in one of the rings references a
38frame by referencing its frame id. The user space allocates memory for
39this UMEM using whatever means it feels is most appropriate (malloc,
40mmap, huge pages, etc). This memory area is then registered with the
41kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two
42rings: the FILL ring and the COMPLETION ring. The fill ring is used by
43the application to send down frame ids for the kernel to fill in with
44RX packet data. References to these frames will then appear in the RX
45ring once each packet has been received. The completion ring, on the
46other hand, contains frame ids that the kernel has transmitted
47completely and can now be used again by user space, for either TX or
48RX. Thus, the frame ids appearing in the completion ring are ids that
49were previously transmitted using the TX ring. In summary, the RX and
50FILL rings are used for the RX path and the TX and COMPLETION rings
51are used for the TX path.
52
53The socket is then finally bound with a bind() call to a device and a
54specific queue id on that device, and it is not until bind is
55completed that traffic starts to flow.
56
57The UMEM can be shared between processes, if desired. If a process
58wants to do this, it simply skips the registration of the UMEM and its
59corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind
60call and submits the XSK of the process it would like to share UMEM
61with as well as its own newly created XSK socket. The new process will
62then receive frame id references in its own RX ring that point to this
63shared UMEM. Note that since the ring structures are single-consumer /
64single-producer (for performance reasons), the new process has to
65create its own socket with associated RX and TX rings, since it cannot
66share this with the other process. This is also the reason that there
67is only one set of FILL and COMPLETION rings per UMEM. It is the
68responsibility of a single process to handle the UMEM.
69
70How is then packets distributed from an XDP program to the XSKs? There
71is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The
72user-space application can place an XSK at an arbitrary place in this
73map. The XDP program can then redirect a packet to a specific index in
74this map and at this point XDP validates that the XSK in that map was
75indeed bound to that device and ring number. If not, the packet is
76dropped. If the map is empty at that index, the packet is also
77dropped. This also means that it is currently mandatory to have an XDP
78program loaded (and one XSK in the XSKMAP) to be able to get any
79traffic to user space through the XSK.
80
81AF_XDP can operate in two different modes: XDP_SKB and XDP_DRV. If the
82driver does not have support for XDP, or XDP_SKB is explicitly chosen
83when loading the XDP program, XDP_SKB mode is employed that uses SKBs
84together with the generic XDP support and copies out the data to user
85space. A fallback mode that works for any network device. On the other
86hand, if the driver has support for XDP, it will be used by the AF_XDP
87code to provide better performance, but there is still a copy of the
88data into user space.
89
90Concepts
91========
92
93In order to use an AF_XDP socket, a number of associated objects need
94to be setup.
95
96Jonathan Corbet has also written an excellent article on LWN,
97"Accelerating networking with AF_XDP". It can be found at
98https://lwn.net/Articles/750845/.
99
100UMEM
101----
102
103UMEM is a region of virtual contiguous memory, divided into
104equal-sized frames. An UMEM is associated to a netdev and a specific
105queue id of that netdev. It is created and configured (frame size,
106frame headroom, start address and size) by using the XDP_UMEM_REG
107setsockopt system call. A UMEM is bound to a netdev and queue id, via
108the bind() system call.
109
110An AF_XDP is socket linked to a single UMEM, but one UMEM can have
111multiple AF_XDP sockets. To share an UMEM created via one socket A,
112the next socket B can do this by setting the XDP_SHARED_UMEM flag in
113struct sockaddr_xdp member sxdp_flags, and passing the file descriptor
114of A to struct sockaddr_xdp member sxdp_shared_umem_fd.
115
116The UMEM has two single-producer/single-consumer rings, that are used
117to transfer ownership of UMEM frames between the kernel and the
118user-space application.
119
120Rings
121-----
122
123There are a four different kind of rings: Fill, Completion, RX and
124TX. All rings are single-producer/single-consumer, so the user-space
125application need explicit synchronization of multiple
126processes/threads are reading/writing to them.
127
128The UMEM uses two rings: Fill and Completion. Each socket associated
129with the UMEM must have an RX queue, TX queue or both. Say, that there
130is a setup with four sockets (all doing TX and RX). Then there will be
131one Fill ring, one Completion ring, four TX rings and four RX rings.
132
133The rings are head(producer)/tail(consumer) based rings. A producer
134writes the data ring at the index pointed out by struct xdp_ring
135producer member, and increasing the producer index. A consumer reads
136the data ring at the index pointed out by struct xdp_ring consumer
137member, and increasing the consumer index.
138
139The rings are configured and created via the _RING setsockopt system
140calls and mmapped to user-space using the appropriate offset to mmap()
141(XDP_PGOFF_RX_RING, XDP_PGOFF_TX_RING, XDP_UMEM_PGOFF_FILL_RING and
142XDP_UMEM_PGOFF_COMPLETION_RING).
143
144The size of the rings need to be of size power of two.
145
146UMEM Fill Ring
147~~~~~~~~~~~~~~
148
149The Fill ring is used to transfer ownership of UMEM frames from
150user-space to kernel-space. The UMEM indicies are passed in the
151ring. As an example, if the UMEM is 64k and each frame is 4k, then the
152UMEM has 16 frames and can pass indicies between 0 and 15.
153
154Frames passed to the kernel are used for the ingress path (RX rings).
155
156The user application produces UMEM indicies to this ring.
157
158UMEM Completetion Ring
159~~~~~~~~~~~~~~~~~~~~~~
160
161The Completion Ring is used transfer ownership of UMEM frames from
162kernel-space to user-space. Just like the Fill ring, UMEM indicies are
163used.
164
165Frames passed from the kernel to user-space are frames that has been
166sent (TX ring) and can be used by user-space again.
167
168The user application consumes UMEM indicies from this ring.
169
170
171RX Ring
172~~~~~~~
173
174The RX ring is the receiving side of a socket. Each entry in the ring
175is a struct xdp_desc descriptor. The descriptor contains UMEM index
176(idx), the length of the data (len), the offset into the frame
177(offset).
178
179If no frames have been passed to kernel via the Fill ring, no
180descriptors will (or can) appear on the RX ring.
181
182The user application consumes struct xdp_desc descriptors from this
183ring.
184
185TX Ring
186~~~~~~~
187
188The TX ring is used to send frames. The struct xdp_desc descriptor is
189filled (index, length and offset) and passed into the ring.
190
191To start the transfer a sendmsg() system call is required. This might
192be relaxed in the future.
193
194The user application produces struct xdp_desc descriptors to this
195ring.
196
197XSKMAP / BPF_MAP_TYPE_XSKMAP
198----------------------------
199
200On XDP side there is a BPF map type BPF_MAP_TYPE_XSKMAP (XSKMAP) that
201is used in conjunction with bpf_redirect_map() to pass the ingress
202frame to a socket.
203
204The user application inserts the socket into the map, via the bpf()
205system call.
206
207Note that if an XDP program tries to redirect to a socket that does
208not match the queue configuration and netdev, the frame will be
209dropped. E.g. an AF_XDP socket is bound to netdev eth0 and
210queue 17. Only the XDP program executing for eth0 and queue 17 will
211successfully pass data to the socket. Please refer to the sample
212application (samples/bpf/) in for an example.
213
214Usage
215=====
216
217In order to use AF_XDP sockets there are two parts needed. The
218user-space application and the XDP program. For a complete setup and
219usage example, please refer to the sample application. The user-space
220side is xdpsock_user.c and the XDP side xdpsock_kern.c.
221
222Naive ring dequeue and enqueue could look like this::
223
224 // typedef struct xdp_rxtx_ring RING;
225 // typedef struct xdp_umem_ring RING;
226
227 // typedef struct xdp_desc RING_TYPE;
228 // typedef __u32 RING_TYPE;
229
230 int dequeue_one(RING *ring, RING_TYPE *item)
231 {
232 __u32 entries = ring->ptrs.producer - ring->ptrs.consumer;
233
234 if (entries == 0)
235 return -1;
236
237 // read-barrier!
238
239 *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)];
240 ring->ptrs.consumer++;
241 return 0;
242 }
243
244 int enqueue_one(RING *ring, const RING_TYPE *item)
245 {
246 u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer);
247
248 if (free_entries == 0)
249 return -1;
250
251 ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item;
252
253 // write-barrier!
254
255 ring->ptrs.producer++;
256 return 0;
257 }
258
259
260For a more optimized version, please refer to the sample application.
261
262Sample application
263==================
264
265There is a xdpsock benchmarking/test application included that
266demonstrates how to use AF_XDP sockets with both private and shared
267UMEMs. Say that you would like your UDP traffic from port 4242 to end
268up in queue 16, that we will enable AF_XDP on. Here, we use ethtool
269for this::
270
271 ethtool -N p3p2 rx-flow-hash udp4 fn
272 ethtool -N p3p2 flow-type udp4 src-port 4242 dst-port 4242 \
273 action 16
274
275Running the rxdrop benchmark in XDP_DRV mode can then be done
276using::
277
278 samples/bpf/xdpsock -i p3p2 -q 16 -r -N
279
280For XDP_SKB mode, use the switch "-S" instead of "-N" and all options
281can be displayed with "-h", as usual.
282
283Credits
284=======
285
286- Björn Töpel (AF_XDP core)
287- Magnus Karlsson (AF_XDP core)
288- Alexander Duyck
289- Alexei Starovoitov
290- Daniel Borkmann
291- Jesper Dangaard Brouer
292- John Fastabend
293- Jonathan Corbet (LWN coverage)
294- Michael S. Tsirkin
295- Qi Z Zhang
296- Willem de Bruijn
297
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index fd55c7de9991..5032e1263bc9 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -483,6 +483,12 @@ Example output from dmesg:
483[ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00 483[ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00
484[ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3 484[ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3
485 485
486When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1 and
487setting any other value than that will return in failure. This is even the case for
488setting bpf_jit_enable to 2, since dumping the final JIT image into the kernel log
489is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is the
490generally recommended approach instead.
491
486In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for 492In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for
487generating disassembly out of the kernel log's hexdump: 493generating disassembly out of the kernel log's hexdump:
488 494
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index f204eaff657d..cbd9bdd4a79e 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -6,6 +6,7 @@ Contents:
6.. toctree:: 6.. toctree::
7 :maxdepth: 2 7 :maxdepth: 2
8 8
9 af_xdp
9 batman-adv 10 batman-adv
10 can 11 can
11 dpaa2/index 12 dpaa2/index
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 5992602469d8..9ecde517728c 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -45,6 +45,7 @@ through bpf(2) and passing a verifier in the kernel, a JIT will then
45translate these BPF proglets into native CPU instructions. There are 45translate these BPF proglets into native CPU instructions. There are
46two flavors of JITs, the newer eBPF JIT currently supported on: 46two flavors of JITs, the newer eBPF JIT currently supported on:
47 - x86_64 47 - x86_64
48 - x86_32
48 - arm64 49 - arm64
49 - arm32 50 - arm32
50 - ppc64 51 - ppc64
diff --git a/MAINTAINERS b/MAINTAINERS
index ebe0b9ed7805..b22be10d5916 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2729,7 +2729,6 @@ F: Documentation/networking/filter.txt
2729F: Documentation/bpf/ 2729F: Documentation/bpf/
2730F: include/linux/bpf* 2730F: include/linux/bpf*
2731F: include/linux/filter.h 2731F: include/linux/filter.h
2732F: include/trace/events/bpf.h
2733F: include/trace/events/xdp.h 2732F: include/trace/events/xdp.h
2734F: include/uapi/linux/bpf* 2733F: include/uapi/linux/bpf*
2735F: include/uapi/linux/filter.h 2734F: include/uapi/linux/filter.h
@@ -15408,6 +15407,14 @@ T: git git://linuxtv.org/media_tree.git
15408S: Maintained 15407S: Maintained
15409F: drivers/media/tuners/tuner-xc2028.* 15408F: drivers/media/tuners/tuner-xc2028.*
15410 15409
15410XDP SOCKETS (AF_XDP)
15411M: Björn Töpel <bjorn.topel@intel.com>
15412M: Magnus Karlsson <magnus.karlsson@intel.com>
15413L: netdev@vger.kernel.org
15414S: Maintained
15415F: kernel/bpf/xskmap.c
15416F: net/xdp/
15417
15411XEN BLOCK SUBSYSTEM 15418XEN BLOCK SUBSYSTEM
15412M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 15419M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
15413M: Roger Pau Monné <roger.pau@citrix.com> 15420M: Roger Pau Monné <roger.pau@citrix.com>
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index b5030e1a41d8..82689b999257 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -1452,83 +1452,6 @@ exit:
1452 emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); 1452 emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx);
1453 emit_ldx_r(dst, rn, dstk, off, ctx, BPF_SIZE(code)); 1453 emit_ldx_r(dst, rn, dstk, off, ctx, BPF_SIZE(code));
1454 break; 1454 break;
1455 /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */
1456 case BPF_LD | BPF_ABS | BPF_W:
1457 case BPF_LD | BPF_ABS | BPF_H:
1458 case BPF_LD | BPF_ABS | BPF_B:
1459 /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */
1460 case BPF_LD | BPF_IND | BPF_W:
1461 case BPF_LD | BPF_IND | BPF_H:
1462 case BPF_LD | BPF_IND | BPF_B:
1463 {
1464 const u8 r4 = bpf2a32[BPF_REG_6][1]; /* r4 = ptr to sk_buff */
1465 const u8 r0 = bpf2a32[BPF_REG_0][1]; /*r0: struct sk_buff *skb*/
1466 /* rtn value */
1467 const u8 r1 = bpf2a32[BPF_REG_0][0]; /* r1: int k */
1468 const u8 r2 = bpf2a32[BPF_REG_1][1]; /* r2: unsigned int size */
1469 const u8 r3 = bpf2a32[BPF_REG_1][0]; /* r3: void *buffer */
1470 const u8 r6 = bpf2a32[TMP_REG_1][1]; /* r6: void *(*func)(..) */
1471 int size;
1472
1473 /* Setting up first argument */
1474 emit(ARM_MOV_R(r0, r4), ctx);
1475
1476 /* Setting up second argument */
1477 emit_a32_mov_i(r1, imm, false, ctx);
1478 if (BPF_MODE(code) == BPF_IND)
1479 emit_a32_alu_r(r1, src_lo, false, sstk, ctx,
1480 false, false, BPF_ADD);
1481
1482 /* Setting up third argument */
1483 switch (BPF_SIZE(code)) {
1484 case BPF_W:
1485 size = 4;
1486 break;
1487 case BPF_H:
1488 size = 2;
1489 break;
1490 case BPF_B:
1491 size = 1;
1492 break;
1493 default:
1494 return -EINVAL;
1495 }
1496 emit_a32_mov_i(r2, size, false, ctx);
1497
1498 /* Setting up fourth argument */
1499 emit(ARM_ADD_I(r3, ARM_SP, imm8m(SKB_BUFFER)), ctx);
1500
1501 /* Setting up function pointer to call */
1502 emit_a32_mov_i(r6, (unsigned int)bpf_load_pointer, false, ctx);
1503 emit_blx_r(r6, ctx);
1504
1505 emit(ARM_EOR_R(r1, r1, r1), ctx);
1506 /* Check if return address is NULL or not.
1507 * if NULL then jump to epilogue
1508 * else continue to load the value from retn address
1509 */
1510 emit(ARM_CMP_I(r0, 0), ctx);
1511 jmp_offset = epilogue_offset(ctx);
1512 check_imm24(jmp_offset);
1513 _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
1514
1515 /* Load value from the address */
1516 switch (BPF_SIZE(code)) {
1517 case BPF_W:
1518 emit(ARM_LDR_I(r0, r0, 0), ctx);
1519 emit_rev32(r0, r0, ctx);
1520 break;
1521 case BPF_H:
1522 emit(ARM_LDRH_I(r0, r0, 0), ctx);
1523 emit_rev16(r0, r0, ctx);
1524 break;
1525 case BPF_B:
1526 emit(ARM_LDRB_I(r0, r0, 0), ctx);
1527 /* No need to reverse */
1528 break;
1529 }
1530 break;
1531 }
1532 /* ST: *(size *)(dst + off) = imm */ 1455 /* ST: *(size *)(dst + off) = imm */
1533 case BPF_ST | BPF_MEM | BPF_W: 1456 case BPF_ST | BPF_MEM | BPF_W:
1534 case BPF_ST | BPF_MEM | BPF_H: 1457 case BPF_ST | BPF_MEM | BPF_H:
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index a93350451e8e..0b40c8fb0706 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -723,71 +723,6 @@ emit_cond_jmp:
723 emit(A64_CBNZ(0, tmp3, jmp_offset), ctx); 723 emit(A64_CBNZ(0, tmp3, jmp_offset), ctx);
724 break; 724 break;
725 725
726 /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */
727 case BPF_LD | BPF_ABS | BPF_W:
728 case BPF_LD | BPF_ABS | BPF_H:
729 case BPF_LD | BPF_ABS | BPF_B:
730 /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */
731 case BPF_LD | BPF_IND | BPF_W:
732 case BPF_LD | BPF_IND | BPF_H:
733 case BPF_LD | BPF_IND | BPF_B:
734 {
735 const u8 r0 = bpf2a64[BPF_REG_0]; /* r0 = return value */
736 const u8 r6 = bpf2a64[BPF_REG_6]; /* r6 = pointer to sk_buff */
737 const u8 fp = bpf2a64[BPF_REG_FP];
738 const u8 r1 = bpf2a64[BPF_REG_1]; /* r1: struct sk_buff *skb */
739 const u8 r2 = bpf2a64[BPF_REG_2]; /* r2: int k */
740 const u8 r3 = bpf2a64[BPF_REG_3]; /* r3: unsigned int size */
741 const u8 r4 = bpf2a64[BPF_REG_4]; /* r4: void *buffer */
742 const u8 r5 = bpf2a64[BPF_REG_5]; /* r5: void *(*func)(...) */
743 int size;
744
745 emit(A64_MOV(1, r1, r6), ctx);
746 emit_a64_mov_i(0, r2, imm, ctx);
747 if (BPF_MODE(code) == BPF_IND)
748 emit(A64_ADD(0, r2, r2, src), ctx);
749 switch (BPF_SIZE(code)) {
750 case BPF_W:
751 size = 4;
752 break;
753 case BPF_H:
754 size = 2;
755 break;
756 case BPF_B:
757 size = 1;
758 break;
759 default:
760 return -EINVAL;
761 }
762 emit_a64_mov_i64(r3, size, ctx);
763 emit(A64_SUB_I(1, r4, fp, ctx->stack_size), ctx);
764 emit_a64_mov_i64(r5, (unsigned long)bpf_load_pointer, ctx);
765 emit(A64_BLR(r5), ctx);
766 emit(A64_MOV(1, r0, A64_R(0)), ctx);
767
768 jmp_offset = epilogue_offset(ctx);
769 check_imm19(jmp_offset);
770 emit(A64_CBZ(1, r0, jmp_offset), ctx);
771 emit(A64_MOV(1, r5, r0), ctx);
772 switch (BPF_SIZE(code)) {
773 case BPF_W:
774 emit(A64_LDR32(r0, r5, A64_ZR), ctx);
775#ifndef CONFIG_CPU_BIG_ENDIAN
776 emit(A64_REV32(0, r0, r0), ctx);
777#endif
778 break;
779 case BPF_H:
780 emit(A64_LDRH(r0, r5, A64_ZR), ctx);
781#ifndef CONFIG_CPU_BIG_ENDIAN
782 emit(A64_REV16(0, r0, r0), ctx);
783#endif
784 break;
785 case BPF_B:
786 emit(A64_LDRB(r0, r5, A64_ZR), ctx);
787 break;
788 }
789 break;
790 }
791 default: 726 default:
792 pr_err_once("unknown opcode %02x\n", code); 727 pr_err_once("unknown opcode %02x\n", code);
793 return -EINVAL; 728 return -EINVAL;
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 3e2798bfea4f..7ba7df9c28fc 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -1267,110 +1267,6 @@ jeq_common:
1267 return -EINVAL; 1267 return -EINVAL;
1268 break; 1268 break;
1269 1269
1270 case BPF_LD | BPF_B | BPF_ABS:
1271 case BPF_LD | BPF_H | BPF_ABS:
1272 case BPF_LD | BPF_W | BPF_ABS:
1273 case BPF_LD | BPF_DW | BPF_ABS:
1274 ctx->flags |= EBPF_SAVE_RA;
1275
1276 gen_imm_to_reg(insn, MIPS_R_A1, ctx);
1277 emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn));
1278
1279 if (insn->imm < 0) {
1280 emit_const_to_reg(ctx, MIPS_R_T9, (u64)bpf_internal_load_pointer_neg_helper);
1281 } else {
1282 emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer);
1283 emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset);
1284 }
1285 goto ld_skb_common;
1286
1287 case BPF_LD | BPF_B | BPF_IND:
1288 case BPF_LD | BPF_H | BPF_IND:
1289 case BPF_LD | BPF_W | BPF_IND:
1290 case BPF_LD | BPF_DW | BPF_IND:
1291 ctx->flags |= EBPF_SAVE_RA;
1292 src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
1293 if (src < 0)
1294 return src;
1295 ts = get_reg_val_type(ctx, this_idx, insn->src_reg);
1296 if (ts == REG_32BIT_ZERO_EX) {
1297 /* sign extend */
1298 emit_instr(ctx, sll, MIPS_R_A1, src, 0);
1299 src = MIPS_R_A1;
1300 }
1301 if (insn->imm >= S16_MIN && insn->imm <= S16_MAX) {
1302 emit_instr(ctx, daddiu, MIPS_R_A1, src, insn->imm);
1303 } else {
1304 gen_imm_to_reg(insn, MIPS_R_AT, ctx);
1305 emit_instr(ctx, daddu, MIPS_R_A1, MIPS_R_AT, src);
1306 }
1307 /* truncate to 32-bit int */
1308 emit_instr(ctx, sll, MIPS_R_A1, MIPS_R_A1, 0);
1309 emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset);
1310 emit_instr(ctx, slt, MIPS_R_AT, MIPS_R_A1, MIPS_R_ZERO);
1311
1312 emit_const_to_reg(ctx, MIPS_R_T8, (u64)bpf_internal_load_pointer_neg_helper);
1313 emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer);
1314 emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn));
1315 emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_T8, MIPS_R_AT);
1316
1317ld_skb_common:
1318 emit_instr(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
1319 /* delay slot move */
1320 emit_instr(ctx, daddu, MIPS_R_A0, MIPS_R_S0, MIPS_R_ZERO);
1321
1322 /* Check the error value */
1323 b_off = b_imm(exit_idx, ctx);
1324 if (is_bad_offset(b_off)) {
1325 target = j_target(ctx, exit_idx);
1326 if (target == (unsigned int)-1)
1327 return -E2BIG;
1328
1329 if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) {
1330 ctx->offsets[this_idx] |= OFFSETS_B_CONV;
1331 ctx->long_b_conversion = 1;
1332 }
1333 emit_instr(ctx, bne, MIPS_R_V0, MIPS_R_ZERO, 4 * 3);
1334 emit_instr(ctx, nop);
1335 emit_instr(ctx, j, target);
1336 emit_instr(ctx, nop);
1337 } else {
1338 emit_instr(ctx, beq, MIPS_R_V0, MIPS_R_ZERO, b_off);
1339 emit_instr(ctx, nop);
1340 }
1341
1342#ifdef __BIG_ENDIAN
1343 need_swap = false;
1344#else
1345 need_swap = true;
1346#endif
1347 dst = MIPS_R_V0;
1348 switch (BPF_SIZE(insn->code)) {
1349 case BPF_B:
1350 emit_instr(ctx, lbu, dst, 0, MIPS_R_V0);
1351 break;
1352 case BPF_H:
1353 emit_instr(ctx, lhu, dst, 0, MIPS_R_V0);
1354 if (need_swap)
1355 emit_instr(ctx, wsbh, dst, dst);
1356 break;
1357 case BPF_W:
1358 emit_instr(ctx, lw, dst, 0, MIPS_R_V0);
1359 if (need_swap) {
1360 emit_instr(ctx, wsbh, dst, dst);
1361 emit_instr(ctx, rotr, dst, dst, 16);
1362 }
1363 break;
1364 case BPF_DW:
1365 emit_instr(ctx, ld, dst, 0, MIPS_R_V0);
1366 if (need_swap) {
1367 emit_instr(ctx, dsbh, dst, dst);
1368 emit_instr(ctx, dshd, dst, dst);
1369 }
1370 break;
1371 }
1372
1373 break;
1374 case BPF_ALU | BPF_END | BPF_FROM_BE: 1270 case BPF_ALU | BPF_END | BPF_FROM_BE:
1375 case BPF_ALU | BPF_END | BPF_FROM_LE: 1271 case BPF_ALU | BPF_END | BPF_FROM_LE:
1376 dst = ebpf_to_mips_reg(ctx, insn, dst_reg); 1272 dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
diff --git a/arch/powerpc/net/Makefile b/arch/powerpc/net/Makefile
index 02d369ca6a53..809f019d3cba 100644
--- a/arch/powerpc/net/Makefile
+++ b/arch/powerpc/net/Makefile
@@ -3,7 +3,7 @@
3# Arch-specific network modules 3# Arch-specific network modules
4# 4#
5ifeq ($(CONFIG_PPC64),y) 5ifeq ($(CONFIG_PPC64),y)
6obj-$(CONFIG_BPF_JIT) += bpf_jit_asm64.o bpf_jit_comp64.o 6obj-$(CONFIG_BPF_JIT) += bpf_jit_comp64.o
7else 7else
8obj-$(CONFIG_BPF_JIT) += bpf_jit_asm.o bpf_jit_comp.o 8obj-$(CONFIG_BPF_JIT) += bpf_jit_asm.o bpf_jit_comp.o
9endif 9endif
diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h
index 8bdef7ed28a8..3609be4692b3 100644
--- a/arch/powerpc/net/bpf_jit64.h
+++ b/arch/powerpc/net/bpf_jit64.h
@@ -20,7 +20,7 @@
20 * with our redzone usage. 20 * with our redzone usage.
21 * 21 *
22 * [ prev sp ] <------------- 22 * [ prev sp ] <-------------
23 * [ nv gpr save area ] 8*8 | 23 * [ nv gpr save area ] 6*8 |
24 * [ tail_call_cnt ] 8 | 24 * [ tail_call_cnt ] 8 |
25 * [ local_tmp_var ] 8 | 25 * [ local_tmp_var ] 8 |
26 * fp (r31) --> [ ebpf stack space ] upto 512 | 26 * fp (r31) --> [ ebpf stack space ] upto 512 |
@@ -28,8 +28,8 @@
28 * sp (r1) ---> [ stack pointer ] -------------- 28 * sp (r1) ---> [ stack pointer ] --------------
29 */ 29 */
30 30
31/* for gpr non volatile registers BPG_REG_6 to 10, plus skb cache registers */ 31/* for gpr non volatile registers BPG_REG_6 to 10 */
32#define BPF_PPC_STACK_SAVE (8*8) 32#define BPF_PPC_STACK_SAVE (6*8)
33/* for bpf JIT code internal usage */ 33/* for bpf JIT code internal usage */
34#define BPF_PPC_STACK_LOCALS 16 34#define BPF_PPC_STACK_LOCALS 16
35/* stack frame excluding BPF stack, ensure this is quadword aligned */ 35/* stack frame excluding BPF stack, ensure this is quadword aligned */
@@ -39,10 +39,8 @@
39#ifndef __ASSEMBLY__ 39#ifndef __ASSEMBLY__
40 40
41/* BPF register usage */ 41/* BPF register usage */
42#define SKB_HLEN_REG (MAX_BPF_JIT_REG + 0) 42#define TMP_REG_1 (MAX_BPF_JIT_REG + 0)
43#define SKB_DATA_REG (MAX_BPF_JIT_REG + 1) 43#define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
44#define TMP_REG_1 (MAX_BPF_JIT_REG + 2)
45#define TMP_REG_2 (MAX_BPF_JIT_REG + 3)
46 44
47/* BPF to ppc register mappings */ 45/* BPF to ppc register mappings */
48static const int b2p[] = { 46static const int b2p[] = {
@@ -63,40 +61,23 @@ static const int b2p[] = {
63 [BPF_REG_FP] = 31, 61 [BPF_REG_FP] = 31,
64 /* eBPF jit internal registers */ 62 /* eBPF jit internal registers */
65 [BPF_REG_AX] = 2, 63 [BPF_REG_AX] = 2,
66 [SKB_HLEN_REG] = 25,
67 [SKB_DATA_REG] = 26,
68 [TMP_REG_1] = 9, 64 [TMP_REG_1] = 9,
69 [TMP_REG_2] = 10 65 [TMP_REG_2] = 10
70}; 66};
71 67
72/* PPC NVR range -- update this if we ever use NVRs below r24 */ 68/* PPC NVR range -- update this if we ever use NVRs below r27 */
73#define BPF_PPC_NVR_MIN 24 69#define BPF_PPC_NVR_MIN 27
74
75/* Assembly helpers */
76#define DECLARE_LOAD_FUNC(func) u64 func(u64 r3, u64 r4); \
77 u64 func##_negative_offset(u64 r3, u64 r4); \
78 u64 func##_positive_offset(u64 r3, u64 r4);
79
80DECLARE_LOAD_FUNC(sk_load_word);
81DECLARE_LOAD_FUNC(sk_load_half);
82DECLARE_LOAD_FUNC(sk_load_byte);
83
84#define CHOOSE_LOAD_FUNC(imm, func) \
85 (imm < 0 ? \
86 (imm >= SKF_LL_OFF ? func##_negative_offset : func) : \
87 func##_positive_offset)
88 70
89#define SEEN_FUNC 0x1000 /* might call external helpers */ 71#define SEEN_FUNC 0x1000 /* might call external helpers */
90#define SEEN_STACK 0x2000 /* uses BPF stack */ 72#define SEEN_STACK 0x2000 /* uses BPF stack */
91#define SEEN_SKB 0x4000 /* uses sk_buff */ 73#define SEEN_TAILCALL 0x4000 /* uses tail calls */
92#define SEEN_TAILCALL 0x8000 /* uses tail calls */
93 74
94struct codegen_context { 75struct codegen_context {
95 /* 76 /*
96 * This is used to track register usage as well 77 * This is used to track register usage as well
97 * as calls to external helpers. 78 * as calls to external helpers.
98 * - register usage is tracked with corresponding 79 * - register usage is tracked with corresponding
99 * bits (r3-r10 and r25-r31) 80 * bits (r3-r10 and r27-r31)
100 * - rest of the bits can be used to track other 81 * - rest of the bits can be used to track other
101 * things -- for now, we use bits 16 to 23 82 * things -- for now, we use bits 16 to 23
102 * encoded in SEEN_* macros above 83 * encoded in SEEN_* macros above
diff --git a/arch/powerpc/net/bpf_jit_asm64.S b/arch/powerpc/net/bpf_jit_asm64.S
deleted file mode 100644
index 7e4c51430b84..000000000000
--- a/arch/powerpc/net/bpf_jit_asm64.S
+++ /dev/null
@@ -1,180 +0,0 @@
1/*
2 * bpf_jit_asm64.S: Packet/header access helper functions
3 * for PPC64 BPF compiler.
4 *
5 * Copyright 2016, Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
6 * IBM Corporation
7 *
8 * Based on bpf_jit_asm.S by Matt Evans
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; version 2
13 * of the License.
14 */
15
16#include <asm/ppc_asm.h>
17#include <asm/ptrace.h>
18#include "bpf_jit64.h"
19
20/*
21 * All of these routines are called directly from generated code,
22 * with the below register usage:
23 * r27 skb pointer (ctx)
24 * r25 skb header length
25 * r26 skb->data pointer
26 * r4 offset
27 *
28 * Result is passed back in:
29 * r8 data read in host endian format (accumulator)
30 *
31 * r9 is used as a temporary register
32 */
33
34#define r_skb r27
35#define r_hlen r25
36#define r_data r26
37#define r_off r4
38#define r_val r8
39#define r_tmp r9
40
41_GLOBAL_TOC(sk_load_word)
42 cmpdi r_off, 0
43 blt bpf_slow_path_word_neg
44 b sk_load_word_positive_offset
45
46_GLOBAL_TOC(sk_load_word_positive_offset)
47 /* Are we accessing past headlen? */
48 subi r_tmp, r_hlen, 4
49 cmpd r_tmp, r_off
50 blt bpf_slow_path_word
51 /* Nope, just hitting the header. cr0 here is eq or gt! */
52 LWZX_BE r_val, r_data, r_off
53 blr /* Return success, cr0 != LT */
54
55_GLOBAL_TOC(sk_load_half)
56 cmpdi r_off, 0
57 blt bpf_slow_path_half_neg
58 b sk_load_half_positive_offset
59
60_GLOBAL_TOC(sk_load_half_positive_offset)
61 subi r_tmp, r_hlen, 2
62 cmpd r_tmp, r_off
63 blt bpf_slow_path_half
64 LHZX_BE r_val, r_data, r_off
65 blr
66
67_GLOBAL_TOC(sk_load_byte)
68 cmpdi r_off, 0
69 blt bpf_slow_path_byte_neg
70 b sk_load_byte_positive_offset
71
72_GLOBAL_TOC(sk_load_byte_positive_offset)
73 cmpd r_hlen, r_off
74 ble bpf_slow_path_byte
75 lbzx r_val, r_data, r_off
76 blr
77
78/*
79 * Call out to skb_copy_bits:
80 * Allocate a new stack frame here to remain ABI-compliant in
81 * stashing LR.
82 */
83#define bpf_slow_path_common(SIZE) \
84 mflr r0; \
85 std r0, PPC_LR_STKOFF(r1); \
86 stdu r1, -(STACK_FRAME_MIN_SIZE + BPF_PPC_STACK_LOCALS)(r1); \
87 mr r3, r_skb; \
88 /* r4 = r_off as passed */ \
89 addi r5, r1, STACK_FRAME_MIN_SIZE; \
90 li r6, SIZE; \
91 bl skb_copy_bits; \
92 nop; \
93 /* save r5 */ \
94 addi r5, r1, STACK_FRAME_MIN_SIZE; \
95 /* r3 = 0 on success */ \
96 addi r1, r1, STACK_FRAME_MIN_SIZE + BPF_PPC_STACK_LOCALS; \
97 ld r0, PPC_LR_STKOFF(r1); \
98 mtlr r0; \
99 cmpdi r3, 0; \
100 blt bpf_error; /* cr0 = LT */
101
102bpf_slow_path_word:
103 bpf_slow_path_common(4)
104 /* Data value is on stack, and cr0 != LT */
105 LWZX_BE r_val, 0, r5
106 blr
107
108bpf_slow_path_half:
109 bpf_slow_path_common(2)
110 LHZX_BE r_val, 0, r5
111 blr
112
113bpf_slow_path_byte:
114 bpf_slow_path_common(1)
115 lbzx r_val, 0, r5
116 blr
117
118/*
119 * Call out to bpf_internal_load_pointer_neg_helper
120 */
121#define sk_negative_common(SIZE) \
122 mflr r0; \
123 std r0, PPC_LR_STKOFF(r1); \
124 stdu r1, -STACK_FRAME_MIN_SIZE(r1); \
125 mr r3, r_skb; \
126 /* r4 = r_off, as passed */ \
127 li r5, SIZE; \
128 bl bpf_internal_load_pointer_neg_helper; \
129 nop; \
130 addi r1, r1, STACK_FRAME_MIN_SIZE; \
131 ld r0, PPC_LR_STKOFF(r1); \
132 mtlr r0; \
133 /* R3 != 0 on success */ \
134 cmpldi r3, 0; \
135 beq bpf_error_slow; /* cr0 = EQ */
136
137bpf_slow_path_word_neg:
138 lis r_tmp, -32 /* SKF_LL_OFF */
139 cmpd r_off, r_tmp /* addr < SKF_* */
140 blt bpf_error /* cr0 = LT */
141 b sk_load_word_negative_offset
142
143_GLOBAL_TOC(sk_load_word_negative_offset)
144 sk_negative_common(4)
145 LWZX_BE r_val, 0, r3
146 blr
147
148bpf_slow_path_half_neg:
149 lis r_tmp, -32 /* SKF_LL_OFF */
150 cmpd r_off, r_tmp /* addr < SKF_* */
151 blt bpf_error /* cr0 = LT */
152 b sk_load_half_negative_offset
153
154_GLOBAL_TOC(sk_load_half_negative_offset)
155 sk_negative_common(2)
156 LHZX_BE r_val, 0, r3
157 blr
158
159bpf_slow_path_byte_neg:
160 lis r_tmp, -32 /* SKF_LL_OFF */
161 cmpd r_off, r_tmp /* addr < SKF_* */
162 blt bpf_error /* cr0 = LT */
163 b sk_load_byte_negative_offset
164
165_GLOBAL_TOC(sk_load_byte_negative_offset)
166 sk_negative_common(1)
167 lbzx r_val, 0, r3
168 blr
169
170bpf_error_slow:
171 /* fabricate a cr0 = lt */
172 li r_tmp, -1
173 cmpdi r_tmp, 0
174bpf_error:
175 /*
176 * Entered with cr0 = lt
177 * Generated code will 'blt epilogue', returning 0.
178 */
179 li r_val, 0
180 blr
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 0ef3d9580e98..1bdb1aff0619 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -59,7 +59,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx)
59 * [ prev sp ] <------------- 59 * [ prev sp ] <-------------
60 * [ ... ] | 60 * [ ... ] |
61 * sp (r1) ---> [ stack pointer ] -------------- 61 * sp (r1) ---> [ stack pointer ] --------------
62 * [ nv gpr save area ] 8*8 62 * [ nv gpr save area ] 6*8
63 * [ tail_call_cnt ] 8 63 * [ tail_call_cnt ] 8
64 * [ local_tmp_var ] 8 64 * [ local_tmp_var ] 8
65 * [ unused red zone ] 208 bytes protected 65 * [ unused red zone ] 208 bytes protected
@@ -88,21 +88,6 @@ static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg)
88 BUG(); 88 BUG();
89} 89}
90 90
91static void bpf_jit_emit_skb_loads(u32 *image, struct codegen_context *ctx)
92{
93 /*
94 * Load skb->len and skb->data_len
95 * r3 points to skb
96 */
97 PPC_LWZ(b2p[SKB_HLEN_REG], 3, offsetof(struct sk_buff, len));
98 PPC_LWZ(b2p[TMP_REG_1], 3, offsetof(struct sk_buff, data_len));
99 /* header_len = len - data_len */
100 PPC_SUB(b2p[SKB_HLEN_REG], b2p[SKB_HLEN_REG], b2p[TMP_REG_1]);
101
102 /* skb->data pointer */
103 PPC_BPF_LL(b2p[SKB_DATA_REG], 3, offsetof(struct sk_buff, data));
104}
105
106static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) 91static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
107{ 92{
108 int i; 93 int i;
@@ -145,18 +130,6 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
145 if (bpf_is_seen_register(ctx, i)) 130 if (bpf_is_seen_register(ctx, i))
146 PPC_BPF_STL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i])); 131 PPC_BPF_STL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i]));
147 132
148 /*
149 * Save additional non-volatile regs if we cache skb
150 * Also, setup skb data
151 */
152 if (ctx->seen & SEEN_SKB) {
153 PPC_BPF_STL(b2p[SKB_HLEN_REG], 1,
154 bpf_jit_stack_offsetof(ctx, b2p[SKB_HLEN_REG]));
155 PPC_BPF_STL(b2p[SKB_DATA_REG], 1,
156 bpf_jit_stack_offsetof(ctx, b2p[SKB_DATA_REG]));
157 bpf_jit_emit_skb_loads(image, ctx);
158 }
159
160 /* Setup frame pointer to point to the bpf stack area */ 133 /* Setup frame pointer to point to the bpf stack area */
161 if (bpf_is_seen_register(ctx, BPF_REG_FP)) 134 if (bpf_is_seen_register(ctx, BPF_REG_FP))
162 PPC_ADDI(b2p[BPF_REG_FP], 1, 135 PPC_ADDI(b2p[BPF_REG_FP], 1,
@@ -172,14 +145,6 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx
172 if (bpf_is_seen_register(ctx, i)) 145 if (bpf_is_seen_register(ctx, i))
173 PPC_BPF_LL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i])); 146 PPC_BPF_LL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i]));
174 147
175 /* Restore non-volatile registers used for skb cache */
176 if (ctx->seen & SEEN_SKB) {
177 PPC_BPF_LL(b2p[SKB_HLEN_REG], 1,
178 bpf_jit_stack_offsetof(ctx, b2p[SKB_HLEN_REG]));
179 PPC_BPF_LL(b2p[SKB_DATA_REG], 1,
180 bpf_jit_stack_offsetof(ctx, b2p[SKB_DATA_REG]));
181 }
182
183 /* Tear down our stack frame */ 148 /* Tear down our stack frame */
184 if (bpf_has_stack_frame(ctx)) { 149 if (bpf_has_stack_frame(ctx)) {
185 PPC_ADDI(1, 1, BPF_PPC_STACKFRAME + ctx->stack_size); 150 PPC_ADDI(1, 1, BPF_PPC_STACKFRAME + ctx->stack_size);
@@ -753,23 +718,10 @@ emit_clear:
753 ctx->seen |= SEEN_FUNC; 718 ctx->seen |= SEEN_FUNC;
754 func = (u8 *) __bpf_call_base + imm; 719 func = (u8 *) __bpf_call_base + imm;
755 720
756 /* Save skb pointer if we need to re-cache skb data */
757 if ((ctx->seen & SEEN_SKB) &&
758 bpf_helper_changes_pkt_data(func))
759 PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
760
761 bpf_jit_emit_func_call(image, ctx, (u64)func); 721 bpf_jit_emit_func_call(image, ctx, (u64)func);
762 722
763 /* move return value from r3 to BPF_REG_0 */ 723 /* move return value from r3 to BPF_REG_0 */
764 PPC_MR(b2p[BPF_REG_0], 3); 724 PPC_MR(b2p[BPF_REG_0], 3);
765
766 /* refresh skb cache */
767 if ((ctx->seen & SEEN_SKB) &&
768 bpf_helper_changes_pkt_data(func)) {
769 /* reload skb pointer to r3 */
770 PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
771 bpf_jit_emit_skb_loads(image, ctx);
772 }
773 break; 725 break;
774 726
775 /* 727 /*
@@ -887,65 +839,6 @@ cond_branch:
887 break; 839 break;
888 840
889 /* 841 /*
890 * Loads from packet header/data
891 * Assume 32-bit input value in imm and X (src_reg)
892 */
893
894 /* Absolute loads */
895 case BPF_LD | BPF_W | BPF_ABS:
896 func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_word);
897 goto common_load_abs;
898 case BPF_LD | BPF_H | BPF_ABS:
899 func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_half);
900 goto common_load_abs;
901 case BPF_LD | BPF_B | BPF_ABS:
902 func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_byte);
903common_load_abs:
904 /*
905 * Load from [imm]
906 * Load into r4, which can just be passed onto
907 * skb load helpers as the second parameter
908 */
909 PPC_LI32(4, imm);
910 goto common_load;
911
912 /* Indirect loads */
913 case BPF_LD | BPF_W | BPF_IND:
914 func = (u8 *)sk_load_word;
915 goto common_load_ind;
916 case BPF_LD | BPF_H | BPF_IND:
917 func = (u8 *)sk_load_half;
918 goto common_load_ind;
919 case BPF_LD | BPF_B | BPF_IND:
920 func = (u8 *)sk_load_byte;
921common_load_ind:
922 /*
923 * Load from [src_reg + imm]
924 * Treat src_reg as a 32-bit value
925 */
926 PPC_EXTSW(4, src_reg);
927 if (imm) {
928 if (imm >= -32768 && imm < 32768)
929 PPC_ADDI(4, 4, IMM_L(imm));
930 else {
931 PPC_LI32(b2p[TMP_REG_1], imm);
932 PPC_ADD(4, 4, b2p[TMP_REG_1]);
933 }
934 }
935
936common_load:
937 ctx->seen |= SEEN_SKB;
938 ctx->seen |= SEEN_FUNC;
939 bpf_jit_emit_func_call(image, ctx, (u64)func);
940
941 /*
942 * Helper returns 'lt' condition on error, and an
943 * appropriate return value in BPF_REG_0
944 */
945 PPC_BCC(COND_LT, exit_addr);
946 break;
947
948 /*
949 * Tail call 842 * Tail call
950 */ 843 */
951 case BPF_JMP | BPF_TAIL_CALL: 844 case BPF_JMP | BPF_TAIL_CALL:
diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile
index e0d5f245e42b..d4663b4bf509 100644
--- a/arch/s390/net/Makefile
+++ b/arch/s390/net/Makefile
@@ -2,4 +2,4 @@
2# 2#
3# Arch-specific network modules 3# Arch-specific network modules
4# 4#
5obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o 5obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
diff --git a/arch/s390/net/bpf_jit.S b/arch/s390/net/bpf_jit.S
deleted file mode 100644
index 25bb4643c4f4..000000000000
--- a/arch/s390/net/bpf_jit.S
+++ /dev/null
@@ -1,116 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * BPF Jit compiler for s390, help functions.
4 *
5 * Copyright IBM Corp. 2012,2015
6 *
7 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
8 * Michael Holzheu <holzheu@linux.vnet.ibm.com>
9 */
10
11#include <linux/linkage.h>
12#include "bpf_jit.h"
13
14/*
15 * Calling convention:
16 * registers %r7-%r10, %r11,%r13, and %r15 are call saved
17 *
18 * Input (64 bit):
19 * %r3 (%b2) = offset into skb data
20 * %r6 (%b5) = return address
21 * %r7 (%b6) = skb pointer
22 * %r12 = skb data pointer
23 *
24 * Output:
25 * %r14= %b0 = return value (read skb value)
26 *
27 * Work registers: %r2,%r4,%r5,%r14
28 *
29 * skb_copy_bits takes 4 parameters:
30 * %r2 = skb pointer
31 * %r3 = offset into skb data
32 * %r4 = pointer to temp buffer
33 * %r5 = length to copy
34 * Return value in %r2: 0 = ok
35 *
36 * bpf_internal_load_pointer_neg_helper takes 3 parameters:
37 * %r2 = skb pointer
38 * %r3 = offset into data
39 * %r4 = length to copy
40 * Return value in %r2: Pointer to data
41 */
42
43#define SKF_MAX_NEG_OFF -0x200000 /* SKF_LL_OFF from filter.h */
44
45/*
46 * Load SIZE bytes from SKB
47 */
48#define sk_load_common(NAME, SIZE, LOAD) \
49ENTRY(sk_load_##NAME); \
50 ltgr %r3,%r3; /* Is offset negative? */ \
51 jl sk_load_##NAME##_slow_neg; \
52ENTRY(sk_load_##NAME##_pos); \
53 aghi %r3,SIZE; /* Offset + SIZE */ \
54 clg %r3,STK_OFF_HLEN(%r15); /* Offset + SIZE > hlen? */ \
55 jh sk_load_##NAME##_slow; \
56 LOAD %r14,-SIZE(%r3,%r12); /* Get data from skb */ \
57 b OFF_OK(%r6); /* Return */ \
58 \
59sk_load_##NAME##_slow:; \
60 lgr %r2,%r7; /* Arg1 = skb pointer */ \
61 aghi %r3,-SIZE; /* Arg2 = offset */ \
62 la %r4,STK_OFF_TMP(%r15); /* Arg3 = temp bufffer */ \
63 lghi %r5,SIZE; /* Arg4 = size */ \
64 brasl %r14,skb_copy_bits; /* Get data from skb */ \
65 LOAD %r14,STK_OFF_TMP(%r15); /* Load from temp bufffer */ \
66 ltgr %r2,%r2; /* Set cc to (%r2 != 0) */ \
67 br %r6; /* Return */
68
69sk_load_common(word, 4, llgf) /* r14 = *(u32 *) (skb->data+offset) */
70sk_load_common(half, 2, llgh) /* r14 = *(u16 *) (skb->data+offset) */
71
72/*
73 * Load 1 byte from SKB (optimized version)
74 */
75 /* r14 = *(u8 *) (skb->data+offset) */
76ENTRY(sk_load_byte)
77 ltgr %r3,%r3 # Is offset negative?
78 jl sk_load_byte_slow_neg
79ENTRY(sk_load_byte_pos)
80 clg %r3,STK_OFF_HLEN(%r15) # Offset >= hlen?
81 jnl sk_load_byte_slow
82 llgc %r14,0(%r3,%r12) # Get byte from skb
83 b OFF_OK(%r6) # Return OK
84
85sk_load_byte_slow:
86 lgr %r2,%r7 # Arg1 = skb pointer
87 # Arg2 = offset
88 la %r4,STK_OFF_TMP(%r15) # Arg3 = pointer to temp buffer
89 lghi %r5,1 # Arg4 = size (1 byte)
90 brasl %r14,skb_copy_bits # Get data from skb
91 llgc %r14,STK_OFF_TMP(%r15) # Load result from temp buffer
92 ltgr %r2,%r2 # Set cc to (%r2 != 0)
93 br %r6 # Return cc
94
95#define sk_negative_common(NAME, SIZE, LOAD) \
96sk_load_##NAME##_slow_neg:; \
97 cgfi %r3,SKF_MAX_NEG_OFF; \
98 jl bpf_error; \
99 lgr %r2,%r7; /* Arg1 = skb pointer */ \
100 /* Arg2 = offset */ \
101 lghi %r4,SIZE; /* Arg3 = size */ \
102 brasl %r14,bpf_internal_load_pointer_neg_helper; \
103 ltgr %r2,%r2; \
104 jz bpf_error; \
105 LOAD %r14,0(%r2); /* Get data from pointer */ \
106 xr %r3,%r3; /* Set cc to zero */ \
107 br %r6; /* Return cc */
108
109sk_negative_common(word, 4, llgf)
110sk_negative_common(half, 2, llgh)
111sk_negative_common(byte, 1, llgc)
112
113bpf_error:
114# force a return 0 from jit handler
115 ltgr %r15,%r15 # Set condition code
116 br %r6
diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h
index 5e1e5133132d..7822ea92e54a 100644
--- a/arch/s390/net/bpf_jit.h
+++ b/arch/s390/net/bpf_jit.h
@@ -16,9 +16,6 @@
16#include <linux/filter.h> 16#include <linux/filter.h>
17#include <linux/types.h> 17#include <linux/types.h>
18 18
19extern u8 sk_load_word_pos[], sk_load_half_pos[], sk_load_byte_pos[];
20extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
21
22#endif /* __ASSEMBLY__ */ 19#endif /* __ASSEMBLY__ */
23 20
24/* 21/*
@@ -36,15 +33,6 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
36 * | | | 33 * | | |
37 * | BPF stack | | 34 * | BPF stack | |
38 * | | | 35 * | | |
39 * +---------------+ |
40 * | 8 byte skbp | |
41 * R15+176 -> +---------------+ |
42 * | 8 byte hlen | |
43 * R15+168 -> +---------------+ |
44 * | 4 byte align | |
45 * +---------------+ |
46 * | 4 byte temp | |
47 * | for bpf_jit.S | |
48 * R15+160 -> +---------------+ | 36 * R15+160 -> +---------------+ |
49 * | new backchain | | 37 * | new backchain | |
50 * R15+152 -> +---------------+ | 38 * R15+152 -> +---------------+ |
@@ -57,17 +45,11 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
57 * The stack size used by the BPF program ("BPF stack" above) is passed 45 * The stack size used by the BPF program ("BPF stack" above) is passed
58 * via "aux->stack_depth". 46 * via "aux->stack_depth".
59 */ 47 */
60#define STK_SPACE_ADD (8 + 8 + 4 + 4 + 160) 48#define STK_SPACE_ADD (160)
61#define STK_160_UNUSED (160 - 12 * 8) 49#define STK_160_UNUSED (160 - 12 * 8)
62#define STK_OFF (STK_SPACE_ADD - STK_160_UNUSED) 50#define STK_OFF (STK_SPACE_ADD - STK_160_UNUSED)
63#define STK_OFF_TMP 160 /* Offset of tmp buffer on stack */
64#define STK_OFF_HLEN 168 /* Offset of SKB header length on stack */
65#define STK_OFF_SKBP 176 /* Offset of SKB pointer on stack */
66 51
67#define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */ 52#define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */
68#define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */ 53#define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */
69 54
70/* Offset to skip condition code check */
71#define OFF_OK 4
72
73#endif /* __ARCH_S390_NET_BPF_JIT_H */ 55#endif /* __ARCH_S390_NET_BPF_JIT_H */
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 78a19c93b380..b020bea040b7 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -47,23 +47,21 @@ struct bpf_jit {
47 47
48#define BPF_SIZE_MAX 0xffff /* Max size for program (16 bit branches) */ 48#define BPF_SIZE_MAX 0xffff /* Max size for program (16 bit branches) */
49 49
50#define SEEN_SKB 1 /* skb access */ 50#define SEEN_MEM (1 << 0) /* use mem[] for temporary storage */
51#define SEEN_MEM 2 /* use mem[] for temporary storage */ 51#define SEEN_RET0 (1 << 1) /* ret0_ip points to a valid return 0 */
52#define SEEN_RET0 4 /* ret0_ip points to a valid return 0 */ 52#define SEEN_LITERAL (1 << 2) /* code uses literals */
53#define SEEN_LITERAL 8 /* code uses literals */ 53#define SEEN_FUNC (1 << 3) /* calls C functions */
54#define SEEN_FUNC 16 /* calls C functions */ 54#define SEEN_TAIL_CALL (1 << 4) /* code uses tail calls */
55#define SEEN_TAIL_CALL 32 /* code uses tail calls */ 55#define SEEN_REG_AX (1 << 5) /* code uses constant blinding */
56#define SEEN_REG_AX 64 /* code uses constant blinding */ 56#define SEEN_STACK (SEEN_FUNC | SEEN_MEM)
57#define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB)
58 57
59/* 58/*
60 * s390 registers 59 * s390 registers
61 */ 60 */
62#define REG_W0 (MAX_BPF_JIT_REG + 0) /* Work register 1 (even) */ 61#define REG_W0 (MAX_BPF_JIT_REG + 0) /* Work register 1 (even) */
63#define REG_W1 (MAX_BPF_JIT_REG + 1) /* Work register 2 (odd) */ 62#define REG_W1 (MAX_BPF_JIT_REG + 1) /* Work register 2 (odd) */
64#define REG_SKB_DATA (MAX_BPF_JIT_REG + 2) /* SKB data register */ 63#define REG_L (MAX_BPF_JIT_REG + 2) /* Literal pool register */
65#define REG_L (MAX_BPF_JIT_REG + 3) /* Literal pool register */ 64#define REG_15 (MAX_BPF_JIT_REG + 3) /* Register 15 */
66#define REG_15 (MAX_BPF_JIT_REG + 4) /* Register 15 */
67#define REG_0 REG_W0 /* Register 0 */ 65#define REG_0 REG_W0 /* Register 0 */
68#define REG_1 REG_W1 /* Register 1 */ 66#define REG_1 REG_W1 /* Register 1 */
69#define REG_2 BPF_REG_1 /* Register 2 */ 67#define REG_2 BPF_REG_1 /* Register 2 */
@@ -88,10 +86,8 @@ static const int reg2hex[] = {
88 [BPF_REG_9] = 10, 86 [BPF_REG_9] = 10,
89 /* BPF stack pointer */ 87 /* BPF stack pointer */
90 [BPF_REG_FP] = 13, 88 [BPF_REG_FP] = 13,
91 /* Register for blinding (shared with REG_SKB_DATA) */ 89 /* Register for blinding */
92 [BPF_REG_AX] = 12, 90 [BPF_REG_AX] = 12,
93 /* SKB data pointer */
94 [REG_SKB_DATA] = 12,
95 /* Work registers for s390x backend */ 91 /* Work registers for s390x backend */
96 [REG_W0] = 0, 92 [REG_W0] = 0,
97 [REG_W1] = 1, 93 [REG_W1] = 1,
@@ -385,27 +381,6 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
385} 381}
386 382
387/* 383/*
388 * For SKB access %b1 contains the SKB pointer. For "bpf_jit.S"
389 * we store the SKB header length on the stack and the SKB data
390 * pointer in REG_SKB_DATA if BPF_REG_AX is not used.
391 */
392static void emit_load_skb_data_hlen(struct bpf_jit *jit)
393{
394 /* Header length: llgf %w1,<len>(%b1) */
395 EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_1,
396 offsetof(struct sk_buff, len));
397 /* s %w1,<data_len>(%b1) */
398 EMIT4_DISP(0x5b000000, REG_W1, BPF_REG_1,
399 offsetof(struct sk_buff, data_len));
400 /* stg %w1,ST_OFF_HLEN(%r0,%r15) */
401 EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, STK_OFF_HLEN);
402 if (!(jit->seen & SEEN_REG_AX))
403 /* lg %skb_data,data_off(%b1) */
404 EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0,
405 BPF_REG_1, offsetof(struct sk_buff, data));
406}
407
408/*
409 * Emit function prologue 384 * Emit function prologue
410 * 385 *
411 * Save registers and create stack frame if necessary. 386 * Save registers and create stack frame if necessary.
@@ -445,12 +420,6 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
445 EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, 420 EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
446 REG_15, 152); 421 REG_15, 152);
447 } 422 }
448 if (jit->seen & SEEN_SKB) {
449 emit_load_skb_data_hlen(jit);
450 /* stg %b1,ST_OFF_SKBP(%r0,%r15) */
451 EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
452 STK_OFF_SKBP);
453 }
454} 423}
455 424
456/* 425/*
@@ -483,12 +452,12 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
483{ 452{
484 struct bpf_insn *insn = &fp->insnsi[i]; 453 struct bpf_insn *insn = &fp->insnsi[i];
485 int jmp_off, last, insn_count = 1; 454 int jmp_off, last, insn_count = 1;
486 unsigned int func_addr, mask;
487 u32 dst_reg = insn->dst_reg; 455 u32 dst_reg = insn->dst_reg;
488 u32 src_reg = insn->src_reg; 456 u32 src_reg = insn->src_reg;
489 u32 *addrs = jit->addrs; 457 u32 *addrs = jit->addrs;
490 s32 imm = insn->imm; 458 s32 imm = insn->imm;
491 s16 off = insn->off; 459 s16 off = insn->off;
460 unsigned int mask;
492 461
493 if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX) 462 if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
494 jit->seen |= SEEN_REG_AX; 463 jit->seen |= SEEN_REG_AX;
@@ -970,13 +939,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
970 EMIT2(0x0d00, REG_14, REG_W1); 939 EMIT2(0x0d00, REG_14, REG_W1);
971 /* lgr %b0,%r2: load return value into %b0 */ 940 /* lgr %b0,%r2: load return value into %b0 */
972 EMIT4(0xb9040000, BPF_REG_0, REG_2); 941 EMIT4(0xb9040000, BPF_REG_0, REG_2);
973 if ((jit->seen & SEEN_SKB) &&
974 bpf_helper_changes_pkt_data((void *)func)) {
975 /* lg %b1,ST_OFF_SKBP(%r15) */
976 EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
977 REG_15, STK_OFF_SKBP);
978 emit_load_skb_data_hlen(jit);
979 }
980 break; 942 break;
981 } 943 }
982 case BPF_JMP | BPF_TAIL_CALL: 944 case BPF_JMP | BPF_TAIL_CALL:
@@ -1176,73 +1138,6 @@ branch_oc:
1176 jmp_off = addrs[i + off + 1] - (addrs[i + 1] - 4); 1138 jmp_off = addrs[i + off + 1] - (addrs[i + 1] - 4);
1177 EMIT4_PCREL(0xa7040000 | mask << 8, jmp_off); 1139 EMIT4_PCREL(0xa7040000 | mask << 8, jmp_off);
1178 break; 1140 break;
1179 /*
1180 * BPF_LD
1181 */
1182 case BPF_LD | BPF_ABS | BPF_B: /* b0 = *(u8 *) (skb->data+imm) */
1183 case BPF_LD | BPF_IND | BPF_B: /* b0 = *(u8 *) (skb->data+imm+src) */
1184 if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0))
1185 func_addr = __pa(sk_load_byte_pos);
1186 else
1187 func_addr = __pa(sk_load_byte);
1188 goto call_fn;
1189 case BPF_LD | BPF_ABS | BPF_H: /* b0 = *(u16 *) (skb->data+imm) */
1190 case BPF_LD | BPF_IND | BPF_H: /* b0 = *(u16 *) (skb->data+imm+src) */
1191 if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0))
1192 func_addr = __pa(sk_load_half_pos);
1193 else
1194 func_addr = __pa(sk_load_half);
1195 goto call_fn;
1196 case BPF_LD | BPF_ABS | BPF_W: /* b0 = *(u32 *) (skb->data+imm) */
1197 case BPF_LD | BPF_IND | BPF_W: /* b0 = *(u32 *) (skb->data+imm+src) */
1198 if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0))
1199 func_addr = __pa(sk_load_word_pos);
1200 else
1201 func_addr = __pa(sk_load_word);
1202 goto call_fn;
1203call_fn:
1204 jit->seen |= SEEN_SKB | SEEN_RET0 | SEEN_FUNC;
1205 REG_SET_SEEN(REG_14); /* Return address of possible func call */
1206
1207 /*
1208 * Implicit input:
1209 * BPF_REG_6 (R7) : skb pointer
1210 * REG_SKB_DATA (R12): skb data pointer (if no BPF_REG_AX)
1211 *
1212 * Calculated input:
1213 * BPF_REG_2 (R3) : offset of byte(s) to fetch in skb
1214 * BPF_REG_5 (R6) : return address
1215 *
1216 * Output:
1217 * BPF_REG_0 (R14): data read from skb
1218 *
1219 * Scratch registers (BPF_REG_1-5)
1220 */
1221
1222 /* Call function: llilf %w1,func_addr */
1223 EMIT6_IMM(0xc00f0000, REG_W1, func_addr);
1224
1225 /* Offset: lgfi %b2,imm */
1226 EMIT6_IMM(0xc0010000, BPF_REG_2, imm);
1227 if (BPF_MODE(insn->code) == BPF_IND)
1228 /* agfr %b2,%src (%src is s32 here) */
1229 EMIT4(0xb9180000, BPF_REG_2, src_reg);
1230
1231 /* Reload REG_SKB_DATA if BPF_REG_AX is used */
1232 if (jit->seen & SEEN_REG_AX)
1233 /* lg %skb_data,data_off(%b6) */
1234 EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0,
1235 BPF_REG_6, offsetof(struct sk_buff, data));
1236 /* basr %b5,%w1 (%b5 is call saved) */
1237 EMIT2(0x0d00, BPF_REG_5, REG_W1);
1238
1239 /*
1240 * Note: For fast access we jump directly after the
1241 * jnz instruction from bpf_jit.S
1242 */
1243 /* jnz <ret0> */
1244 EMIT4_PCREL(0xa7740000, jit->ret0_ip - jit->prg);
1245 break;
1246 default: /* too complex, give up */ 1141 default: /* too complex, give up */
1247 pr_err("Unknown opcode %02x\n", insn->code); 1142 pr_err("Unknown opcode %02x\n", insn->code);
1248 return -1; 1143 return -1;
diff --git a/arch/sparc/net/Makefile b/arch/sparc/net/Makefile
index 76fa8e95b721..d32aac3a25b8 100644
--- a/arch/sparc/net/Makefile
+++ b/arch/sparc/net/Makefile
@@ -1,4 +1,7 @@
1# 1#
2# Arch-specific network modules 2# Arch-specific network modules
3# 3#
4obj-$(CONFIG_BPF_JIT) += bpf_jit_asm_$(BITS).o bpf_jit_comp_$(BITS).o 4obj-$(CONFIG_BPF_JIT) += bpf_jit_comp_$(BITS).o
5ifeq ($(BITS),32)
6obj-$(CONFIG_BPF_JIT) += bpf_jit_asm_32.o
7endif
diff --git a/arch/sparc/net/bpf_jit_64.h b/arch/sparc/net/bpf_jit_64.h
index 428f7fd19175..fbc836f1c51c 100644
--- a/arch/sparc/net/bpf_jit_64.h
+++ b/arch/sparc/net/bpf_jit_64.h
@@ -33,35 +33,6 @@
33#define I5 0x1d 33#define I5 0x1d
34#define FP 0x1e 34#define FP 0x1e
35#define I7 0x1f 35#define I7 0x1f
36
37#define r_SKB L0
38#define r_HEADLEN L4
39#define r_SKB_DATA L5
40#define r_TMP G1
41#define r_TMP2 G3
42
43/* assembly code in arch/sparc/net/bpf_jit_asm_64.S */
44extern u32 bpf_jit_load_word[];
45extern u32 bpf_jit_load_half[];
46extern u32 bpf_jit_load_byte[];
47extern u32 bpf_jit_load_byte_msh[];
48extern u32 bpf_jit_load_word_positive_offset[];
49extern u32 bpf_jit_load_half_positive_offset[];
50extern u32 bpf_jit_load_byte_positive_offset[];
51extern u32 bpf_jit_load_byte_msh_positive_offset[];
52extern u32 bpf_jit_load_word_negative_offset[];
53extern u32 bpf_jit_load_half_negative_offset[];
54extern u32 bpf_jit_load_byte_negative_offset[];
55extern u32 bpf_jit_load_byte_msh_negative_offset[];
56
57#else
58#define r_RESULT %o0
59#define r_SKB %o0
60#define r_OFF %o1
61#define r_HEADLEN %l4
62#define r_SKB_DATA %l5
63#define r_TMP %g1
64#define r_TMP2 %g3
65#endif 36#endif
66 37
67#endif /* _BPF_JIT_H */ 38#endif /* _BPF_JIT_H */
diff --git a/arch/sparc/net/bpf_jit_asm_64.S b/arch/sparc/net/bpf_jit_asm_64.S
deleted file mode 100644
index 7177867052a1..000000000000
--- a/arch/sparc/net/bpf_jit_asm_64.S
+++ /dev/null
@@ -1,162 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <asm/ptrace.h>
3
4#include "bpf_jit_64.h"
5
6#define SAVE_SZ 176
7#define SCRATCH_OFF STACK_BIAS + 128
8#define BE_PTR(label) be,pn %xcc, label
9#define SIGN_EXTEND(reg) sra reg, 0, reg
10
11#define SKF_MAX_NEG_OFF (-0x200000) /* SKF_LL_OFF from filter.h */
12
13 .text
14 .globl bpf_jit_load_word
15bpf_jit_load_word:
16 cmp r_OFF, 0
17 bl bpf_slow_path_word_neg
18 nop
19 .globl bpf_jit_load_word_positive_offset
20bpf_jit_load_word_positive_offset:
21 sub r_HEADLEN, r_OFF, r_TMP
22 cmp r_TMP, 3
23 ble bpf_slow_path_word
24 add r_SKB_DATA, r_OFF, r_TMP
25 andcc r_TMP, 3, %g0
26 bne load_word_unaligned
27 nop
28 retl
29 ld [r_TMP], r_RESULT
30load_word_unaligned:
31 ldub [r_TMP + 0x0], r_OFF
32 ldub [r_TMP + 0x1], r_TMP2
33 sll r_OFF, 8, r_OFF
34 or r_OFF, r_TMP2, r_OFF
35 ldub [r_TMP + 0x2], r_TMP2
36 sll r_OFF, 8, r_OFF
37 or r_OFF, r_TMP2, r_OFF
38 ldub [r_TMP + 0x3], r_TMP2
39 sll r_OFF, 8, r_OFF
40 retl
41 or r_OFF, r_TMP2, r_RESULT
42
43 .globl bpf_jit_load_half
44bpf_jit_load_half:
45 cmp r_OFF, 0
46 bl bpf_slow_path_half_neg
47 nop
48 .globl bpf_jit_load_half_positive_offset
49bpf_jit_load_half_positive_offset:
50 sub r_HEADLEN, r_OFF, r_TMP
51 cmp r_TMP, 1
52 ble bpf_slow_path_half
53 add r_SKB_DATA, r_OFF, r_TMP
54 andcc r_TMP, 1, %g0
55 bne load_half_unaligned
56 nop
57 retl
58 lduh [r_TMP], r_RESULT
59load_half_unaligned:
60 ldub [r_TMP + 0x0], r_OFF
61 ldub [r_TMP + 0x1], r_TMP2
62 sll r_OFF, 8, r_OFF
63 retl
64 or r_OFF, r_TMP2, r_RESULT
65
66 .globl bpf_jit_load_byte
67bpf_jit_load_byte:
68 cmp r_OFF, 0
69 bl bpf_slow_path_byte_neg
70 nop
71 .globl bpf_jit_load_byte_positive_offset
72bpf_jit_load_byte_positive_offset:
73 cmp r_OFF, r_HEADLEN
74 bge bpf_slow_path_byte
75 nop
76 retl
77 ldub [r_SKB_DATA + r_OFF], r_RESULT
78
79#define bpf_slow_path_common(LEN) \
80 save %sp, -SAVE_SZ, %sp; \
81 mov %i0, %o0; \
82 mov %i1, %o1; \
83 add %fp, SCRATCH_OFF, %o2; \
84 call skb_copy_bits; \
85 mov (LEN), %o3; \
86 cmp %o0, 0; \
87 restore;
88
89bpf_slow_path_word:
90 bpf_slow_path_common(4)
91 bl bpf_error
92 ld [%sp + SCRATCH_OFF], r_RESULT
93 retl
94 nop
95bpf_slow_path_half:
96 bpf_slow_path_common(2)
97 bl bpf_error
98 lduh [%sp + SCRATCH_OFF], r_RESULT
99 retl
100 nop
101bpf_slow_path_byte:
102 bpf_slow_path_common(1)
103 bl bpf_error
104 ldub [%sp + SCRATCH_OFF], r_RESULT
105 retl
106 nop
107
108#define bpf_negative_common(LEN) \
109 save %sp, -SAVE_SZ, %sp; \
110 mov %i0, %o0; \
111 mov %i1, %o1; \
112 SIGN_EXTEND(%o1); \
113 call bpf_internal_load_pointer_neg_helper; \
114 mov (LEN), %o2; \
115 mov %o0, r_TMP; \
116 cmp %o0, 0; \
117 BE_PTR(bpf_error); \
118 restore;
119
120bpf_slow_path_word_neg:
121 sethi %hi(SKF_MAX_NEG_OFF), r_TMP
122 cmp r_OFF, r_TMP
123 bl bpf_error
124 nop
125 .globl bpf_jit_load_word_negative_offset
126bpf_jit_load_word_negative_offset:
127 bpf_negative_common(4)
128 andcc r_TMP, 3, %g0
129 bne load_word_unaligned
130 nop
131 retl
132 ld [r_TMP], r_RESULT
133
134bpf_slow_path_half_neg:
135 sethi %hi(SKF_MAX_NEG_OFF), r_TMP
136 cmp r_OFF, r_TMP
137 bl bpf_error
138 nop
139 .globl bpf_jit_load_half_negative_offset
140bpf_jit_load_half_negative_offset:
141 bpf_negative_common(2)
142 andcc r_TMP, 1, %g0
143 bne load_half_unaligned
144 nop
145 retl
146 lduh [r_TMP], r_RESULT
147
148bpf_slow_path_byte_neg:
149 sethi %hi(SKF_MAX_NEG_OFF), r_TMP
150 cmp r_OFF, r_TMP
151 bl bpf_error
152 nop
153 .globl bpf_jit_load_byte_negative_offset
154bpf_jit_load_byte_negative_offset:
155 bpf_negative_common(1)
156 retl
157 ldub [r_TMP], r_RESULT
158
159bpf_error:
160 /* Make the JIT program itself return zero. */
161 ret
162 restore %g0, %g0, %o0
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 48a25869349b..9f5918e0693a 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -48,10 +48,6 @@ static void bpf_flush_icache(void *start_, void *end_)
48 } 48 }
49} 49}
50 50
51#define SEEN_DATAREF 1 /* might call external helpers */
52#define SEEN_XREG 2 /* ebx is used */
53#define SEEN_MEM 4 /* use mem[] for temporary storage */
54
55#define S13(X) ((X) & 0x1fff) 51#define S13(X) ((X) & 0x1fff)
56#define S5(X) ((X) & 0x1f) 52#define S5(X) ((X) & 0x1f)
57#define IMMED 0x00002000 53#define IMMED 0x00002000
@@ -198,7 +194,6 @@ struct jit_ctx {
198 bool tmp_1_used; 194 bool tmp_1_used;
199 bool tmp_2_used; 195 bool tmp_2_used;
200 bool tmp_3_used; 196 bool tmp_3_used;
201 bool saw_ld_abs_ind;
202 bool saw_frame_pointer; 197 bool saw_frame_pointer;
203 bool saw_call; 198 bool saw_call;
204 bool saw_tail_call; 199 bool saw_tail_call;
@@ -207,9 +202,7 @@ struct jit_ctx {
207 202
208#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) 203#define TMP_REG_1 (MAX_BPF_JIT_REG + 0)
209#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) 204#define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
210#define SKB_HLEN_REG (MAX_BPF_JIT_REG + 2) 205#define TMP_REG_3 (MAX_BPF_JIT_REG + 2)
211#define SKB_DATA_REG (MAX_BPF_JIT_REG + 3)
212#define TMP_REG_3 (MAX_BPF_JIT_REG + 4)
213 206
214/* Map BPF registers to SPARC registers */ 207/* Map BPF registers to SPARC registers */
215static const int bpf2sparc[] = { 208static const int bpf2sparc[] = {
@@ -238,9 +231,6 @@ static const int bpf2sparc[] = {
238 [TMP_REG_1] = G1, 231 [TMP_REG_1] = G1,
239 [TMP_REG_2] = G2, 232 [TMP_REG_2] = G2,
240 [TMP_REG_3] = G3, 233 [TMP_REG_3] = G3,
241
242 [SKB_HLEN_REG] = L4,
243 [SKB_DATA_REG] = L5,
244}; 234};
245 235
246static void emit(const u32 insn, struct jit_ctx *ctx) 236static void emit(const u32 insn, struct jit_ctx *ctx)
@@ -800,25 +790,6 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src,
800 return 0; 790 return 0;
801} 791}
802 792
803static void load_skb_regs(struct jit_ctx *ctx, u8 r_skb)
804{
805 const u8 r_headlen = bpf2sparc[SKB_HLEN_REG];
806 const u8 r_data = bpf2sparc[SKB_DATA_REG];
807 const u8 r_tmp = bpf2sparc[TMP_REG_1];
808 unsigned int off;
809
810 off = offsetof(struct sk_buff, len);
811 emit(LD32I | RS1(r_skb) | S13(off) | RD(r_headlen), ctx);
812
813 off = offsetof(struct sk_buff, data_len);
814 emit(LD32I | RS1(r_skb) | S13(off) | RD(r_tmp), ctx);
815
816 emit(SUB | RS1(r_headlen) | RS2(r_tmp) | RD(r_headlen), ctx);
817
818 off = offsetof(struct sk_buff, data);
819 emit(LDPTRI | RS1(r_skb) | S13(off) | RD(r_data), ctx);
820}
821
822/* Just skip the save instruction and the ctx register move. */ 793/* Just skip the save instruction and the ctx register move. */
823#define BPF_TAILCALL_PROLOGUE_SKIP 16 794#define BPF_TAILCALL_PROLOGUE_SKIP 16
824#define BPF_TAILCALL_CNT_SP_OFF (STACK_BIAS + 128) 795#define BPF_TAILCALL_CNT_SP_OFF (STACK_BIAS + 128)
@@ -857,9 +828,6 @@ static void build_prologue(struct jit_ctx *ctx)
857 828
858 emit_reg_move(I0, O0, ctx); 829 emit_reg_move(I0, O0, ctx);
859 /* If you add anything here, adjust BPF_TAILCALL_PROLOGUE_SKIP above. */ 830 /* If you add anything here, adjust BPF_TAILCALL_PROLOGUE_SKIP above. */
860
861 if (ctx->saw_ld_abs_ind)
862 load_skb_regs(ctx, bpf2sparc[BPF_REG_1]);
863} 831}
864 832
865static void build_epilogue(struct jit_ctx *ctx) 833static void build_epilogue(struct jit_ctx *ctx)
@@ -1225,16 +1193,11 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
1225 u8 *func = ((u8 *)__bpf_call_base) + imm; 1193 u8 *func = ((u8 *)__bpf_call_base) + imm;
1226 1194
1227 ctx->saw_call = true; 1195 ctx->saw_call = true;
1228 if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
1229 emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
1230 1196
1231 emit_call((u32 *)func, ctx); 1197 emit_call((u32 *)func, ctx);
1232 emit_nop(ctx); 1198 emit_nop(ctx);
1233 1199
1234 emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx); 1200 emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
1235
1236 if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
1237 load_skb_regs(ctx, L7);
1238 break; 1201 break;
1239 } 1202 }
1240 1203
@@ -1412,43 +1375,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
1412 emit_nop(ctx); 1375 emit_nop(ctx);
1413 break; 1376 break;
1414 } 1377 }
1415#define CHOOSE_LOAD_FUNC(K, func) \
1416 ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
1417
1418 /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */
1419 case BPF_LD | BPF_ABS | BPF_W:
1420 func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_word);
1421 goto common_load;
1422 case BPF_LD | BPF_ABS | BPF_H:
1423 func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_half);
1424 goto common_load;
1425 case BPF_LD | BPF_ABS | BPF_B:
1426 func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_byte);
1427 goto common_load;
1428 /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */
1429 case BPF_LD | BPF_IND | BPF_W:
1430 func = bpf_jit_load_word;
1431 goto common_load;
1432 case BPF_LD | BPF_IND | BPF_H:
1433 func = bpf_jit_load_half;
1434 goto common_load;
1435
1436 case BPF_LD | BPF_IND | BPF_B:
1437 func = bpf_jit_load_byte;
1438 common_load:
1439 ctx->saw_ld_abs_ind = true;
1440
1441 emit_reg_move(bpf2sparc[BPF_REG_6], O0, ctx);
1442 emit_loadimm(imm, O1, ctx);
1443
1444 if (BPF_MODE(code) == BPF_IND)
1445 emit_alu(ADD, src, O1, ctx);
1446
1447 emit_call(func, ctx);
1448 emit_alu_K(SRA, O1, 0, ctx);
1449
1450 emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
1451 break;
1452 1378
1453 default: 1379 default:
1454 pr_err_once("unknown opcode %02x\n", code); 1380 pr_err_once("unknown opcode %02x\n", code);
@@ -1583,12 +1509,11 @@ skip_init_ctx:
1583 build_epilogue(&ctx); 1509 build_epilogue(&ctx);
1584 1510
1585 if (bpf_jit_enable > 1) 1511 if (bpf_jit_enable > 1)
1586 pr_info("Pass %d: shrink = %d, seen = [%c%c%c%c%c%c%c]\n", pass, 1512 pr_info("Pass %d: shrink = %d, seen = [%c%c%c%c%c%c]\n", pass,
1587 image_size - (ctx.idx * 4), 1513 image_size - (ctx.idx * 4),
1588 ctx.tmp_1_used ? '1' : ' ', 1514 ctx.tmp_1_used ? '1' : ' ',
1589 ctx.tmp_2_used ? '2' : ' ', 1515 ctx.tmp_2_used ? '2' : ' ',
1590 ctx.tmp_3_used ? '3' : ' ', 1516 ctx.tmp_3_used ? '3' : ' ',
1591 ctx.saw_ld_abs_ind ? 'L' : ' ',
1592 ctx.saw_frame_pointer ? 'F' : ' ', 1517 ctx.saw_frame_pointer ? 'F' : ' ',
1593 ctx.saw_call ? 'C' : ' ', 1518 ctx.saw_call ? 'C' : ' ',
1594 ctx.saw_tail_call ? 'T' : ' '); 1519 ctx.saw_tail_call ? 'T' : ' ');
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f492b871a..d51a71dcbac2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -138,7 +138,7 @@ config X86
138 select HAVE_DMA_CONTIGUOUS 138 select HAVE_DMA_CONTIGUOUS
139 select HAVE_DYNAMIC_FTRACE 139 select HAVE_DYNAMIC_FTRACE
140 select HAVE_DYNAMIC_FTRACE_WITH_REGS 140 select HAVE_DYNAMIC_FTRACE_WITH_REGS
141 select HAVE_EBPF_JIT if X86_64 141 select HAVE_EBPF_JIT
142 select HAVE_EFFICIENT_UNALIGNED_ACCESS 142 select HAVE_EFFICIENT_UNALIGNED_ACCESS
143 select HAVE_EXIT_THREAD 143 select HAVE_EXIT_THREAD
144 select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE 144 select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f928ad9b143f..2cd344d1a6e5 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -291,16 +291,20 @@ do { \
291 * lfence 291 * lfence
292 * jmp spec_trap 292 * jmp spec_trap
293 * do_rop: 293 * do_rop:
294 * mov %rax,(%rsp) 294 * mov %rax,(%rsp) for x86_64
295 * mov %edx,(%esp) for x86_32
295 * retq 296 * retq
296 * 297 *
297 * Without retpolines configured: 298 * Without retpolines configured:
298 * 299 *
299 * jmp *%rax 300 * jmp *%rax for x86_64
301 * jmp *%edx for x86_32
300 */ 302 */
301#ifdef CONFIG_RETPOLINE 303#ifdef CONFIG_RETPOLINE
304#ifdef CONFIG_X86_64
302# define RETPOLINE_RAX_BPF_JIT_SIZE 17 305# define RETPOLINE_RAX_BPF_JIT_SIZE 17
303# define RETPOLINE_RAX_BPF_JIT() \ 306# define RETPOLINE_RAX_BPF_JIT() \
307do { \
304 EMIT1_off32(0xE8, 7); /* callq do_rop */ \ 308 EMIT1_off32(0xE8, 7); /* callq do_rop */ \
305 /* spec_trap: */ \ 309 /* spec_trap: */ \
306 EMIT2(0xF3, 0x90); /* pause */ \ 310 EMIT2(0xF3, 0x90); /* pause */ \
@@ -308,11 +312,31 @@ do { \
308 EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ 312 EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
309 /* do_rop: */ \ 313 /* do_rop: */ \
310 EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \ 314 EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \
311 EMIT1(0xC3); /* retq */ 315 EMIT1(0xC3); /* retq */ \
316} while (0)
312#else 317#else
318# define RETPOLINE_EDX_BPF_JIT() \
319do { \
320 EMIT1_off32(0xE8, 7); /* call do_rop */ \
321 /* spec_trap: */ \
322 EMIT2(0xF3, 0x90); /* pause */ \
323 EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \
324 EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
325 /* do_rop: */ \
326 EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \
327 EMIT1(0xC3); /* ret */ \
328} while (0)
329#endif
330#else /* !CONFIG_RETPOLINE */
331
332#ifdef CONFIG_X86_64
313# define RETPOLINE_RAX_BPF_JIT_SIZE 2 333# define RETPOLINE_RAX_BPF_JIT_SIZE 2
314# define RETPOLINE_RAX_BPF_JIT() \ 334# define RETPOLINE_RAX_BPF_JIT() \
315 EMIT2(0xFF, 0xE0); /* jmp *%rax */ 335 EMIT2(0xFF, 0xE0); /* jmp *%rax */
336#else
337# define RETPOLINE_EDX_BPF_JIT() \
338 EMIT2(0xFF, 0xE2) /* jmp *%edx */
339#endif
316#endif 340#endif
317 341
318#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ 342#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index fefb4b619598..59e123da580c 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,6 +1,9 @@
1# 1#
2# Arch-specific network modules 2# Arch-specific network modules
3# 3#
4OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
5 4
6obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o 5ifeq ($(CONFIG_X86_32),y)
6 obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
7else
8 obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
9endif
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
deleted file mode 100644
index b33093f84528..000000000000
--- a/arch/x86/net/bpf_jit.S
+++ /dev/null
@@ -1,154 +0,0 @@
1/* bpf_jit.S : BPF JIT helper functions
2 *
3 * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 */
10#include <linux/linkage.h>
11#include <asm/frame.h>
12
13/*
14 * Calling convention :
15 * rbx : skb pointer (callee saved)
16 * esi : offset of byte(s) to fetch in skb (can be scratched)
17 * r10 : copy of skb->data
18 * r9d : hlen = skb->len - skb->data_len
19 */
20#define SKBDATA %r10
21#define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */
22
23#define FUNC(name) \
24 .globl name; \
25 .type name, @function; \
26 name:
27
28FUNC(sk_load_word)
29 test %esi,%esi
30 js bpf_slow_path_word_neg
31
32FUNC(sk_load_word_positive_offset)
33 mov %r9d,%eax # hlen
34 sub %esi,%eax # hlen - offset
35 cmp $3,%eax
36 jle bpf_slow_path_word
37 mov (SKBDATA,%rsi),%eax
38 bswap %eax /* ntohl() */
39 ret
40
41FUNC(sk_load_half)
42 test %esi,%esi
43 js bpf_slow_path_half_neg
44
45FUNC(sk_load_half_positive_offset)
46 mov %r9d,%eax
47 sub %esi,%eax # hlen - offset
48 cmp $1,%eax
49 jle bpf_slow_path_half
50 movzwl (SKBDATA,%rsi),%eax
51 rol $8,%ax # ntohs()
52 ret
53
54FUNC(sk_load_byte)
55 test %esi,%esi
56 js bpf_slow_path_byte_neg
57
58FUNC(sk_load_byte_positive_offset)
59 cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */
60 jle bpf_slow_path_byte
61 movzbl (SKBDATA,%rsi),%eax
62 ret
63
64/* rsi contains offset and can be scratched */
65#define bpf_slow_path_common(LEN) \
66 lea 32(%rbp), %rdx;\
67 FRAME_BEGIN; \
68 mov %rbx, %rdi; /* arg1 == skb */ \
69 push %r9; \
70 push SKBDATA; \
71/* rsi already has offset */ \
72 mov $LEN,%ecx; /* len */ \
73 call skb_copy_bits; \
74 test %eax,%eax; \
75 pop SKBDATA; \
76 pop %r9; \
77 FRAME_END
78
79
80bpf_slow_path_word:
81 bpf_slow_path_common(4)
82 js bpf_error
83 mov 32(%rbp),%eax
84 bswap %eax
85 ret
86
87bpf_slow_path_half:
88 bpf_slow_path_common(2)
89 js bpf_error
90 mov 32(%rbp),%ax
91 rol $8,%ax
92 movzwl %ax,%eax
93 ret
94
95bpf_slow_path_byte:
96 bpf_slow_path_common(1)
97 js bpf_error
98 movzbl 32(%rbp),%eax
99 ret
100
101#define sk_negative_common(SIZE) \
102 FRAME_BEGIN; \
103 mov %rbx, %rdi; /* arg1 == skb */ \
104 push %r9; \
105 push SKBDATA; \
106/* rsi already has offset */ \
107 mov $SIZE,%edx; /* size */ \
108 call bpf_internal_load_pointer_neg_helper; \
109 test %rax,%rax; \
110 pop SKBDATA; \
111 pop %r9; \
112 FRAME_END; \
113 jz bpf_error
114
115bpf_slow_path_word_neg:
116 cmp SKF_MAX_NEG_OFF, %esi /* test range */
117 jl bpf_error /* offset lower -> error */
118
119FUNC(sk_load_word_negative_offset)
120 sk_negative_common(4)
121 mov (%rax), %eax
122 bswap %eax
123 ret
124
125bpf_slow_path_half_neg:
126 cmp SKF_MAX_NEG_OFF, %esi
127 jl bpf_error
128
129FUNC(sk_load_half_negative_offset)
130 sk_negative_common(2)
131 mov (%rax),%ax
132 rol $8,%ax
133 movzwl %ax,%eax
134 ret
135
136bpf_slow_path_byte_neg:
137 cmp SKF_MAX_NEG_OFF, %esi
138 jl bpf_error
139
140FUNC(sk_load_byte_negative_offset)
141 sk_negative_common(1)
142 movzbl (%rax), %eax
143 ret
144
145bpf_error:
146# force a return 0 from jit handler
147 xor %eax,%eax
148 mov (%rbp),%rbx
149 mov 8(%rbp),%r13
150 mov 16(%rbp),%r14
151 mov 24(%rbp),%r15
152 add $40, %rbp
153 leaveq
154 ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 263c8453815e..8fca446aaef6 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,4 +1,5 @@
1/* bpf_jit_comp.c : BPF JIT compiler 1/*
2 * bpf_jit_comp.c: BPF JIT compiler
2 * 3 *
3 * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com) 4 * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
4 * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 5 * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
@@ -16,15 +17,6 @@
16#include <asm/set_memory.h> 17#include <asm/set_memory.h>
17#include <asm/nospec-branch.h> 18#include <asm/nospec-branch.h>
18 19
19/*
20 * assembly code in arch/x86/net/bpf_jit.S
21 */
22extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
23extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
24extern u8 sk_load_byte_positive_offset[];
25extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
26extern u8 sk_load_byte_negative_offset[];
27
28static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) 20static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
29{ 21{
30 if (len == 1) 22 if (len == 1)
@@ -45,14 +37,15 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
45#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) 37#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
46#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) 38#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
47#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) 39#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
40
48#define EMIT1_off32(b1, off) \ 41#define EMIT1_off32(b1, off) \
49 do {EMIT1(b1); EMIT(off, 4); } while (0) 42 do { EMIT1(b1); EMIT(off, 4); } while (0)
50#define EMIT2_off32(b1, b2, off) \ 43#define EMIT2_off32(b1, b2, off) \
51 do {EMIT2(b1, b2); EMIT(off, 4); } while (0) 44 do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
52#define EMIT3_off32(b1, b2, b3, off) \ 45#define EMIT3_off32(b1, b2, b3, off) \
53 do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0) 46 do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
54#define EMIT4_off32(b1, b2, b3, b4, off) \ 47#define EMIT4_off32(b1, b2, b3, b4, off) \
55 do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) 48 do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
56 49
57static bool is_imm8(int value) 50static bool is_imm8(int value)
58{ 51{
@@ -70,9 +63,10 @@ static bool is_uimm32(u64 value)
70} 63}
71 64
72/* mov dst, src */ 65/* mov dst, src */
73#define EMIT_mov(DST, SRC) \ 66#define EMIT_mov(DST, SRC) \
74 do {if (DST != SRC) \ 67 do { \
75 EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \ 68 if (DST != SRC) \
69 EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
76 } while (0) 70 } while (0)
77 71
78static int bpf_size_to_x86_bytes(int bpf_size) 72static int bpf_size_to_x86_bytes(int bpf_size)
@@ -89,7 +83,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
89 return 0; 83 return 0;
90} 84}
91 85
92/* list of x86 cond jumps opcodes (. + s8) 86/*
87 * List of x86 cond jumps opcodes (. + s8)
93 * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32) 88 * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
94 */ 89 */
95#define X86_JB 0x72 90#define X86_JB 0x72
@@ -103,38 +98,37 @@ static int bpf_size_to_x86_bytes(int bpf_size)
103#define X86_JLE 0x7E 98#define X86_JLE 0x7E
104#define X86_JG 0x7F 99#define X86_JG 0x7F
105 100
106#define CHOOSE_LOAD_FUNC(K, func) \ 101/* Pick a register outside of BPF range for JIT internal work */
107 ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
108
109/* pick a register outside of BPF range for JIT internal work */
110#define AUX_REG (MAX_BPF_JIT_REG + 1) 102#define AUX_REG (MAX_BPF_JIT_REG + 1)
111 103
112/* The following table maps BPF registers to x64 registers. 104/*
105 * The following table maps BPF registers to x86-64 registers.
113 * 106 *
114 * x64 register r12 is unused, since if used as base address 107 * x86-64 register R12 is unused, since if used as base address
115 * register in load/store instructions, it always needs an 108 * register in load/store instructions, it always needs an
116 * extra byte of encoding and is callee saved. 109 * extra byte of encoding and is callee saved.
117 * 110 *
118 * r9 caches skb->len - skb->data_len 111 * Also x86-64 register R9 is unused. x86-64 register R10 is
119 * r10 caches skb->data, and used for blinding (if enabled) 112 * used for blinding (if enabled).
120 */ 113 */
121static const int reg2hex[] = { 114static const int reg2hex[] = {
122 [BPF_REG_0] = 0, /* rax */ 115 [BPF_REG_0] = 0, /* RAX */
123 [BPF_REG_1] = 7, /* rdi */ 116 [BPF_REG_1] = 7, /* RDI */
124 [BPF_REG_2] = 6, /* rsi */ 117 [BPF_REG_2] = 6, /* RSI */
125 [BPF_REG_3] = 2, /* rdx */ 118 [BPF_REG_3] = 2, /* RDX */
126 [BPF_REG_4] = 1, /* rcx */ 119 [BPF_REG_4] = 1, /* RCX */
127 [BPF_REG_5] = 0, /* r8 */ 120 [BPF_REG_5] = 0, /* R8 */
128 [BPF_REG_6] = 3, /* rbx callee saved */ 121 [BPF_REG_6] = 3, /* RBX callee saved */
129 [BPF_REG_7] = 5, /* r13 callee saved */ 122 [BPF_REG_7] = 5, /* R13 callee saved */
130 [BPF_REG_8] = 6, /* r14 callee saved */ 123 [BPF_REG_8] = 6, /* R14 callee saved */
131 [BPF_REG_9] = 7, /* r15 callee saved */ 124 [BPF_REG_9] = 7, /* R15 callee saved */
132 [BPF_REG_FP] = 5, /* rbp readonly */ 125 [BPF_REG_FP] = 5, /* RBP readonly */
133 [BPF_REG_AX] = 2, /* r10 temp register */ 126 [BPF_REG_AX] = 2, /* R10 temp register */
134 [AUX_REG] = 3, /* r11 temp register */ 127 [AUX_REG] = 3, /* R11 temp register */
135}; 128};
136 129
137/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15 130/*
131 * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
138 * which need extra byte of encoding. 132 * which need extra byte of encoding.
139 * rax,rcx,...,rbp have simpler encoding 133 * rax,rcx,...,rbp have simpler encoding
140 */ 134 */
@@ -153,7 +147,7 @@ static bool is_axreg(u32 reg)
153 return reg == BPF_REG_0; 147 return reg == BPF_REG_0;
154} 148}
155 149
156/* add modifiers if 'reg' maps to x64 registers r8..r15 */ 150/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */
157static u8 add_1mod(u8 byte, u32 reg) 151static u8 add_1mod(u8 byte, u32 reg)
158{ 152{
159 if (is_ereg(reg)) 153 if (is_ereg(reg))
@@ -170,13 +164,13 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
170 return byte; 164 return byte;
171} 165}
172 166
173/* encode 'dst_reg' register into x64 opcode 'byte' */ 167/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
174static u8 add_1reg(u8 byte, u32 dst_reg) 168static u8 add_1reg(u8 byte, u32 dst_reg)
175{ 169{
176 return byte + reg2hex[dst_reg]; 170 return byte + reg2hex[dst_reg];
177} 171}
178 172
179/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */ 173/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
180static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) 174static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
181{ 175{
182 return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); 176 return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
@@ -184,27 +178,24 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
184 178
185static void jit_fill_hole(void *area, unsigned int size) 179static void jit_fill_hole(void *area, unsigned int size)
186{ 180{
187 /* fill whole space with int3 instructions */ 181 /* Fill whole space with INT3 instructions */
188 memset(area, 0xcc, size); 182 memset(area, 0xcc, size);
189} 183}
190 184
191struct jit_context { 185struct jit_context {
192 int cleanup_addr; /* epilogue code offset */ 186 int cleanup_addr; /* Epilogue code offset */
193 bool seen_ld_abs;
194 bool seen_ax_reg;
195}; 187};
196 188
197/* maximum number of bytes emitted while JITing one eBPF insn */ 189/* Maximum number of bytes emitted while JITing one eBPF insn */
198#define BPF_MAX_INSN_SIZE 128 190#define BPF_MAX_INSN_SIZE 128
199#define BPF_INSN_SAFETY 64 191#define BPF_INSN_SAFETY 64
200 192
201#define AUX_STACK_SPACE \ 193#define AUX_STACK_SPACE 40 /* Space for RBX, R13, R14, R15, tailcnt */
202 (32 /* space for rbx, r13, r14, r15 */ + \
203 8 /* space for skb_copy_bits() buffer */)
204 194
205#define PROLOGUE_SIZE 37 195#define PROLOGUE_SIZE 37
206 196
207/* emit x64 prologue code for BPF program and check it's size. 197/*
198 * Emit x86-64 prologue code for BPF program and check its size.
208 * bpf_tail_call helper will skip it while jumping into another program 199 * bpf_tail_call helper will skip it while jumping into another program
209 */ 200 */
210static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) 201static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
@@ -212,8 +203,11 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
212 u8 *prog = *pprog; 203 u8 *prog = *pprog;
213 int cnt = 0; 204 int cnt = 0;
214 205
215 EMIT1(0x55); /* push rbp */ 206 /* push rbp */
216 EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ 207 EMIT1(0x55);
208
209 /* mov rbp,rsp */
210 EMIT3(0x48, 0x89, 0xE5);
217 211
218 /* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */ 212 /* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */
219 EMIT3_off32(0x48, 0x81, 0xEC, 213 EMIT3_off32(0x48, 0x81, 0xEC,
@@ -222,19 +216,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
222 /* sub rbp, AUX_STACK_SPACE */ 216 /* sub rbp, AUX_STACK_SPACE */
223 EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE); 217 EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
224 218
225 /* all classic BPF filters use R6(rbx) save it */
226
227 /* mov qword ptr [rbp+0],rbx */ 219 /* mov qword ptr [rbp+0],rbx */
228 EMIT4(0x48, 0x89, 0x5D, 0); 220 EMIT4(0x48, 0x89, 0x5D, 0);
229
230 /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
231 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
232 * R8(r14). R9(r15) spill could be made conditional, but there is only
233 * one 'bpf_error' return path out of helper functions inside bpf_jit.S
234 * The overhead of extra spill is negligible for any filter other
235 * than synthetic ones. Therefore not worth adding complexity.
236 */
237
238 /* mov qword ptr [rbp+8],r13 */ 221 /* mov qword ptr [rbp+8],r13 */
239 EMIT4(0x4C, 0x89, 0x6D, 8); 222 EMIT4(0x4C, 0x89, 0x6D, 8);
240 /* mov qword ptr [rbp+16],r14 */ 223 /* mov qword ptr [rbp+16],r14 */
@@ -243,9 +226,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
243 EMIT4(0x4C, 0x89, 0x7D, 24); 226 EMIT4(0x4C, 0x89, 0x7D, 24);
244 227
245 if (!ebpf_from_cbpf) { 228 if (!ebpf_from_cbpf) {
246 /* Clear the tail call counter (tail_call_cnt): for eBPF tail 229 /*
230 * Clear the tail call counter (tail_call_cnt): for eBPF tail
247 * calls we need to reset the counter to 0. It's done in two 231 * calls we need to reset the counter to 0. It's done in two
248 * instructions, resetting rax register to 0, and moving it 232 * instructions, resetting RAX register to 0, and moving it
249 * to the counter location. 233 * to the counter location.
250 */ 234 */
251 235
@@ -260,7 +244,9 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
260 *pprog = prog; 244 *pprog = prog;
261} 245}
262 246
263/* generate the following code: 247/*
248 * Generate the following code:
249 *
264 * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... 250 * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
265 * if (index >= array->map.max_entries) 251 * if (index >= array->map.max_entries)
266 * goto out; 252 * goto out;
@@ -278,23 +264,26 @@ static void emit_bpf_tail_call(u8 **pprog)
278 int label1, label2, label3; 264 int label1, label2, label3;
279 int cnt = 0; 265 int cnt = 0;
280 266
281 /* rdi - pointer to ctx 267 /*
268 * rdi - pointer to ctx
282 * rsi - pointer to bpf_array 269 * rsi - pointer to bpf_array
283 * rdx - index in bpf_array 270 * rdx - index in bpf_array
284 */ 271 */
285 272
286 /* if (index >= array->map.max_entries) 273 /*
287 * goto out; 274 * if (index >= array->map.max_entries)
275 * goto out;
288 */ 276 */
289 EMIT2(0x89, 0xD2); /* mov edx, edx */ 277 EMIT2(0x89, 0xD2); /* mov edx, edx */
290 EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ 278 EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */
291 offsetof(struct bpf_array, map.max_entries)); 279 offsetof(struct bpf_array, map.max_entries));
292#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */ 280#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */
293 EMIT2(X86_JBE, OFFSET1); /* jbe out */ 281 EMIT2(X86_JBE, OFFSET1); /* jbe out */
294 label1 = cnt; 282 label1 = cnt;
295 283
296 /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) 284 /*
297 * goto out; 285 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
286 * goto out;
298 */ 287 */
299 EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ 288 EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */
300 EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ 289 EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
@@ -308,8 +297,9 @@ static void emit_bpf_tail_call(u8 **pprog)
308 EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ 297 EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
309 offsetof(struct bpf_array, ptrs)); 298 offsetof(struct bpf_array, ptrs));
310 299
311 /* if (prog == NULL) 300 /*
312 * goto out; 301 * if (prog == NULL)
302 * goto out;
313 */ 303 */
314 EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ 304 EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
315#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE) 305#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE)
@@ -321,7 +311,8 @@ static void emit_bpf_tail_call(u8 **pprog)
321 offsetof(struct bpf_prog, bpf_func)); 311 offsetof(struct bpf_prog, bpf_func));
322 EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ 312 EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */
323 313
324 /* now we're ready to jump into next BPF program 314 /*
315 * Wow we're ready to jump into next BPF program
325 * rdi == ctx (1st arg) 316 * rdi == ctx (1st arg)
326 * rax == prog->bpf_func + prologue_size 317 * rax == prog->bpf_func + prologue_size
327 */ 318 */
@@ -334,26 +325,6 @@ static void emit_bpf_tail_call(u8 **pprog)
334 *pprog = prog; 325 *pprog = prog;
335} 326}
336 327
337
338static void emit_load_skb_data_hlen(u8 **pprog)
339{
340 u8 *prog = *pprog;
341 int cnt = 0;
342
343 /* r9d = skb->len - skb->data_len (headlen)
344 * r10 = skb->data
345 */
346 /* mov %r9d, off32(%rdi) */
347 EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
348
349 /* sub %r9d, off32(%rdi) */
350 EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
351
352 /* mov %r10, off32(%rdi) */
353 EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
354 *pprog = prog;
355}
356
357static void emit_mov_imm32(u8 **pprog, bool sign_propagate, 328static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
358 u32 dst_reg, const u32 imm32) 329 u32 dst_reg, const u32 imm32)
359{ 330{
@@ -361,7 +332,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
361 u8 b1, b2, b3; 332 u8 b1, b2, b3;
362 int cnt = 0; 333 int cnt = 0;
363 334
364 /* optimization: if imm32 is positive, use 'mov %eax, imm32' 335 /*
336 * Optimization: if imm32 is positive, use 'mov %eax, imm32'
365 * (which zero-extends imm32) to save 2 bytes. 337 * (which zero-extends imm32) to save 2 bytes.
366 */ 338 */
367 if (sign_propagate && (s32)imm32 < 0) { 339 if (sign_propagate && (s32)imm32 < 0) {
@@ -373,7 +345,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
373 goto done; 345 goto done;
374 } 346 }
375 347
376 /* optimization: if imm32 is zero, use 'xor %eax, %eax' 348 /*
349 * Optimization: if imm32 is zero, use 'xor %eax, %eax'
377 * to save 3 bytes. 350 * to save 3 bytes.
378 */ 351 */
379 if (imm32 == 0) { 352 if (imm32 == 0) {
@@ -400,7 +373,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
400 int cnt = 0; 373 int cnt = 0;
401 374
402 if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { 375 if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
403 /* For emitting plain u32, where sign bit must not be 376 /*
377 * For emitting plain u32, where sign bit must not be
404 * propagated LLVM tends to load imm64 over mov32 378 * propagated LLVM tends to load imm64 over mov32
405 * directly, so save couple of bytes by just doing 379 * directly, so save couple of bytes by just doing
406 * 'mov %eax, imm32' instead. 380 * 'mov %eax, imm32' instead.
@@ -439,8 +413,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
439{ 413{
440 struct bpf_insn *insn = bpf_prog->insnsi; 414 struct bpf_insn *insn = bpf_prog->insnsi;
441 int insn_cnt = bpf_prog->len; 415 int insn_cnt = bpf_prog->len;
442 bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
443 bool seen_ax_reg = ctx->seen_ax_reg | (oldproglen == 0);
444 bool seen_exit = false; 416 bool seen_exit = false;
445 u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; 417 u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
446 int i, cnt = 0; 418 int i, cnt = 0;
@@ -450,9 +422,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
450 emit_prologue(&prog, bpf_prog->aux->stack_depth, 422 emit_prologue(&prog, bpf_prog->aux->stack_depth,
451 bpf_prog_was_classic(bpf_prog)); 423 bpf_prog_was_classic(bpf_prog));
452 424
453 if (seen_ld_abs)
454 emit_load_skb_data_hlen(&prog);
455
456 for (i = 0; i < insn_cnt; i++, insn++) { 425 for (i = 0; i < insn_cnt; i++, insn++) {
457 const s32 imm32 = insn->imm; 426 const s32 imm32 = insn->imm;
458 u32 dst_reg = insn->dst_reg; 427 u32 dst_reg = insn->dst_reg;
@@ -460,13 +429,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
460 u8 b2 = 0, b3 = 0; 429 u8 b2 = 0, b3 = 0;
461 s64 jmp_offset; 430 s64 jmp_offset;
462 u8 jmp_cond; 431 u8 jmp_cond;
463 bool reload_skb_data;
464 int ilen; 432 int ilen;
465 u8 *func; 433 u8 *func;
466 434
467 if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
468 ctx->seen_ax_reg = seen_ax_reg = true;
469
470 switch (insn->code) { 435 switch (insn->code) {
471 /* ALU */ 436 /* ALU */
472 case BPF_ALU | BPF_ADD | BPF_X: 437 case BPF_ALU | BPF_ADD | BPF_X:
@@ -525,7 +490,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
525 else if (is_ereg(dst_reg)) 490 else if (is_ereg(dst_reg))
526 EMIT1(add_1mod(0x40, dst_reg)); 491 EMIT1(add_1mod(0x40, dst_reg));
527 492
528 /* b3 holds 'normal' opcode, b2 short form only valid 493 /*
494 * b3 holds 'normal' opcode, b2 short form only valid
529 * in case dst is eax/rax. 495 * in case dst is eax/rax.
530 */ 496 */
531 switch (BPF_OP(insn->code)) { 497 switch (BPF_OP(insn->code)) {
@@ -593,7 +559,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
593 /* mov rax, dst_reg */ 559 /* mov rax, dst_reg */
594 EMIT_mov(BPF_REG_0, dst_reg); 560 EMIT_mov(BPF_REG_0, dst_reg);
595 561
596 /* xor edx, edx 562 /*
563 * xor edx, edx
597 * equivalent to 'xor rdx, rdx', but one byte less 564 * equivalent to 'xor rdx, rdx', but one byte less
598 */ 565 */
599 EMIT2(0x31, 0xd2); 566 EMIT2(0x31, 0xd2);
@@ -655,7 +622,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
655 } 622 }
656 break; 623 break;
657 } 624 }
658 /* shifts */ 625 /* Shifts */
659 case BPF_ALU | BPF_LSH | BPF_K: 626 case BPF_ALU | BPF_LSH | BPF_K:
660 case BPF_ALU | BPF_RSH | BPF_K: 627 case BPF_ALU | BPF_RSH | BPF_K:
661 case BPF_ALU | BPF_ARSH | BPF_K: 628 case BPF_ALU | BPF_ARSH | BPF_K:
@@ -686,7 +653,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
686 case BPF_ALU64 | BPF_RSH | BPF_X: 653 case BPF_ALU64 | BPF_RSH | BPF_X:
687 case BPF_ALU64 | BPF_ARSH | BPF_X: 654 case BPF_ALU64 | BPF_ARSH | BPF_X:
688 655
689 /* check for bad case when dst_reg == rcx */ 656 /* Check for bad case when dst_reg == rcx */
690 if (dst_reg == BPF_REG_4) { 657 if (dst_reg == BPF_REG_4) {
691 /* mov r11, dst_reg */ 658 /* mov r11, dst_reg */
692 EMIT_mov(AUX_REG, dst_reg); 659 EMIT_mov(AUX_REG, dst_reg);
@@ -724,13 +691,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
724 case BPF_ALU | BPF_END | BPF_FROM_BE: 691 case BPF_ALU | BPF_END | BPF_FROM_BE:
725 switch (imm32) { 692 switch (imm32) {
726 case 16: 693 case 16:
727 /* emit 'ror %ax, 8' to swap lower 2 bytes */ 694 /* Emit 'ror %ax, 8' to swap lower 2 bytes */
728 EMIT1(0x66); 695 EMIT1(0x66);
729 if (is_ereg(dst_reg)) 696 if (is_ereg(dst_reg))
730 EMIT1(0x41); 697 EMIT1(0x41);
731 EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8); 698 EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
732 699
733 /* emit 'movzwl eax, ax' */ 700 /* Emit 'movzwl eax, ax' */
734 if (is_ereg(dst_reg)) 701 if (is_ereg(dst_reg))
735 EMIT3(0x45, 0x0F, 0xB7); 702 EMIT3(0x45, 0x0F, 0xB7);
736 else 703 else
@@ -738,7 +705,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
738 EMIT1(add_2reg(0xC0, dst_reg, dst_reg)); 705 EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
739 break; 706 break;
740 case 32: 707 case 32:
741 /* emit 'bswap eax' to swap lower 4 bytes */ 708 /* Emit 'bswap eax' to swap lower 4 bytes */
742 if (is_ereg(dst_reg)) 709 if (is_ereg(dst_reg))
743 EMIT2(0x41, 0x0F); 710 EMIT2(0x41, 0x0F);
744 else 711 else
@@ -746,7 +713,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
746 EMIT1(add_1reg(0xC8, dst_reg)); 713 EMIT1(add_1reg(0xC8, dst_reg));
747 break; 714 break;
748 case 64: 715 case 64:
749 /* emit 'bswap rax' to swap 8 bytes */ 716 /* Emit 'bswap rax' to swap 8 bytes */
750 EMIT3(add_1mod(0x48, dst_reg), 0x0F, 717 EMIT3(add_1mod(0x48, dst_reg), 0x0F,
751 add_1reg(0xC8, dst_reg)); 718 add_1reg(0xC8, dst_reg));
752 break; 719 break;
@@ -756,7 +723,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
756 case BPF_ALU | BPF_END | BPF_FROM_LE: 723 case BPF_ALU | BPF_END | BPF_FROM_LE:
757 switch (imm32) { 724 switch (imm32) {
758 case 16: 725 case 16:
759 /* emit 'movzwl eax, ax' to zero extend 16-bit 726 /*
727 * Emit 'movzwl eax, ax' to zero extend 16-bit
760 * into 64 bit 728 * into 64 bit
761 */ 729 */
762 if (is_ereg(dst_reg)) 730 if (is_ereg(dst_reg))
@@ -766,7 +734,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
766 EMIT1(add_2reg(0xC0, dst_reg, dst_reg)); 734 EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
767 break; 735 break;
768 case 32: 736 case 32:
769 /* emit 'mov eax, eax' to clear upper 32-bits */ 737 /* Emit 'mov eax, eax' to clear upper 32-bits */
770 if (is_ereg(dst_reg)) 738 if (is_ereg(dst_reg))
771 EMIT1(0x45); 739 EMIT1(0x45);
772 EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg)); 740 EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg));
@@ -809,9 +777,9 @@ st: if (is_imm8(insn->off))
809 777
810 /* STX: *(u8*)(dst_reg + off) = src_reg */ 778 /* STX: *(u8*)(dst_reg + off) = src_reg */
811 case BPF_STX | BPF_MEM | BPF_B: 779 case BPF_STX | BPF_MEM | BPF_B:
812 /* emit 'mov byte ptr [rax + off], al' */ 780 /* Emit 'mov byte ptr [rax + off], al' */
813 if (is_ereg(dst_reg) || is_ereg(src_reg) || 781 if (is_ereg(dst_reg) || is_ereg(src_reg) ||
814 /* have to add extra byte for x86 SIL, DIL regs */ 782 /* We have to add extra byte for x86 SIL, DIL regs */
815 src_reg == BPF_REG_1 || src_reg == BPF_REG_2) 783 src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
816 EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); 784 EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
817 else 785 else
@@ -840,25 +808,26 @@ stx: if (is_imm8(insn->off))
840 808
841 /* LDX: dst_reg = *(u8*)(src_reg + off) */ 809 /* LDX: dst_reg = *(u8*)(src_reg + off) */
842 case BPF_LDX | BPF_MEM | BPF_B: 810 case BPF_LDX | BPF_MEM | BPF_B:
843 /* emit 'movzx rax, byte ptr [rax + off]' */ 811 /* Emit 'movzx rax, byte ptr [rax + off]' */
844 EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); 812 EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
845 goto ldx; 813 goto ldx;
846 case BPF_LDX | BPF_MEM | BPF_H: 814 case BPF_LDX | BPF_MEM | BPF_H:
847 /* emit 'movzx rax, word ptr [rax + off]' */ 815 /* Emit 'movzx rax, word ptr [rax + off]' */
848 EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); 816 EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
849 goto ldx; 817 goto ldx;
850 case BPF_LDX | BPF_MEM | BPF_W: 818 case BPF_LDX | BPF_MEM | BPF_W:
851 /* emit 'mov eax, dword ptr [rax+0x14]' */ 819 /* Emit 'mov eax, dword ptr [rax+0x14]' */
852 if (is_ereg(dst_reg) || is_ereg(src_reg)) 820 if (is_ereg(dst_reg) || is_ereg(src_reg))
853 EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); 821 EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
854 else 822 else
855 EMIT1(0x8B); 823 EMIT1(0x8B);
856 goto ldx; 824 goto ldx;
857 case BPF_LDX | BPF_MEM | BPF_DW: 825 case BPF_LDX | BPF_MEM | BPF_DW:
858 /* emit 'mov rax, qword ptr [rax+0x14]' */ 826 /* Emit 'mov rax, qword ptr [rax+0x14]' */
859 EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); 827 EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
860ldx: /* if insn->off == 0 we can save one extra byte, but 828ldx: /*
861 * special case of x86 r13 which always needs an offset 829 * If insn->off == 0 we can save one extra byte, but
830 * special case of x86 R13 which always needs an offset
862 * is not worth the hassle 831 * is not worth the hassle
863 */ 832 */
864 if (is_imm8(insn->off)) 833 if (is_imm8(insn->off))
@@ -870,7 +839,7 @@ ldx: /* if insn->off == 0 we can save one extra byte, but
870 839
871 /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ 840 /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
872 case BPF_STX | BPF_XADD | BPF_W: 841 case BPF_STX | BPF_XADD | BPF_W:
873 /* emit 'lock add dword ptr [rax + off], eax' */ 842 /* Emit 'lock add dword ptr [rax + off], eax' */
874 if (is_ereg(dst_reg) || is_ereg(src_reg)) 843 if (is_ereg(dst_reg) || is_ereg(src_reg))
875 EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01); 844 EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
876 else 845 else
@@ -889,35 +858,12 @@ xadd: if (is_imm8(insn->off))
889 case BPF_JMP | BPF_CALL: 858 case BPF_JMP | BPF_CALL:
890 func = (u8 *) __bpf_call_base + imm32; 859 func = (u8 *) __bpf_call_base + imm32;
891 jmp_offset = func - (image + addrs[i]); 860 jmp_offset = func - (image + addrs[i]);
892 if (seen_ld_abs) {
893 reload_skb_data = bpf_helper_changes_pkt_data(func);
894 if (reload_skb_data) {
895 EMIT1(0x57); /* push %rdi */
896 jmp_offset += 22; /* pop, mov, sub, mov */
897 } else {
898 EMIT2(0x41, 0x52); /* push %r10 */
899 EMIT2(0x41, 0x51); /* push %r9 */
900 /* need to adjust jmp offset, since
901 * pop %r9, pop %r10 take 4 bytes after call insn
902 */
903 jmp_offset += 4;
904 }
905 }
906 if (!imm32 || !is_simm32(jmp_offset)) { 861 if (!imm32 || !is_simm32(jmp_offset)) {
907 pr_err("unsupported bpf func %d addr %p image %p\n", 862 pr_err("unsupported BPF func %d addr %p image %p\n",
908 imm32, func, image); 863 imm32, func, image);
909 return -EINVAL; 864 return -EINVAL;
910 } 865 }
911 EMIT1_off32(0xE8, jmp_offset); 866 EMIT1_off32(0xE8, jmp_offset);
912 if (seen_ld_abs) {
913 if (reload_skb_data) {
914 EMIT1(0x5F); /* pop %rdi */
915 emit_load_skb_data_hlen(&prog);
916 } else {
917 EMIT2(0x41, 0x59); /* pop %r9 */
918 EMIT2(0x41, 0x5A); /* pop %r10 */
919 }
920 }
921 break; 867 break;
922 868
923 case BPF_JMP | BPF_TAIL_CALL: 869 case BPF_JMP | BPF_TAIL_CALL:
@@ -970,7 +916,7 @@ xadd: if (is_imm8(insn->off))
970 else 916 else
971 EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32); 917 EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
972 918
973emit_cond_jmp: /* convert BPF opcode to x86 */ 919emit_cond_jmp: /* Convert BPF opcode to x86 */
974 switch (BPF_OP(insn->code)) { 920 switch (BPF_OP(insn->code)) {
975 case BPF_JEQ: 921 case BPF_JEQ:
976 jmp_cond = X86_JE; 922 jmp_cond = X86_JE;
@@ -996,22 +942,22 @@ emit_cond_jmp: /* convert BPF opcode to x86 */
996 jmp_cond = X86_JBE; 942 jmp_cond = X86_JBE;
997 break; 943 break;
998 case BPF_JSGT: 944 case BPF_JSGT:
999 /* signed '>', GT in x86 */ 945 /* Signed '>', GT in x86 */
1000 jmp_cond = X86_JG; 946 jmp_cond = X86_JG;
1001 break; 947 break;
1002 case BPF_JSLT: 948 case BPF_JSLT:
1003 /* signed '<', LT in x86 */ 949 /* Signed '<', LT in x86 */
1004 jmp_cond = X86_JL; 950 jmp_cond = X86_JL;
1005 break; 951 break;
1006 case BPF_JSGE: 952 case BPF_JSGE:
1007 /* signed '>=', GE in x86 */ 953 /* Signed '>=', GE in x86 */
1008 jmp_cond = X86_JGE; 954 jmp_cond = X86_JGE;
1009 break; 955 break;
1010 case BPF_JSLE: 956 case BPF_JSLE:
1011 /* signed '<=', LE in x86 */ 957 /* Signed '<=', LE in x86 */
1012 jmp_cond = X86_JLE; 958 jmp_cond = X86_JLE;
1013 break; 959 break;
1014 default: /* to silence gcc warning */ 960 default: /* to silence GCC warning */
1015 return -EFAULT; 961 return -EFAULT;
1016 } 962 }
1017 jmp_offset = addrs[i + insn->off] - addrs[i]; 963 jmp_offset = addrs[i + insn->off] - addrs[i];
@@ -1039,7 +985,7 @@ emit_cond_jmp: /* convert BPF opcode to x86 */
1039 jmp_offset = addrs[i + insn->off] - addrs[i]; 985 jmp_offset = addrs[i + insn->off] - addrs[i];
1040 986
1041 if (!jmp_offset) 987 if (!jmp_offset)
1042 /* optimize out nop jumps */ 988 /* Optimize out nop jumps */
1043 break; 989 break;
1044emit_jmp: 990emit_jmp:
1045 if (is_imm8(jmp_offset)) { 991 if (is_imm8(jmp_offset)) {
@@ -1052,66 +998,13 @@ emit_jmp:
1052 } 998 }
1053 break; 999 break;
1054 1000
1055 case BPF_LD | BPF_IND | BPF_W:
1056 func = sk_load_word;
1057 goto common_load;
1058 case BPF_LD | BPF_ABS | BPF_W:
1059 func = CHOOSE_LOAD_FUNC(imm32, sk_load_word);
1060common_load:
1061 ctx->seen_ld_abs = seen_ld_abs = true;
1062 jmp_offset = func - (image + addrs[i]);
1063 if (!func || !is_simm32(jmp_offset)) {
1064 pr_err("unsupported bpf func %d addr %p image %p\n",
1065 imm32, func, image);
1066 return -EINVAL;
1067 }
1068 if (BPF_MODE(insn->code) == BPF_ABS) {
1069 /* mov %esi, imm32 */
1070 EMIT1_off32(0xBE, imm32);
1071 } else {
1072 /* mov %rsi, src_reg */
1073 EMIT_mov(BPF_REG_2, src_reg);
1074 if (imm32) {
1075 if (is_imm8(imm32))
1076 /* add %esi, imm8 */
1077 EMIT3(0x83, 0xC6, imm32);
1078 else
1079 /* add %esi, imm32 */
1080 EMIT2_off32(0x81, 0xC6, imm32);
1081 }
1082 }
1083 /* skb pointer is in R6 (%rbx), it will be copied into
1084 * %rdi if skb_copy_bits() call is necessary.
1085 * sk_load_* helpers also use %r10 and %r9d.
1086 * See bpf_jit.S
1087 */
1088 if (seen_ax_reg)
1089 /* r10 = skb->data, mov %r10, off32(%rbx) */
1090 EMIT3_off32(0x4c, 0x8b, 0x93,
1091 offsetof(struct sk_buff, data));
1092 EMIT1_off32(0xE8, jmp_offset); /* call */
1093 break;
1094
1095 case BPF_LD | BPF_IND | BPF_H:
1096 func = sk_load_half;
1097 goto common_load;
1098 case BPF_LD | BPF_ABS | BPF_H:
1099 func = CHOOSE_LOAD_FUNC(imm32, sk_load_half);
1100 goto common_load;
1101 case BPF_LD | BPF_IND | BPF_B:
1102 func = sk_load_byte;
1103 goto common_load;
1104 case BPF_LD | BPF_ABS | BPF_B:
1105 func = CHOOSE_LOAD_FUNC(imm32, sk_load_byte);
1106 goto common_load;
1107
1108 case BPF_JMP | BPF_EXIT: 1001 case BPF_JMP | BPF_EXIT:
1109 if (seen_exit) { 1002 if (seen_exit) {
1110 jmp_offset = ctx->cleanup_addr - addrs[i]; 1003 jmp_offset = ctx->cleanup_addr - addrs[i];
1111 goto emit_jmp; 1004 goto emit_jmp;
1112 } 1005 }
1113 seen_exit = true; 1006 seen_exit = true;
1114 /* update cleanup_addr */ 1007 /* Update cleanup_addr */
1115 ctx->cleanup_addr = proglen; 1008 ctx->cleanup_addr = proglen;
1116 /* mov rbx, qword ptr [rbp+0] */ 1009 /* mov rbx, qword ptr [rbp+0] */
1117 EMIT4(0x48, 0x8B, 0x5D, 0); 1010 EMIT4(0x48, 0x8B, 0x5D, 0);
@@ -1129,10 +1022,11 @@ common_load:
1129 break; 1022 break;
1130 1023
1131 default: 1024 default:
1132 /* By design x64 JIT should support all BPF instructions 1025 /*
1026 * By design x86-64 JIT should support all BPF instructions.
1133 * This error will be seen if new instruction was added 1027 * This error will be seen if new instruction was added
1134 * to interpreter, but not to JIT 1028 * to the interpreter, but not to the JIT, or if there is
1135 * or if there is junk in bpf_prog 1029 * junk in bpf_prog.
1136 */ 1030 */
1137 pr_err("bpf_jit: unknown opcode %02x\n", insn->code); 1031 pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
1138 return -EINVAL; 1032 return -EINVAL;
@@ -1184,7 +1078,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
1184 return orig_prog; 1078 return orig_prog;
1185 1079
1186 tmp = bpf_jit_blind_constants(prog); 1080 tmp = bpf_jit_blind_constants(prog);
1187 /* If blinding was requested and we failed during blinding, 1081 /*
1082 * If blinding was requested and we failed during blinding,
1188 * we must fall back to the interpreter. 1083 * we must fall back to the interpreter.
1189 */ 1084 */
1190 if (IS_ERR(tmp)) 1085 if (IS_ERR(tmp))
@@ -1218,8 +1113,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
1218 goto out_addrs; 1113 goto out_addrs;
1219 } 1114 }
1220 1115
1221 /* Before first pass, make a rough estimation of addrs[] 1116 /*
1222 * each bpf instruction is translated to less than 64 bytes 1117 * Before first pass, make a rough estimation of addrs[]
1118 * each BPF instruction is translated to less than 64 bytes
1223 */ 1119 */
1224 for (proglen = 0, i = 0; i < prog->len; i++) { 1120 for (proglen = 0, i = 0; i < prog->len; i++) {
1225 proglen += 64; 1121 proglen += 64;
@@ -1228,10 +1124,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
1228 ctx.cleanup_addr = proglen; 1124 ctx.cleanup_addr = proglen;
1229skip_init_addrs: 1125skip_init_addrs:
1230 1126
1231 /* JITed image shrinks with every pass and the loop iterates 1127 /*
1232 * until the image stops shrinking. Very large bpf programs 1128 * JITed image shrinks with every pass and the loop iterates
1129 * until the image stops shrinking. Very large BPF programs
1233 * may converge on the last pass. In such case do one more 1130 * may converge on the last pass. In such case do one more
1234 * pass to emit the final image 1131 * pass to emit the final image.
1235 */ 1132 */
1236 for (pass = 0; pass < 20 || image; pass++) { 1133 for (pass = 0; pass < 20 || image; pass++) {
1237 proglen = do_jit(prog, addrs, image, oldproglen, &ctx); 1134 proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
new file mode 100644
index 000000000000..0cc04e30adc1
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -0,0 +1,2419 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Just-In-Time compiler for eBPF filters on IA32 (32bit x86)
4 *
5 * Author: Wang YanQing (udknight@gmail.com)
6 * The code based on code and ideas from:
7 * Eric Dumazet (eric.dumazet@gmail.com)
8 * and from:
9 * Shubham Bansal <illusionist.neo@gmail.com>
10 */
11
12#include <linux/netdevice.h>
13#include <linux/filter.h>
14#include <linux/if_vlan.h>
15#include <asm/cacheflush.h>
16#include <asm/set_memory.h>
17#include <asm/nospec-branch.h>
18#include <linux/bpf.h>
19
20/*
21 * eBPF prog stack layout:
22 *
23 * high
24 * original ESP => +-----+
25 * | | callee saved registers
26 * +-----+
27 * | ... | eBPF JIT scratch space
28 * BPF_FP,IA32_EBP => +-----+
29 * | ... | eBPF prog stack
30 * +-----+
31 * |RSVD | JIT scratchpad
32 * current ESP => +-----+
33 * | |
34 * | ... | Function call stack
35 * | |
36 * +-----+
37 * low
38 *
39 * The callee saved registers:
40 *
41 * high
42 * original ESP => +------------------+ \
43 * | ebp | |
44 * current EBP => +------------------+ } callee saved registers
45 * | ebx,esi,edi | |
46 * +------------------+ /
47 * low
48 */
49
50static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
51{
52 if (len == 1)
53 *ptr = bytes;
54 else if (len == 2)
55 *(u16 *)ptr = bytes;
56 else {
57 *(u32 *)ptr = bytes;
58 barrier();
59 }
60 return ptr + len;
61}
62
63#define EMIT(bytes, len) \
64 do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
65
66#define EMIT1(b1) EMIT(b1, 1)
67#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
68#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
69#define EMIT4(b1, b2, b3, b4) \
70 EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
71
72#define EMIT1_off32(b1, off) \
73 do { EMIT1(b1); EMIT(off, 4); } while (0)
74#define EMIT2_off32(b1, b2, off) \
75 do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
76#define EMIT3_off32(b1, b2, b3, off) \
77 do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
78#define EMIT4_off32(b1, b2, b3, b4, off) \
79 do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
80
81#define jmp_label(label, jmp_insn_len) (label - cnt - jmp_insn_len)
82
83static bool is_imm8(int value)
84{
85 return value <= 127 && value >= -128;
86}
87
88static bool is_simm32(s64 value)
89{
90 return value == (s64) (s32) value;
91}
92
93#define STACK_OFFSET(k) (k)
94#define TCALL_CNT (MAX_BPF_JIT_REG + 0) /* Tail Call Count */
95
96#define IA32_EAX (0x0)
97#define IA32_EBX (0x3)
98#define IA32_ECX (0x1)
99#define IA32_EDX (0x2)
100#define IA32_ESI (0x6)
101#define IA32_EDI (0x7)
102#define IA32_EBP (0x5)
103#define IA32_ESP (0x4)
104
105/*
106 * List of x86 cond jumps opcodes (. + s8)
107 * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
108 */
109#define IA32_JB 0x72
110#define IA32_JAE 0x73
111#define IA32_JE 0x74
112#define IA32_JNE 0x75
113#define IA32_JBE 0x76
114#define IA32_JA 0x77
115#define IA32_JL 0x7C
116#define IA32_JGE 0x7D
117#define IA32_JLE 0x7E
118#define IA32_JG 0x7F
119
120/*
121 * Map eBPF registers to IA32 32bit registers or stack scratch space.
122 *
123 * 1. All the registers, R0-R10, are mapped to scratch space on stack.
124 * 2. We need two 64 bit temp registers to do complex operations on eBPF
125 * registers.
126 * 3. For performance reason, the BPF_REG_AX for blinding constant, is
127 * mapped to real hardware register pair, IA32_ESI and IA32_EDI.
128 *
129 * As the eBPF registers are all 64 bit registers and IA32 has only 32 bit
130 * registers, we have to map each eBPF registers with two IA32 32 bit regs
131 * or scratch memory space and we have to build eBPF 64 bit register from those.
132 *
133 * We use IA32_EAX, IA32_EDX, IA32_ECX, IA32_EBX as temporary registers.
134 */
135static const u8 bpf2ia32[][2] = {
136 /* Return value from in-kernel function, and exit value from eBPF */
137 [BPF_REG_0] = {STACK_OFFSET(0), STACK_OFFSET(4)},
138
139 /* The arguments from eBPF program to in-kernel function */
140 /* Stored on stack scratch space */
141 [BPF_REG_1] = {STACK_OFFSET(8), STACK_OFFSET(12)},
142 [BPF_REG_2] = {STACK_OFFSET(16), STACK_OFFSET(20)},
143 [BPF_REG_3] = {STACK_OFFSET(24), STACK_OFFSET(28)},
144 [BPF_REG_4] = {STACK_OFFSET(32), STACK_OFFSET(36)},
145 [BPF_REG_5] = {STACK_OFFSET(40), STACK_OFFSET(44)},
146
147 /* Callee saved registers that in-kernel function will preserve */
148 /* Stored on stack scratch space */
149 [BPF_REG_6] = {STACK_OFFSET(48), STACK_OFFSET(52)},
150 [BPF_REG_7] = {STACK_OFFSET(56), STACK_OFFSET(60)},
151 [BPF_REG_8] = {STACK_OFFSET(64), STACK_OFFSET(68)},
152 [BPF_REG_9] = {STACK_OFFSET(72), STACK_OFFSET(76)},
153
154 /* Read only Frame Pointer to access Stack */
155 [BPF_REG_FP] = {STACK_OFFSET(80), STACK_OFFSET(84)},
156
157 /* Temporary register for blinding constants. */
158 [BPF_REG_AX] = {IA32_ESI, IA32_EDI},
159
160 /* Tail call count. Stored on stack scratch space. */
161 [TCALL_CNT] = {STACK_OFFSET(88), STACK_OFFSET(92)},
162};
163
164#define dst_lo dst[0]
165#define dst_hi dst[1]
166#define src_lo src[0]
167#define src_hi src[1]
168
169#define STACK_ALIGNMENT 8
170/*
171 * Stack space for BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4,
172 * BPF_REG_5, BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9,
173 * BPF_REG_FP, BPF_REG_AX and Tail call counts.
174 */
175#define SCRATCH_SIZE 96
176
177/* Total stack size used in JITed code */
178#define _STACK_SIZE (stack_depth + SCRATCH_SIZE)
179
180#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
181
182/* Get the offset of eBPF REGISTERs stored on scratch space. */
183#define STACK_VAR(off) (off)
184
185/* Encode 'dst_reg' register into IA32 opcode 'byte' */
186static u8 add_1reg(u8 byte, u32 dst_reg)
187{
188 return byte + dst_reg;
189}
190
191/* Encode 'dst_reg' and 'src_reg' registers into IA32 opcode 'byte' */
192static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
193{
194 return byte + dst_reg + (src_reg << 3);
195}
196
197static void jit_fill_hole(void *area, unsigned int size)
198{
199 /* Fill whole space with int3 instructions */
200 memset(area, 0xcc, size);
201}
202
203static inline void emit_ia32_mov_i(const u8 dst, const u32 val, bool dstk,
204 u8 **pprog)
205{
206 u8 *prog = *pprog;
207 int cnt = 0;
208
209 if (dstk) {
210 if (val == 0) {
211 /* xor eax,eax */
212 EMIT2(0x33, add_2reg(0xC0, IA32_EAX, IA32_EAX));
213 /* mov dword ptr [ebp+off],eax */
214 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
215 STACK_VAR(dst));
216 } else {
217 EMIT3_off32(0xC7, add_1reg(0x40, IA32_EBP),
218 STACK_VAR(dst), val);
219 }
220 } else {
221 if (val == 0)
222 EMIT2(0x33, add_2reg(0xC0, dst, dst));
223 else
224 EMIT2_off32(0xC7, add_1reg(0xC0, dst),
225 val);
226 }
227 *pprog = prog;
228}
229
230/* dst = imm (4 bytes)*/
231static inline void emit_ia32_mov_r(const u8 dst, const u8 src, bool dstk,
232 bool sstk, u8 **pprog)
233{
234 u8 *prog = *pprog;
235 int cnt = 0;
236 u8 sreg = sstk ? IA32_EAX : src;
237
238 if (sstk)
239 /* mov eax,dword ptr [ebp+off] */
240 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
241 if (dstk)
242 /* mov dword ptr [ebp+off],eax */
243 EMIT3(0x89, add_2reg(0x40, IA32_EBP, sreg), STACK_VAR(dst));
244 else
245 /* mov dst,sreg */
246 EMIT2(0x89, add_2reg(0xC0, dst, sreg));
247
248 *pprog = prog;
249}
250
251/* dst = src */
252static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[],
253 const u8 src[], bool dstk,
254 bool sstk, u8 **pprog)
255{
256 emit_ia32_mov_r(dst_lo, src_lo, dstk, sstk, pprog);
257 if (is64)
258 /* complete 8 byte move */
259 emit_ia32_mov_r(dst_hi, src_hi, dstk, sstk, pprog);
260 else
261 /* zero out high 4 bytes */
262 emit_ia32_mov_i(dst_hi, 0, dstk, pprog);
263}
264
265/* Sign extended move */
266static inline void emit_ia32_mov_i64(const bool is64, const u8 dst[],
267 const u32 val, bool dstk, u8 **pprog)
268{
269 u32 hi = 0;
270
271 if (is64 && (val & (1<<31)))
272 hi = (u32)~0;
273 emit_ia32_mov_i(dst_lo, val, dstk, pprog);
274 emit_ia32_mov_i(dst_hi, hi, dstk, pprog);
275}
276
277/*
278 * ALU operation (32 bit)
279 * dst = dst * src
280 */
281static inline void emit_ia32_mul_r(const u8 dst, const u8 src, bool dstk,
282 bool sstk, u8 **pprog)
283{
284 u8 *prog = *pprog;
285 int cnt = 0;
286 u8 sreg = sstk ? IA32_ECX : src;
287
288 if (sstk)
289 /* mov ecx,dword ptr [ebp+off] */
290 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
291
292 if (dstk)
293 /* mov eax,dword ptr [ebp+off] */
294 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
295 else
296 /* mov eax,dst */
297 EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
298
299
300 EMIT2(0xF7, add_1reg(0xE0, sreg));
301
302 if (dstk)
303 /* mov dword ptr [ebp+off],eax */
304 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
305 STACK_VAR(dst));
306 else
307 /* mov dst,eax */
308 EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
309
310 *pprog = prog;
311}
312
313static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val,
314 bool dstk, u8 **pprog)
315{
316 u8 *prog = *pprog;
317 int cnt = 0;
318 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
319 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
320
321 if (dstk && val != 64) {
322 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
323 STACK_VAR(dst_lo));
324 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
325 STACK_VAR(dst_hi));
326 }
327 switch (val) {
328 case 16:
329 /*
330 * Emit 'movzwl eax,ax' to zero extend 16-bit
331 * into 64 bit
332 */
333 EMIT2(0x0F, 0xB7);
334 EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
335 /* xor dreg_hi,dreg_hi */
336 EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
337 break;
338 case 32:
339 /* xor dreg_hi,dreg_hi */
340 EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
341 break;
342 case 64:
343 /* nop */
344 break;
345 }
346
347 if (dstk && val != 64) {
348 /* mov dword ptr [ebp+off],dreg_lo */
349 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
350 STACK_VAR(dst_lo));
351 /* mov dword ptr [ebp+off],dreg_hi */
352 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
353 STACK_VAR(dst_hi));
354 }
355 *pprog = prog;
356}
357
358static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val,
359 bool dstk, u8 **pprog)
360{
361 u8 *prog = *pprog;
362 int cnt = 0;
363 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
364 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
365
366 if (dstk) {
367 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
368 STACK_VAR(dst_lo));
369 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
370 STACK_VAR(dst_hi));
371 }
372 switch (val) {
373 case 16:
374 /* Emit 'ror %ax, 8' to swap lower 2 bytes */
375 EMIT1(0x66);
376 EMIT3(0xC1, add_1reg(0xC8, dreg_lo), 8);
377
378 EMIT2(0x0F, 0xB7);
379 EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
380
381 /* xor dreg_hi,dreg_hi */
382 EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
383 break;
384 case 32:
385 /* Emit 'bswap eax' to swap lower 4 bytes */
386 EMIT1(0x0F);
387 EMIT1(add_1reg(0xC8, dreg_lo));
388
389 /* xor dreg_hi,dreg_hi */
390 EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
391 break;
392 case 64:
393 /* Emit 'bswap eax' to swap lower 4 bytes */
394 EMIT1(0x0F);
395 EMIT1(add_1reg(0xC8, dreg_lo));
396
397 /* Emit 'bswap edx' to swap lower 4 bytes */
398 EMIT1(0x0F);
399 EMIT1(add_1reg(0xC8, dreg_hi));
400
401 /* mov ecx,dreg_hi */
402 EMIT2(0x89, add_2reg(0xC0, IA32_ECX, dreg_hi));
403 /* mov dreg_hi,dreg_lo */
404 EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
405 /* mov dreg_lo,ecx */
406 EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
407
408 break;
409 }
410 if (dstk) {
411 /* mov dword ptr [ebp+off],dreg_lo */
412 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
413 STACK_VAR(dst_lo));
414 /* mov dword ptr [ebp+off],dreg_hi */
415 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
416 STACK_VAR(dst_hi));
417 }
418 *pprog = prog;
419}
420
421/*
422 * ALU operation (32 bit)
423 * dst = dst (div|mod) src
424 */
425static inline void emit_ia32_div_mod_r(const u8 op, const u8 dst, const u8 src,
426 bool dstk, bool sstk, u8 **pprog)
427{
428 u8 *prog = *pprog;
429 int cnt = 0;
430
431 if (sstk)
432 /* mov ecx,dword ptr [ebp+off] */
433 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
434 STACK_VAR(src));
435 else if (src != IA32_ECX)
436 /* mov ecx,src */
437 EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
438
439 if (dstk)
440 /* mov eax,dword ptr [ebp+off] */
441 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
442 STACK_VAR(dst));
443 else
444 /* mov eax,dst */
445 EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
446
447 /* xor edx,edx */
448 EMIT2(0x31, add_2reg(0xC0, IA32_EDX, IA32_EDX));
449 /* div ecx */
450 EMIT2(0xF7, add_1reg(0xF0, IA32_ECX));
451
452 if (op == BPF_MOD) {
453 if (dstk)
454 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
455 STACK_VAR(dst));
456 else
457 EMIT2(0x89, add_2reg(0xC0, dst, IA32_EDX));
458 } else {
459 if (dstk)
460 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
461 STACK_VAR(dst));
462 else
463 EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
464 }
465 *pprog = prog;
466}
467
468/*
469 * ALU operation (32 bit)
470 * dst = dst (shift) src
471 */
472static inline void emit_ia32_shift_r(const u8 op, const u8 dst, const u8 src,
473 bool dstk, bool sstk, u8 **pprog)
474{
475 u8 *prog = *pprog;
476 int cnt = 0;
477 u8 dreg = dstk ? IA32_EAX : dst;
478 u8 b2;
479
480 if (dstk)
481 /* mov eax,dword ptr [ebp+off] */
482 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
483
484 if (sstk)
485 /* mov ecx,dword ptr [ebp+off] */
486 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
487 else if (src != IA32_ECX)
488 /* mov ecx,src */
489 EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
490
491 switch (op) {
492 case BPF_LSH:
493 b2 = 0xE0; break;
494 case BPF_RSH:
495 b2 = 0xE8; break;
496 case BPF_ARSH:
497 b2 = 0xF8; break;
498 default:
499 return;
500 }
501 EMIT2(0xD3, add_1reg(b2, dreg));
502
503 if (dstk)
504 /* mov dword ptr [ebp+off],dreg */
505 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg), STACK_VAR(dst));
506 *pprog = prog;
507}
508
509/*
510 * ALU operation (32 bit)
511 * dst = dst (op) src
512 */
513static inline void emit_ia32_alu_r(const bool is64, const bool hi, const u8 op,
514 const u8 dst, const u8 src, bool dstk,
515 bool sstk, u8 **pprog)
516{
517 u8 *prog = *pprog;
518 int cnt = 0;
519 u8 sreg = sstk ? IA32_EAX : src;
520 u8 dreg = dstk ? IA32_EDX : dst;
521
522 if (sstk)
523 /* mov eax,dword ptr [ebp+off] */
524 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
525
526 if (dstk)
527 /* mov eax,dword ptr [ebp+off] */
528 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(dst));
529
530 switch (BPF_OP(op)) {
531 /* dst = dst + src */
532 case BPF_ADD:
533 if (hi && is64)
534 EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
535 else
536 EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
537 break;
538 /* dst = dst - src */
539 case BPF_SUB:
540 if (hi && is64)
541 EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
542 else
543 EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
544 break;
545 /* dst = dst | src */
546 case BPF_OR:
547 EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
548 break;
549 /* dst = dst & src */
550 case BPF_AND:
551 EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
552 break;
553 /* dst = dst ^ src */
554 case BPF_XOR:
555 EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
556 break;
557 }
558
559 if (dstk)
560 /* mov dword ptr [ebp+off],dreg */
561 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
562 STACK_VAR(dst));
563 *pprog = prog;
564}
565
566/* ALU operation (64 bit) */
567static inline void emit_ia32_alu_r64(const bool is64, const u8 op,
568 const u8 dst[], const u8 src[],
569 bool dstk, bool sstk,
570 u8 **pprog)
571{
572 u8 *prog = *pprog;
573
574 emit_ia32_alu_r(is64, false, op, dst_lo, src_lo, dstk, sstk, &prog);
575 if (is64)
576 emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, dstk, sstk,
577 &prog);
578 else
579 emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
580 *pprog = prog;
581}
582
583/*
584 * ALU operation (32 bit)
585 * dst = dst (op) val
586 */
587static inline void emit_ia32_alu_i(const bool is64, const bool hi, const u8 op,
588 const u8 dst, const s32 val, bool dstk,
589 u8 **pprog)
590{
591 u8 *prog = *pprog;
592 int cnt = 0;
593 u8 dreg = dstk ? IA32_EAX : dst;
594 u8 sreg = IA32_EDX;
595
596 if (dstk)
597 /* mov eax,dword ptr [ebp+off] */
598 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
599
600 if (!is_imm8(val))
601 /* mov edx,imm32*/
602 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EDX), val);
603
604 switch (op) {
605 /* dst = dst + val */
606 case BPF_ADD:
607 if (hi && is64) {
608 if (is_imm8(val))
609 EMIT3(0x83, add_1reg(0xD0, dreg), val);
610 else
611 EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
612 } else {
613 if (is_imm8(val))
614 EMIT3(0x83, add_1reg(0xC0, dreg), val);
615 else
616 EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
617 }
618 break;
619 /* dst = dst - val */
620 case BPF_SUB:
621 if (hi && is64) {
622 if (is_imm8(val))
623 EMIT3(0x83, add_1reg(0xD8, dreg), val);
624 else
625 EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
626 } else {
627 if (is_imm8(val))
628 EMIT3(0x83, add_1reg(0xE8, dreg), val);
629 else
630 EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
631 }
632 break;
633 /* dst = dst | val */
634 case BPF_OR:
635 if (is_imm8(val))
636 EMIT3(0x83, add_1reg(0xC8, dreg), val);
637 else
638 EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
639 break;
640 /* dst = dst & val */
641 case BPF_AND:
642 if (is_imm8(val))
643 EMIT3(0x83, add_1reg(0xE0, dreg), val);
644 else
645 EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
646 break;
647 /* dst = dst ^ val */
648 case BPF_XOR:
649 if (is_imm8(val))
650 EMIT3(0x83, add_1reg(0xF0, dreg), val);
651 else
652 EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
653 break;
654 case BPF_NEG:
655 EMIT2(0xF7, add_1reg(0xD8, dreg));
656 break;
657 }
658
659 if (dstk)
660 /* mov dword ptr [ebp+off],dreg */
661 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
662 STACK_VAR(dst));
663 *pprog = prog;
664}
665
666/* ALU operation (64 bit) */
667static inline void emit_ia32_alu_i64(const bool is64, const u8 op,
668 const u8 dst[], const u32 val,
669 bool dstk, u8 **pprog)
670{
671 u8 *prog = *pprog;
672 u32 hi = 0;
673
674 if (is64 && (val & (1<<31)))
675 hi = (u32)~0;
676
677 emit_ia32_alu_i(is64, false, op, dst_lo, val, dstk, &prog);
678 if (is64)
679 emit_ia32_alu_i(is64, true, op, dst_hi, hi, dstk, &prog);
680 else
681 emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
682
683 *pprog = prog;
684}
685
686/* dst = ~dst (64 bit) */
687static inline void emit_ia32_neg64(const u8 dst[], bool dstk, u8 **pprog)
688{
689 u8 *prog = *pprog;
690 int cnt = 0;
691 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
692 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
693
694 if (dstk) {
695 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
696 STACK_VAR(dst_lo));
697 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
698 STACK_VAR(dst_hi));
699 }
700
701 /* xor ecx,ecx */
702 EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
703 /* sub dreg_lo,ecx */
704 EMIT2(0x2B, add_2reg(0xC0, dreg_lo, IA32_ECX));
705 /* mov dreg_lo,ecx */
706 EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
707
708 /* xor ecx,ecx */
709 EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
710 /* sbb dreg_hi,ecx */
711 EMIT2(0x19, add_2reg(0xC0, dreg_hi, IA32_ECX));
712 /* mov dreg_hi,ecx */
713 EMIT2(0x89, add_2reg(0xC0, dreg_hi, IA32_ECX));
714
715 if (dstk) {
716 /* mov dword ptr [ebp+off],dreg_lo */
717 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
718 STACK_VAR(dst_lo));
719 /* mov dword ptr [ebp+off],dreg_hi */
720 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
721 STACK_VAR(dst_hi));
722 }
723 *pprog = prog;
724}
725
726/* dst = dst << src */
727static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[],
728 bool dstk, bool sstk, u8 **pprog)
729{
730 u8 *prog = *pprog;
731 int cnt = 0;
732 static int jmp_label1 = -1;
733 static int jmp_label2 = -1;
734 static int jmp_label3 = -1;
735 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
736 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
737
738 if (dstk) {
739 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
740 STACK_VAR(dst_lo));
741 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
742 STACK_VAR(dst_hi));
743 }
744
745 if (sstk)
746 /* mov ecx,dword ptr [ebp+off] */
747 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
748 STACK_VAR(src_lo));
749 else
750 /* mov ecx,src_lo */
751 EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
752
753 /* cmp ecx,32 */
754 EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
755 /* Jumps when >= 32 */
756 if (is_imm8(jmp_label(jmp_label1, 2)))
757 EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
758 else
759 EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
760
761 /* < 32 */
762 /* shl dreg_hi,cl */
763 EMIT2(0xD3, add_1reg(0xE0, dreg_hi));
764 /* mov ebx,dreg_lo */
765 EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
766 /* shl dreg_lo,cl */
767 EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
768
769 /* IA32_ECX = -IA32_ECX + 32 */
770 /* neg ecx */
771 EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
772 /* add ecx,32 */
773 EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
774
775 /* shr ebx,cl */
776 EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
777 /* or dreg_hi,ebx */
778 EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
779
780 /* goto out; */
781 if (is_imm8(jmp_label(jmp_label3, 2)))
782 EMIT2(0xEB, jmp_label(jmp_label3, 2));
783 else
784 EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
785
786 /* >= 32 */
787 if (jmp_label1 == -1)
788 jmp_label1 = cnt;
789
790 /* cmp ecx,64 */
791 EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
792 /* Jumps when >= 64 */
793 if (is_imm8(jmp_label(jmp_label2, 2)))
794 EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
795 else
796 EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
797
798 /* >= 32 && < 64 */
799 /* sub ecx,32 */
800 EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
801 /* shl dreg_lo,cl */
802 EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
803 /* mov dreg_hi,dreg_lo */
804 EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
805
806 /* xor dreg_lo,dreg_lo */
807 EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
808
809 /* goto out; */
810 if (is_imm8(jmp_label(jmp_label3, 2)))
811 EMIT2(0xEB, jmp_label(jmp_label3, 2));
812 else
813 EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
814
815 /* >= 64 */
816 if (jmp_label2 == -1)
817 jmp_label2 = cnt;
818 /* xor dreg_lo,dreg_lo */
819 EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
820 /* xor dreg_hi,dreg_hi */
821 EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
822
823 if (jmp_label3 == -1)
824 jmp_label3 = cnt;
825
826 if (dstk) {
827 /* mov dword ptr [ebp+off],dreg_lo */
828 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
829 STACK_VAR(dst_lo));
830 /* mov dword ptr [ebp+off],dreg_hi */
831 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
832 STACK_VAR(dst_hi));
833 }
834 /* out: */
835 *pprog = prog;
836}
837
838/* dst = dst >> src (signed)*/
839static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[],
840 bool dstk, bool sstk, u8 **pprog)
841{
842 u8 *prog = *pprog;
843 int cnt = 0;
844 static int jmp_label1 = -1;
845 static int jmp_label2 = -1;
846 static int jmp_label3 = -1;
847 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
848 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
849
850 if (dstk) {
851 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
852 STACK_VAR(dst_lo));
853 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
854 STACK_VAR(dst_hi));
855 }
856
857 if (sstk)
858 /* mov ecx,dword ptr [ebp+off] */
859 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
860 STACK_VAR(src_lo));
861 else
862 /* mov ecx,src_lo */
863 EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
864
865 /* cmp ecx,32 */
866 EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
867 /* Jumps when >= 32 */
868 if (is_imm8(jmp_label(jmp_label1, 2)))
869 EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
870 else
871 EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
872
873 /* < 32 */
874 /* lshr dreg_lo,cl */
875 EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
876 /* mov ebx,dreg_hi */
877 EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
878 /* ashr dreg_hi,cl */
879 EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
880
881 /* IA32_ECX = -IA32_ECX + 32 */
882 /* neg ecx */
883 EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
884 /* add ecx,32 */
885 EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
886
887 /* shl ebx,cl */
888 EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
889 /* or dreg_lo,ebx */
890 EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
891
892 /* goto out; */
893 if (is_imm8(jmp_label(jmp_label3, 2)))
894 EMIT2(0xEB, jmp_label(jmp_label3, 2));
895 else
896 EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
897
898 /* >= 32 */
899 if (jmp_label1 == -1)
900 jmp_label1 = cnt;
901
902 /* cmp ecx,64 */
903 EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
904 /* Jumps when >= 64 */
905 if (is_imm8(jmp_label(jmp_label2, 2)))
906 EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
907 else
908 EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
909
910 /* >= 32 && < 64 */
911 /* sub ecx,32 */
912 EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
913 /* ashr dreg_hi,cl */
914 EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
915 /* mov dreg_lo,dreg_hi */
916 EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
917
918 /* ashr dreg_hi,imm8 */
919 EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
920
921 /* goto out; */
922 if (is_imm8(jmp_label(jmp_label3, 2)))
923 EMIT2(0xEB, jmp_label(jmp_label3, 2));
924 else
925 EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
926
927 /* >= 64 */
928 if (jmp_label2 == -1)
929 jmp_label2 = cnt;
930 /* ashr dreg_hi,imm8 */
931 EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
932 /* mov dreg_lo,dreg_hi */
933 EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
934
935 if (jmp_label3 == -1)
936 jmp_label3 = cnt;
937
938 if (dstk) {
939 /* mov dword ptr [ebp+off],dreg_lo */
940 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
941 STACK_VAR(dst_lo));
942 /* mov dword ptr [ebp+off],dreg_hi */
943 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
944 STACK_VAR(dst_hi));
945 }
946 /* out: */
947 *pprog = prog;
948}
949
950/* dst = dst >> src */
951static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk,
952 bool sstk, u8 **pprog)
953{
954 u8 *prog = *pprog;
955 int cnt = 0;
956 static int jmp_label1 = -1;
957 static int jmp_label2 = -1;
958 static int jmp_label3 = -1;
959 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
960 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
961
962 if (dstk) {
963 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
964 STACK_VAR(dst_lo));
965 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
966 STACK_VAR(dst_hi));
967 }
968
969 if (sstk)
970 /* mov ecx,dword ptr [ebp+off] */
971 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
972 STACK_VAR(src_lo));
973 else
974 /* mov ecx,src_lo */
975 EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
976
977 /* cmp ecx,32 */
978 EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
979 /* Jumps when >= 32 */
980 if (is_imm8(jmp_label(jmp_label1, 2)))
981 EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
982 else
983 EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
984
985 /* < 32 */
986 /* lshr dreg_lo,cl */
987 EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
988 /* mov ebx,dreg_hi */
989 EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
990 /* shr dreg_hi,cl */
991 EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
992
993 /* IA32_ECX = -IA32_ECX + 32 */
994 /* neg ecx */
995 EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
996 /* add ecx,32 */
997 EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
998
999 /* shl ebx,cl */
1000 EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
1001 /* or dreg_lo,ebx */
1002 EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
1003
1004 /* goto out; */
1005 if (is_imm8(jmp_label(jmp_label3, 2)))
1006 EMIT2(0xEB, jmp_label(jmp_label3, 2));
1007 else
1008 EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
1009
1010 /* >= 32 */
1011 if (jmp_label1 == -1)
1012 jmp_label1 = cnt;
1013 /* cmp ecx,64 */
1014 EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
1015 /* Jumps when >= 64 */
1016 if (is_imm8(jmp_label(jmp_label2, 2)))
1017 EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
1018 else
1019 EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
1020
1021 /* >= 32 && < 64 */
1022 /* sub ecx,32 */
1023 EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
1024 /* shr dreg_hi,cl */
1025 EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
1026 /* mov dreg_lo,dreg_hi */
1027 EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
1028 /* xor dreg_hi,dreg_hi */
1029 EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
1030
1031 /* goto out; */
1032 if (is_imm8(jmp_label(jmp_label3, 2)))
1033 EMIT2(0xEB, jmp_label(jmp_label3, 2));
1034 else
1035 EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
1036
1037 /* >= 64 */
1038 if (jmp_label2 == -1)
1039 jmp_label2 = cnt;
1040 /* xor dreg_lo,dreg_lo */
1041 EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
1042 /* xor dreg_hi,dreg_hi */
1043 EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
1044
1045 if (jmp_label3 == -1)
1046 jmp_label3 = cnt;
1047
1048 if (dstk) {
1049 /* mov dword ptr [ebp+off],dreg_lo */
1050 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
1051 STACK_VAR(dst_lo));
1052 /* mov dword ptr [ebp+off],dreg_hi */
1053 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
1054 STACK_VAR(dst_hi));
1055 }
1056 /* out: */
1057 *pprog = prog;
1058}
1059
1060/* dst = dst << val */
1061static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val,
1062 bool dstk, u8 **pprog)
1063{
1064 u8 *prog = *pprog;
1065 int cnt = 0;
1066 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
1067 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
1068
1069 if (dstk) {
1070 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
1071 STACK_VAR(dst_lo));
1072 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
1073 STACK_VAR(dst_hi));
1074 }
1075 /* Do LSH operation */
1076 if (val < 32) {
1077 /* shl dreg_hi,imm8 */
1078 EMIT3(0xC1, add_1reg(0xE0, dreg_hi), val);
1079 /* mov ebx,dreg_lo */
1080 EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
1081 /* shl dreg_lo,imm8 */
1082 EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val);
1083
1084 /* IA32_ECX = 32 - val */
1085 /* mov ecx,val */
1086 EMIT2(0xB1, val);
1087 /* movzx ecx,ecx */
1088 EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
1089 /* neg ecx */
1090 EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
1091 /* add ecx,32 */
1092 EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
1093
1094 /* shr ebx,cl */
1095 EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
1096 /* or dreg_hi,ebx */
1097 EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
1098 } else if (val >= 32 && val < 64) {
1099 u32 value = val - 32;
1100
1101 /* shl dreg_lo,imm8 */
1102 EMIT3(0xC1, add_1reg(0xE0, dreg_lo), value);
1103 /* mov dreg_hi,dreg_lo */
1104 EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
1105 /* xor dreg_lo,dreg_lo */
1106 EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
1107 } else {
1108 /* xor dreg_lo,dreg_lo */
1109 EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
1110 /* xor dreg_hi,dreg_hi */
1111 EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
1112 }
1113
1114 if (dstk) {
1115 /* mov dword ptr [ebp+off],dreg_lo */
1116 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
1117 STACK_VAR(dst_lo));
1118 /* mov dword ptr [ebp+off],dreg_hi */
1119 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
1120 STACK_VAR(dst_hi));
1121 }
1122 *pprog = prog;
1123}
1124
1125/* dst = dst >> val */
1126static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val,
1127 bool dstk, u8 **pprog)
1128{
1129 u8 *prog = *pprog;
1130 int cnt = 0;
1131 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
1132 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
1133
1134 if (dstk) {
1135 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
1136 STACK_VAR(dst_lo));
1137 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
1138 STACK_VAR(dst_hi));
1139 }
1140
1141 /* Do RSH operation */
1142 if (val < 32) {
1143 /* shr dreg_lo,imm8 */
1144 EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
1145 /* mov ebx,dreg_hi */
1146 EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
1147 /* shr dreg_hi,imm8 */
1148 EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val);
1149
1150 /* IA32_ECX = 32 - val */
1151 /* mov ecx,val */
1152 EMIT2(0xB1, val);
1153 /* movzx ecx,ecx */
1154 EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
1155 /* neg ecx */
1156 EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
1157 /* add ecx,32 */
1158 EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
1159
1160 /* shl ebx,cl */
1161 EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
1162 /* or dreg_lo,ebx */
1163 EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
1164 } else if (val >= 32 && val < 64) {
1165 u32 value = val - 32;
1166
1167 /* shr dreg_hi,imm8 */
1168 EMIT3(0xC1, add_1reg(0xE8, dreg_hi), value);
1169 /* mov dreg_lo,dreg_hi */
1170 EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
1171 /* xor dreg_hi,dreg_hi */
1172 EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
1173 } else {
1174 /* xor dreg_lo,dreg_lo */
1175 EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
1176 /* xor dreg_hi,dreg_hi */
1177 EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
1178 }
1179
1180 if (dstk) {
1181 /* mov dword ptr [ebp+off],dreg_lo */
1182 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
1183 STACK_VAR(dst_lo));
1184 /* mov dword ptr [ebp+off],dreg_hi */
1185 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
1186 STACK_VAR(dst_hi));
1187 }
1188 *pprog = prog;
1189}
1190
1191/* dst = dst >> val (signed) */
1192static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val,
1193 bool dstk, u8 **pprog)
1194{
1195 u8 *prog = *pprog;
1196 int cnt = 0;
1197 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
1198 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
1199
1200 if (dstk) {
1201 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
1202 STACK_VAR(dst_lo));
1203 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
1204 STACK_VAR(dst_hi));
1205 }
1206 /* Do RSH operation */
1207 if (val < 32) {
1208 /* shr dreg_lo,imm8 */
1209 EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
1210 /* mov ebx,dreg_hi */
1211 EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
1212 /* ashr dreg_hi,imm8 */
1213 EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val);
1214
1215 /* IA32_ECX = 32 - val */
1216 /* mov ecx,val */
1217 EMIT2(0xB1, val);
1218 /* movzx ecx,ecx */
1219 EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
1220 /* neg ecx */
1221 EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
1222 /* add ecx,32 */
1223 EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
1224
1225 /* shl ebx,cl */
1226 EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
1227 /* or dreg_lo,ebx */
1228 EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
1229 } else if (val >= 32 && val < 64) {
1230 u32 value = val - 32;
1231
1232 /* ashr dreg_hi,imm8 */
1233 EMIT3(0xC1, add_1reg(0xF8, dreg_hi), value);
1234 /* mov dreg_lo,dreg_hi */
1235 EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
1236
1237 /* ashr dreg_hi,imm8 */
1238 EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
1239 } else {
1240 /* ashr dreg_hi,imm8 */
1241 EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
1242 /* mov dreg_lo,dreg_hi */
1243 EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
1244 }
1245
1246 if (dstk) {
1247 /* mov dword ptr [ebp+off],dreg_lo */
1248 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
1249 STACK_VAR(dst_lo));
1250 /* mov dword ptr [ebp+off],dreg_hi */
1251 EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
1252 STACK_VAR(dst_hi));
1253 }
1254 *pprog = prog;
1255}
1256
1257static inline void emit_ia32_mul_r64(const u8 dst[], const u8 src[], bool dstk,
1258 bool sstk, u8 **pprog)
1259{
1260 u8 *prog = *pprog;
1261 int cnt = 0;
1262
1263 if (dstk)
1264 /* mov eax,dword ptr [ebp+off] */
1265 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
1266 STACK_VAR(dst_hi));
1267 else
1268 /* mov eax,dst_hi */
1269 EMIT2(0x8B, add_2reg(0xC0, dst_hi, IA32_EAX));
1270
1271 if (sstk)
1272 /* mul dword ptr [ebp+off] */
1273 EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
1274 else
1275 /* mul src_lo */
1276 EMIT2(0xF7, add_1reg(0xE0, src_lo));
1277
1278 /* mov ecx,eax */
1279 EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
1280
1281 if (dstk)
1282 /* mov eax,dword ptr [ebp+off] */
1283 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
1284 STACK_VAR(dst_lo));
1285 else
1286 /* mov eax,dst_lo */
1287 EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
1288
1289 if (sstk)
1290 /* mul dword ptr [ebp+off] */
1291 EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_hi));
1292 else
1293 /* mul src_hi */
1294 EMIT2(0xF7, add_1reg(0xE0, src_hi));
1295
1296 /* add eax,eax */
1297 EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
1298
1299 if (dstk)
1300 /* mov eax,dword ptr [ebp+off] */
1301 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
1302 STACK_VAR(dst_lo));
1303 else
1304 /* mov eax,dst_lo */
1305 EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
1306
1307 if (sstk)
1308 /* mul dword ptr [ebp+off] */
1309 EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
1310 else
1311 /* mul src_lo */
1312 EMIT2(0xF7, add_1reg(0xE0, src_lo));
1313
1314 /* add ecx,edx */
1315 EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
1316
1317 if (dstk) {
1318 /* mov dword ptr [ebp+off],eax */
1319 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
1320 STACK_VAR(dst_lo));
1321 /* mov dword ptr [ebp+off],ecx */
1322 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
1323 STACK_VAR(dst_hi));
1324 } else {
1325 /* mov dst_lo,eax */
1326 EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
1327 /* mov dst_hi,ecx */
1328 EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
1329 }
1330
1331 *pprog = prog;
1332}
1333
1334static inline void emit_ia32_mul_i64(const u8 dst[], const u32 val,
1335 bool dstk, u8 **pprog)
1336{
1337 u8 *prog = *pprog;
1338 int cnt = 0;
1339 u32 hi;
1340
1341 hi = val & (1<<31) ? (u32)~0 : 0;
1342 /* movl eax,imm32 */
1343 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
1344 if (dstk)
1345 /* mul dword ptr [ebp+off] */
1346 EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi));
1347 else
1348 /* mul dst_hi */
1349 EMIT2(0xF7, add_1reg(0xE0, dst_hi));
1350
1351 /* mov ecx,eax */
1352 EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
1353
1354 /* movl eax,imm32 */
1355 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), hi);
1356 if (dstk)
1357 /* mul dword ptr [ebp+off] */
1358 EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
1359 else
1360 /* mul dst_lo */
1361 EMIT2(0xF7, add_1reg(0xE0, dst_lo));
1362 /* add ecx,eax */
1363 EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
1364
1365 /* movl eax,imm32 */
1366 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
1367 if (dstk)
1368 /* mul dword ptr [ebp+off] */
1369 EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
1370 else
1371 /* mul dst_lo */
1372 EMIT2(0xF7, add_1reg(0xE0, dst_lo));
1373
1374 /* add ecx,edx */
1375 EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
1376
1377 if (dstk) {
1378 /* mov dword ptr [ebp+off],eax */
1379 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
1380 STACK_VAR(dst_lo));
1381 /* mov dword ptr [ebp+off],ecx */
1382 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
1383 STACK_VAR(dst_hi));
1384 } else {
1385 /* mov dword ptr [ebp+off],eax */
1386 EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
1387 /* mov dword ptr [ebp+off],ecx */
1388 EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
1389 }
1390
1391 *pprog = prog;
1392}
1393
1394static int bpf_size_to_x86_bytes(int bpf_size)
1395{
1396 if (bpf_size == BPF_W)
1397 return 4;
1398 else if (bpf_size == BPF_H)
1399 return 2;
1400 else if (bpf_size == BPF_B)
1401 return 1;
1402 else if (bpf_size == BPF_DW)
1403 return 4; /* imm32 */
1404 else
1405 return 0;
1406}
1407
1408struct jit_context {
1409 int cleanup_addr; /* Epilogue code offset */
1410};
1411
1412/* Maximum number of bytes emitted while JITing one eBPF insn */
1413#define BPF_MAX_INSN_SIZE 128
1414#define BPF_INSN_SAFETY 64
1415
1416#define PROLOGUE_SIZE 35
1417
1418/*
1419 * Emit prologue code for BPF program and check it's size.
1420 * bpf_tail_call helper will skip it while jumping into another program.
1421 */
1422static void emit_prologue(u8 **pprog, u32 stack_depth)
1423{
1424 u8 *prog = *pprog;
1425 int cnt = 0;
1426 const u8 *r1 = bpf2ia32[BPF_REG_1];
1427 const u8 fplo = bpf2ia32[BPF_REG_FP][0];
1428 const u8 fphi = bpf2ia32[BPF_REG_FP][1];
1429 const u8 *tcc = bpf2ia32[TCALL_CNT];
1430
1431 /* push ebp */
1432 EMIT1(0x55);
1433 /* mov ebp,esp */
1434 EMIT2(0x89, 0xE5);
1435 /* push edi */
1436 EMIT1(0x57);
1437 /* push esi */
1438 EMIT1(0x56);
1439 /* push ebx */
1440 EMIT1(0x53);
1441
1442 /* sub esp,STACK_SIZE */
1443 EMIT2_off32(0x81, 0xEC, STACK_SIZE);
1444 /* sub ebp,SCRATCH_SIZE+4+12*/
1445 EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16);
1446 /* xor ebx,ebx */
1447 EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX));
1448
1449 /* Set up BPF prog stack base register */
1450 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBP), STACK_VAR(fplo));
1451 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(fphi));
1452
1453 /* Move BPF_CTX (EAX) to BPF_REG_R1 */
1454 /* mov dword ptr [ebp+off],eax */
1455 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
1456 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(r1[1]));
1457
1458 /* Initialize Tail Count */
1459 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[0]));
1460 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
1461
1462 BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
1463 *pprog = prog;
1464}
1465
1466/* Emit epilogue code for BPF program */
1467static void emit_epilogue(u8 **pprog, u32 stack_depth)
1468{
1469 u8 *prog = *pprog;
1470 const u8 *r0 = bpf2ia32[BPF_REG_0];
1471 int cnt = 0;
1472
1473 /* mov eax,dword ptr [ebp+off]*/
1474 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r0[0]));
1475 /* mov edx,dword ptr [ebp+off]*/
1476 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1]));
1477
1478 /* add ebp,SCRATCH_SIZE+4+12*/
1479 EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16);
1480
1481 /* mov ebx,dword ptr [ebp-12]*/
1482 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12);
1483 /* mov esi,dword ptr [ebp-8]*/
1484 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ESI), -8);
1485 /* mov edi,dword ptr [ebp-4]*/
1486 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDI), -4);
1487
1488 EMIT1(0xC9); /* leave */
1489 EMIT1(0xC3); /* ret */
1490 *pprog = prog;
1491}
1492
1493/*
1494 * Generate the following code:
1495 * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
1496 * if (index >= array->map.max_entries)
1497 * goto out;
1498 * if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
1499 * goto out;
1500 * prog = array->ptrs[index];
1501 * if (prog == NULL)
1502 * goto out;
1503 * goto *(prog->bpf_func + prologue_size);
1504 * out:
1505 */
1506static void emit_bpf_tail_call(u8 **pprog)
1507{
1508 u8 *prog = *pprog;
1509 int cnt = 0;
1510 const u8 *r1 = bpf2ia32[BPF_REG_1];
1511 const u8 *r2 = bpf2ia32[BPF_REG_2];
1512 const u8 *r3 = bpf2ia32[BPF_REG_3];
1513 const u8 *tcc = bpf2ia32[TCALL_CNT];
1514 u32 lo, hi;
1515 static int jmp_label1 = -1;
1516
1517 /*
1518 * if (index >= array->map.max_entries)
1519 * goto out;
1520 */
1521 /* mov eax,dword ptr [ebp+off] */
1522 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r2[0]));
1523 /* mov edx,dword ptr [ebp+off] */
1524 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r3[0]));
1525
1526 /* cmp dword ptr [eax+off],edx */
1527 EMIT3(0x39, add_2reg(0x40, IA32_EAX, IA32_EDX),
1528 offsetof(struct bpf_array, map.max_entries));
1529 /* jbe out */
1530 EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
1531
1532 /*
1533 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
1534 * goto out;
1535 */
1536 lo = (u32)MAX_TAIL_CALL_CNT;
1537 hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
1538 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
1539 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
1540
1541 /* cmp edx,hi */
1542 EMIT3(0x83, add_1reg(0xF8, IA32_EBX), hi);
1543 EMIT2(IA32_JNE, 3);
1544 /* cmp ecx,lo */
1545 EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo);
1546
1547 /* ja out */
1548 EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
1549
1550 /* add eax,0x1 */
1551 EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 0x01);
1552 /* adc ebx,0x0 */
1553 EMIT3(0x83, add_1reg(0xD0, IA32_EBX), 0x00);
1554
1555 /* mov dword ptr [ebp+off],eax */
1556 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
1557 /* mov dword ptr [ebp+off],edx */
1558 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
1559
1560 /* prog = array->ptrs[index]; */
1561 /* mov edx, [eax + edx * 4 + offsetof(...)] */
1562 EMIT3_off32(0x8B, 0x94, 0x90, offsetof(struct bpf_array, ptrs));
1563
1564 /*
1565 * if (prog == NULL)
1566 * goto out;
1567 */
1568 /* test edx,edx */
1569 EMIT2(0x85, add_2reg(0xC0, IA32_EDX, IA32_EDX));
1570 /* je out */
1571 EMIT2(IA32_JE, jmp_label(jmp_label1, 2));
1572
1573 /* goto *(prog->bpf_func + prologue_size); */
1574 /* mov edx, dword ptr [edx + 32] */
1575 EMIT3(0x8B, add_2reg(0x40, IA32_EDX, IA32_EDX),
1576 offsetof(struct bpf_prog, bpf_func));
1577 /* add edx,prologue_size */
1578 EMIT3(0x83, add_1reg(0xC0, IA32_EDX), PROLOGUE_SIZE);
1579
1580 /* mov eax,dword ptr [ebp+off] */
1581 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
1582
1583 /*
1584 * Now we're ready to jump into next BPF program:
1585 * eax == ctx (1st arg)
1586 * edx == prog->bpf_func + prologue_size
1587 */
1588 RETPOLINE_EDX_BPF_JIT();
1589
1590 if (jmp_label1 == -1)
1591 jmp_label1 = cnt;
1592
1593 /* out: */
1594 *pprog = prog;
1595}
1596
1597/* Push the scratch stack register on top of the stack. */
1598static inline void emit_push_r64(const u8 src[], u8 **pprog)
1599{
1600 u8 *prog = *pprog;
1601 int cnt = 0;
1602
1603 /* mov ecx,dword ptr [ebp+off] */
1604 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_hi));
1605 /* push ecx */
1606 EMIT1(0x51);
1607
1608 /* mov ecx,dword ptr [ebp+off] */
1609 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
1610 /* push ecx */
1611 EMIT1(0x51);
1612
1613 *pprog = prog;
1614}
1615
1616static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
1617 int oldproglen, struct jit_context *ctx)
1618{
1619 struct bpf_insn *insn = bpf_prog->insnsi;
1620 int insn_cnt = bpf_prog->len;
1621 bool seen_exit = false;
1622 u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
1623 int i, cnt = 0;
1624 int proglen = 0;
1625 u8 *prog = temp;
1626
1627 emit_prologue(&prog, bpf_prog->aux->stack_depth);
1628
1629 for (i = 0; i < insn_cnt; i++, insn++) {
1630 const s32 imm32 = insn->imm;
1631 const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
1632 const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true;
1633 const bool sstk = insn->src_reg == BPF_REG_AX ? false : true;
1634 const u8 code = insn->code;
1635 const u8 *dst = bpf2ia32[insn->dst_reg];
1636 const u8 *src = bpf2ia32[insn->src_reg];
1637 const u8 *r0 = bpf2ia32[BPF_REG_0];
1638 s64 jmp_offset;
1639 u8 jmp_cond;
1640 int ilen;
1641 u8 *func;
1642
1643 switch (code) {
1644 /* ALU operations */
1645 /* dst = src */
1646 case BPF_ALU | BPF_MOV | BPF_K:
1647 case BPF_ALU | BPF_MOV | BPF_X:
1648 case BPF_ALU64 | BPF_MOV | BPF_K:
1649 case BPF_ALU64 | BPF_MOV | BPF_X:
1650 switch (BPF_SRC(code)) {
1651 case BPF_X:
1652 emit_ia32_mov_r64(is64, dst, src, dstk,
1653 sstk, &prog);
1654 break;
1655 case BPF_K:
1656 /* Sign-extend immediate value to dst reg */
1657 emit_ia32_mov_i64(is64, dst, imm32,
1658 dstk, &prog);
1659 break;
1660 }
1661 break;
1662 /* dst = dst + src/imm */
1663 /* dst = dst - src/imm */
1664 /* dst = dst | src/imm */
1665 /* dst = dst & src/imm */
1666 /* dst = dst ^ src/imm */
1667 /* dst = dst * src/imm */
1668 /* dst = dst << src */
1669 /* dst = dst >> src */
1670 case BPF_ALU | BPF_ADD | BPF_K:
1671 case BPF_ALU | BPF_ADD | BPF_X:
1672 case BPF_ALU | BPF_SUB | BPF_K:
1673 case BPF_ALU | BPF_SUB | BPF_X:
1674 case BPF_ALU | BPF_OR | BPF_K:
1675 case BPF_ALU | BPF_OR | BPF_X:
1676 case BPF_ALU | BPF_AND | BPF_K:
1677 case BPF_ALU | BPF_AND | BPF_X:
1678 case BPF_ALU | BPF_XOR | BPF_K:
1679 case BPF_ALU | BPF_XOR | BPF_X:
1680 case BPF_ALU64 | BPF_ADD | BPF_K:
1681 case BPF_ALU64 | BPF_ADD | BPF_X:
1682 case BPF_ALU64 | BPF_SUB | BPF_K:
1683 case BPF_ALU64 | BPF_SUB | BPF_X:
1684 case BPF_ALU64 | BPF_OR | BPF_K:
1685 case BPF_ALU64 | BPF_OR | BPF_X:
1686 case BPF_ALU64 | BPF_AND | BPF_K:
1687 case BPF_ALU64 | BPF_AND | BPF_X:
1688 case BPF_ALU64 | BPF_XOR | BPF_K:
1689 case BPF_ALU64 | BPF_XOR | BPF_X:
1690 switch (BPF_SRC(code)) {
1691 case BPF_X:
1692 emit_ia32_alu_r64(is64, BPF_OP(code), dst,
1693 src, dstk, sstk, &prog);
1694 break;
1695 case BPF_K:
1696 emit_ia32_alu_i64(is64, BPF_OP(code), dst,
1697 imm32, dstk, &prog);
1698 break;
1699 }
1700 break;
1701 case BPF_ALU | BPF_MUL | BPF_K:
1702 case BPF_ALU | BPF_MUL | BPF_X:
1703 switch (BPF_SRC(code)) {
1704 case BPF_X:
1705 emit_ia32_mul_r(dst_lo, src_lo, dstk,
1706 sstk, &prog);
1707 break;
1708 case BPF_K:
1709 /* mov ecx,imm32*/
1710 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
1711 imm32);
1712 emit_ia32_mul_r(dst_lo, IA32_ECX, dstk,
1713 false, &prog);
1714 break;
1715 }
1716 emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
1717 break;
1718 case BPF_ALU | BPF_LSH | BPF_X:
1719 case BPF_ALU | BPF_RSH | BPF_X:
1720 case BPF_ALU | BPF_ARSH | BPF_K:
1721 case BPF_ALU | BPF_ARSH | BPF_X:
1722 switch (BPF_SRC(code)) {
1723 case BPF_X:
1724 emit_ia32_shift_r(BPF_OP(code), dst_lo, src_lo,
1725 dstk, sstk, &prog);
1726 break;
1727 case BPF_K:
1728 /* mov ecx,imm32*/
1729 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
1730 imm32);
1731 emit_ia32_shift_r(BPF_OP(code), dst_lo,
1732 IA32_ECX, dstk, false,
1733 &prog);
1734 break;
1735 }
1736 emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
1737 break;
1738 /* dst = dst / src(imm) */
1739 /* dst = dst % src(imm) */
1740 case BPF_ALU | BPF_DIV | BPF_K:
1741 case BPF_ALU | BPF_DIV | BPF_X:
1742 case BPF_ALU | BPF_MOD | BPF_K:
1743 case BPF_ALU | BPF_MOD | BPF_X:
1744 switch (BPF_SRC(code)) {
1745 case BPF_X:
1746 emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
1747 src_lo, dstk, sstk, &prog);
1748 break;
1749 case BPF_K:
1750 /* mov ecx,imm32*/
1751 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
1752 imm32);
1753 emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
1754 IA32_ECX, dstk, false,
1755 &prog);
1756 break;
1757 }
1758 emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
1759 break;
1760 case BPF_ALU64 | BPF_DIV | BPF_K:
1761 case BPF_ALU64 | BPF_DIV | BPF_X:
1762 case BPF_ALU64 | BPF_MOD | BPF_K:
1763 case BPF_ALU64 | BPF_MOD | BPF_X:
1764 goto notyet;
1765 /* dst = dst >> imm */
1766 /* dst = dst << imm */
1767 case BPF_ALU | BPF_RSH | BPF_K:
1768 case BPF_ALU | BPF_LSH | BPF_K:
1769 if (unlikely(imm32 > 31))
1770 return -EINVAL;
1771 /* mov ecx,imm32*/
1772 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
1773 emit_ia32_shift_r(BPF_OP(code), dst_lo, IA32_ECX, dstk,
1774 false, &prog);
1775 emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
1776 break;
1777 /* dst = dst << imm */
1778 case BPF_ALU64 | BPF_LSH | BPF_K:
1779 if (unlikely(imm32 > 63))
1780 return -EINVAL;
1781 emit_ia32_lsh_i64(dst, imm32, dstk, &prog);
1782 break;
1783 /* dst = dst >> imm */
1784 case BPF_ALU64 | BPF_RSH | BPF_K:
1785 if (unlikely(imm32 > 63))
1786 return -EINVAL;
1787 emit_ia32_rsh_i64(dst, imm32, dstk, &prog);
1788 break;
1789 /* dst = dst << src */
1790 case BPF_ALU64 | BPF_LSH | BPF_X:
1791 emit_ia32_lsh_r64(dst, src, dstk, sstk, &prog);
1792 break;
1793 /* dst = dst >> src */
1794 case BPF_ALU64 | BPF_RSH | BPF_X:
1795 emit_ia32_rsh_r64(dst, src, dstk, sstk, &prog);
1796 break;
1797 /* dst = dst >> src (signed) */
1798 case BPF_ALU64 | BPF_ARSH | BPF_X:
1799 emit_ia32_arsh_r64(dst, src, dstk, sstk, &prog);
1800 break;
1801 /* dst = dst >> imm (signed) */
1802 case BPF_ALU64 | BPF_ARSH | BPF_K:
1803 if (unlikely(imm32 > 63))
1804 return -EINVAL;
1805 emit_ia32_arsh_i64(dst, imm32, dstk, &prog);
1806 break;
1807 /* dst = ~dst */
1808 case BPF_ALU | BPF_NEG:
1809 emit_ia32_alu_i(is64, false, BPF_OP(code),
1810 dst_lo, 0, dstk, &prog);
1811 emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
1812 break;
1813 /* dst = ~dst (64 bit) */
1814 case BPF_ALU64 | BPF_NEG:
1815 emit_ia32_neg64(dst, dstk, &prog);
1816 break;
1817 /* dst = dst * src/imm */
1818 case BPF_ALU64 | BPF_MUL | BPF_X:
1819 case BPF_ALU64 | BPF_MUL | BPF_K:
1820 switch (BPF_SRC(code)) {
1821 case BPF_X:
1822 emit_ia32_mul_r64(dst, src, dstk, sstk, &prog);
1823 break;
1824 case BPF_K:
1825 emit_ia32_mul_i64(dst, imm32, dstk, &prog);
1826 break;
1827 }
1828 break;
1829 /* dst = htole(dst) */
1830 case BPF_ALU | BPF_END | BPF_FROM_LE:
1831 emit_ia32_to_le_r64(dst, imm32, dstk, &prog);
1832 break;
1833 /* dst = htobe(dst) */
1834 case BPF_ALU | BPF_END | BPF_FROM_BE:
1835 emit_ia32_to_be_r64(dst, imm32, dstk, &prog);
1836 break;
1837 /* dst = imm64 */
1838 case BPF_LD | BPF_IMM | BPF_DW: {
1839 s32 hi, lo = imm32;
1840
1841 hi = insn[1].imm;
1842 emit_ia32_mov_i(dst_lo, lo, dstk, &prog);
1843 emit_ia32_mov_i(dst_hi, hi, dstk, &prog);
1844 insn++;
1845 i++;
1846 break;
1847 }
1848 /* ST: *(u8*)(dst_reg + off) = imm */
1849 case BPF_ST | BPF_MEM | BPF_H:
1850 case BPF_ST | BPF_MEM | BPF_B:
1851 case BPF_ST | BPF_MEM | BPF_W:
1852 case BPF_ST | BPF_MEM | BPF_DW:
1853 if (dstk)
1854 /* mov eax,dword ptr [ebp+off] */
1855 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
1856 STACK_VAR(dst_lo));
1857 else
1858 /* mov eax,dst_lo */
1859 EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
1860
1861 switch (BPF_SIZE(code)) {
1862 case BPF_B:
1863 EMIT(0xC6, 1); break;
1864 case BPF_H:
1865 EMIT2(0x66, 0xC7); break;
1866 case BPF_W:
1867 case BPF_DW:
1868 EMIT(0xC7, 1); break;
1869 }
1870
1871 if (is_imm8(insn->off))
1872 EMIT2(add_1reg(0x40, IA32_EAX), insn->off);
1873 else
1874 EMIT1_off32(add_1reg(0x80, IA32_EAX),
1875 insn->off);
1876 EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(code)));
1877
1878 if (BPF_SIZE(code) == BPF_DW) {
1879 u32 hi;
1880
1881 hi = imm32 & (1<<31) ? (u32)~0 : 0;
1882 EMIT2_off32(0xC7, add_1reg(0x80, IA32_EAX),
1883 insn->off + 4);
1884 EMIT(hi, 4);
1885 }
1886 break;
1887
1888 /* STX: *(u8*)(dst_reg + off) = src_reg */
1889 case BPF_STX | BPF_MEM | BPF_B:
1890 case BPF_STX | BPF_MEM | BPF_H:
1891 case BPF_STX | BPF_MEM | BPF_W:
1892 case BPF_STX | BPF_MEM | BPF_DW:
1893 if (dstk)
1894 /* mov eax,dword ptr [ebp+off] */
1895 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
1896 STACK_VAR(dst_lo));
1897 else
1898 /* mov eax,dst_lo */
1899 EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
1900
1901 if (sstk)
1902 /* mov edx,dword ptr [ebp+off] */
1903 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
1904 STACK_VAR(src_lo));
1905 else
1906 /* mov edx,src_lo */
1907 EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EDX));
1908
1909 switch (BPF_SIZE(code)) {
1910 case BPF_B:
1911 EMIT(0x88, 1); break;
1912 case BPF_H:
1913 EMIT2(0x66, 0x89); break;
1914 case BPF_W:
1915 case BPF_DW:
1916 EMIT(0x89, 1); break;
1917 }
1918
1919 if (is_imm8(insn->off))
1920 EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
1921 insn->off);
1922 else
1923 EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
1924 insn->off);
1925
1926 if (BPF_SIZE(code) == BPF_DW) {
1927 if (sstk)
1928 /* mov edi,dword ptr [ebp+off] */
1929 EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
1930 IA32_EDX),
1931 STACK_VAR(src_hi));
1932 else
1933 /* mov edi,src_hi */
1934 EMIT2(0x8B, add_2reg(0xC0, src_hi,
1935 IA32_EDX));
1936 EMIT1(0x89);
1937 if (is_imm8(insn->off + 4)) {
1938 EMIT2(add_2reg(0x40, IA32_EAX,
1939 IA32_EDX),
1940 insn->off + 4);
1941 } else {
1942 EMIT1(add_2reg(0x80, IA32_EAX,
1943 IA32_EDX));
1944 EMIT(insn->off + 4, 4);
1945 }
1946 }
1947 break;
1948
1949 /* LDX: dst_reg = *(u8*)(src_reg + off) */
1950 case BPF_LDX | BPF_MEM | BPF_B:
1951 case BPF_LDX | BPF_MEM | BPF_H:
1952 case BPF_LDX | BPF_MEM | BPF_W:
1953 case BPF_LDX | BPF_MEM | BPF_DW:
1954 if (sstk)
1955 /* mov eax,dword ptr [ebp+off] */
1956 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
1957 STACK_VAR(src_lo));
1958 else
1959 /* mov eax,dword ptr [ebp+off] */
1960 EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EAX));
1961
1962 switch (BPF_SIZE(code)) {
1963 case BPF_B:
1964 EMIT2(0x0F, 0xB6); break;
1965 case BPF_H:
1966 EMIT2(0x0F, 0xB7); break;
1967 case BPF_W:
1968 case BPF_DW:
1969 EMIT(0x8B, 1); break;
1970 }
1971
1972 if (is_imm8(insn->off))
1973 EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
1974 insn->off);
1975 else
1976 EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
1977 insn->off);
1978
1979 if (dstk)
1980 /* mov dword ptr [ebp+off],edx */
1981 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
1982 STACK_VAR(dst_lo));
1983 else
1984 /* mov dst_lo,edx */
1985 EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EDX));
1986 switch (BPF_SIZE(code)) {
1987 case BPF_B:
1988 case BPF_H:
1989 case BPF_W:
1990 if (dstk) {
1991 EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
1992 STACK_VAR(dst_hi));
1993 EMIT(0x0, 4);
1994 } else {
1995 EMIT3(0xC7, add_1reg(0xC0, dst_hi), 0);
1996 }
1997 break;
1998 case BPF_DW:
1999 EMIT2_off32(0x8B,
2000 add_2reg(0x80, IA32_EAX, IA32_EDX),
2001 insn->off + 4);
2002 if (dstk)
2003 EMIT3(0x89,
2004 add_2reg(0x40, IA32_EBP,
2005 IA32_EDX),
2006 STACK_VAR(dst_hi));
2007 else
2008 EMIT2(0x89,
2009 add_2reg(0xC0, dst_hi, IA32_EDX));
2010 break;
2011 default:
2012 break;
2013 }
2014 break;
2015 /* call */
2016 case BPF_JMP | BPF_CALL:
2017 {
2018 const u8 *r1 = bpf2ia32[BPF_REG_1];
2019 const u8 *r2 = bpf2ia32[BPF_REG_2];
2020 const u8 *r3 = bpf2ia32[BPF_REG_3];
2021 const u8 *r4 = bpf2ia32[BPF_REG_4];
2022 const u8 *r5 = bpf2ia32[BPF_REG_5];
2023
2024 if (insn->src_reg == BPF_PSEUDO_CALL)
2025 goto notyet;
2026
2027 func = (u8 *) __bpf_call_base + imm32;
2028 jmp_offset = func - (image + addrs[i]);
2029
2030 if (!imm32 || !is_simm32(jmp_offset)) {
2031 pr_err("unsupported BPF func %d addr %p image %p\n",
2032 imm32, func, image);
2033 return -EINVAL;
2034 }
2035
2036 /* mov eax,dword ptr [ebp+off] */
2037 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
2038 STACK_VAR(r1[0]));
2039 /* mov edx,dword ptr [ebp+off] */
2040 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
2041 STACK_VAR(r1[1]));
2042
2043 emit_push_r64(r5, &prog);
2044 emit_push_r64(r4, &prog);
2045 emit_push_r64(r3, &prog);
2046 emit_push_r64(r2, &prog);
2047
2048 EMIT1_off32(0xE8, jmp_offset + 9);
2049
2050 /* mov dword ptr [ebp+off],eax */
2051 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
2052 STACK_VAR(r0[0]));
2053 /* mov dword ptr [ebp+off],edx */
2054 EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
2055 STACK_VAR(r0[1]));
2056
2057 /* add esp,32 */
2058 EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 32);
2059 break;
2060 }
2061 case BPF_JMP | BPF_TAIL_CALL:
2062 emit_bpf_tail_call(&prog);
2063 break;
2064
2065 /* cond jump */
2066 case BPF_JMP | BPF_JEQ | BPF_X:
2067 case BPF_JMP | BPF_JNE | BPF_X:
2068 case BPF_JMP | BPF_JGT | BPF_X:
2069 case BPF_JMP | BPF_JLT | BPF_X:
2070 case BPF_JMP | BPF_JGE | BPF_X:
2071 case BPF_JMP | BPF_JLE | BPF_X:
2072 case BPF_JMP | BPF_JSGT | BPF_X:
2073 case BPF_JMP | BPF_JSLE | BPF_X:
2074 case BPF_JMP | BPF_JSLT | BPF_X:
2075 case BPF_JMP | BPF_JSGE | BPF_X: {
2076 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
2077 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
2078 u8 sreg_lo = sstk ? IA32_ECX : src_lo;
2079 u8 sreg_hi = sstk ? IA32_EBX : src_hi;
2080
2081 if (dstk) {
2082 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
2083 STACK_VAR(dst_lo));
2084 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
2085 STACK_VAR(dst_hi));
2086 }
2087
2088 if (sstk) {
2089 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
2090 STACK_VAR(src_lo));
2091 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
2092 STACK_VAR(src_hi));
2093 }
2094
2095 /* cmp dreg_hi,sreg_hi */
2096 EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
2097 EMIT2(IA32_JNE, 2);
2098 /* cmp dreg_lo,sreg_lo */
2099 EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
2100 goto emit_cond_jmp;
2101 }
2102 case BPF_JMP | BPF_JSET | BPF_X: {
2103 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
2104 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
2105 u8 sreg_lo = sstk ? IA32_ECX : src_lo;
2106 u8 sreg_hi = sstk ? IA32_EBX : src_hi;
2107
2108 if (dstk) {
2109 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
2110 STACK_VAR(dst_lo));
2111 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
2112 STACK_VAR(dst_hi));
2113 }
2114
2115 if (sstk) {
2116 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
2117 STACK_VAR(src_lo));
2118 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
2119 STACK_VAR(src_hi));
2120 }
2121 /* and dreg_lo,sreg_lo */
2122 EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
2123 /* and dreg_hi,sreg_hi */
2124 EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
2125 /* or dreg_lo,dreg_hi */
2126 EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
2127 goto emit_cond_jmp;
2128 }
2129 case BPF_JMP | BPF_JSET | BPF_K: {
2130 u32 hi;
2131 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
2132 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
2133 u8 sreg_lo = IA32_ECX;
2134 u8 sreg_hi = IA32_EBX;
2135
2136 if (dstk) {
2137 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
2138 STACK_VAR(dst_lo));
2139 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
2140 STACK_VAR(dst_hi));
2141 }
2142 hi = imm32 & (1<<31) ? (u32)~0 : 0;
2143
2144 /* mov ecx,imm32 */
2145 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
2146 /* mov ebx,imm32 */
2147 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
2148
2149 /* and dreg_lo,sreg_lo */
2150 EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
2151 /* and dreg_hi,sreg_hi */
2152 EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
2153 /* or dreg_lo,dreg_hi */
2154 EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
2155 goto emit_cond_jmp;
2156 }
2157 case BPF_JMP | BPF_JEQ | BPF_K:
2158 case BPF_JMP | BPF_JNE | BPF_K:
2159 case BPF_JMP | BPF_JGT | BPF_K:
2160 case BPF_JMP | BPF_JLT | BPF_K:
2161 case BPF_JMP | BPF_JGE | BPF_K:
2162 case BPF_JMP | BPF_JLE | BPF_K:
2163 case BPF_JMP | BPF_JSGT | BPF_K:
2164 case BPF_JMP | BPF_JSLE | BPF_K:
2165 case BPF_JMP | BPF_JSLT | BPF_K:
2166 case BPF_JMP | BPF_JSGE | BPF_K: {
2167 u32 hi;
2168 u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
2169 u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
2170 u8 sreg_lo = IA32_ECX;
2171 u8 sreg_hi = IA32_EBX;
2172
2173 if (dstk) {
2174 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
2175 STACK_VAR(dst_lo));
2176 EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
2177 STACK_VAR(dst_hi));
2178 }
2179
2180 hi = imm32 & (1<<31) ? (u32)~0 : 0;
2181 /* mov ecx,imm32 */
2182 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
2183 /* mov ebx,imm32 */
2184 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
2185
2186 /* cmp dreg_hi,sreg_hi */
2187 EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
2188 EMIT2(IA32_JNE, 2);
2189 /* cmp dreg_lo,sreg_lo */
2190 EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
2191
2192emit_cond_jmp: /* Convert BPF opcode to x86 */
2193 switch (BPF_OP(code)) {
2194 case BPF_JEQ:
2195 jmp_cond = IA32_JE;
2196 break;
2197 case BPF_JSET:
2198 case BPF_JNE:
2199 jmp_cond = IA32_JNE;
2200 break;
2201 case BPF_JGT:
2202 /* GT is unsigned '>', JA in x86 */
2203 jmp_cond = IA32_JA;
2204 break;
2205 case BPF_JLT:
2206 /* LT is unsigned '<', JB in x86 */
2207 jmp_cond = IA32_JB;
2208 break;
2209 case BPF_JGE:
2210 /* GE is unsigned '>=', JAE in x86 */
2211 jmp_cond = IA32_JAE;
2212 break;
2213 case BPF_JLE:
2214 /* LE is unsigned '<=', JBE in x86 */
2215 jmp_cond = IA32_JBE;
2216 break;
2217 case BPF_JSGT:
2218 /* Signed '>', GT in x86 */
2219 jmp_cond = IA32_JG;
2220 break;
2221 case BPF_JSLT:
2222 /* Signed '<', LT in x86 */
2223 jmp_cond = IA32_JL;
2224 break;
2225 case BPF_JSGE:
2226 /* Signed '>=', GE in x86 */
2227 jmp_cond = IA32_JGE;
2228 break;
2229 case BPF_JSLE:
2230 /* Signed '<=', LE in x86 */
2231 jmp_cond = IA32_JLE;
2232 break;
2233 default: /* to silence GCC warning */
2234 return -EFAULT;
2235 }
2236 jmp_offset = addrs[i + insn->off] - addrs[i];
2237 if (is_imm8(jmp_offset)) {
2238 EMIT2(jmp_cond, jmp_offset);
2239 } else if (is_simm32(jmp_offset)) {
2240 EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
2241 } else {
2242 pr_err("cond_jmp gen bug %llx\n", jmp_offset);
2243 return -EFAULT;
2244 }
2245
2246 break;
2247 }
2248 case BPF_JMP | BPF_JA:
2249 if (insn->off == -1)
2250 /* -1 jmp instructions will always jump
2251 * backwards two bytes. Explicitly handling
2252 * this case avoids wasting too many passes
2253 * when there are long sequences of replaced
2254 * dead code.
2255 */
2256 jmp_offset = -2;
2257 else
2258 jmp_offset = addrs[i + insn->off] - addrs[i];
2259
2260 if (!jmp_offset)
2261 /* Optimize out nop jumps */
2262 break;
2263emit_jmp:
2264 if (is_imm8(jmp_offset)) {
2265 EMIT2(0xEB, jmp_offset);
2266 } else if (is_simm32(jmp_offset)) {
2267 EMIT1_off32(0xE9, jmp_offset);
2268 } else {
2269 pr_err("jmp gen bug %llx\n", jmp_offset);
2270 return -EFAULT;
2271 }
2272 break;
2273 /* STX XADD: lock *(u32 *)(dst + off) += src */
2274 case BPF_STX | BPF_XADD | BPF_W:
2275 /* STX XADD: lock *(u64 *)(dst + off) += src */
2276 case BPF_STX | BPF_XADD | BPF_DW:
2277 goto notyet;
2278 case BPF_JMP | BPF_EXIT:
2279 if (seen_exit) {
2280 jmp_offset = ctx->cleanup_addr - addrs[i];
2281 goto emit_jmp;
2282 }
2283 seen_exit = true;
2284 /* Update cleanup_addr */
2285 ctx->cleanup_addr = proglen;
2286 emit_epilogue(&prog, bpf_prog->aux->stack_depth);
2287 break;
2288notyet:
2289 pr_info_once("*** NOT YET: opcode %02x ***\n", code);
2290 return -EFAULT;
2291 default:
2292 /*
2293 * This error will be seen if new instruction was added
2294 * to interpreter, but not to JIT or if there is junk in
2295 * bpf_prog
2296 */
2297 pr_err("bpf_jit: unknown opcode %02x\n", code);
2298 return -EINVAL;
2299 }
2300
2301 ilen = prog - temp;
2302 if (ilen > BPF_MAX_INSN_SIZE) {
2303 pr_err("bpf_jit: fatal insn size error\n");
2304 return -EFAULT;
2305 }
2306
2307 if (image) {
2308 if (unlikely(proglen + ilen > oldproglen)) {
2309 pr_err("bpf_jit: fatal error\n");
2310 return -EFAULT;
2311 }
2312 memcpy(image + proglen, temp, ilen);
2313 }
2314 proglen += ilen;
2315 addrs[i] = proglen;
2316 prog = temp;
2317 }
2318 return proglen;
2319}
2320
2321struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
2322{
2323 struct bpf_binary_header *header = NULL;
2324 struct bpf_prog *tmp, *orig_prog = prog;
2325 int proglen, oldproglen = 0;
2326 struct jit_context ctx = {};
2327 bool tmp_blinded = false;
2328 u8 *image = NULL;
2329 int *addrs;
2330 int pass;
2331 int i;
2332
2333 if (!prog->jit_requested)
2334 return orig_prog;
2335
2336 tmp = bpf_jit_blind_constants(prog);
2337 /*
2338 * If blinding was requested and we failed during blinding,
2339 * we must fall back to the interpreter.
2340 */
2341 if (IS_ERR(tmp))
2342 return orig_prog;
2343 if (tmp != prog) {
2344 tmp_blinded = true;
2345 prog = tmp;
2346 }
2347
2348 addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
2349 if (!addrs) {
2350 prog = orig_prog;
2351 goto out;
2352 }
2353
2354 /*
2355 * Before first pass, make a rough estimation of addrs[]
2356 * each BPF instruction is translated to less than 64 bytes
2357 */
2358 for (proglen = 0, i = 0; i < prog->len; i++) {
2359 proglen += 64;
2360 addrs[i] = proglen;
2361 }
2362 ctx.cleanup_addr = proglen;
2363
2364 /*
2365 * JITed image shrinks with every pass and the loop iterates
2366 * until the image stops shrinking. Very large BPF programs
2367 * may converge on the last pass. In such case do one more
2368 * pass to emit the final image.
2369 */
2370 for (pass = 0; pass < 20 || image; pass++) {
2371 proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
2372 if (proglen <= 0) {
2373out_image:
2374 image = NULL;
2375 if (header)
2376 bpf_jit_binary_free(header);
2377 prog = orig_prog;
2378 goto out_addrs;
2379 }
2380 if (image) {
2381 if (proglen != oldproglen) {
2382 pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
2383 proglen, oldproglen);
2384 goto out_image;
2385 }
2386 break;
2387 }
2388 if (proglen == oldproglen) {
2389 header = bpf_jit_binary_alloc(proglen, &image,
2390 1, jit_fill_hole);
2391 if (!header) {
2392 prog = orig_prog;
2393 goto out_addrs;
2394 }
2395 }
2396 oldproglen = proglen;
2397 cond_resched();
2398 }
2399
2400 if (bpf_jit_enable > 1)
2401 bpf_jit_dump(prog->len, proglen, pass + 1, image);
2402
2403 if (image) {
2404 bpf_jit_binary_lock_ro(header);
2405 prog->bpf_func = (void *)image;
2406 prog->jited = 1;
2407 prog->jited_len = proglen;
2408 } else {
2409 prog = orig_prog;
2410 }
2411
2412out_addrs:
2413 kfree(addrs);
2414out:
2415 if (tmp_blinded)
2416 bpf_jit_prog_release_other(prog, prog == orig_prog ?
2417 tmp : orig_prog);
2418 return prog;
2419}
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
index 7e298148ca26..cb87fccb9f6a 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2017 Netronome Systems, Inc. 2 * Copyright (C) 2017-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -102,6 +102,15 @@ nfp_bpf_cmsg_map_req_alloc(struct nfp_app_bpf *bpf, unsigned int n)
102 return nfp_bpf_cmsg_alloc(bpf, size); 102 return nfp_bpf_cmsg_alloc(bpf, size);
103} 103}
104 104
105static u8 nfp_bpf_cmsg_get_type(struct sk_buff *skb)
106{
107 struct cmsg_hdr *hdr;
108
109 hdr = (struct cmsg_hdr *)skb->data;
110
111 return hdr->type;
112}
113
105static unsigned int nfp_bpf_cmsg_get_tag(struct sk_buff *skb) 114static unsigned int nfp_bpf_cmsg_get_tag(struct sk_buff *skb)
106{ 115{
107 struct cmsg_hdr *hdr; 116 struct cmsg_hdr *hdr;
@@ -431,6 +440,11 @@ void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb)
431 goto err_free; 440 goto err_free;
432 } 441 }
433 442
443 if (nfp_bpf_cmsg_get_type(skb) == CMSG_TYPE_BPF_EVENT) {
444 nfp_bpf_event_output(bpf, skb);
445 return;
446 }
447
434 nfp_ctrl_lock(bpf->app->ctrl); 448 nfp_ctrl_lock(bpf->app->ctrl);
435 449
436 tag = nfp_bpf_cmsg_get_tag(skb); 450 tag = nfp_bpf_cmsg_get_tag(skb);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
index 39639ac28b01..3dbc21653ce5 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2017 Netronome Systems, Inc. 2 * Copyright (C) 2017-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -37,6 +37,14 @@
37#include <linux/bitops.h> 37#include <linux/bitops.h>
38#include <linux/types.h> 38#include <linux/types.h>
39 39
40/* Kernel's enum bpf_reg_type is not uABI so people may change it breaking
41 * our FW ABI. In that case we will do translation in the driver.
42 */
43#define NFP_BPF_SCALAR_VALUE 1
44#define NFP_BPF_MAP_VALUE 4
45#define NFP_BPF_STACK 6
46#define NFP_BPF_PACKET_DATA 8
47
40enum bpf_cap_tlv_type { 48enum bpf_cap_tlv_type {
41 NFP_BPF_CAP_TYPE_FUNC = 1, 49 NFP_BPF_CAP_TYPE_FUNC = 1,
42 NFP_BPF_CAP_TYPE_ADJUST_HEAD = 2, 50 NFP_BPF_CAP_TYPE_ADJUST_HEAD = 2,
@@ -81,6 +89,7 @@ enum nfp_bpf_cmsg_type {
81 CMSG_TYPE_MAP_DELETE = 5, 89 CMSG_TYPE_MAP_DELETE = 5,
82 CMSG_TYPE_MAP_GETNEXT = 6, 90 CMSG_TYPE_MAP_GETNEXT = 6,
83 CMSG_TYPE_MAP_GETFIRST = 7, 91 CMSG_TYPE_MAP_GETFIRST = 7,
92 CMSG_TYPE_BPF_EVENT = 8,
84 __CMSG_TYPE_MAP_MAX, 93 __CMSG_TYPE_MAP_MAX,
85}; 94};
86 95
@@ -155,4 +164,13 @@ struct cmsg_reply_map_op {
155 __be32 resv; 164 __be32 resv;
156 struct cmsg_key_value_pair elem[0]; 165 struct cmsg_key_value_pair elem[0];
157}; 166};
167
168struct cmsg_bpf_event {
169 struct cmsg_hdr hdr;
170 __be32 cpu_id;
171 __be64 map_ptr;
172 __be32 data_size;
173 __be32 pkt_size;
174 u8 data[0];
175};
158#endif 176#endif
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 65f0791cae0c..326a2085d650 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2016-2017 Netronome Systems, Inc. 2 * Copyright (C) 2016-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -1395,15 +1395,9 @@ static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1395static int 1395static int
1396map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 1396map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1397{ 1397{
1398 struct bpf_offloaded_map *offmap;
1399 struct nfp_bpf_map *nfp_map;
1400 bool load_lm_ptr; 1398 bool load_lm_ptr;
1401 u32 ret_tgt; 1399 u32 ret_tgt;
1402 s64 lm_off; 1400 s64 lm_off;
1403 swreg tid;
1404
1405 offmap = (struct bpf_offloaded_map *)meta->arg1.map_ptr;
1406 nfp_map = offmap->dev_priv;
1407 1401
1408 /* We only have to reload LM0 if the key is not at start of stack */ 1402 /* We only have to reload LM0 if the key is not at start of stack */
1409 lm_off = nfp_prog->stack_depth; 1403 lm_off = nfp_prog->stack_depth;
@@ -1416,17 +1410,12 @@ map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1416 if (meta->func_id == BPF_FUNC_map_update_elem) 1410 if (meta->func_id == BPF_FUNC_map_update_elem)
1417 emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2); 1411 emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1418 1412
1419 /* Load map ID into a register, it should actually fit as an immediate
1420 * but in case it doesn't deal with it here, not in the delay slots.
1421 */
1422 tid = ur_load_imm_any(nfp_prog, nfp_map->tid, imm_a(nfp_prog));
1423
1424 emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id, 1413 emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1425 2, RELO_BR_HELPER); 1414 2, RELO_BR_HELPER);
1426 ret_tgt = nfp_prog_current_offset(nfp_prog) + 2; 1415 ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1427 1416
1428 /* Load map ID into A0 */ 1417 /* Load map ID into A0 */
1429 wrp_mov(nfp_prog, reg_a(0), tid); 1418 wrp_mov(nfp_prog, reg_a(0), reg_a(2));
1430 1419
1431 /* Load the return address into B0 */ 1420 /* Load the return address into B0 */
1432 wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL); 1421 wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
@@ -1456,6 +1445,31 @@ nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1456 return 0; 1445 return 0;
1457} 1446}
1458 1447
1448static int
1449nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1450{
1451 swreg ptr_type;
1452 u32 ret_tgt;
1453
1454 ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
1455
1456 ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
1457
1458 emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1459 2, RELO_BR_HELPER);
1460
1461 /* Load ptr type into A1 */
1462 wrp_mov(nfp_prog, reg_a(1), ptr_type);
1463
1464 /* Load the return address into B0 */
1465 wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1466
1467 if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1468 return -EINVAL;
1469
1470 return 0;
1471}
1472
1459/* --- Callbacks --- */ 1473/* --- Callbacks --- */
1460static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 1474static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1461{ 1475{
@@ -2411,6 +2425,8 @@ static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2411 return map_call_stack_common(nfp_prog, meta); 2425 return map_call_stack_common(nfp_prog, meta);
2412 case BPF_FUNC_get_prandom_u32: 2426 case BPF_FUNC_get_prandom_u32:
2413 return nfp_get_prandom_u32(nfp_prog, meta); 2427 return nfp_get_prandom_u32(nfp_prog, meta);
2428 case BPF_FUNC_perf_event_output:
2429 return nfp_perf_event_output(nfp_prog, meta);
2414 default: 2430 default:
2415 WARN_ONCE(1, "verifier allowed unsupported function\n"); 2431 WARN_ONCE(1, "verifier allowed unsupported function\n");
2416 return -EOPNOTSUPP; 2432 return -EOPNOTSUPP;
@@ -3227,6 +3243,33 @@ static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
3227 return 0; 3243 return 0;
3228} 3244}
3229 3245
3246static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
3247{
3248 struct nfp_insn_meta *meta1, *meta2;
3249 struct nfp_bpf_map *nfp_map;
3250 struct bpf_map *map;
3251
3252 nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3253 if (meta1->skip || meta2->skip)
3254 continue;
3255
3256 if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
3257 meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
3258 continue;
3259
3260 map = (void *)(unsigned long)((u32)meta1->insn.imm |
3261 (u64)meta2->insn.imm << 32);
3262 if (bpf_map_offload_neutral(map))
3263 continue;
3264 nfp_map = map_to_offmap(map)->dev_priv;
3265
3266 meta1->insn.imm = nfp_map->tid;
3267 meta2->insn.imm = 0;
3268 }
3269
3270 return 0;
3271}
3272
3230static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len) 3273static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
3231{ 3274{
3232 __le64 *ustore = (__force __le64 *)prog; 3275 __le64 *ustore = (__force __le64 *)prog;
@@ -3263,6 +3306,10 @@ int nfp_bpf_jit(struct nfp_prog *nfp_prog)
3263{ 3306{
3264 int ret; 3307 int ret;
3265 3308
3309 ret = nfp_bpf_replace_map_ptrs(nfp_prog);
3310 if (ret)
3311 return ret;
3312
3266 ret = nfp_bpf_optimize(nfp_prog); 3313 ret = nfp_bpf_optimize(nfp_prog);
3267 if (ret) 3314 if (ret)
3268 return ret; 3315 return ret;
@@ -3353,6 +3400,9 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
3353 case BPF_FUNC_map_delete_elem: 3400 case BPF_FUNC_map_delete_elem:
3354 val = nfp_prog->bpf->helpers.map_delete; 3401 val = nfp_prog->bpf->helpers.map_delete;
3355 break; 3402 break;
3403 case BPF_FUNC_perf_event_output:
3404 val = nfp_prog->bpf->helpers.perf_event_output;
3405 break;
3356 default: 3406 default:
3357 pr_err("relocation of unknown helper %d\n", 3407 pr_err("relocation of unknown helper %d\n",
3358 val); 3408 val);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 1dc424685f4e..d72f9e7f42da 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2017 Netronome Systems, Inc. 2 * Copyright (C) 2017-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -43,6 +43,14 @@
43#include "fw.h" 43#include "fw.h"
44#include "main.h" 44#include "main.h"
45 45
46const struct rhashtable_params nfp_bpf_maps_neutral_params = {
47 .nelem_hint = 4,
48 .key_len = FIELD_SIZEOF(struct nfp_bpf_neutral_map, ptr),
49 .key_offset = offsetof(struct nfp_bpf_neutral_map, ptr),
50 .head_offset = offsetof(struct nfp_bpf_neutral_map, l),
51 .automatic_shrinking = true,
52};
53
46static bool nfp_net_ebpf_capable(struct nfp_net *nn) 54static bool nfp_net_ebpf_capable(struct nfp_net *nn)
47{ 55{
48#ifdef __LITTLE_ENDIAN 56#ifdef __LITTLE_ENDIAN
@@ -290,6 +298,9 @@ nfp_bpf_parse_cap_func(struct nfp_app_bpf *bpf, void __iomem *value, u32 length)
290 case BPF_FUNC_map_delete_elem: 298 case BPF_FUNC_map_delete_elem:
291 bpf->helpers.map_delete = readl(&cap->func_addr); 299 bpf->helpers.map_delete = readl(&cap->func_addr);
292 break; 300 break;
301 case BPF_FUNC_perf_event_output:
302 bpf->helpers.perf_event_output = readl(&cap->func_addr);
303 break;
293 } 304 }
294 305
295 return 0; 306 return 0;
@@ -401,17 +412,28 @@ static int nfp_bpf_init(struct nfp_app *app)
401 init_waitqueue_head(&bpf->cmsg_wq); 412 init_waitqueue_head(&bpf->cmsg_wq);
402 INIT_LIST_HEAD(&bpf->map_list); 413 INIT_LIST_HEAD(&bpf->map_list);
403 414
404 err = nfp_bpf_parse_capabilities(app); 415 err = rhashtable_init(&bpf->maps_neutral, &nfp_bpf_maps_neutral_params);
405 if (err) 416 if (err)
406 goto err_free_bpf; 417 goto err_free_bpf;
407 418
419 err = nfp_bpf_parse_capabilities(app);
420 if (err)
421 goto err_free_neutral_maps;
422
408 return 0; 423 return 0;
409 424
425err_free_neutral_maps:
426 rhashtable_destroy(&bpf->maps_neutral);
410err_free_bpf: 427err_free_bpf:
411 kfree(bpf); 428 kfree(bpf);
412 return err; 429 return err;
413} 430}
414 431
432static void nfp_check_rhashtable_empty(void *ptr, void *arg)
433{
434 WARN_ON_ONCE(1);
435}
436
415static void nfp_bpf_clean(struct nfp_app *app) 437static void nfp_bpf_clean(struct nfp_app *app)
416{ 438{
417 struct nfp_app_bpf *bpf = app->priv; 439 struct nfp_app_bpf *bpf = app->priv;
@@ -419,6 +441,8 @@ static void nfp_bpf_clean(struct nfp_app *app)
419 WARN_ON(!skb_queue_empty(&bpf->cmsg_replies)); 441 WARN_ON(!skb_queue_empty(&bpf->cmsg_replies));
420 WARN_ON(!list_empty(&bpf->map_list)); 442 WARN_ON(!list_empty(&bpf->map_list));
421 WARN_ON(bpf->maps_in_use || bpf->map_elems_in_use); 443 WARN_ON(bpf->maps_in_use || bpf->map_elems_in_use);
444 rhashtable_free_and_destroy(&bpf->maps_neutral,
445 nfp_check_rhashtable_empty, NULL);
422 kfree(bpf); 446 kfree(bpf);
423} 447}
424 448
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 68b5d326483d..82682378d57f 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2016-2017 Netronome Systems, Inc. 2 * Copyright (C) 2016-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -39,6 +39,7 @@
39#include <linux/bpf_verifier.h> 39#include <linux/bpf_verifier.h>
40#include <linux/kernel.h> 40#include <linux/kernel.h>
41#include <linux/list.h> 41#include <linux/list.h>
42#include <linux/rhashtable.h>
42#include <linux/skbuff.h> 43#include <linux/skbuff.h>
43#include <linux/types.h> 44#include <linux/types.h>
44#include <linux/wait.h> 45#include <linux/wait.h>
@@ -114,6 +115,8 @@ enum pkt_vec {
114 * @maps_in_use: number of currently offloaded maps 115 * @maps_in_use: number of currently offloaded maps
115 * @map_elems_in_use: number of elements allocated to offloaded maps 116 * @map_elems_in_use: number of elements allocated to offloaded maps
116 * 117 *
118 * @maps_neutral: hash table of offload-neutral maps (on pointer)
119 *
117 * @adjust_head: adjust head capability 120 * @adjust_head: adjust head capability
118 * @adjust_head.flags: extra flags for adjust head 121 * @adjust_head.flags: extra flags for adjust head
119 * @adjust_head.off_min: minimal packet offset within buffer required 122 * @adjust_head.off_min: minimal packet offset within buffer required
@@ -133,6 +136,7 @@ enum pkt_vec {
133 * @helpers.map_lookup: map lookup helper address 136 * @helpers.map_lookup: map lookup helper address
134 * @helpers.map_update: map update helper address 137 * @helpers.map_update: map update helper address
135 * @helpers.map_delete: map delete helper address 138 * @helpers.map_delete: map delete helper address
139 * @helpers.perf_event_output: output perf event to a ring buffer
136 * 140 *
137 * @pseudo_random: FW initialized the pseudo-random machinery (CSRs) 141 * @pseudo_random: FW initialized the pseudo-random machinery (CSRs)
138 */ 142 */
@@ -150,6 +154,8 @@ struct nfp_app_bpf {
150 unsigned int maps_in_use; 154 unsigned int maps_in_use;
151 unsigned int map_elems_in_use; 155 unsigned int map_elems_in_use;
152 156
157 struct rhashtable maps_neutral;
158
153 struct nfp_bpf_cap_adjust_head { 159 struct nfp_bpf_cap_adjust_head {
154 u32 flags; 160 u32 flags;
155 int off_min; 161 int off_min;
@@ -171,6 +177,7 @@ struct nfp_app_bpf {
171 u32 map_lookup; 177 u32 map_lookup;
172 u32 map_update; 178 u32 map_update;
173 u32 map_delete; 179 u32 map_delete;
180 u32 perf_event_output;
174 } helpers; 181 } helpers;
175 182
176 bool pseudo_random; 183 bool pseudo_random;
@@ -199,6 +206,14 @@ struct nfp_bpf_map {
199 enum nfp_bpf_map_use use_map[]; 206 enum nfp_bpf_map_use use_map[];
200}; 207};
201 208
209struct nfp_bpf_neutral_map {
210 struct rhash_head l;
211 struct bpf_map *ptr;
212 u32 count;
213};
214
215extern const struct rhashtable_params nfp_bpf_maps_neutral_params;
216
202struct nfp_prog; 217struct nfp_prog;
203struct nfp_insn_meta; 218struct nfp_insn_meta;
204typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); 219typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
@@ -367,6 +382,8 @@ static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta)
367 * @error: error code if something went wrong 382 * @error: error code if something went wrong
368 * @stack_depth: max stack depth from the verifier 383 * @stack_depth: max stack depth from the verifier
369 * @adjust_head_location: if program has single adjust head call - the insn no. 384 * @adjust_head_location: if program has single adjust head call - the insn no.
385 * @map_records_cnt: the number of map pointers recorded for this prog
386 * @map_records: the map record pointers from bpf->maps_neutral
370 * @insns: list of BPF instruction wrappers (struct nfp_insn_meta) 387 * @insns: list of BPF instruction wrappers (struct nfp_insn_meta)
371 */ 388 */
372struct nfp_prog { 389struct nfp_prog {
@@ -390,6 +407,9 @@ struct nfp_prog {
390 unsigned int stack_depth; 407 unsigned int stack_depth;
391 unsigned int adjust_head_location; 408 unsigned int adjust_head_location;
392 409
410 unsigned int map_records_cnt;
411 struct nfp_bpf_neutral_map **map_records;
412
393 struct list_head insns; 413 struct list_head insns;
394}; 414};
395 415
@@ -440,5 +460,7 @@ int nfp_bpf_ctrl_lookup_entry(struct bpf_offloaded_map *offmap,
440int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap, 460int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap,
441 void *key, void *next_key); 461 void *key, void *next_key);
442 462
463int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb);
464
443void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb); 465void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb);
444#endif 466#endif
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 42d98792bd25..4db0ac1e42a8 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2016-2017 Netronome Systems, Inc. 2 * Copyright (C) 2016-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -57,6 +57,126 @@
57#include "../nfp_net.h" 57#include "../nfp_net.h"
58 58
59static int 59static int
60nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog,
61 struct bpf_map *map)
62{
63 struct nfp_bpf_neutral_map *record;
64 int err;
65
66 /* Map record paths are entered via ndo, update side is protected. */
67 ASSERT_RTNL();
68
69 /* Reuse path - other offloaded program is already tracking this map. */
70 record = rhashtable_lookup_fast(&bpf->maps_neutral, &map,
71 nfp_bpf_maps_neutral_params);
72 if (record) {
73 nfp_prog->map_records[nfp_prog->map_records_cnt++] = record;
74 record->count++;
75 return 0;
76 }
77
78 /* Grab a single ref to the map for our record. The prog destroy ndo
79 * happens after free_used_maps().
80 */
81 map = bpf_map_inc(map, false);
82 if (IS_ERR(map))
83 return PTR_ERR(map);
84
85 record = kmalloc(sizeof(*record), GFP_KERNEL);
86 if (!record) {
87 err = -ENOMEM;
88 goto err_map_put;
89 }
90
91 record->ptr = map;
92 record->count = 1;
93
94 err = rhashtable_insert_fast(&bpf->maps_neutral, &record->l,
95 nfp_bpf_maps_neutral_params);
96 if (err)
97 goto err_free_rec;
98
99 nfp_prog->map_records[nfp_prog->map_records_cnt++] = record;
100
101 return 0;
102
103err_free_rec:
104 kfree(record);
105err_map_put:
106 bpf_map_put(map);
107 return err;
108}
109
110static void
111nfp_map_ptrs_forget(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog)
112{
113 bool freed = false;
114 int i;
115
116 ASSERT_RTNL();
117
118 for (i = 0; i < nfp_prog->map_records_cnt; i++) {
119 if (--nfp_prog->map_records[i]->count) {
120 nfp_prog->map_records[i] = NULL;
121 continue;
122 }
123
124 WARN_ON(rhashtable_remove_fast(&bpf->maps_neutral,
125 &nfp_prog->map_records[i]->l,
126 nfp_bpf_maps_neutral_params));
127 freed = true;
128 }
129
130 if (freed) {
131 synchronize_rcu();
132
133 for (i = 0; i < nfp_prog->map_records_cnt; i++)
134 if (nfp_prog->map_records[i]) {
135 bpf_map_put(nfp_prog->map_records[i]->ptr);
136 kfree(nfp_prog->map_records[i]);
137 }
138 }
139
140 kfree(nfp_prog->map_records);
141 nfp_prog->map_records = NULL;
142 nfp_prog->map_records_cnt = 0;
143}
144
145static int
146nfp_map_ptrs_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog,
147 struct bpf_prog *prog)
148{
149 int i, cnt, err;
150
151 /* Quickly count the maps we will have to remember */
152 cnt = 0;
153 for (i = 0; i < prog->aux->used_map_cnt; i++)
154 if (bpf_map_offload_neutral(prog->aux->used_maps[i]))
155 cnt++;
156 if (!cnt)
157 return 0;
158
159 nfp_prog->map_records = kmalloc_array(cnt,
160 sizeof(nfp_prog->map_records[0]),
161 GFP_KERNEL);
162 if (!nfp_prog->map_records)
163 return -ENOMEM;
164
165 for (i = 0; i < prog->aux->used_map_cnt; i++)
166 if (bpf_map_offload_neutral(prog->aux->used_maps[i])) {
167 err = nfp_map_ptr_record(bpf, nfp_prog,
168 prog->aux->used_maps[i]);
169 if (err) {
170 nfp_map_ptrs_forget(bpf, nfp_prog);
171 return err;
172 }
173 }
174 WARN_ON(cnt != nfp_prog->map_records_cnt);
175
176 return 0;
177}
178
179static int
60nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog, 180nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
61 unsigned int cnt) 181 unsigned int cnt)
62{ 182{
@@ -151,7 +271,7 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
151 prog->aux->offload->jited_len = nfp_prog->prog_len * sizeof(u64); 271 prog->aux->offload->jited_len = nfp_prog->prog_len * sizeof(u64);
152 prog->aux->offload->jited_image = nfp_prog->prog; 272 prog->aux->offload->jited_image = nfp_prog->prog;
153 273
154 return 0; 274 return nfp_map_ptrs_record(nfp_prog->bpf, nfp_prog, prog);
155} 275}
156 276
157static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog) 277static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog)
@@ -159,6 +279,7 @@ static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog)
159 struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv; 279 struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
160 280
161 kvfree(nfp_prog->prog); 281 kvfree(nfp_prog->prog);
282 nfp_map_ptrs_forget(nfp_prog->bpf, nfp_prog);
162 nfp_prog_free(nfp_prog); 283 nfp_prog_free(nfp_prog);
163 284
164 return 0; 285 return 0;
@@ -320,6 +441,53 @@ int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf)
320 } 441 }
321} 442}
322 443
444static unsigned long
445nfp_bpf_perf_event_copy(void *dst, const void *src,
446 unsigned long off, unsigned long len)
447{
448 memcpy(dst, src + off, len);
449 return 0;
450}
451
452int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb)
453{
454 struct cmsg_bpf_event *cbe = (void *)skb->data;
455 u32 pkt_size, data_size;
456 struct bpf_map *map;
457
458 if (skb->len < sizeof(struct cmsg_bpf_event))
459 goto err_drop;
460
461 pkt_size = be32_to_cpu(cbe->pkt_size);
462 data_size = be32_to_cpu(cbe->data_size);
463 map = (void *)(unsigned long)be64_to_cpu(cbe->map_ptr);
464
465 if (skb->len < sizeof(struct cmsg_bpf_event) + pkt_size + data_size)
466 goto err_drop;
467 if (cbe->hdr.ver != CMSG_MAP_ABI_VERSION)
468 goto err_drop;
469
470 rcu_read_lock();
471 if (!rhashtable_lookup_fast(&bpf->maps_neutral, &map,
472 nfp_bpf_maps_neutral_params)) {
473 rcu_read_unlock();
474 pr_warn("perf event: dest map pointer %px not recognized, dropping event\n",
475 map);
476 goto err_drop;
477 }
478
479 bpf_event_output(map, be32_to_cpu(cbe->cpu_id),
480 &cbe->data[round_up(pkt_size, 4)], data_size,
481 cbe->data, pkt_size, nfp_bpf_perf_event_copy);
482 rcu_read_unlock();
483
484 dev_consume_skb_any(skb);
485 return 0;
486err_drop:
487 dev_kfree_skb_any(skb);
488 return -EINVAL;
489}
490
323static int 491static int
324nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog, 492nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog,
325 struct netlink_ext_ack *extack) 493 struct netlink_ext_ack *extack)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 06ad53ce4ad9..e163f3cfa47d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2016-2017 Netronome Systems, Inc. 2 * Copyright (C) 2016-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -36,6 +36,8 @@
36#include <linux/kernel.h> 36#include <linux/kernel.h>
37#include <linux/pkt_cls.h> 37#include <linux/pkt_cls.h>
38 38
39#include "../nfp_app.h"
40#include "../nfp_main.h"
39#include "fw.h" 41#include "fw.h"
40#include "main.h" 42#include "main.h"
41 43
@@ -149,15 +151,6 @@ nfp_bpf_map_call_ok(const char *fname, struct bpf_verifier_env *env,
149 return false; 151 return false;
150 } 152 }
151 153
152 /* Rest of the checks is only if we re-parse the same insn */
153 if (!meta->func_id)
154 return true;
155
156 if (meta->arg1.map_ptr != reg1->map_ptr) {
157 pr_vlog(env, "%s: called for different map\n", fname);
158 return false;
159 }
160
161 return true; 154 return true;
162} 155}
163 156
@@ -216,6 +209,71 @@ nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env,
216 pr_vlog(env, "bpf_get_prandom_u32(): FW doesn't support random number generation\n"); 209 pr_vlog(env, "bpf_get_prandom_u32(): FW doesn't support random number generation\n");
217 return -EOPNOTSUPP; 210 return -EOPNOTSUPP;
218 211
212 case BPF_FUNC_perf_event_output:
213 BUILD_BUG_ON(NFP_BPF_SCALAR_VALUE != SCALAR_VALUE ||
214 NFP_BPF_MAP_VALUE != PTR_TO_MAP_VALUE ||
215 NFP_BPF_STACK != PTR_TO_STACK ||
216 NFP_BPF_PACKET_DATA != PTR_TO_PACKET);
217
218 if (!bpf->helpers.perf_event_output) {
219 pr_vlog(env, "event_output: not supported by FW\n");
220 return -EOPNOTSUPP;
221 }
222
223 /* Force current CPU to make sure we can report the event
224 * wherever we get the control message from FW.
225 */
226 if (reg3->var_off.mask & BPF_F_INDEX_MASK ||
227 (reg3->var_off.value & BPF_F_INDEX_MASK) !=
228 BPF_F_CURRENT_CPU) {
229 char tn_buf[48];
230
231 tnum_strn(tn_buf, sizeof(tn_buf), reg3->var_off);
232 pr_vlog(env, "event_output: must use BPF_F_CURRENT_CPU, var_off: %s\n",
233 tn_buf);
234 return -EOPNOTSUPP;
235 }
236
237 /* Save space in meta, we don't care about arguments other
238 * than 4th meta, shove it into arg1.
239 */
240 reg1 = cur_regs(env) + BPF_REG_4;
241
242 if (reg1->type != SCALAR_VALUE /* NULL ptr */ &&
243 reg1->type != PTR_TO_STACK &&
244 reg1->type != PTR_TO_MAP_VALUE &&
245 reg1->type != PTR_TO_PACKET) {
246 pr_vlog(env, "event_output: unsupported ptr type: %d\n",
247 reg1->type);
248 return -EOPNOTSUPP;
249 }
250
251 if (reg1->type == PTR_TO_STACK &&
252 !nfp_bpf_stack_arg_ok("event_output", env, reg1, NULL))
253 return -EOPNOTSUPP;
254
255 /* Warn user that on offload NFP may return success even if map
256 * is not going to accept the event, since the event output is
257 * fully async and device won't know the state of the map.
258 * There is also FW limitation on the event length.
259 *
260 * Lost events will not show up on the perf ring, driver
261 * won't see them at all. Events may also get reordered.
262 */
263 dev_warn_once(&nfp_prog->bpf->app->pf->pdev->dev,
264 "bpf: note: return codes and behavior of bpf_event_output() helper differs for offloaded programs!\n");
265 pr_vlog(env, "warning: return codes and behavior of event_output helper differ for offload!\n");
266
267 if (!meta->func_id)
268 break;
269
270 if (reg1->type != meta->arg1.type) {
271 pr_vlog(env, "event_output: ptr type changed: %d %d\n",
272 meta->arg1.type, reg1->type);
273 return -EINVAL;
274 }
275 break;
276
219 default: 277 default:
220 pr_vlog(env, "unsupported function id: %d\n", func_id); 278 pr_vlog(env, "unsupported function id: %d\n", func_id);
221 return -EOPNOTSUPP; 279 return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c b/drivers/net/ethernet/netronome/nfp/nfp_app.c
index 6aedef0ad433..0e0253c7e17b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2017 Netronome Systems, Inc. 2 * Copyright (C) 2017-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 38ebbc61ed99..321969da67b7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -110,6 +110,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
110 return container_of(map, struct bpf_offloaded_map, map); 110 return container_of(map, struct bpf_offloaded_map, map);
111} 111}
112 112
113static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
114{
115 return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY;
116}
117
113static inline bool bpf_map_support_seq_show(const struct bpf_map *map) 118static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
114{ 119{
115 return map->ops->map_seq_show_elem && map->ops->map_check_btf; 120 return map->ops->map_seq_show_elem && map->ops->map_check_btf;
@@ -235,6 +240,8 @@ struct bpf_verifier_ops {
235 struct bpf_insn_access_aux *info); 240 struct bpf_insn_access_aux *info);
236 int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, 241 int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
237 const struct bpf_prog *prog); 242 const struct bpf_prog *prog);
243 int (*gen_ld_abs)(const struct bpf_insn *orig,
244 struct bpf_insn *insn_buf);
238 u32 (*convert_ctx_access)(enum bpf_access_type type, 245 u32 (*convert_ctx_access)(enum bpf_access_type type,
239 const struct bpf_insn *src, 246 const struct bpf_insn *src,
240 struct bpf_insn *dst, 247 struct bpf_insn *dst,
@@ -676,6 +683,31 @@ static inline int sock_map_prog(struct bpf_map *map,
676} 683}
677#endif 684#endif
678 685
686#if defined(CONFIG_XDP_SOCKETS)
687struct xdp_sock;
688struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key);
689int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
690 struct xdp_sock *xs);
691void __xsk_map_flush(struct bpf_map *map);
692#else
693struct xdp_sock;
694static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
695 u32 key)
696{
697 return NULL;
698}
699
700static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
701 struct xdp_sock *xs)
702{
703 return -EOPNOTSUPP;
704}
705
706static inline void __xsk_map_flush(struct bpf_map *map)
707{
708}
709#endif
710
679/* verifier prototypes for helper functions called from eBPF programs */ 711/* verifier prototypes for helper functions called from eBPF programs */
680extern const struct bpf_func_proto bpf_map_lookup_elem_proto; 712extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
681extern const struct bpf_func_proto bpf_map_update_elem_proto; 713extern const struct bpf_func_proto bpf_map_update_elem_proto;
@@ -689,9 +721,8 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
689extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; 721extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
690extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; 722extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
691extern const struct bpf_func_proto bpf_get_current_comm_proto; 723extern const struct bpf_func_proto bpf_get_current_comm_proto;
692extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
693extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
694extern const struct bpf_func_proto bpf_get_stackid_proto; 724extern const struct bpf_func_proto bpf_get_stackid_proto;
725extern const struct bpf_func_proto bpf_get_stack_proto;
695extern const struct bpf_func_proto bpf_sock_map_update_proto; 726extern const struct bpf_func_proto bpf_sock_map_update_proto;
696 727
697/* Shared helpers among cBPF and eBPF. */ 728/* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
index e6fe98ae3794..ddf896abcfb6 100644
--- a/include/linux/bpf_trace.h
+++ b/include/linux/bpf_trace.h
@@ -2,7 +2,6 @@
2#ifndef __LINUX_BPF_TRACE_H__ 2#ifndef __LINUX_BPF_TRACE_H__
3#define __LINUX_BPF_TRACE_H__ 3#define __LINUX_BPF_TRACE_H__
4 4
5#include <trace/events/bpf.h>
6#include <trace/events/xdp.h> 5#include <trace/events/xdp.h>
7 6
8#endif /* __LINUX_BPF_TRACE_H__ */ 7#endif /* __LINUX_BPF_TRACE_H__ */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2b28fcf6f6ae..d7df1b323082 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -49,4 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
49BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) 49BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
50#endif 50#endif
51BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) 51BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
52#if defined(CONFIG_XDP_SOCKETS)
53BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
54#endif
52#endif 55#endif
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e61c395fddf..8f70dc181e23 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -173,6 +173,11 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
173 173
174#define BPF_MAX_SUBPROGS 256 174#define BPF_MAX_SUBPROGS 256
175 175
176struct bpf_subprog_info {
177 u32 start; /* insn idx of function entry point */
178 u16 stack_depth; /* max. stack depth used by this function */
179};
180
176/* single container for all structs 181/* single container for all structs
177 * one verifier_env per bpf_check() call 182 * one verifier_env per bpf_check() call
178 */ 183 */
@@ -191,9 +196,7 @@ struct bpf_verifier_env {
191 bool seen_direct_write; 196 bool seen_direct_write;
192 struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ 197 struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
193 struct bpf_verifier_log log; 198 struct bpf_verifier_log log;
194 u32 subprog_starts[BPF_MAX_SUBPROGS]; 199 struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
195 /* computes the stack depth of each bpf function */
196 u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
197 u32 subprog_cnt; 200 u32 subprog_cnt;
198}; 201};
199 202
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4da8b2308174..da7e16523128 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -47,7 +47,9 @@ struct xdp_buff;
47/* Additional register mappings for converted user programs. */ 47/* Additional register mappings for converted user programs. */
48#define BPF_REG_A BPF_REG_0 48#define BPF_REG_A BPF_REG_0
49#define BPF_REG_X BPF_REG_7 49#define BPF_REG_X BPF_REG_7
50#define BPF_REG_TMP BPF_REG_8 50#define BPF_REG_TMP BPF_REG_2 /* scratch reg */
51#define BPF_REG_D BPF_REG_8 /* data, callee-saved */
52#define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */
51 53
52/* Kernel hidden auxiliary/helper register for hardening step. 54/* Kernel hidden auxiliary/helper register for hardening step.
53 * Only used by eBPF JITs. It's nothing more than a temporary 55 * Only used by eBPF JITs. It's nothing more than a temporary
@@ -468,7 +470,8 @@ struct bpf_prog {
468 dst_needed:1, /* Do we need dst entry? */ 470 dst_needed:1, /* Do we need dst entry? */
469 blinded:1, /* Was blinded */ 471 blinded:1, /* Was blinded */
470 is_func:1, /* program is a bpf function */ 472 is_func:1, /* program is a bpf function */
471 kprobe_override:1; /* Do we override a kprobe? */ 473 kprobe_override:1, /* Do we override a kprobe? */
474 has_callchain_buf:1; /* callchain buffer allocated? */
472 enum bpf_prog_type type; /* Type of BPF program */ 475 enum bpf_prog_type type; /* Type of BPF program */
473 enum bpf_attach_type expected_attach_type; /* For some prog types */ 476 enum bpf_attach_type expected_attach_type; /* For some prog types */
474 u32 len; /* Number of filter blocks */ 477 u32 len; /* Number of filter blocks */
@@ -759,7 +762,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
759 * This does not appear to be a real limitation for existing software. 762 * This does not appear to be a real limitation for existing software.
760 */ 763 */
761int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, 764int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
762 struct bpf_prog *prog); 765 struct xdp_buff *xdp, struct bpf_prog *prog);
763int xdp_do_redirect(struct net_device *dev, 766int xdp_do_redirect(struct net_device *dev,
764 struct xdp_buff *xdp, 767 struct xdp_buff *xdp,
765 struct bpf_prog *prog); 768 struct bpf_prog *prog);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 46dcb5f7522f..03ed492c4e14 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2510,6 +2510,7 @@ void dev_disable_lro(struct net_device *dev);
2510int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); 2510int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
2511int dev_queue_xmit(struct sk_buff *skb); 2511int dev_queue_xmit(struct sk_buff *skb);
2512int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv); 2512int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv);
2513int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
2513int register_netdevice(struct net_device *dev); 2514int register_netdevice(struct net_device *dev);
2514void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); 2515void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
2515void unregister_netdevice_many(struct list_head *head); 2516void unregister_netdevice_many(struct list_head *head);
diff --git a/include/linux/socket.h b/include/linux/socket.h
index ea50f4a65816..7ed4713d5337 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -207,8 +207,9 @@ struct ucred {
207 * PF_SMC protocol family that 207 * PF_SMC protocol family that
208 * reuses AF_INET address family 208 * reuses AF_INET address family
209 */ 209 */
210#define AF_XDP 44 /* XDP sockets */
210 211
211#define AF_MAX 44 /* For now.. */ 212#define AF_MAX 45 /* For now.. */
212 213
213/* Protocol families, same as address families. */ 214/* Protocol families, same as address families. */
214#define PF_UNSPEC AF_UNSPEC 215#define PF_UNSPEC AF_UNSPEC
@@ -257,6 +258,7 @@ struct ucred {
257#define PF_KCM AF_KCM 258#define PF_KCM AF_KCM
258#define PF_QIPCRTR AF_QIPCRTR 259#define PF_QIPCRTR AF_QIPCRTR
259#define PF_SMC AF_SMC 260#define PF_SMC AF_SMC
261#define PF_XDP AF_XDP
260#define PF_MAX AF_MAX 262#define PF_MAX AF_MAX
261 263
262/* Maximum queue length specifiable by listen. */ 264/* Maximum queue length specifiable by listen. */
@@ -338,6 +340,7 @@ struct ucred {
338#define SOL_NFC 280 340#define SOL_NFC 280
339#define SOL_KCM 281 341#define SOL_KCM 281
340#define SOL_TLS 282 342#define SOL_TLS 282
343#define SOL_XDP 283
341 344
342/* IPX options */ 345/* IPX options */
343#define IPX_TYPE 1 346#define IPX_TYPE 1
diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 0d2d3da46139..c7dc2b5902c0 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -23,8 +23,10 @@ struct tnum tnum_range(u64 min, u64 max);
23/* Arithmetic and logical ops */ 23/* Arithmetic and logical ops */
24/* Shift a tnum left (by a fixed shift) */ 24/* Shift a tnum left (by a fixed shift) */
25struct tnum tnum_lshift(struct tnum a, u8 shift); 25struct tnum tnum_lshift(struct tnum a, u8 shift);
26/* Shift a tnum right (by a fixed shift) */ 26/* Shift (rsh) a tnum right (by a fixed shift) */
27struct tnum tnum_rshift(struct tnum a, u8 shift); 27struct tnum tnum_rshift(struct tnum a, u8 shift);
28/* Shift (arsh) a tnum right (by a fixed min_shift) */
29struct tnum tnum_arshift(struct tnum a, u8 min_shift);
28/* Add two tnums, return @a + @b */ 30/* Add two tnums, return @a + @b */
29struct tnum tnum_add(struct tnum a, struct tnum b); 31struct tnum tnum_add(struct tnum a, struct tnum b);
30/* Subtract two tnums, return @a - @b */ 32/* Subtract two tnums, return @a - @b */
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 137ad5f9f40f..0b689cf561c7 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
104} 104}
105 105
106void xdp_return_frame(struct xdp_frame *xdpf); 106void xdp_return_frame(struct xdp_frame *xdpf);
107void xdp_return_buff(struct xdp_buff *xdp);
107 108
108int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, 109int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
109 struct net_device *dev, u32 queue_index); 110 struct net_device *dev, u32 queue_index);
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
new file mode 100644
index 000000000000..185f4928fbda
--- /dev/null
+++ b/include/net/xdp_sock.h
@@ -0,0 +1,66 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * AF_XDP internal functions
3 * Copyright(c) 2018 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#ifndef _LINUX_XDP_SOCK_H
16#define _LINUX_XDP_SOCK_H
17
18#include <linux/mutex.h>
19#include <net/sock.h>
20
21struct net_device;
22struct xsk_queue;
23struct xdp_umem;
24
25struct xdp_sock {
26 /* struct sock must be the first member of struct xdp_sock */
27 struct sock sk;
28 struct xsk_queue *rx;
29 struct net_device *dev;
30 struct xdp_umem *umem;
31 struct list_head flush_node;
32 u16 queue_id;
33 struct xsk_queue *tx ____cacheline_aligned_in_smp;
34 /* Protects multiple processes in the control path */
35 struct mutex mutex;
36 u64 rx_dropped;
37};
38
39struct xdp_buff;
40#ifdef CONFIG_XDP_SOCKETS
41int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
42int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
43void xsk_flush(struct xdp_sock *xs);
44bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
45#else
46static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
47{
48 return -ENOTSUPP;
49}
50
51static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
52{
53 return -ENOTSUPP;
54}
55
56static inline void xsk_flush(struct xdp_sock *xs)
57{
58}
59
60static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
61{
62 return false;
63}
64#endif /* CONFIG_XDP_SOCKETS */
65
66#endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h
deleted file mode 100644
index 150185647e6b..000000000000
--- a/include/trace/events/bpf.h
+++ /dev/null
@@ -1,355 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#undef TRACE_SYSTEM
3#define TRACE_SYSTEM bpf
4
5#if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ)
6#define _TRACE_BPF_H
7
8/* These are only used within the BPF_SYSCALL code */
9#ifdef CONFIG_BPF_SYSCALL
10
11#include <linux/filter.h>
12#include <linux/bpf.h>
13#include <linux/fs.h>
14#include <linux/tracepoint.h>
15
16#define __PROG_TYPE_MAP(FN) \
17 FN(SOCKET_FILTER) \
18 FN(KPROBE) \
19 FN(SCHED_CLS) \
20 FN(SCHED_ACT) \
21 FN(TRACEPOINT) \
22 FN(XDP) \
23 FN(PERF_EVENT) \
24 FN(CGROUP_SKB) \
25 FN(CGROUP_SOCK) \
26 FN(LWT_IN) \
27 FN(LWT_OUT) \
28 FN(LWT_XMIT)
29
30#define __MAP_TYPE_MAP(FN) \
31 FN(HASH) \
32 FN(ARRAY) \
33 FN(PROG_ARRAY) \
34 FN(PERF_EVENT_ARRAY) \
35 FN(PERCPU_HASH) \
36 FN(PERCPU_ARRAY) \
37 FN(STACK_TRACE) \
38 FN(CGROUP_ARRAY) \
39 FN(LRU_HASH) \
40 FN(LRU_PERCPU_HASH) \
41 FN(LPM_TRIE)
42
43#define __PROG_TYPE_TP_FN(x) \
44 TRACE_DEFINE_ENUM(BPF_PROG_TYPE_##x);
45#define __PROG_TYPE_SYM_FN(x) \
46 { BPF_PROG_TYPE_##x, #x },
47#define __PROG_TYPE_SYM_TAB \
48 __PROG_TYPE_MAP(__PROG_TYPE_SYM_FN) { -1, 0 }
49__PROG_TYPE_MAP(__PROG_TYPE_TP_FN)
50
51#define __MAP_TYPE_TP_FN(x) \
52 TRACE_DEFINE_ENUM(BPF_MAP_TYPE_##x);
53#define __MAP_TYPE_SYM_FN(x) \
54 { BPF_MAP_TYPE_##x, #x },
55#define __MAP_TYPE_SYM_TAB \
56 __MAP_TYPE_MAP(__MAP_TYPE_SYM_FN) { -1, 0 }
57__MAP_TYPE_MAP(__MAP_TYPE_TP_FN)
58
59DECLARE_EVENT_CLASS(bpf_prog_event,
60
61 TP_PROTO(const struct bpf_prog *prg),
62
63 TP_ARGS(prg),
64
65 TP_STRUCT__entry(
66 __array(u8, prog_tag, 8)
67 __field(u32, type)
68 ),
69
70 TP_fast_assign(
71 BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
72 memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
73 __entry->type = prg->type;
74 ),
75
76 TP_printk("prog=%s type=%s",
77 __print_hex_str(__entry->prog_tag, 8),
78 __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB))
79);
80
81DEFINE_EVENT(bpf_prog_event, bpf_prog_get_type,
82
83 TP_PROTO(const struct bpf_prog *prg),
84
85 TP_ARGS(prg)
86);
87
88DEFINE_EVENT(bpf_prog_event, bpf_prog_put_rcu,
89
90 TP_PROTO(const struct bpf_prog *prg),
91
92 TP_ARGS(prg)
93);
94
95TRACE_EVENT(bpf_prog_load,
96
97 TP_PROTO(const struct bpf_prog *prg, int ufd),
98
99 TP_ARGS(prg, ufd),
100
101 TP_STRUCT__entry(
102 __array(u8, prog_tag, 8)
103 __field(u32, type)
104 __field(int, ufd)
105 ),
106
107 TP_fast_assign(
108 BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
109 memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
110 __entry->type = prg->type;
111 __entry->ufd = ufd;
112 ),
113
114 TP_printk("prog=%s type=%s ufd=%d",
115 __print_hex_str(__entry->prog_tag, 8),
116 __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB),
117 __entry->ufd)
118);
119
120TRACE_EVENT(bpf_map_create,
121
122 TP_PROTO(const struct bpf_map *map, int ufd),
123
124 TP_ARGS(map, ufd),
125
126 TP_STRUCT__entry(
127 __field(u32, type)
128 __field(u32, size_key)
129 __field(u32, size_value)
130 __field(u32, max_entries)
131 __field(u32, flags)
132 __field(int, ufd)
133 ),
134
135 TP_fast_assign(
136 __entry->type = map->map_type;
137 __entry->size_key = map->key_size;
138 __entry->size_value = map->value_size;
139 __entry->max_entries = map->max_entries;
140 __entry->flags = map->map_flags;
141 __entry->ufd = ufd;
142 ),
143
144 TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x",
145 __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
146 __entry->ufd, __entry->size_key, __entry->size_value,
147 __entry->max_entries, __entry->flags)
148);
149
150DECLARE_EVENT_CLASS(bpf_obj_prog,
151
152 TP_PROTO(const struct bpf_prog *prg, int ufd,
153 const struct filename *pname),
154
155 TP_ARGS(prg, ufd, pname),
156
157 TP_STRUCT__entry(
158 __array(u8, prog_tag, 8)
159 __field(int, ufd)
160 __string(path, pname->name)
161 ),
162
163 TP_fast_assign(
164 BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
165 memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
166 __assign_str(path, pname->name);
167 __entry->ufd = ufd;
168 ),
169
170 TP_printk("prog=%s path=%s ufd=%d",
171 __print_hex_str(__entry->prog_tag, 8),
172 __get_str(path), __entry->ufd)
173);
174
175DEFINE_EVENT(bpf_obj_prog, bpf_obj_pin_prog,
176
177 TP_PROTO(const struct bpf_prog *prg, int ufd,
178 const struct filename *pname),
179
180 TP_ARGS(prg, ufd, pname)
181);
182
183DEFINE_EVENT(bpf_obj_prog, bpf_obj_get_prog,
184
185 TP_PROTO(const struct bpf_prog *prg, int ufd,
186 const struct filename *pname),
187
188 TP_ARGS(prg, ufd, pname)
189);
190
191DECLARE_EVENT_CLASS(bpf_obj_map,
192
193 TP_PROTO(const struct bpf_map *map, int ufd,
194 const struct filename *pname),
195
196 TP_ARGS(map, ufd, pname),
197
198 TP_STRUCT__entry(
199 __field(u32, type)
200 __field(int, ufd)
201 __string(path, pname->name)
202 ),
203
204 TP_fast_assign(
205 __assign_str(path, pname->name);
206 __entry->type = map->map_type;
207 __entry->ufd = ufd;
208 ),
209
210 TP_printk("map type=%s ufd=%d path=%s",
211 __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
212 __entry->ufd, __get_str(path))
213);
214
215DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map,
216
217 TP_PROTO(const struct bpf_map *map, int ufd,
218 const struct filename *pname),
219
220 TP_ARGS(map, ufd, pname)
221);
222
223DEFINE_EVENT(bpf_obj_map, bpf_obj_get_map,
224
225 TP_PROTO(const struct bpf_map *map, int ufd,
226 const struct filename *pname),
227
228 TP_ARGS(map, ufd, pname)
229);
230
231DECLARE_EVENT_CLASS(bpf_map_keyval,
232
233 TP_PROTO(const struct bpf_map *map, int ufd,
234 const void *key, const void *val),
235
236 TP_ARGS(map, ufd, key, val),
237
238 TP_STRUCT__entry(
239 __field(u32, type)
240 __field(u32, key_len)
241 __dynamic_array(u8, key, map->key_size)
242 __field(bool, key_trunc)
243 __field(u32, val_len)
244 __dynamic_array(u8, val, map->value_size)
245 __field(bool, val_trunc)
246 __field(int, ufd)
247 ),
248
249 TP_fast_assign(
250 memcpy(__get_dynamic_array(key), key, map->key_size);
251 memcpy(__get_dynamic_array(val), val, map->value_size);
252 __entry->type = map->map_type;
253 __entry->key_len = min(map->key_size, 16U);
254 __entry->key_trunc = map->key_size != __entry->key_len;
255 __entry->val_len = min(map->value_size, 16U);
256 __entry->val_trunc = map->value_size != __entry->val_len;
257 __entry->ufd = ufd;
258 ),
259
260 TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]",
261 __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
262 __entry->ufd,
263 __print_hex(__get_dynamic_array(key), __entry->key_len),
264 __entry->key_trunc ? " ..." : "",
265 __print_hex(__get_dynamic_array(val), __entry->val_len),
266 __entry->val_trunc ? " ..." : "")
267);
268
269DEFINE_EVENT(bpf_map_keyval, bpf_map_lookup_elem,
270
271 TP_PROTO(const struct bpf_map *map, int ufd,
272 const void *key, const void *val),
273
274 TP_ARGS(map, ufd, key, val)
275);
276
277DEFINE_EVENT(bpf_map_keyval, bpf_map_update_elem,
278
279 TP_PROTO(const struct bpf_map *map, int ufd,
280 const void *key, const void *val),
281
282 TP_ARGS(map, ufd, key, val)
283);
284
285TRACE_EVENT(bpf_map_delete_elem,
286
287 TP_PROTO(const struct bpf_map *map, int ufd,
288 const void *key),
289
290 TP_ARGS(map, ufd, key),
291
292 TP_STRUCT__entry(
293 __field(u32, type)
294 __field(u32, key_len)
295 __dynamic_array(u8, key, map->key_size)
296 __field(bool, key_trunc)
297 __field(int, ufd)
298 ),
299
300 TP_fast_assign(
301 memcpy(__get_dynamic_array(key), key, map->key_size);
302 __entry->type = map->map_type;
303 __entry->key_len = min(map->key_size, 16U);
304 __entry->key_trunc = map->key_size != __entry->key_len;
305 __entry->ufd = ufd;
306 ),
307
308 TP_printk("map type=%s ufd=%d key=[%s%s]",
309 __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
310 __entry->ufd,
311 __print_hex(__get_dynamic_array(key), __entry->key_len),
312 __entry->key_trunc ? " ..." : "")
313);
314
315TRACE_EVENT(bpf_map_next_key,
316
317 TP_PROTO(const struct bpf_map *map, int ufd,
318 const void *key, const void *key_next),
319
320 TP_ARGS(map, ufd, key, key_next),
321
322 TP_STRUCT__entry(
323 __field(u32, type)
324 __field(u32, key_len)
325 __dynamic_array(u8, key, map->key_size)
326 __dynamic_array(u8, nxt, map->key_size)
327 __field(bool, key_trunc)
328 __field(bool, key_null)
329 __field(int, ufd)
330 ),
331
332 TP_fast_assign(
333 if (key)
334 memcpy(__get_dynamic_array(key), key, map->key_size);
335 __entry->key_null = !key;
336 memcpy(__get_dynamic_array(nxt), key_next, map->key_size);
337 __entry->type = map->map_type;
338 __entry->key_len = min(map->key_size, 16U);
339 __entry->key_trunc = map->key_size != __entry->key_len;
340 __entry->ufd = ufd;
341 ),
342
343 TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]",
344 __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
345 __entry->ufd,
346 __entry->key_null ? "NULL" : __print_hex(__get_dynamic_array(key),
347 __entry->key_len),
348 __entry->key_trunc && !__entry->key_null ? " ..." : "",
349 __print_hex(__get_dynamic_array(nxt), __entry->key_len),
350 __entry->key_trunc ? " ..." : "")
351);
352#endif /* CONFIG_BPF_SYSCALL */
353#endif /* _TRACE_BPF_H */
354
355#include <trace/define_trace.h>
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da77a9388947..93d5a4eeec2a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_map_type {
116 BPF_MAP_TYPE_DEVMAP, 116 BPF_MAP_TYPE_DEVMAP,
117 BPF_MAP_TYPE_SOCKMAP, 117 BPF_MAP_TYPE_SOCKMAP,
118 BPF_MAP_TYPE_CPUMAP, 118 BPF_MAP_TYPE_CPUMAP,
119 BPF_MAP_TYPE_XSKMAP,
119}; 120};
120 121
121enum bpf_prog_type { 122enum bpf_prog_type {
@@ -828,12 +829,12 @@ union bpf_attr {
828 * 829 *
829 * Also, be aware that the newer helper 830 * Also, be aware that the newer helper
830 * **bpf_perf_event_read_value**\ () is recommended over 831 * **bpf_perf_event_read_value**\ () is recommended over
831 * **bpf_perf_event_read*\ () in general. The latter has some ABI 832 * **bpf_perf_event_read**\ () in general. The latter has some ABI
832 * quirks where error and counter value are used as a return code 833 * quirks where error and counter value are used as a return code
833 * (which is wrong to do since ranges may overlap). This issue is 834 * (which is wrong to do since ranges may overlap). This issue is
834 * fixed with bpf_perf_event_read_value(), which at the same time 835 * fixed with **bpf_perf_event_read_value**\ (), which at the same
835 * provides more features over the **bpf_perf_event_read**\ () 836 * time provides more features over the **bpf_perf_event_read**\
836 * interface. Please refer to the description of 837 * () interface. Please refer to the description of
837 * **bpf_perf_event_read_value**\ () for details. 838 * **bpf_perf_event_read_value**\ () for details.
838 * Return 839 * Return
839 * The value of the perf event counter read from the map, or a 840 * The value of the perf event counter read from the map, or a
@@ -1361,7 +1362,7 @@ union bpf_attr {
1361 * Return 1362 * Return
1362 * 0 1363 * 0
1363 * 1364 *
1364 * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) 1365 * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
1365 * Description 1366 * Description
1366 * Emulate a call to **setsockopt()** on the socket associated to 1367 * Emulate a call to **setsockopt()** on the socket associated to
1367 * *bpf_socket*, which must be a full socket. The *level* at 1368 * *bpf_socket*, which must be a full socket. The *level* at
@@ -1435,7 +1436,7 @@ union bpf_attr {
1435 * Return 1436 * Return
1436 * **SK_PASS** on success, or **SK_DROP** on error. 1437 * **SK_PASS** on success, or **SK_DROP** on error.
1437 * 1438 *
1438 * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) 1439 * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
1439 * Description 1440 * Description
1440 * Add an entry to, or update a *map* referencing sockets. The 1441 * Add an entry to, or update a *map* referencing sockets. The
1441 * *skops* is used as a new value for the entry associated to 1442 * *skops* is used as a new value for the entry associated to
@@ -1533,7 +1534,7 @@ union bpf_attr {
1533 * Return 1534 * Return
1534 * 0 on success, or a negative error in case of failure. 1535 * 0 on success, or a negative error in case of failure.
1535 * 1536 *
1536 * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size) 1537 * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
1537 * Description 1538 * Description
1538 * For en eBPF program attached to a perf event, retrieve the 1539 * For en eBPF program attached to a perf event, retrieve the
1539 * value of the event counter associated to *ctx* and store it in 1540 * value of the event counter associated to *ctx* and store it in
@@ -1544,7 +1545,7 @@ union bpf_attr {
1544 * Return 1545 * Return
1545 * 0 on success, or a negative error in case of failure. 1546 * 0 on success, or a negative error in case of failure.
1546 * 1547 *
1547 * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) 1548 * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
1548 * Description 1549 * Description
1549 * Emulate a call to **getsockopt()** on the socket associated to 1550 * Emulate a call to **getsockopt()** on the socket associated to
1550 * *bpf_socket*, which must be a full socket. The *level* at 1551 * *bpf_socket*, which must be a full socket. The *level* at
@@ -1588,7 +1589,7 @@ union bpf_attr {
1588 * Return 1589 * Return
1589 * 0 1590 * 0
1590 * 1591 *
1591 * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval) 1592 * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
1592 * Description 1593 * Description
1593 * Attempt to set the value of the **bpf_sock_ops_cb_flags** field 1594 * Attempt to set the value of the **bpf_sock_ops_cb_flags** field
1594 * for the full TCP socket associated to *bpf_sock_ops* to 1595 * for the full TCP socket associated to *bpf_sock_ops* to
@@ -1721,7 +1722,7 @@ union bpf_attr {
1721 * Return 1722 * Return
1722 * 0 on success, or a negative error in case of failure. 1723 * 0 on success, or a negative error in case of failure.
1723 * 1724 *
1724 * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len) 1725 * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
1725 * Description 1726 * Description
1726 * Bind the socket associated to *ctx* to the address pointed by 1727 * Bind the socket associated to *ctx* to the address pointed by
1727 * *addr*, of length *addr_len*. This allows for making outgoing 1728 * *addr*, of length *addr_len*. This allows for making outgoing
@@ -1767,6 +1768,64 @@ union bpf_attr {
1767 * **CONFIG_XFRM** configuration option. 1768 * **CONFIG_XFRM** configuration option.
1768 * Return 1769 * Return
1769 * 0 on success, or a negative error in case of failure. 1770 * 0 on success, or a negative error in case of failure.
1771 *
1772 * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
1773 * Description
1774 * Return a user or a kernel stack in bpf program provided buffer.
1775 * To achieve this, the helper needs *ctx*, which is a pointer
1776 * to the context on which the tracing program is executed.
1777 * To store the stacktrace, the bpf program provides *buf* with
1778 * a nonnegative *size*.
1779 *
1780 * The last argument, *flags*, holds the number of stack frames to
1781 * skip (from 0 to 255), masked with
1782 * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
1783 * the following flags:
1784 *
1785 * **BPF_F_USER_STACK**
1786 * Collect a user space stack instead of a kernel stack.
1787 * **BPF_F_USER_BUILD_ID**
1788 * Collect buildid+offset instead of ips for user stack,
1789 * only valid if **BPF_F_USER_STACK** is also specified.
1790 *
1791 * **bpf_get_stack**\ () can collect up to
1792 * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
1793 * to sufficient large buffer size. Note that
1794 * this limit can be controlled with the **sysctl** program, and
1795 * that it should be manually increased in order to profile long
1796 * user stacks (such as stacks for Java programs). To do so, use:
1797 *
1798 * ::
1799 *
1800 * # sysctl kernel.perf_event_max_stack=<new value>
1801 *
1802 * Return
1803 * a non-negative value equal to or less than size on success, or
1804 * a negative error in case of failure.
1805 *
1806 * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
1807 * Description
1808 * This helper is similar to **bpf_skb_load_bytes**\ () in that
1809 * it provides an easy way to load *len* bytes from *offset*
1810 * from the packet associated to *skb*, into the buffer pointed
1811 * by *to*. The difference to **bpf_skb_load_bytes**\ () is that
1812 * a fifth argument *start_header* exists in order to select a
1813 * base offset to start from. *start_header* can be one of:
1814 *
1815 * **BPF_HDR_START_MAC**
1816 * Base offset to load data from is *skb*'s mac header.
1817 * **BPF_HDR_START_NET**
1818 * Base offset to load data from is *skb*'s network header.
1819 *
1820 * In general, "direct packet access" is the preferred method to
1821 * access packet data, however, this helper is in particular useful
1822 * in socket filters where *skb*\ **->data** does not always point
1823 * to the start of the mac header and where "direct packet access"
1824 * is not available.
1825 *
1826 * Return
1827 * 0 on success, or a negative error in case of failure.
1828 *
1770 */ 1829 */
1771#define __BPF_FUNC_MAPPER(FN) \ 1830#define __BPF_FUNC_MAPPER(FN) \
1772 FN(unspec), \ 1831 FN(unspec), \
@@ -1835,7 +1894,9 @@ union bpf_attr {
1835 FN(msg_pull_data), \ 1894 FN(msg_pull_data), \
1836 FN(bind), \ 1895 FN(bind), \
1837 FN(xdp_adjust_tail), \ 1896 FN(xdp_adjust_tail), \
1838 FN(skb_get_xfrm_state), 1897 FN(skb_get_xfrm_state), \
1898 FN(get_stack), \
1899 FN(skb_load_bytes_relative),
1839 1900
1840/* integer value in 'imm' field of BPF_CALL instruction selects which helper 1901/* integer value in 'imm' field of BPF_CALL instruction selects which helper
1841 * function eBPF program intends to call 1902 * function eBPF program intends to call
@@ -1869,11 +1930,14 @@ enum bpf_func_id {
1869/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ 1930/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
1870#define BPF_F_TUNINFO_IPV6 (1ULL << 0) 1931#define BPF_F_TUNINFO_IPV6 (1ULL << 0)
1871 1932
1872/* BPF_FUNC_get_stackid flags. */ 1933/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
1873#define BPF_F_SKIP_FIELD_MASK 0xffULL 1934#define BPF_F_SKIP_FIELD_MASK 0xffULL
1874#define BPF_F_USER_STACK (1ULL << 8) 1935#define BPF_F_USER_STACK (1ULL << 8)
1936/* flags used by BPF_FUNC_get_stackid only. */
1875#define BPF_F_FAST_STACK_CMP (1ULL << 9) 1937#define BPF_F_FAST_STACK_CMP (1ULL << 9)
1876#define BPF_F_REUSE_STACKID (1ULL << 10) 1938#define BPF_F_REUSE_STACKID (1ULL << 10)
1939/* flags used by BPF_FUNC_get_stack only. */
1940#define BPF_F_USER_BUILD_ID (1ULL << 11)
1877 1941
1878/* BPF_FUNC_skb_set_tunnel_key flags. */ 1942/* BPF_FUNC_skb_set_tunnel_key flags. */
1879#define BPF_F_ZERO_CSUM_TX (1ULL << 1) 1943#define BPF_F_ZERO_CSUM_TX (1ULL << 1)
@@ -1893,6 +1957,12 @@ enum bpf_adj_room_mode {
1893 BPF_ADJ_ROOM_NET, 1957 BPF_ADJ_ROOM_NET,
1894}; 1958};
1895 1959
1960/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
1961enum bpf_hdr_start_off {
1962 BPF_HDR_START_MAC,
1963 BPF_HDR_START_NET,
1964};
1965
1896/* user accessible mirror of in-kernel sk_buff. 1966/* user accessible mirror of in-kernel sk_buff.
1897 * new fields can only be added to the end of this structure 1967 * new fields can only be added to the end of this structure
1898 */ 1968 */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
new file mode 100644
index 000000000000..77b88c4efe98
--- /dev/null
+++ b/include/uapi/linux/if_xdp.h
@@ -0,0 +1,87 @@
1/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
2 *
3 * if_xdp: XDP socket user-space interface
4 * Copyright(c) 2018 Intel Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * Author(s): Björn Töpel <bjorn.topel@intel.com>
16 * Magnus Karlsson <magnus.karlsson@intel.com>
17 */
18
19#ifndef _LINUX_IF_XDP_H
20#define _LINUX_IF_XDP_H
21
22#include <linux/types.h>
23
24/* Options for the sxdp_flags field */
25#define XDP_SHARED_UMEM 1
26
27struct sockaddr_xdp {
28 __u16 sxdp_family;
29 __u32 sxdp_ifindex;
30 __u32 sxdp_queue_id;
31 __u32 sxdp_shared_umem_fd;
32 __u16 sxdp_flags;
33};
34
35/* XDP socket options */
36#define XDP_RX_RING 1
37#define XDP_TX_RING 2
38#define XDP_UMEM_REG 3
39#define XDP_UMEM_FILL_RING 4
40#define XDP_UMEM_COMPLETION_RING 5
41#define XDP_STATISTICS 6
42
43struct xdp_umem_reg {
44 __u64 addr; /* Start of packet data area */
45 __u64 len; /* Length of packet data area */
46 __u32 frame_size; /* Frame size */
47 __u32 frame_headroom; /* Frame head room */
48};
49
50struct xdp_statistics {
51 __u64 rx_dropped; /* Dropped for reasons other than invalid desc */
52 __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
53 __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
54};
55
56/* Pgoff for mmaping the rings */
57#define XDP_PGOFF_RX_RING 0
58#define XDP_PGOFF_TX_RING 0x80000000
59#define XDP_UMEM_PGOFF_FILL_RING 0x100000000
60#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000
61
62struct xdp_desc {
63 __u32 idx;
64 __u32 len;
65 __u16 offset;
66 __u8 flags;
67 __u8 padding[5];
68};
69
70struct xdp_ring {
71 __u32 producer __attribute__((aligned(64)));
72 __u32 consumer __attribute__((aligned(64)));
73};
74
75/* Used for the RX and TX queues for packets */
76struct xdp_rxtx_ring {
77 struct xdp_ring ptrs;
78 struct xdp_desc desc[0] __attribute__((aligned(64)));
79};
80
81/* Used for the fill and completion queues for buffers */
82struct xdp_umem_ring {
83 struct xdp_ring ptrs;
84 __u32 desc[0] __attribute__((aligned(64)));
85};
86
87#endif /* _LINUX_IF_XDP_H */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 35c485fa9ea3..f27f5496d6fe 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,9 @@ obj-$(CONFIG_BPF_SYSCALL) += btf.o
8ifeq ($(CONFIG_NET),y) 8ifeq ($(CONFIG_NET),y)
9obj-$(CONFIG_BPF_SYSCALL) += devmap.o 9obj-$(CONFIG_BPF_SYSCALL) += devmap.o
10obj-$(CONFIG_BPF_SYSCALL) += cpumap.o 10obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
11ifeq ($(CONFIG_XDP_SOCKETS),y)
12obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
13endif
11obj-$(CONFIG_BPF_SYSCALL) += offload.o 14obj-$(CONFIG_BPF_SYSCALL) += offload.o
12ifeq ($(CONFIG_STREAM_PARSER),y) 15ifeq ($(CONFIG_STREAM_PARSER),y)
13ifeq ($(CONFIG_INET),y) 16ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec39efb3..d0d7d9462368 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
31#include <linux/rbtree_latch.h> 31#include <linux/rbtree_latch.h>
32#include <linux/kallsyms.h> 32#include <linux/kallsyms.h>
33#include <linux/rcupdate.h> 33#include <linux/rcupdate.h>
34#include <linux/perf_event.h>
34 35
35#include <asm/unaligned.h> 36#include <asm/unaligned.h>
36 37
@@ -633,23 +634,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
633 *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); 634 *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
634 break; 635 break;
635 636
636 case BPF_LD | BPF_ABS | BPF_W:
637 case BPF_LD | BPF_ABS | BPF_H:
638 case BPF_LD | BPF_ABS | BPF_B:
639 *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
640 *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
641 *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
642 break;
643
644 case BPF_LD | BPF_IND | BPF_W:
645 case BPF_LD | BPF_IND | BPF_H:
646 case BPF_LD | BPF_IND | BPF_B:
647 *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
648 *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
649 *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
650 *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
651 break;
652
653 case BPF_LD | BPF_IMM | BPF_DW: 637 case BPF_LD | BPF_IMM | BPF_DW:
654 *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); 638 *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
655 *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); 639 *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
@@ -890,14 +874,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
890 INSN_3(LDX, MEM, W), \ 874 INSN_3(LDX, MEM, W), \
891 INSN_3(LDX, MEM, DW), \ 875 INSN_3(LDX, MEM, DW), \
892 /* Immediate based. */ \ 876 /* Immediate based. */ \
893 INSN_3(LD, IMM, DW), \ 877 INSN_3(LD, IMM, DW)
894 /* Misc (old cBPF carry-over). */ \
895 INSN_3(LD, ABS, B), \
896 INSN_3(LD, ABS, H), \
897 INSN_3(LD, ABS, W), \
898 INSN_3(LD, IND, B), \
899 INSN_3(LD, IND, H), \
900 INSN_3(LD, IND, W)
901 878
902bool bpf_opcode_in_insntable(u8 code) 879bool bpf_opcode_in_insntable(u8 code)
903{ 880{
@@ -907,6 +884,13 @@ bool bpf_opcode_in_insntable(u8 code)
907 [0 ... 255] = false, 884 [0 ... 255] = false,
908 /* Now overwrite non-defaults ... */ 885 /* Now overwrite non-defaults ... */
909 BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), 886 BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
887 /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
888 [BPF_LD | BPF_ABS | BPF_B] = true,
889 [BPF_LD | BPF_ABS | BPF_H] = true,
890 [BPF_LD | BPF_ABS | BPF_W] = true,
891 [BPF_LD | BPF_IND | BPF_B] = true,
892 [BPF_LD | BPF_IND | BPF_H] = true,
893 [BPF_LD | BPF_IND | BPF_W] = true,
910 }; 894 };
911#undef BPF_INSN_3_TBL 895#undef BPF_INSN_3_TBL
912#undef BPF_INSN_2_TBL 896#undef BPF_INSN_2_TBL
@@ -937,8 +921,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
937#undef BPF_INSN_3_LBL 921#undef BPF_INSN_3_LBL
938#undef BPF_INSN_2_LBL 922#undef BPF_INSN_2_LBL
939 u32 tail_call_cnt = 0; 923 u32 tail_call_cnt = 0;
940 void *ptr;
941 int off;
942 924
943#define CONT ({ insn++; goto select_insn; }) 925#define CONT ({ insn++; goto select_insn; })
944#define CONT_JMP ({ insn++; goto select_insn; }) 926#define CONT_JMP ({ insn++; goto select_insn; })
@@ -1265,67 +1247,6 @@ out:
1265 atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) 1247 atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
1266 (DST + insn->off)); 1248 (DST + insn->off));
1267 CONT; 1249 CONT;
1268 LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
1269 off = IMM;
1270load_word:
1271 /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
1272 * appearing in the programs where ctx == skb
1273 * (see may_access_skb() in the verifier). All programs
1274 * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
1275 * bpf_convert_filter() saves it in BPF_R6, internal BPF
1276 * verifier will check that BPF_R6 == ctx.
1277 *
1278 * BPF_ABS and BPF_IND are wrappers of function calls,
1279 * so they scratch BPF_R1-BPF_R5 registers, preserve
1280 * BPF_R6-BPF_R9, and store return value into BPF_R0.
1281 *
1282 * Implicit input:
1283 * ctx == skb == BPF_R6 == CTX
1284 *
1285 * Explicit input:
1286 * SRC == any register
1287 * IMM == 32-bit immediate
1288 *
1289 * Output:
1290 * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
1291 */
1292
1293 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
1294 if (likely(ptr != NULL)) {
1295 BPF_R0 = get_unaligned_be32(ptr);
1296 CONT;
1297 }
1298
1299 return 0;
1300 LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
1301 off = IMM;
1302load_half:
1303 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
1304 if (likely(ptr != NULL)) {
1305 BPF_R0 = get_unaligned_be16(ptr);
1306 CONT;
1307 }
1308
1309 return 0;
1310 LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
1311 off = IMM;
1312load_byte:
1313 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
1314 if (likely(ptr != NULL)) {
1315 BPF_R0 = *(u8 *)ptr;
1316 CONT;
1317 }
1318
1319 return 0;
1320 LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
1321 off = IMM + SRC;
1322 goto load_word;
1323 LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
1324 off = IMM + SRC;
1325 goto load_half;
1326 LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
1327 off = IMM + SRC;
1328 goto load_byte;
1329 1250
1330 default_label: 1251 default_label:
1331 /* If we ever reach this, we have a bug somewhere. Die hard here 1252 /* If we ever reach this, we have a bug somewhere. Die hard here
@@ -1722,6 +1643,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
1722 aux = container_of(work, struct bpf_prog_aux, work); 1643 aux = container_of(work, struct bpf_prog_aux, work);
1723 if (bpf_prog_is_dev_bound(aux)) 1644 if (bpf_prog_is_dev_bound(aux))
1724 bpf_prog_offload_destroy(aux->prog); 1645 bpf_prog_offload_destroy(aux->prog);
1646#ifdef CONFIG_PERF_EVENTS
1647 if (aux->prog->has_callchain_buf)
1648 put_callchain_buffers();
1649#endif
1725 for (i = 0; i < aux->func_cnt; i++) 1650 for (i = 0; i < aux->func_cnt; i++)
1726 bpf_jit_free(aux->func[i]); 1651 bpf_jit_free(aux->func[i]);
1727 if (aux->func_cnt) { 1652 if (aux->func_cnt) {
@@ -1794,6 +1719,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
1794{ 1719{
1795 return -ENOTSUPP; 1720 return -ENOTSUPP;
1796} 1721}
1722EXPORT_SYMBOL_GPL(bpf_event_output);
1797 1723
1798/* Always built-in helper functions. */ 1724/* Always built-in helper functions. */
1799const struct bpf_func_proto bpf_tail_call_proto = { 1725const struct bpf_func_proto bpf_tail_call_proto = {
@@ -1840,9 +1766,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
1840#include <linux/bpf_trace.h> 1766#include <linux/bpf_trace.h>
1841 1767
1842EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); 1768EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
1843
1844/* These are only used within the BPF_SYSCALL code */
1845#ifdef CONFIG_BPF_SYSCALL
1846EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
1847EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
1848#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index a41343009ccc..ed13645bd80c 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -429,13 +429,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
429 ret = bpf_obj_do_pin(pname, raw, type); 429 ret = bpf_obj_do_pin(pname, raw, type);
430 if (ret != 0) 430 if (ret != 0)
431 bpf_any_put(raw, type); 431 bpf_any_put(raw, type);
432 if ((trace_bpf_obj_pin_prog_enabled() ||
433 trace_bpf_obj_pin_map_enabled()) && !ret) {
434 if (type == BPF_TYPE_PROG)
435 trace_bpf_obj_pin_prog(raw, ufd, pname);
436 if (type == BPF_TYPE_MAP)
437 trace_bpf_obj_pin_map(raw, ufd, pname);
438 }
439out: 432out:
440 putname(pname); 433 putname(pname);
441 return ret; 434 return ret;
@@ -502,15 +495,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
502 else 495 else
503 goto out; 496 goto out;
504 497
505 if (ret < 0) { 498 if (ret < 0)
506 bpf_any_put(raw, type); 499 bpf_any_put(raw, type);
507 } else if (trace_bpf_obj_get_prog_enabled() ||
508 trace_bpf_obj_get_map_enabled()) {
509 if (type == BPF_TYPE_PROG)
510 trace_bpf_obj_get_prog(raw, ret, pname);
511 if (type == BPF_TYPE_MAP)
512 trace_bpf_obj_get_map(raw, ret, pname);
513 }
514out: 500out:
515 putname(pname); 501 putname(pname);
516 return ret; 502 return ret;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index c9401075b58c..ac747d5cf7c6 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2017 Netronome Systems, Inc. 2 * Copyright (C) 2017-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is licensed under the GNU General License Version 2, 4 * This software is licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
474 struct bpf_prog_offload *offload; 474 struct bpf_prog_offload *offload;
475 bool ret; 475 bool ret;
476 476
477 if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) 477 if (!bpf_prog_is_dev_bound(prog->aux))
478 return false; 478 return false;
479 if (!bpf_map_is_dev_bound(map))
480 return bpf_map_offload_neutral(map);
479 481
480 down_read(&bpf_devs_lock); 482 down_read(&bpf_devs_lock);
481 offload = prog->aux->offload; 483 offload = prog->aux->offload;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 57eeb1234b67..3ba102b41512 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -262,16 +262,11 @@ out:
262 return ret; 262 return ret;
263} 263}
264 264
265static void stack_map_get_build_id_offset(struct bpf_map *map, 265static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
266 struct stack_map_bucket *bucket,
267 u64 *ips, u32 trace_nr, bool user) 266 u64 *ips, u32 trace_nr, bool user)
268{ 267{
269 int i; 268 int i;
270 struct vm_area_struct *vma; 269 struct vm_area_struct *vma;
271 struct bpf_stack_build_id *id_offs;
272
273 bucket->nr = trace_nr;
274 id_offs = (struct bpf_stack_build_id *)bucket->data;
275 270
276 /* 271 /*
277 * We cannot do up_read() in nmi context, so build_id lookup is 272 * We cannot do up_read() in nmi context, so build_id lookup is
@@ -361,8 +356,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
361 pcpu_freelist_pop(&smap->freelist); 356 pcpu_freelist_pop(&smap->freelist);
362 if (unlikely(!new_bucket)) 357 if (unlikely(!new_bucket))
363 return -ENOMEM; 358 return -ENOMEM;
364 stack_map_get_build_id_offset(map, new_bucket, ips, 359 new_bucket->nr = trace_nr;
365 trace_nr, user); 360 stack_map_get_build_id_offset(
361 (struct bpf_stack_build_id *)new_bucket->data,
362 ips, trace_nr, user);
366 trace_len = trace_nr * sizeof(struct bpf_stack_build_id); 363 trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
367 if (hash_matches && bucket->nr == trace_nr && 364 if (hash_matches && bucket->nr == trace_nr &&
368 memcmp(bucket->data, new_bucket->data, trace_len) == 0) { 365 memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
@@ -405,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
405 .arg3_type = ARG_ANYTHING, 402 .arg3_type = ARG_ANYTHING,
406}; 403};
407 404
405BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
406 u64, flags)
407{
408 u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
409 bool user_build_id = flags & BPF_F_USER_BUILD_ID;
410 u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
411 bool user = flags & BPF_F_USER_STACK;
412 struct perf_callchain_entry *trace;
413 bool kernel = !user;
414 int err = -EINVAL;
415 u64 *ips;
416
417 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
418 BPF_F_USER_BUILD_ID)))
419 goto clear;
420 if (kernel && user_build_id)
421 goto clear;
422
423 elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
424 : sizeof(u64);
425 if (unlikely(size % elem_size))
426 goto clear;
427
428 num_elem = size / elem_size;
429 if (sysctl_perf_event_max_stack < num_elem)
430 init_nr = 0;
431 else
432 init_nr = sysctl_perf_event_max_stack - num_elem;
433 trace = get_perf_callchain(regs, init_nr, kernel, user,
434 sysctl_perf_event_max_stack, false, false);
435 if (unlikely(!trace))
436 goto err_fault;
437
438 trace_nr = trace->nr - init_nr;
439 if (trace_nr < skip)
440 goto err_fault;
441
442 trace_nr -= skip;
443 trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
444 copy_len = trace_nr * elem_size;
445 ips = trace->ip + skip + init_nr;
446 if (user && user_build_id)
447 stack_map_get_build_id_offset(buf, ips, trace_nr, user);
448 else
449 memcpy(buf, ips, copy_len);
450
451 if (size > copy_len)
452 memset(buf + copy_len, 0, size - copy_len);
453 return copy_len;
454
455err_fault:
456 err = -EFAULT;
457clear:
458 memset(buf, 0, size);
459 return err;
460}
461
462const struct bpf_func_proto bpf_get_stack_proto = {
463 .func = bpf_get_stack,
464 .gpl_only = true,
465 .ret_type = RET_INTEGER,
466 .arg1_type = ARG_PTR_TO_CTX,
467 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
468 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
469 .arg4_type = ARG_ANYTHING,
470};
471
408/* Called from eBPF program */ 472/* Called from eBPF program */
409static void *stack_map_lookup_elem(struct bpf_map *map, void *key) 473static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
410{ 474{
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0bd2944eafb9..9b87198deea2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -282,6 +282,7 @@ void bpf_map_put(struct bpf_map *map)
282{ 282{
283 __bpf_map_put(map, true); 283 __bpf_map_put(map, true);
284} 284}
285EXPORT_SYMBOL_GPL(bpf_map_put);
285 286
286void bpf_map_put_with_uref(struct bpf_map *map) 287void bpf_map_put_with_uref(struct bpf_map *map)
287{ 288{
@@ -503,7 +504,6 @@ static int map_create(union bpf_attr *attr)
503 return err; 504 return err;
504 } 505 }
505 506
506 trace_bpf_map_create(map, err);
507 return err; 507 return err;
508 508
509free_map: 509free_map:
@@ -544,6 +544,7 @@ struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
544 atomic_inc(&map->usercnt); 544 atomic_inc(&map->usercnt);
545 return map; 545 return map;
546} 546}
547EXPORT_SYMBOL_GPL(bpf_map_inc);
547 548
548struct bpf_map *bpf_map_get_with_uref(u32 ufd) 549struct bpf_map *bpf_map_get_with_uref(u32 ufd)
549{ 550{
@@ -663,7 +664,6 @@ static int map_lookup_elem(union bpf_attr *attr)
663 if (copy_to_user(uvalue, value, value_size) != 0) 664 if (copy_to_user(uvalue, value, value_size) != 0)
664 goto free_value; 665 goto free_value;
665 666
666 trace_bpf_map_lookup_elem(map, ufd, key, value);
667 err = 0; 667 err = 0;
668 668
669free_value: 669free_value:
@@ -760,8 +760,6 @@ static int map_update_elem(union bpf_attr *attr)
760 __this_cpu_dec(bpf_prog_active); 760 __this_cpu_dec(bpf_prog_active);
761 preempt_enable(); 761 preempt_enable();
762out: 762out:
763 if (!err)
764 trace_bpf_map_update_elem(map, ufd, key, value);
765free_value: 763free_value:
766 kfree(value); 764 kfree(value);
767free_key: 765free_key:
@@ -814,8 +812,6 @@ static int map_delete_elem(union bpf_attr *attr)
814 __this_cpu_dec(bpf_prog_active); 812 __this_cpu_dec(bpf_prog_active);
815 preempt_enable(); 813 preempt_enable();
816out: 814out:
817 if (!err)
818 trace_bpf_map_delete_elem(map, ufd, key);
819 kfree(key); 815 kfree(key);
820err_put: 816err_put:
821 fdput(f); 817 fdput(f);
@@ -879,7 +875,6 @@ out:
879 if (copy_to_user(unext_key, next_key, map->key_size) != 0) 875 if (copy_to_user(unext_key, next_key, map->key_size) != 0)
880 goto free_next_key; 876 goto free_next_key;
881 877
882 trace_bpf_map_next_key(map, ufd, key, next_key);
883 err = 0; 878 err = 0;
884 879
885free_next_key: 880free_next_key:
@@ -1027,7 +1022,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
1027 if (atomic_dec_and_test(&prog->aux->refcnt)) { 1022 if (atomic_dec_and_test(&prog->aux->refcnt)) {
1028 int i; 1023 int i;
1029 1024
1030 trace_bpf_prog_put_rcu(prog);
1031 /* bpf_prog_free_id() must be called first */ 1025 /* bpf_prog_free_id() must be called first */
1032 bpf_prog_free_id(prog, do_idr_lock); 1026 bpf_prog_free_id(prog, do_idr_lock);
1033 1027
@@ -1194,11 +1188,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
1194struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, 1188struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
1195 bool attach_drv) 1189 bool attach_drv)
1196{ 1190{
1197 struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv); 1191 return __bpf_prog_get(ufd, &type, attach_drv);
1198
1199 if (!IS_ERR(prog))
1200 trace_bpf_prog_get_type(prog);
1201 return prog;
1202} 1192}
1203EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); 1193EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
1204 1194
@@ -1373,7 +1363,6 @@ static int bpf_prog_load(union bpf_attr *attr)
1373 } 1363 }
1374 1364
1375 bpf_prog_kallsyms_add(prog); 1365 bpf_prog_kallsyms_add(prog);
1376 trace_bpf_prog_load(prog, err);
1377 return err; 1366 return err;
1378 1367
1379free_used_maps: 1368free_used_maps:
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 1f4bf68c12db..938d41211be7 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
43 return TNUM(a.value >> shift, a.mask >> shift); 43 return TNUM(a.value >> shift, a.mask >> shift);
44} 44}
45 45
46struct tnum tnum_arshift(struct tnum a, u8 min_shift)
47{
48 /* if a.value is negative, arithmetic shifting by minimum shift
49 * will have larger negative offset compared to more shifting.
50 * If a.value is nonnegative, arithmetic shifting by minimum shift
51 * will have larger positive offset compare to more shifting.
52 */
53 return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
54}
55
46struct tnum tnum_add(struct tnum a, struct tnum b) 56struct tnum tnum_add(struct tnum a, struct tnum b)
47{ 57{
48 u64 sm, sv, sigma, chi, mu; 58 u64 sm, sv, sigma, chi, mu;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb1a596aebd3..d5e1a6c4165d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22,6 +22,7 @@
22#include <linux/stringify.h> 22#include <linux/stringify.h>
23#include <linux/bsearch.h> 23#include <linux/bsearch.h>
24#include <linux/sort.h> 24#include <linux/sort.h>
25#include <linux/perf_event.h>
25 26
26#include "disasm.h" 27#include "disasm.h"
27 28
@@ -164,6 +165,8 @@ struct bpf_call_arg_meta {
164 bool pkt_access; 165 bool pkt_access;
165 int regno; 166 int regno;
166 int access_size; 167 int access_size;
168 s64 msize_smax_value;
169 u64 msize_umax_value;
167}; 170};
168 171
169static DEFINE_MUTEX(bpf_verifier_lock); 172static DEFINE_MUTEX(bpf_verifier_lock);
@@ -738,18 +741,19 @@ enum reg_arg_type {
738 741
739static int cmp_subprogs(const void *a, const void *b) 742static int cmp_subprogs(const void *a, const void *b)
740{ 743{
741 return *(int *)a - *(int *)b; 744 return ((struct bpf_subprog_info *)a)->start -
745 ((struct bpf_subprog_info *)b)->start;
742} 746}
743 747
744static int find_subprog(struct bpf_verifier_env *env, int off) 748static int find_subprog(struct bpf_verifier_env *env, int off)
745{ 749{
746 u32 *p; 750 struct bpf_subprog_info *p;
747 751
748 p = bsearch(&off, env->subprog_starts, env->subprog_cnt, 752 p = bsearch(&off, env->subprog_info, env->subprog_cnt,
749 sizeof(env->subprog_starts[0]), cmp_subprogs); 753 sizeof(env->subprog_info[0]), cmp_subprogs);
750 if (!p) 754 if (!p)
751 return -ENOENT; 755 return -ENOENT;
752 return p - env->subprog_starts; 756 return p - env->subprog_info;
753 757
754} 758}
755 759
@@ -769,18 +773,24 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
769 verbose(env, "too many subprograms\n"); 773 verbose(env, "too many subprograms\n");
770 return -E2BIG; 774 return -E2BIG;
771 } 775 }
772 env->subprog_starts[env->subprog_cnt++] = off; 776 env->subprog_info[env->subprog_cnt++].start = off;
773 sort(env->subprog_starts, env->subprog_cnt, 777 sort(env->subprog_info, env->subprog_cnt,
774 sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); 778 sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
775 return 0; 779 return 0;
776} 780}
777 781
778static int check_subprogs(struct bpf_verifier_env *env) 782static int check_subprogs(struct bpf_verifier_env *env)
779{ 783{
780 int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; 784 int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
785 struct bpf_subprog_info *subprog = env->subprog_info;
781 struct bpf_insn *insn = env->prog->insnsi; 786 struct bpf_insn *insn = env->prog->insnsi;
782 int insn_cnt = env->prog->len; 787 int insn_cnt = env->prog->len;
783 788
789 /* Add entry function. */
790 ret = add_subprog(env, 0);
791 if (ret < 0)
792 return ret;
793
784 /* determine subprog starts. The end is one before the next starts */ 794 /* determine subprog starts. The end is one before the next starts */
785 for (i = 0; i < insn_cnt; i++) { 795 for (i = 0; i < insn_cnt; i++) {
786 if (insn[i].code != (BPF_JMP | BPF_CALL)) 796 if (insn[i].code != (BPF_JMP | BPF_CALL))
@@ -800,16 +810,18 @@ static int check_subprogs(struct bpf_verifier_env *env)
800 return ret; 810 return ret;
801 } 811 }
802 812
813 /* Add a fake 'exit' subprog which could simplify subprog iteration
814 * logic. 'subprog_cnt' should not be increased.
815 */
816 subprog[env->subprog_cnt].start = insn_cnt;
817
803 if (env->log.level > 1) 818 if (env->log.level > 1)
804 for (i = 0; i < env->subprog_cnt; i++) 819 for (i = 0; i < env->subprog_cnt; i++)
805 verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); 820 verbose(env, "func#%d @%d\n", i, subprog[i].start);
806 821
807 /* now check that all jumps are within the same subprog */ 822 /* now check that all jumps are within the same subprog */
808 subprog_start = 0; 823 subprog_start = subprog[cur_subprog].start;
809 if (env->subprog_cnt == cur_subprog) 824 subprog_end = subprog[cur_subprog + 1].start;
810 subprog_end = insn_cnt;
811 else
812 subprog_end = env->subprog_starts[cur_subprog++];
813 for (i = 0; i < insn_cnt; i++) { 825 for (i = 0; i < insn_cnt; i++) {
814 u8 code = insn[i].code; 826 u8 code = insn[i].code;
815 827
@@ -834,10 +846,9 @@ next:
834 return -EINVAL; 846 return -EINVAL;
835 } 847 }
836 subprog_start = subprog_end; 848 subprog_start = subprog_end;
837 if (env->subprog_cnt == cur_subprog) 849 cur_subprog++;
838 subprog_end = insn_cnt; 850 if (cur_subprog < env->subprog_cnt)
839 else 851 subprog_end = subprog[cur_subprog + 1].start;
840 subprog_end = env->subprog_starts[cur_subprog++];
841 } 852 }
842 } 853 }
843 return 0; 854 return 0;
@@ -1470,13 +1481,13 @@ static int update_stack_depth(struct bpf_verifier_env *env,
1470 const struct bpf_func_state *func, 1481 const struct bpf_func_state *func,
1471 int off) 1482 int off)
1472{ 1483{
1473 u16 stack = env->subprog_stack_depth[func->subprogno]; 1484 u16 stack = env->subprog_info[func->subprogno].stack_depth;
1474 1485
1475 if (stack >= -off) 1486 if (stack >= -off)
1476 return 0; 1487 return 0;
1477 1488
1478 /* update known max for given subprogram */ 1489 /* update known max for given subprogram */
1479 env->subprog_stack_depth[func->subprogno] = -off; 1490 env->subprog_info[func->subprogno].stack_depth = -off;
1480 return 0; 1491 return 0;
1481} 1492}
1482 1493
@@ -1488,9 +1499,9 @@ static int update_stack_depth(struct bpf_verifier_env *env,
1488 */ 1499 */
1489static int check_max_stack_depth(struct bpf_verifier_env *env) 1500static int check_max_stack_depth(struct bpf_verifier_env *env)
1490{ 1501{
1491 int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; 1502 int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
1503 struct bpf_subprog_info *subprog = env->subprog_info;
1492 struct bpf_insn *insn = env->prog->insnsi; 1504 struct bpf_insn *insn = env->prog->insnsi;
1493 int insn_cnt = env->prog->len;
1494 int ret_insn[MAX_CALL_FRAMES]; 1505 int ret_insn[MAX_CALL_FRAMES];
1495 int ret_prog[MAX_CALL_FRAMES]; 1506 int ret_prog[MAX_CALL_FRAMES];
1496 1507
@@ -1498,17 +1509,14 @@ process_func:
1498 /* round up to 32-bytes, since this is granularity 1509 /* round up to 32-bytes, since this is granularity
1499 * of interpreter stack size 1510 * of interpreter stack size
1500 */ 1511 */
1501 depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); 1512 depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
1502 if (depth > MAX_BPF_STACK) { 1513 if (depth > MAX_BPF_STACK) {
1503 verbose(env, "combined stack size of %d calls is %d. Too large\n", 1514 verbose(env, "combined stack size of %d calls is %d. Too large\n",
1504 frame + 1, depth); 1515 frame + 1, depth);
1505 return -EACCES; 1516 return -EACCES;
1506 } 1517 }
1507continue_func: 1518continue_func:
1508 if (env->subprog_cnt == subprog) 1519 subprog_end = subprog[idx + 1].start;
1509 subprog_end = insn_cnt;
1510 else
1511 subprog_end = env->subprog_starts[subprog];
1512 for (; i < subprog_end; i++) { 1520 for (; i < subprog_end; i++) {
1513 if (insn[i].code != (BPF_JMP | BPF_CALL)) 1521 if (insn[i].code != (BPF_JMP | BPF_CALL))
1514 continue; 1522 continue;
@@ -1516,17 +1524,16 @@ continue_func:
1516 continue; 1524 continue;
1517 /* remember insn and function to return to */ 1525 /* remember insn and function to return to */
1518 ret_insn[frame] = i + 1; 1526 ret_insn[frame] = i + 1;
1519 ret_prog[frame] = subprog; 1527 ret_prog[frame] = idx;
1520 1528
1521 /* find the callee */ 1529 /* find the callee */
1522 i = i + insn[i].imm + 1; 1530 i = i + insn[i].imm + 1;
1523 subprog = find_subprog(env, i); 1531 idx = find_subprog(env, i);
1524 if (subprog < 0) { 1532 if (idx < 0) {
1525 WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", 1533 WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
1526 i); 1534 i);
1527 return -EFAULT; 1535 return -EFAULT;
1528 } 1536 }
1529 subprog++;
1530 frame++; 1537 frame++;
1531 if (frame >= MAX_CALL_FRAMES) { 1538 if (frame >= MAX_CALL_FRAMES) {
1532 WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); 1539 WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
@@ -1539,10 +1546,10 @@ continue_func:
1539 */ 1546 */
1540 if (frame == 0) 1547 if (frame == 0)
1541 return 0; 1548 return 0;
1542 depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); 1549 depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
1543 frame--; 1550 frame--;
1544 i = ret_insn[frame]; 1551 i = ret_insn[frame];
1545 subprog = ret_prog[frame]; 1552 idx = ret_prog[frame];
1546 goto continue_func; 1553 goto continue_func;
1547} 1554}
1548 1555
@@ -1558,8 +1565,7 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
1558 start); 1565 start);
1559 return -EFAULT; 1566 return -EFAULT;
1560 } 1567 }
1561 subprog++; 1568 return env->subprog_info[subprog].stack_depth;
1562 return env->subprog_stack_depth[subprog];
1563} 1569}
1564#endif 1570#endif
1565 1571
@@ -1984,6 +1990,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
1984 } else if (arg_type_is_mem_size(arg_type)) { 1990 } else if (arg_type_is_mem_size(arg_type)) {
1985 bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); 1991 bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
1986 1992
1993 /* remember the mem_size which may be used later
1994 * to refine return values.
1995 */
1996 meta->msize_smax_value = reg->smax_value;
1997 meta->msize_umax_value = reg->umax_value;
1998
1987 /* The register is SCALAR_VALUE; the access check 1999 /* The register is SCALAR_VALUE; the access check
1988 * happens using its boundaries. 2000 * happens using its boundaries.
1989 */ 2001 */
@@ -2061,8 +2073,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
2061 if (func_id != BPF_FUNC_redirect_map) 2073 if (func_id != BPF_FUNC_redirect_map)
2062 goto error; 2074 goto error;
2063 break; 2075 break;
2064 /* Restrict bpf side of cpumap, open when use-cases appear */ 2076 /* Restrict bpf side of cpumap and xskmap, open when use-cases
2077 * appear.
2078 */
2065 case BPF_MAP_TYPE_CPUMAP: 2079 case BPF_MAP_TYPE_CPUMAP:
2080 case BPF_MAP_TYPE_XSKMAP:
2066 if (func_id != BPF_FUNC_redirect_map) 2081 if (func_id != BPF_FUNC_redirect_map)
2067 goto error; 2082 goto error;
2068 break; 2083 break;
@@ -2087,7 +2102,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
2087 case BPF_FUNC_tail_call: 2102 case BPF_FUNC_tail_call:
2088 if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) 2103 if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
2089 goto error; 2104 goto error;
2090 if (env->subprog_cnt) { 2105 if (env->subprog_cnt > 1) {
2091 verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); 2106 verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
2092 return -EINVAL; 2107 return -EINVAL;
2093 } 2108 }
@@ -2109,7 +2124,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
2109 break; 2124 break;
2110 case BPF_FUNC_redirect_map: 2125 case BPF_FUNC_redirect_map:
2111 if (map->map_type != BPF_MAP_TYPE_DEVMAP && 2126 if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
2112 map->map_type != BPF_MAP_TYPE_CPUMAP) 2127 map->map_type != BPF_MAP_TYPE_CPUMAP &&
2128 map->map_type != BPF_MAP_TYPE_XSKMAP)
2113 goto error; 2129 goto error;
2114 break; 2130 break;
2115 case BPF_FUNC_sk_redirect_map: 2131 case BPF_FUNC_sk_redirect_map:
@@ -2259,7 +2275,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
2259 /* remember the callsite, it will be used by bpf_exit */ 2275 /* remember the callsite, it will be used by bpf_exit */
2260 *insn_idx /* callsite */, 2276 *insn_idx /* callsite */,
2261 state->curframe + 1 /* frameno within this callchain */, 2277 state->curframe + 1 /* frameno within this callchain */,
2262 subprog + 1 /* subprog number within this prog */); 2278 subprog /* subprog number within this prog */);
2263 2279
2264 /* copy r1 - r5 args that callee can access */ 2280 /* copy r1 - r5 args that callee can access */
2265 for (i = BPF_REG_1; i <= BPF_REG_5; i++) 2281 for (i = BPF_REG_1; i <= BPF_REG_5; i++)
@@ -2323,6 +2339,23 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
2323 return 0; 2339 return 0;
2324} 2340}
2325 2341
2342static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
2343 int func_id,
2344 struct bpf_call_arg_meta *meta)
2345{
2346 struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
2347
2348 if (ret_type != RET_INTEGER ||
2349 (func_id != BPF_FUNC_get_stack &&
2350 func_id != BPF_FUNC_probe_read_str))
2351 return;
2352
2353 ret_reg->smax_value = meta->msize_smax_value;
2354 ret_reg->umax_value = meta->msize_umax_value;
2355 __reg_deduce_bounds(ret_reg);
2356 __reg_bound_offset(ret_reg);
2357}
2358
2326static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) 2359static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
2327{ 2360{
2328 const struct bpf_func_proto *fn = NULL; 2361 const struct bpf_func_proto *fn = NULL;
@@ -2446,10 +2479,30 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
2446 return -EINVAL; 2479 return -EINVAL;
2447 } 2480 }
2448 2481
2482 do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
2483
2449 err = check_map_func_compatibility(env, meta.map_ptr, func_id); 2484 err = check_map_func_compatibility(env, meta.map_ptr, func_id);
2450 if (err) 2485 if (err)
2451 return err; 2486 return err;
2452 2487
2488 if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
2489 const char *err_str;
2490
2491#ifdef CONFIG_PERF_EVENTS
2492 err = get_callchain_buffers(sysctl_perf_event_max_stack);
2493 err_str = "cannot get callchain buffer for func %s#%d\n";
2494#else
2495 err = -ENOTSUPP;
2496 err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
2497#endif
2498 if (err) {
2499 verbose(env, err_str, func_id_name(func_id), func_id);
2500 return err;
2501 }
2502
2503 env->prog->has_callchain_buf = true;
2504 }
2505
2453 if (changes_data) 2506 if (changes_data)
2454 clear_all_pkt_pointers(env); 2507 clear_all_pkt_pointers(env);
2455 return 0; 2508 return 0;
@@ -2894,10 +2947,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
2894 dst_reg->umin_value <<= umin_val; 2947 dst_reg->umin_value <<= umin_val;
2895 dst_reg->umax_value <<= umax_val; 2948 dst_reg->umax_value <<= umax_val;
2896 } 2949 }
2897 if (src_known) 2950 dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
2898 dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
2899 else
2900 dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
2901 /* We may learn something more from the var_off */ 2951 /* We may learn something more from the var_off */
2902 __update_reg_bounds(dst_reg); 2952 __update_reg_bounds(dst_reg);
2903 break; 2953 break;
@@ -2925,16 +2975,35 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
2925 */ 2975 */
2926 dst_reg->smin_value = S64_MIN; 2976 dst_reg->smin_value = S64_MIN;
2927 dst_reg->smax_value = S64_MAX; 2977 dst_reg->smax_value = S64_MAX;
2928 if (src_known) 2978 dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
2929 dst_reg->var_off = tnum_rshift(dst_reg->var_off,
2930 umin_val);
2931 else
2932 dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
2933 dst_reg->umin_value >>= umax_val; 2979 dst_reg->umin_value >>= umax_val;
2934 dst_reg->umax_value >>= umin_val; 2980 dst_reg->umax_value >>= umin_val;
2935 /* We may learn something more from the var_off */ 2981 /* We may learn something more from the var_off */
2936 __update_reg_bounds(dst_reg); 2982 __update_reg_bounds(dst_reg);
2937 break; 2983 break;
2984 case BPF_ARSH:
2985 if (umax_val >= insn_bitness) {
2986 /* Shifts greater than 31 or 63 are undefined.
2987 * This includes shifts by a negative number.
2988 */
2989 mark_reg_unknown(env, regs, insn->dst_reg);
2990 break;
2991 }
2992
2993 /* Upon reaching here, src_known is true and
2994 * umax_val is equal to umin_val.
2995 */
2996 dst_reg->smin_value >>= umin_val;
2997 dst_reg->smax_value >>= umin_val;
2998 dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
2999
3000 /* blow away the dst_reg umin_value/umax_value and rely on
3001 * dst_reg var_off to refine the result.
3002 */
3003 dst_reg->umin_value = 0;
3004 dst_reg->umax_value = U64_MAX;
3005 __update_reg_bounds(dst_reg);
3006 break;
2938 default: 3007 default:
2939 mark_reg_unknown(env, regs, insn->dst_reg); 3008 mark_reg_unknown(env, regs, insn->dst_reg);
2940 break; 3009 break;
@@ -3818,7 +3887,12 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
3818 return -EINVAL; 3887 return -EINVAL;
3819 } 3888 }
3820 3889
3821 if (env->subprog_cnt) { 3890 if (!env->ops->gen_ld_abs) {
3891 verbose(env, "bpf verifier is misconfigured\n");
3892 return -EINVAL;
3893 }
3894
3895 if (env->subprog_cnt > 1) {
3822 /* when program has LD_ABS insn JITs and interpreter assume 3896 /* when program has LD_ABS insn JITs and interpreter assume
3823 * that r1 == ctx == skb which is not the case for callees 3897 * that r1 == ctx == skb which is not the case for callees
3824 * that can have arbitrary arguments. It's problematic 3898 * that can have arbitrary arguments. It's problematic
@@ -4849,15 +4923,15 @@ process_bpf_exit:
4849 4923
4850 verbose(env, "processed %d insns (limit %d), stack depth ", 4924 verbose(env, "processed %d insns (limit %d), stack depth ",
4851 insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); 4925 insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
4852 for (i = 0; i < env->subprog_cnt + 1; i++) { 4926 for (i = 0; i < env->subprog_cnt; i++) {
4853 u32 depth = env->subprog_stack_depth[i]; 4927 u32 depth = env->subprog_info[i].stack_depth;
4854 4928
4855 verbose(env, "%d", depth); 4929 verbose(env, "%d", depth);
4856 if (i + 1 < env->subprog_cnt + 1) 4930 if (i + 1 < env->subprog_cnt)
4857 verbose(env, "+"); 4931 verbose(env, "+");
4858 } 4932 }
4859 verbose(env, "\n"); 4933 verbose(env, "\n");
4860 env->prog->aux->stack_depth = env->subprog_stack_depth[0]; 4934 env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
4861 return 0; 4935 return 0;
4862} 4936}
4863 4937
@@ -4981,7 +5055,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
4981 /* hold the map. If the program is rejected by verifier, 5055 /* hold the map. If the program is rejected by verifier,
4982 * the map will be released by release_maps() or it 5056 * the map will be released by release_maps() or it
4983 * will be used by the valid program until it's unloaded 5057 * will be used by the valid program until it's unloaded
4984 * and all maps are released in free_bpf_prog_info() 5058 * and all maps are released in free_used_maps()
4985 */ 5059 */
4986 map = bpf_map_inc(map, false); 5060 map = bpf_map_inc(map, false);
4987 if (IS_ERR(map)) { 5061 if (IS_ERR(map)) {
@@ -5063,10 +5137,11 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
5063 5137
5064 if (len == 1) 5138 if (len == 1)
5065 return; 5139 return;
5066 for (i = 0; i < env->subprog_cnt; i++) { 5140 /* NOTE: fake 'exit' subprog should be updated as well. */
5067 if (env->subprog_starts[i] < off) 5141 for (i = 0; i <= env->subprog_cnt; i++) {
5142 if (env->subprog_info[i].start < off)
5068 continue; 5143 continue;
5069 env->subprog_starts[i] += len - 1; 5144 env->subprog_info[i].start += len - 1;
5070 } 5145 }
5071} 5146}
5072 5147
@@ -5230,7 +5305,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5230 void *old_bpf_func; 5305 void *old_bpf_func;
5231 int err = -ENOMEM; 5306 int err = -ENOMEM;
5232 5307
5233 if (env->subprog_cnt == 0) 5308 if (env->subprog_cnt <= 1)
5234 return 0; 5309 return 0;
5235 5310
5236 for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { 5311 for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5246,7 +5321,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5246 /* temporarily remember subprog id inside insn instead of 5321 /* temporarily remember subprog id inside insn instead of
5247 * aux_data, since next loop will split up all insns into funcs 5322 * aux_data, since next loop will split up all insns into funcs
5248 */ 5323 */
5249 insn->off = subprog + 1; 5324 insn->off = subprog;
5250 /* remember original imm in case JIT fails and fallback 5325 /* remember original imm in case JIT fails and fallback
5251 * to interpreter will be needed 5326 * to interpreter will be needed
5252 */ 5327 */
@@ -5255,16 +5330,13 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5255 insn->imm = 1; 5330 insn->imm = 1;
5256 } 5331 }
5257 5332
5258 func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); 5333 func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL);
5259 if (!func) 5334 if (!func)
5260 return -ENOMEM; 5335 return -ENOMEM;
5261 5336
5262 for (i = 0; i <= env->subprog_cnt; i++) { 5337 for (i = 0; i < env->subprog_cnt; i++) {
5263 subprog_start = subprog_end; 5338 subprog_start = subprog_end;
5264 if (env->subprog_cnt == i) 5339 subprog_end = env->subprog_info[i + 1].start;
5265 subprog_end = prog->len;
5266 else
5267 subprog_end = env->subprog_starts[i];
5268 5340
5269 len = subprog_end - subprog_start; 5341 len = subprog_end - subprog_start;
5270 func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); 5342 func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
@@ -5281,7 +5353,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5281 * Long term would need debug info to populate names 5353 * Long term would need debug info to populate names
5282 */ 5354 */
5283 func[i]->aux->name[0] = 'F'; 5355 func[i]->aux->name[0] = 'F';
5284 func[i]->aux->stack_depth = env->subprog_stack_depth[i]; 5356 func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
5285 func[i]->jit_requested = 1; 5357 func[i]->jit_requested = 1;
5286 func[i] = bpf_int_jit_compile(func[i]); 5358 func[i] = bpf_int_jit_compile(func[i]);
5287 if (!func[i]->jited) { 5359 if (!func[i]->jited) {
@@ -5294,7 +5366,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5294 * now populate all bpf_calls with correct addresses and 5366 * now populate all bpf_calls with correct addresses and
5295 * run last pass of JIT 5367 * run last pass of JIT
5296 */ 5368 */
5297 for (i = 0; i <= env->subprog_cnt; i++) { 5369 for (i = 0; i < env->subprog_cnt; i++) {
5298 insn = func[i]->insnsi; 5370 insn = func[i]->insnsi;
5299 for (j = 0; j < func[i]->len; j++, insn++) { 5371 for (j = 0; j < func[i]->len; j++, insn++) {
5300 if (insn->code != (BPF_JMP | BPF_CALL) || 5372 if (insn->code != (BPF_JMP | BPF_CALL) ||
@@ -5307,7 +5379,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5307 __bpf_call_base; 5379 __bpf_call_base;
5308 } 5380 }
5309 } 5381 }
5310 for (i = 0; i <= env->subprog_cnt; i++) { 5382 for (i = 0; i < env->subprog_cnt; i++) {
5311 old_bpf_func = func[i]->bpf_func; 5383 old_bpf_func = func[i]->bpf_func;
5312 tmp = bpf_int_jit_compile(func[i]); 5384 tmp = bpf_int_jit_compile(func[i]);
5313 if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { 5385 if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
@@ -5321,7 +5393,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5321 /* finally lock prog and jit images for all functions and 5393 /* finally lock prog and jit images for all functions and
5322 * populate kallsysm 5394 * populate kallsysm
5323 */ 5395 */
5324 for (i = 0; i <= env->subprog_cnt; i++) { 5396 for (i = 0; i < env->subprog_cnt; i++) {
5325 bpf_prog_lock_ro(func[i]); 5397 bpf_prog_lock_ro(func[i]);
5326 bpf_prog_kallsyms_add(func[i]); 5398 bpf_prog_kallsyms_add(func[i]);
5327 } 5399 }
@@ -5338,7 +5410,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5338 continue; 5410 continue;
5339 insn->off = env->insn_aux_data[i].call_imm; 5411 insn->off = env->insn_aux_data[i].call_imm;
5340 subprog = find_subprog(env, i + insn->off + 1); 5412 subprog = find_subprog(env, i + insn->off + 1);
5341 addr = (unsigned long)func[subprog + 1]->bpf_func; 5413 addr = (unsigned long)func[subprog]->bpf_func;
5342 addr &= PAGE_MASK; 5414 addr &= PAGE_MASK;
5343 insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) 5415 insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
5344 addr - __bpf_call_base; 5416 addr - __bpf_call_base;
@@ -5347,10 +5419,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5347 prog->jited = 1; 5419 prog->jited = 1;
5348 prog->bpf_func = func[0]->bpf_func; 5420 prog->bpf_func = func[0]->bpf_func;
5349 prog->aux->func = func; 5421 prog->aux->func = func;
5350 prog->aux->func_cnt = env->subprog_cnt + 1; 5422 prog->aux->func_cnt = env->subprog_cnt;
5351 return 0; 5423 return 0;
5352out_free: 5424out_free:
5353 for (i = 0; i <= env->subprog_cnt; i++) 5425 for (i = 0; i < env->subprog_cnt; i++)
5354 if (func[i]) 5426 if (func[i])
5355 bpf_jit_free(func[i]); 5427 bpf_jit_free(func[i]);
5356 kfree(func); 5428 kfree(func);
@@ -5453,6 +5525,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
5453 continue; 5525 continue;
5454 } 5526 }
5455 5527
5528 if (BPF_CLASS(insn->code) == BPF_LD &&
5529 (BPF_MODE(insn->code) == BPF_ABS ||
5530 BPF_MODE(insn->code) == BPF_IND)) {
5531 cnt = env->ops->gen_ld_abs(insn, insn_buf);
5532 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
5533 verbose(env, "bpf verifier is misconfigured\n");
5534 return -EINVAL;
5535 }
5536
5537 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
5538 if (!new_prog)
5539 return -ENOMEM;
5540
5541 delta += cnt - 1;
5542 env->prog = prog = new_prog;
5543 insn = new_prog->insnsi + i + delta;
5544 continue;
5545 }
5546
5456 if (insn->code != (BPF_JMP | BPF_CALL)) 5547 if (insn->code != (BPF_JMP | BPF_CALL))
5457 continue; 5548 continue;
5458 if (insn->src_reg == BPF_PSEUDO_CALL) 5549 if (insn->src_reg == BPF_PSEUDO_CALL)
@@ -5650,16 +5741,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
5650 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) 5741 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
5651 env->strict_alignment = true; 5742 env->strict_alignment = true;
5652 5743
5744 ret = replace_map_fd_with_map_ptr(env);
5745 if (ret < 0)
5746 goto skip_full_check;
5747
5653 if (bpf_prog_is_dev_bound(env->prog->aux)) { 5748 if (bpf_prog_is_dev_bound(env->prog->aux)) {
5654 ret = bpf_prog_offload_verifier_prep(env); 5749 ret = bpf_prog_offload_verifier_prep(env);
5655 if (ret) 5750 if (ret)
5656 goto err_unlock; 5751 goto skip_full_check;
5657 } 5752 }
5658 5753
5659 ret = replace_map_fd_with_map_ptr(env);
5660 if (ret < 0)
5661 goto skip_full_check;
5662
5663 env->explored_states = kcalloc(env->prog->len, 5754 env->explored_states = kcalloc(env->prog->len,
5664 sizeof(struct bpf_verifier_state_list *), 5755 sizeof(struct bpf_verifier_state_list *),
5665 GFP_USER); 5756 GFP_USER);
@@ -5730,7 +5821,7 @@ skip_full_check:
5730err_release_maps: 5821err_release_maps:
5731 if (!env->prog->aux->used_maps) 5822 if (!env->prog->aux->used_maps)
5732 /* if we didn't copy map pointers into bpf_prog_info, release 5823 /* if we didn't copy map pointers into bpf_prog_info, release
5733 * them now. Otherwise free_bpf_prog_info() will release them. 5824 * them now. Otherwise free_used_maps() will release them.
5734 */ 5825 */
5735 release_maps(env); 5826 release_maps(env);
5736 *prog = env->prog; 5827 *prog = env->prog;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
new file mode 100644
index 000000000000..cb3a12137404
--- /dev/null
+++ b/kernel/bpf/xskmap.c
@@ -0,0 +1,241 @@
1// SPDX-License-Identifier: GPL-2.0
2/* XSKMAP used for AF_XDP sockets
3 * Copyright(c) 2018 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/bpf.h>
16#include <linux/capability.h>
17#include <net/xdp_sock.h>
18#include <linux/slab.h>
19#include <linux/sched.h>
20
21struct xsk_map {
22 struct bpf_map map;
23 struct xdp_sock **xsk_map;
24 struct list_head __percpu *flush_list;
25};
26
27static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
28{
29 int cpu, err = -EINVAL;
30 struct xsk_map *m;
31 u64 cost;
32
33 if (!capable(CAP_NET_ADMIN))
34 return ERR_PTR(-EPERM);
35
36 if (attr->max_entries == 0 || attr->key_size != 4 ||
37 attr->value_size != 4 ||
38 attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
39 return ERR_PTR(-EINVAL);
40
41 m = kzalloc(sizeof(*m), GFP_USER);
42 if (!m)
43 return ERR_PTR(-ENOMEM);
44
45 bpf_map_init_from_attr(&m->map, attr);
46
47 cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
48 cost += sizeof(struct list_head) * num_possible_cpus();
49 if (cost >= U32_MAX - PAGE_SIZE)
50 goto free_m;
51
52 m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
53
54 /* Notice returns -EPERM on if map size is larger than memlock limit */
55 err = bpf_map_precharge_memlock(m->map.pages);
56 if (err)
57 goto free_m;
58
59 err = -ENOMEM;
60
61 m->flush_list = alloc_percpu(struct list_head);
62 if (!m->flush_list)
63 goto free_m;
64
65 for_each_possible_cpu(cpu)
66 INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
67
68 m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
69 sizeof(struct xdp_sock *),
70 m->map.numa_node);
71 if (!m->xsk_map)
72 goto free_percpu;
73 return &m->map;
74
75free_percpu:
76 free_percpu(m->flush_list);
77free_m:
78 kfree(m);
79 return ERR_PTR(err);
80}
81
82static void xsk_map_free(struct bpf_map *map)
83{
84 struct xsk_map *m = container_of(map, struct xsk_map, map);
85 int i;
86
87 synchronize_net();
88
89 for (i = 0; i < map->max_entries; i++) {
90 struct xdp_sock *xs;
91
92 xs = m->xsk_map[i];
93 if (!xs)
94 continue;
95
96 sock_put((struct sock *)xs);
97 }
98
99 free_percpu(m->flush_list);
100 bpf_map_area_free(m->xsk_map);
101 kfree(m);
102}
103
104static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
105{
106 struct xsk_map *m = container_of(map, struct xsk_map, map);
107 u32 index = key ? *(u32 *)key : U32_MAX;
108 u32 *next = next_key;
109
110 if (index >= m->map.max_entries) {
111 *next = 0;
112 return 0;
113 }
114
115 if (index == m->map.max_entries - 1)
116 return -ENOENT;
117 *next = index + 1;
118 return 0;
119}
120
121struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
122{
123 struct xsk_map *m = container_of(map, struct xsk_map, map);
124 struct xdp_sock *xs;
125
126 if (key >= map->max_entries)
127 return NULL;
128
129 xs = READ_ONCE(m->xsk_map[key]);
130 return xs;
131}
132
133int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
134 struct xdp_sock *xs)
135{
136 struct xsk_map *m = container_of(map, struct xsk_map, map);
137 struct list_head *flush_list = this_cpu_ptr(m->flush_list);
138 int err;
139
140 err = xsk_rcv(xs, xdp);
141 if (err)
142 return err;
143
144 if (!xs->flush_node.prev)
145 list_add(&xs->flush_node, flush_list);
146
147 return 0;
148}
149
150void __xsk_map_flush(struct bpf_map *map)
151{
152 struct xsk_map *m = container_of(map, struct xsk_map, map);
153 struct list_head *flush_list = this_cpu_ptr(m->flush_list);
154 struct xdp_sock *xs, *tmp;
155
156 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
157 xsk_flush(xs);
158 __list_del(xs->flush_node.prev, xs->flush_node.next);
159 xs->flush_node.prev = NULL;
160 }
161}
162
163static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
164{
165 return NULL;
166}
167
168static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
169 u64 map_flags)
170{
171 struct xsk_map *m = container_of(map, struct xsk_map, map);
172 u32 i = *(u32 *)key, fd = *(u32 *)value;
173 struct xdp_sock *xs, *old_xs;
174 struct socket *sock;
175 int err;
176
177 if (unlikely(map_flags > BPF_EXIST))
178 return -EINVAL;
179 if (unlikely(i >= m->map.max_entries))
180 return -E2BIG;
181 if (unlikely(map_flags == BPF_NOEXIST))
182 return -EEXIST;
183
184 sock = sockfd_lookup(fd, &err);
185 if (!sock)
186 return err;
187
188 if (sock->sk->sk_family != PF_XDP) {
189 sockfd_put(sock);
190 return -EOPNOTSUPP;
191 }
192
193 xs = (struct xdp_sock *)sock->sk;
194
195 if (!xsk_is_setup_for_bpf_map(xs)) {
196 sockfd_put(sock);
197 return -EOPNOTSUPP;
198 }
199
200 sock_hold(sock->sk);
201
202 old_xs = xchg(&m->xsk_map[i], xs);
203 if (old_xs) {
204 /* Make sure we've flushed everything. */
205 synchronize_net();
206 sock_put((struct sock *)old_xs);
207 }
208
209 sockfd_put(sock);
210 return 0;
211}
212
213static int xsk_map_delete_elem(struct bpf_map *map, void *key)
214{
215 struct xsk_map *m = container_of(map, struct xsk_map, map);
216 struct xdp_sock *old_xs;
217 int k = *(u32 *)key;
218
219 if (k >= map->max_entries)
220 return -EINVAL;
221
222 old_xs = xchg(&m->xsk_map[k], NULL);
223 if (old_xs) {
224 /* Make sure we've flushed everything. */
225 synchronize_net();
226 sock_put((struct sock *)old_xs);
227 }
228
229 return 0;
230}
231
232const struct bpf_map_ops xsk_map_ops = {
233 .map_alloc = xsk_map_alloc,
234 .map_free = xsk_map_free,
235 .map_get_next_key = xsk_map_get_next_key,
236 .map_lookup_elem = xsk_map_lookup_elem,
237 .map_update_elem = xsk_map_update_elem,
238 .map_delete_elem = xsk_map_delete_elem,
239};
240
241
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 56ba0f2a01db..ce2cbbff27e4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@
20#include "trace.h" 20#include "trace.h"
21 21
22u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); 22u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
23u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
23 24
24/** 25/**
25 * trace_call_bpf - invoke BPF program 26 * trace_call_bpf - invoke BPF program
@@ -474,8 +475,6 @@ BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
474 struct bpf_array *array = container_of(map, struct bpf_array, map); 475 struct bpf_array *array = container_of(map, struct bpf_array, map);
475 struct cgroup *cgrp; 476 struct cgroup *cgrp;
476 477
477 if (unlikely(in_interrupt()))
478 return -EINVAL;
479 if (unlikely(idx >= array->map.max_entries)) 478 if (unlikely(idx >= array->map.max_entries))
480 return -E2BIG; 479 return -E2BIG;
481 480
@@ -577,6 +576,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
577 return &bpf_perf_event_output_proto; 576 return &bpf_perf_event_output_proto;
578 case BPF_FUNC_get_stackid: 577 case BPF_FUNC_get_stackid:
579 return &bpf_get_stackid_proto; 578 return &bpf_get_stackid_proto;
579 case BPF_FUNC_get_stack:
580 return &bpf_get_stack_proto;
580 case BPF_FUNC_perf_event_read_value: 581 case BPF_FUNC_perf_event_read_value:
581 return &bpf_perf_event_read_value_proto; 582 return &bpf_perf_event_read_value_proto;
582#ifdef CONFIG_BPF_KPROBE_OVERRIDE 583#ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +665,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
664 .arg3_type = ARG_ANYTHING, 665 .arg3_type = ARG_ANYTHING,
665}; 666};
666 667
668BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
669 u64, flags)
670{
671 struct pt_regs *regs = *(struct pt_regs **)tp_buff;
672
673 return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
674 (unsigned long) size, flags, 0);
675}
676
677static const struct bpf_func_proto bpf_get_stack_proto_tp = {
678 .func = bpf_get_stack_tp,
679 .gpl_only = true,
680 .ret_type = RET_INTEGER,
681 .arg1_type = ARG_PTR_TO_CTX,
682 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
683 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
684 .arg4_type = ARG_ANYTHING,
685};
686
667static const struct bpf_func_proto * 687static const struct bpf_func_proto *
668tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 688tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
669{ 689{
@@ -672,6 +692,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
672 return &bpf_perf_event_output_proto_tp; 692 return &bpf_perf_event_output_proto_tp;
673 case BPF_FUNC_get_stackid: 693 case BPF_FUNC_get_stackid:
674 return &bpf_get_stackid_proto_tp; 694 return &bpf_get_stackid_proto_tp;
695 case BPF_FUNC_get_stack:
696 return &bpf_get_stack_proto_tp;
675 default: 697 default:
676 return tracing_func_proto(func_id, prog); 698 return tracing_func_proto(func_id, prog);
677 } 699 }
@@ -734,6 +756,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
734 return &bpf_perf_event_output_proto_tp; 756 return &bpf_perf_event_output_proto_tp;
735 case BPF_FUNC_get_stackid: 757 case BPF_FUNC_get_stackid:
736 return &bpf_get_stackid_proto_tp; 758 return &bpf_get_stackid_proto_tp;
759 case BPF_FUNC_get_stack:
760 return &bpf_get_stack_proto_tp;
737 case BPF_FUNC_perf_prog_read_value: 761 case BPF_FUNC_perf_prog_read_value:
738 return &bpf_perf_prog_read_value_proto; 762 return &bpf_perf_prog_read_value_proto;
739 default: 763 default:
@@ -744,7 +768,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
744/* 768/*
745 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp 769 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
746 * to avoid potential recursive reuse issue when/if tracepoints are added 770 * to avoid potential recursive reuse issue when/if tracepoints are added
747 * inside bpf_*_event_output and/or bpf_get_stack_id 771 * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
748 */ 772 */
749static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); 773static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
750BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, 774BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +811,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
787 .arg3_type = ARG_ANYTHING, 811 .arg3_type = ARG_ANYTHING,
788}; 812};
789 813
814BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
815 void *, buf, u32, size, u64, flags)
816{
817 struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
818
819 perf_fetch_caller_regs(regs);
820 return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
821 (unsigned long) size, flags, 0);
822}
823
824static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
825 .func = bpf_get_stack_raw_tp,
826 .gpl_only = true,
827 .ret_type = RET_INTEGER,
828 .arg1_type = ARG_PTR_TO_CTX,
829 .arg2_type = ARG_PTR_TO_MEM,
830 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
831 .arg4_type = ARG_ANYTHING,
832};
833
790static const struct bpf_func_proto * 834static const struct bpf_func_proto *
791raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 835raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
792{ 836{
@@ -795,6 +839,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
795 return &bpf_perf_event_output_proto_raw_tp; 839 return &bpf_perf_event_output_proto_raw_tp;
796 case BPF_FUNC_get_stackid: 840 case BPF_FUNC_get_stackid:
797 return &bpf_get_stackid_proto_raw_tp; 841 return &bpf_get_stackid_proto_raw_tp;
842 case BPF_FUNC_get_stack:
843 return &bpf_get_stack_proto_raw_tp;
798 default: 844 default:
799 return tracing_func_proto(func_id, prog); 845 return tracing_func_proto(func_id, prog);
800 } 846 }
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 8e157806df7a..317f231462d4 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -386,116 +386,6 @@ static int bpf_fill_ld_abs_get_processor_id(struct bpf_test *self)
386 return 0; 386 return 0;
387} 387}
388 388
389#define PUSH_CNT 68
390/* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */
391static int bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self)
392{
393 unsigned int len = BPF_MAXINSNS;
394 struct bpf_insn *insn;
395 int i = 0, j, k = 0;
396
397 insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
398 if (!insn)
399 return -ENOMEM;
400
401 insn[i++] = BPF_MOV64_REG(R6, R1);
402loop:
403 for (j = 0; j < PUSH_CNT; j++) {
404 insn[i++] = BPF_LD_ABS(BPF_B, 0);
405 insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2);
406 i++;
407 insn[i++] = BPF_MOV64_REG(R1, R6);
408 insn[i++] = BPF_MOV64_IMM(R2, 1);
409 insn[i++] = BPF_MOV64_IMM(R3, 2);
410 insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
411 bpf_skb_vlan_push_proto.func - __bpf_call_base);
412 insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2);
413 i++;
414 }
415
416 for (j = 0; j < PUSH_CNT; j++) {
417 insn[i++] = BPF_LD_ABS(BPF_B, 0);
418 insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2);
419 i++;
420 insn[i++] = BPF_MOV64_REG(R1, R6);
421 insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
422 bpf_skb_vlan_pop_proto.func - __bpf_call_base);
423 insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2);
424 i++;
425 }
426 if (++k < 5)
427 goto loop;
428
429 for (; i < len - 1; i++)
430 insn[i] = BPF_ALU32_IMM(BPF_MOV, R0, 0xbef);
431
432 insn[len - 1] = BPF_EXIT_INSN();
433
434 self->u.ptr.insns = insn;
435 self->u.ptr.len = len;
436
437 return 0;
438}
439
440static int bpf_fill_ld_abs_vlan_push_pop2(struct bpf_test *self)
441{
442 struct bpf_insn *insn;
443
444 insn = kmalloc_array(16, sizeof(*insn), GFP_KERNEL);
445 if (!insn)
446 return -ENOMEM;
447
448 /* Due to func address being non-const, we need to
449 * assemble this here.
450 */
451 insn[0] = BPF_MOV64_REG(R6, R1);
452 insn[1] = BPF_LD_ABS(BPF_B, 0);
453 insn[2] = BPF_LD_ABS(BPF_H, 0);
454 insn[3] = BPF_LD_ABS(BPF_W, 0);
455 insn[4] = BPF_MOV64_REG(R7, R6);
456 insn[5] = BPF_MOV64_IMM(R6, 0);
457 insn[6] = BPF_MOV64_REG(R1, R7);
458 insn[7] = BPF_MOV64_IMM(R2, 1);
459 insn[8] = BPF_MOV64_IMM(R3, 2);
460 insn[9] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
461 bpf_skb_vlan_push_proto.func - __bpf_call_base);
462 insn[10] = BPF_MOV64_REG(R6, R7);
463 insn[11] = BPF_LD_ABS(BPF_B, 0);
464 insn[12] = BPF_LD_ABS(BPF_H, 0);
465 insn[13] = BPF_LD_ABS(BPF_W, 0);
466 insn[14] = BPF_MOV64_IMM(R0, 42);
467 insn[15] = BPF_EXIT_INSN();
468
469 self->u.ptr.insns = insn;
470 self->u.ptr.len = 16;
471
472 return 0;
473}
474
475static int bpf_fill_jump_around_ld_abs(struct bpf_test *self)
476{
477 unsigned int len = BPF_MAXINSNS;
478 struct bpf_insn *insn;
479 int i = 0;
480
481 insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
482 if (!insn)
483 return -ENOMEM;
484
485 insn[i++] = BPF_MOV64_REG(R6, R1);
486 insn[i++] = BPF_LD_ABS(BPF_B, 0);
487 insn[i] = BPF_JMP_IMM(BPF_JEQ, R0, 10, len - i - 2);
488 i++;
489 while (i < len - 1)
490 insn[i++] = BPF_LD_ABS(BPF_B, 1);
491 insn[i] = BPF_EXIT_INSN();
492
493 self->u.ptr.insns = insn;
494 self->u.ptr.len = len;
495
496 return 0;
497}
498
499static int __bpf_fill_stxdw(struct bpf_test *self, int size) 389static int __bpf_fill_stxdw(struct bpf_test *self, int size)
500{ 390{
501 unsigned int len = BPF_MAXINSNS; 391 unsigned int len = BPF_MAXINSNS;
@@ -1988,40 +1878,6 @@ static struct bpf_test tests[] = {
1988 { { 0, -1 } } 1878 { { 0, -1 } }
1989 }, 1879 },
1990 { 1880 {
1991 "INT: DIV + ABS",
1992 .u.insns_int = {
1993 BPF_ALU64_REG(BPF_MOV, R6, R1),
1994 BPF_LD_ABS(BPF_B, 3),
1995 BPF_ALU64_IMM(BPF_MOV, R2, 2),
1996 BPF_ALU32_REG(BPF_DIV, R0, R2),
1997 BPF_ALU64_REG(BPF_MOV, R8, R0),
1998 BPF_LD_ABS(BPF_B, 4),
1999 BPF_ALU64_REG(BPF_ADD, R8, R0),
2000 BPF_LD_IND(BPF_B, R8, -70),
2001 BPF_EXIT_INSN(),
2002 },
2003 INTERNAL,
2004 { 10, 20, 30, 40, 50 },
2005 { { 4, 0 }, { 5, 10 } }
2006 },
2007 {
2008 /* This one doesn't go through verifier, but is just raw insn
2009 * as opposed to cBPF tests from here. Thus div by 0 tests are
2010 * done in test_verifier in BPF kselftests.
2011 */
2012 "INT: DIV by -1",
2013 .u.insns_int = {
2014 BPF_ALU64_REG(BPF_MOV, R6, R1),
2015 BPF_ALU64_IMM(BPF_MOV, R7, -1),
2016 BPF_LD_ABS(BPF_B, 3),
2017 BPF_ALU32_REG(BPF_DIV, R0, R7),
2018 BPF_EXIT_INSN(),
2019 },
2020 INTERNAL,
2021 { 10, 20, 30, 40, 50 },
2022 { { 3, 0 }, { 4, 0 } }
2023 },
2024 {
2025 "check: missing ret", 1881 "check: missing ret",
2026 .u.insns = { 1882 .u.insns = {
2027 BPF_STMT(BPF_LD | BPF_IMM, 1), 1883 BPF_STMT(BPF_LD | BPF_IMM, 1),
@@ -2383,50 +2239,6 @@ static struct bpf_test tests[] = {
2383 { }, 2239 { },
2384 { { 0, 1 } } 2240 { { 0, 1 } }
2385 }, 2241 },
2386 {
2387 "nmap reduced",
2388 .u.insns_int = {
2389 BPF_MOV64_REG(R6, R1),
2390 BPF_LD_ABS(BPF_H, 12),
2391 BPF_JMP_IMM(BPF_JNE, R0, 0x806, 28),
2392 BPF_LD_ABS(BPF_H, 12),
2393 BPF_JMP_IMM(BPF_JNE, R0, 0x806, 26),
2394 BPF_MOV32_IMM(R0, 18),
2395 BPF_STX_MEM(BPF_W, R10, R0, -64),
2396 BPF_LDX_MEM(BPF_W, R7, R10, -64),
2397 BPF_LD_IND(BPF_W, R7, 14),
2398 BPF_STX_MEM(BPF_W, R10, R0, -60),
2399 BPF_MOV32_IMM(R0, 280971478),
2400 BPF_STX_MEM(BPF_W, R10, R0, -56),
2401 BPF_LDX_MEM(BPF_W, R7, R10, -56),
2402 BPF_LDX_MEM(BPF_W, R0, R10, -60),
2403 BPF_ALU32_REG(BPF_SUB, R0, R7),
2404 BPF_JMP_IMM(BPF_JNE, R0, 0, 15),
2405 BPF_LD_ABS(BPF_H, 12),
2406 BPF_JMP_IMM(BPF_JNE, R0, 0x806, 13),
2407 BPF_MOV32_IMM(R0, 22),
2408 BPF_STX_MEM(BPF_W, R10, R0, -56),
2409 BPF_LDX_MEM(BPF_W, R7, R10, -56),
2410 BPF_LD_IND(BPF_H, R7, 14),
2411 BPF_STX_MEM(BPF_W, R10, R0, -52),
2412 BPF_MOV32_IMM(R0, 17366),
2413 BPF_STX_MEM(BPF_W, R10, R0, -48),
2414 BPF_LDX_MEM(BPF_W, R7, R10, -48),
2415 BPF_LDX_MEM(BPF_W, R0, R10, -52),
2416 BPF_ALU32_REG(BPF_SUB, R0, R7),
2417 BPF_JMP_IMM(BPF_JNE, R0, 0, 2),
2418 BPF_MOV32_IMM(R0, 256),
2419 BPF_EXIT_INSN(),
2420 BPF_MOV32_IMM(R0, 0),
2421 BPF_EXIT_INSN(),
2422 },
2423 INTERNAL,
2424 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0, 0,
2425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2426 0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6},
2427 { { 38, 256 } },
2428 .stack_depth = 64,
2429 },
2430 /* BPF_ALU | BPF_MOV | BPF_X */ 2242 /* BPF_ALU | BPF_MOV | BPF_X */
2431 { 2243 {
2432 "ALU_MOV_X: dst = 2", 2244 "ALU_MOV_X: dst = 2",
@@ -5485,22 +5297,6 @@ static struct bpf_test tests[] = {
5485 { { 1, 0xbee } }, 5297 { { 1, 0xbee } },
5486 .fill_helper = bpf_fill_ld_abs_get_processor_id, 5298 .fill_helper = bpf_fill_ld_abs_get_processor_id,
5487 }, 5299 },
5488 {
5489 "BPF_MAXINSNS: ld_abs+vlan_push/pop",
5490 { },
5491 INTERNAL,
5492 { 0x34 },
5493 { { ETH_HLEN, 0xbef } },
5494 .fill_helper = bpf_fill_ld_abs_vlan_push_pop,
5495 },
5496 {
5497 "BPF_MAXINSNS: jump around ld_abs",
5498 { },
5499 INTERNAL,
5500 { 10, 11 },
5501 { { 2, 10 } },
5502 .fill_helper = bpf_fill_jump_around_ld_abs,
5503 },
5504 /* 5300 /*
5505 * LD_IND / LD_ABS on fragmented SKBs 5301 * LD_IND / LD_ABS on fragmented SKBs
5506 */ 5302 */
@@ -5683,6 +5479,53 @@ static struct bpf_test tests[] = {
5683 { {0x40, 0x05 } }, 5479 { {0x40, 0x05 } },
5684 }, 5480 },
5685 { 5481 {
5482 "LD_IND byte positive offset, all ff",
5483 .u.insns = {
5484 BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
5485 BPF_STMT(BPF_LD | BPF_IND | BPF_B, 0x1),
5486 BPF_STMT(BPF_RET | BPF_A, 0x0),
5487 },
5488 CLASSIC,
5489 { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff },
5490 { {0x40, 0xff } },
5491 },
5492 {
5493 "LD_IND byte positive offset, out of bounds",
5494 .u.insns = {
5495 BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
5496 BPF_STMT(BPF_LD | BPF_IND | BPF_B, 0x1),
5497 BPF_STMT(BPF_RET | BPF_A, 0x0),
5498 },
5499 CLASSIC,
5500 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5501 { {0x3f, 0 }, },
5502 },
5503 {
5504 "LD_IND byte negative offset, out of bounds",
5505 .u.insns = {
5506 BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
5507 BPF_STMT(BPF_LD | BPF_IND | BPF_B, -0x3f),
5508 BPF_STMT(BPF_RET | BPF_A, 0x0),
5509 },
5510 CLASSIC,
5511 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5512 { {0x3f, 0 } },
5513 },
5514 {
5515 "LD_IND byte negative offset, multiple calls",
5516 .u.insns = {
5517 BPF_STMT(BPF_LDX | BPF_IMM, 0x3b),
5518 BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 1),
5519 BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 2),
5520 BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 3),
5521 BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 4),
5522 BPF_STMT(BPF_RET | BPF_A, 0x0),
5523 },
5524 CLASSIC,
5525 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5526 { {0x40, 0x82 }, },
5527 },
5528 {
5686 "LD_IND halfword positive offset", 5529 "LD_IND halfword positive offset",
5687 .u.insns = { 5530 .u.insns = {
5688 BPF_STMT(BPF_LDX | BPF_IMM, 0x20), 5531 BPF_STMT(BPF_LDX | BPF_IMM, 0x20),
@@ -5731,6 +5574,39 @@ static struct bpf_test tests[] = {
5731 { {0x40, 0x66cc } }, 5574 { {0x40, 0x66cc } },
5732 }, 5575 },
5733 { 5576 {
5577 "LD_IND halfword positive offset, all ff",
5578 .u.insns = {
5579 BPF_STMT(BPF_LDX | BPF_IMM, 0x3d),
5580 BPF_STMT(BPF_LD | BPF_IND | BPF_H, 0x1),
5581 BPF_STMT(BPF_RET | BPF_A, 0x0),
5582 },
5583 CLASSIC,
5584 { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff },
5585 { {0x40, 0xffff } },
5586 },
5587 {
5588 "LD_IND halfword positive offset, out of bounds",
5589 .u.insns = {
5590 BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
5591 BPF_STMT(BPF_LD | BPF_IND | BPF_H, 0x1),
5592 BPF_STMT(BPF_RET | BPF_A, 0x0),
5593 },
5594 CLASSIC,
5595 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5596 { {0x3f, 0 }, },
5597 },
5598 {
5599 "LD_IND halfword negative offset, out of bounds",
5600 .u.insns = {
5601 BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
5602 BPF_STMT(BPF_LD | BPF_IND | BPF_H, -0x3f),
5603 BPF_STMT(BPF_RET | BPF_A, 0x0),
5604 },
5605 CLASSIC,
5606 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5607 { {0x3f, 0 } },
5608 },
5609 {
5734 "LD_IND word positive offset", 5610 "LD_IND word positive offset",
5735 .u.insns = { 5611 .u.insns = {
5736 BPF_STMT(BPF_LDX | BPF_IMM, 0x20), 5612 BPF_STMT(BPF_LDX | BPF_IMM, 0x20),
@@ -5821,6 +5697,39 @@ static struct bpf_test tests[] = {
5821 { {0x40, 0x66cc77dd } }, 5697 { {0x40, 0x66cc77dd } },
5822 }, 5698 },
5823 { 5699 {
5700 "LD_IND word positive offset, all ff",
5701 .u.insns = {
5702 BPF_STMT(BPF_LDX | BPF_IMM, 0x3b),
5703 BPF_STMT(BPF_LD | BPF_IND | BPF_W, 0x1),
5704 BPF_STMT(BPF_RET | BPF_A, 0x0),
5705 },
5706 CLASSIC,
5707 { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff },
5708 { {0x40, 0xffffffff } },
5709 },
5710 {
5711 "LD_IND word positive offset, out of bounds",
5712 .u.insns = {
5713 BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
5714 BPF_STMT(BPF_LD | BPF_IND | BPF_W, 0x1),
5715 BPF_STMT(BPF_RET | BPF_A, 0x0),
5716 },
5717 CLASSIC,
5718 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5719 { {0x3f, 0 }, },
5720 },
5721 {
5722 "LD_IND word negative offset, out of bounds",
5723 .u.insns = {
5724 BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
5725 BPF_STMT(BPF_LD | BPF_IND | BPF_W, -0x3f),
5726 BPF_STMT(BPF_RET | BPF_A, 0x0),
5727 },
5728 CLASSIC,
5729 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5730 { {0x3f, 0 } },
5731 },
5732 {
5824 "LD_ABS byte", 5733 "LD_ABS byte",
5825 .u.insns = { 5734 .u.insns = {
5826 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x20), 5735 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x20),
@@ -5838,6 +5747,68 @@ static struct bpf_test tests[] = {
5838 { {0x40, 0xcc } }, 5747 { {0x40, 0xcc } },
5839 }, 5748 },
5840 { 5749 {
5750 "LD_ABS byte positive offset, all ff",
5751 .u.insns = {
5752 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x3f),
5753 BPF_STMT(BPF_RET | BPF_A, 0x0),
5754 },
5755 CLASSIC,
5756 { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff },
5757 { {0x40, 0xff } },
5758 },
5759 {
5760 "LD_ABS byte positive offset, out of bounds",
5761 .u.insns = {
5762 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x3f),
5763 BPF_STMT(BPF_RET | BPF_A, 0x0),
5764 },
5765 CLASSIC,
5766 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5767 { {0x3f, 0 }, },
5768 },
5769 {
5770 "LD_ABS byte negative offset, out of bounds load",
5771 .u.insns = {
5772 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, -1),
5773 BPF_STMT(BPF_RET | BPF_A, 0x0),
5774 },
5775 CLASSIC | FLAG_EXPECTED_FAIL,
5776 .expected_errcode = -EINVAL,
5777 },
5778 {
5779 "LD_ABS byte negative offset, in bounds",
5780 .u.insns = {
5781 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f),
5782 BPF_STMT(BPF_RET | BPF_A, 0x0),
5783 },
5784 CLASSIC,
5785 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5786 { {0x40, 0x82 }, },
5787 },
5788 {
5789 "LD_ABS byte negative offset, out of bounds",
5790 .u.insns = {
5791 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f),
5792 BPF_STMT(BPF_RET | BPF_A, 0x0),
5793 },
5794 CLASSIC,
5795 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5796 { {0x3f, 0 }, },
5797 },
5798 {
5799 "LD_ABS byte negative offset, multiple calls",
5800 .u.insns = {
5801 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3c),
5802 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3d),
5803 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3e),
5804 BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f),
5805 BPF_STMT(BPF_RET | BPF_A, 0x0),
5806 },
5807 CLASSIC,
5808 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5809 { {0x40, 0x82 }, },
5810 },
5811 {
5841 "LD_ABS halfword", 5812 "LD_ABS halfword",
5842 .u.insns = { 5813 .u.insns = {
5843 BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x22), 5814 BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x22),
@@ -5872,6 +5843,55 @@ static struct bpf_test tests[] = {
5872 { {0x40, 0x99ff } }, 5843 { {0x40, 0x99ff } },
5873 }, 5844 },
5874 { 5845 {
5846 "LD_ABS halfword positive offset, all ff",
5847 .u.insns = {
5848 BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x3e),
5849 BPF_STMT(BPF_RET | BPF_A, 0x0),
5850 },
5851 CLASSIC,
5852 { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff },
5853 { {0x40, 0xffff } },
5854 },
5855 {
5856 "LD_ABS halfword positive offset, out of bounds",
5857 .u.insns = {
5858 BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x3f),
5859 BPF_STMT(BPF_RET | BPF_A, 0x0),
5860 },
5861 CLASSIC,
5862 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5863 { {0x3f, 0 }, },
5864 },
5865 {
5866 "LD_ABS halfword negative offset, out of bounds load",
5867 .u.insns = {
5868 BPF_STMT(BPF_LD | BPF_ABS | BPF_H, -1),
5869 BPF_STMT(BPF_RET | BPF_A, 0x0),
5870 },
5871 CLASSIC | FLAG_EXPECTED_FAIL,
5872 .expected_errcode = -EINVAL,
5873 },
5874 {
5875 "LD_ABS halfword negative offset, in bounds",
5876 .u.insns = {
5877 BPF_STMT(BPF_LD | BPF_ABS | BPF_H, SKF_LL_OFF + 0x3e),
5878 BPF_STMT(BPF_RET | BPF_A, 0x0),
5879 },
5880 CLASSIC,
5881 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5882 { {0x40, 0x1982 }, },
5883 },
5884 {
5885 "LD_ABS halfword negative offset, out of bounds",
5886 .u.insns = {
5887 BPF_STMT(BPF_LD | BPF_ABS | BPF_H, SKF_LL_OFF + 0x3e),
5888 BPF_STMT(BPF_RET | BPF_A, 0x0),
5889 },
5890 CLASSIC,
5891 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5892 { {0x3f, 0 }, },
5893 },
5894 {
5875 "LD_ABS word", 5895 "LD_ABS word",
5876 .u.insns = { 5896 .u.insns = {
5877 BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x1c), 5897 BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x1c),
@@ -5939,6 +5959,140 @@ static struct bpf_test tests[] = {
5939 }, 5959 },
5940 { {0x40, 0x88ee99ff } }, 5960 { {0x40, 0x88ee99ff } },
5941 }, 5961 },
5962 {
5963 "LD_ABS word positive offset, all ff",
5964 .u.insns = {
5965 BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x3c),
5966 BPF_STMT(BPF_RET | BPF_A, 0x0),
5967 },
5968 CLASSIC,
5969 { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff },
5970 { {0x40, 0xffffffff } },
5971 },
5972 {
5973 "LD_ABS word positive offset, out of bounds",
5974 .u.insns = {
5975 BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x3f),
5976 BPF_STMT(BPF_RET | BPF_A, 0x0),
5977 },
5978 CLASSIC,
5979 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5980 { {0x3f, 0 }, },
5981 },
5982 {
5983 "LD_ABS word negative offset, out of bounds load",
5984 .u.insns = {
5985 BPF_STMT(BPF_LD | BPF_ABS | BPF_W, -1),
5986 BPF_STMT(BPF_RET | BPF_A, 0x0),
5987 },
5988 CLASSIC | FLAG_EXPECTED_FAIL,
5989 .expected_errcode = -EINVAL,
5990 },
5991 {
5992 "LD_ABS word negative offset, in bounds",
5993 .u.insns = {
5994 BPF_STMT(BPF_LD | BPF_ABS | BPF_W, SKF_LL_OFF + 0x3c),
5995 BPF_STMT(BPF_RET | BPF_A, 0x0),
5996 },
5997 CLASSIC,
5998 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
5999 { {0x40, 0x25051982 }, },
6000 },
6001 {
6002 "LD_ABS word negative offset, out of bounds",
6003 .u.insns = {
6004 BPF_STMT(BPF_LD | BPF_ABS | BPF_W, SKF_LL_OFF + 0x3c),
6005 BPF_STMT(BPF_RET | BPF_A, 0x0),
6006 },
6007 CLASSIC,
6008 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
6009 { {0x3f, 0 }, },
6010 },
6011 {
6012 "LDX_MSH standalone, preserved A",
6013 .u.insns = {
6014 BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
6015 BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c),
6016 BPF_STMT(BPF_RET | BPF_A, 0x0),
6017 },
6018 CLASSIC,
6019 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
6020 { {0x40, 0xffeebbaa }, },
6021 },
6022 {
6023 "LDX_MSH standalone, preserved A 2",
6024 .u.insns = {
6025 BPF_STMT(BPF_LD | BPF_IMM, 0x175e9d63),
6026 BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c),
6027 BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3d),
6028 BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3e),
6029 BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3f),
6030 BPF_STMT(BPF_RET | BPF_A, 0x0),
6031 },
6032 CLASSIC,
6033 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
6034 { {0x40, 0x175e9d63 }, },
6035 },
6036 {
6037 "LDX_MSH standalone, test result 1",
6038 .u.insns = {
6039 BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
6040 BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c),
6041 BPF_STMT(BPF_MISC | BPF_TXA, 0),
6042 BPF_STMT(BPF_RET | BPF_A, 0x0),
6043 },
6044 CLASSIC,
6045 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
6046 { {0x40, 0x14 }, },
6047 },
6048 {
6049 "LDX_MSH standalone, test result 2",
6050 .u.insns = {
6051 BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
6052 BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3e),
6053 BPF_STMT(BPF_MISC | BPF_TXA, 0),
6054 BPF_STMT(BPF_RET | BPF_A, 0x0),
6055 },
6056 CLASSIC,
6057 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
6058 { {0x40, 0x24 }, },
6059 },
6060 {
6061 "LDX_MSH standalone, negative offset",
6062 .u.insns = {
6063 BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
6064 BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, -1),
6065 BPF_STMT(BPF_MISC | BPF_TXA, 0),
6066 BPF_STMT(BPF_RET | BPF_A, 0x0),
6067 },
6068 CLASSIC,
6069 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
6070 { {0x40, 0 }, },
6071 },
6072 {
6073 "LDX_MSH standalone, negative offset 2",
6074 .u.insns = {
6075 BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
6076 BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, SKF_LL_OFF + 0x3e),
6077 BPF_STMT(BPF_MISC | BPF_TXA, 0),
6078 BPF_STMT(BPF_RET | BPF_A, 0x0),
6079 },
6080 CLASSIC,
6081 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
6082 { {0x40, 0x24 }, },
6083 },
6084 {
6085 "LDX_MSH standalone, out of bounds",
6086 .u.insns = {
6087 BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
6088 BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x40),
6089 BPF_STMT(BPF_MISC | BPF_TXA, 0),
6090 BPF_STMT(BPF_RET | BPF_A, 0x0),
6091 },
6092 CLASSIC,
6093 { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 },
6094 { {0x40, 0 }, },
6095 },
5942 /* 6096 /*
5943 * verify that the interpreter or JIT correctly sets A and X 6097 * verify that the interpreter or JIT correctly sets A and X
5944 * to 0. 6098 * to 0.
@@ -6127,14 +6281,6 @@ static struct bpf_test tests[] = {
6127 {}, 6281 {},
6128 { {0x1, 0x42 } }, 6282 { {0x1, 0x42 } },
6129 }, 6283 },
6130 {
6131 "LD_ABS with helper changing skb data",
6132 { },
6133 INTERNAL,
6134 { 0x34 },
6135 { { ETH_HLEN, 42 } },
6136 .fill_helper = bpf_fill_ld_abs_vlan_push_pop2,
6137 },
6138 /* Checking interpreter vs JIT wrt signed extended imms. */ 6284 /* Checking interpreter vs JIT wrt signed extended imms. */
6139 { 6285 {
6140 "JNE signed compare, test 1", 6286 "JNE signed compare, test 1",
diff --git a/net/Kconfig b/net/Kconfig
index b62089fb1332..df8d45ef47d8 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -59,6 +59,7 @@ source "net/tls/Kconfig"
59source "net/xfrm/Kconfig" 59source "net/xfrm/Kconfig"
60source "net/iucv/Kconfig" 60source "net/iucv/Kconfig"
61source "net/smc/Kconfig" 61source "net/smc/Kconfig"
62source "net/xdp/Kconfig"
62 63
63config INET 64config INET
64 bool "TCP/IP networking" 65 bool "TCP/IP networking"
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..77aaddedbd29 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -85,3 +85,4 @@ obj-y += l3mdev/
85endif 85endif
86obj-$(CONFIG_QRTR) += qrtr/ 86obj-$(CONFIG_QRTR) += qrtr/
87obj-$(CONFIG_NET_NCSI) += ncsi/ 87obj-$(CONFIG_NET_NCSI) += ncsi/
88obj-$(CONFIG_XDP_SOCKETS) += xdp/
diff --git a/net/core/dev.c b/net/core/dev.c
index bb81a6e1d354..29bf39174900 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3627,6 +3627,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3627} 3627}
3628EXPORT_SYMBOL(dev_queue_xmit_accel); 3628EXPORT_SYMBOL(dev_queue_xmit_accel);
3629 3629
3630int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
3631{
3632 struct net_device *dev = skb->dev;
3633 struct sk_buff *orig_skb = skb;
3634 struct netdev_queue *txq;
3635 int ret = NETDEV_TX_BUSY;
3636 bool again = false;
3637
3638 if (unlikely(!netif_running(dev) ||
3639 !netif_carrier_ok(dev)))
3640 goto drop;
3641
3642 skb = validate_xmit_skb_list(skb, dev, &again);
3643 if (skb != orig_skb)
3644 goto drop;
3645
3646 skb_set_queue_mapping(skb, queue_id);
3647 txq = skb_get_tx_queue(dev, skb);
3648
3649 local_bh_disable();
3650
3651 HARD_TX_LOCK(dev, txq, smp_processor_id());
3652 if (!netif_xmit_frozen_or_drv_stopped(txq))
3653 ret = netdev_start_xmit(skb, dev, txq, false);
3654 HARD_TX_UNLOCK(dev, txq);
3655
3656 local_bh_enable();
3657
3658 if (!dev_xmit_complete(ret))
3659 kfree_skb(skb);
3660
3661 return ret;
3662drop:
3663 atomic_long_inc(&dev->tx_dropped);
3664 kfree_skb_list(skb);
3665 return NET_XMIT_DROP;
3666}
3667EXPORT_SYMBOL(dev_direct_xmit);
3630 3668
3631/************************************************************************* 3669/*************************************************************************
3632 * Receiver routines 3670 * Receiver routines
@@ -3996,12 +4034,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
3996} 4034}
3997 4035
3998static u32 netif_receive_generic_xdp(struct sk_buff *skb, 4036static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4037 struct xdp_buff *xdp,
3999 struct bpf_prog *xdp_prog) 4038 struct bpf_prog *xdp_prog)
4000{ 4039{
4001 struct netdev_rx_queue *rxqueue; 4040 struct netdev_rx_queue *rxqueue;
4002 void *orig_data, *orig_data_end; 4041 void *orig_data, *orig_data_end;
4003 u32 metalen, act = XDP_DROP; 4042 u32 metalen, act = XDP_DROP;
4004 struct xdp_buff xdp;
4005 int hlen, off; 4043 int hlen, off;
4006 u32 mac_len; 4044 u32 mac_len;
4007 4045
@@ -4036,19 +4074,19 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4036 */ 4074 */
4037 mac_len = skb->data - skb_mac_header(skb); 4075 mac_len = skb->data - skb_mac_header(skb);
4038 hlen = skb_headlen(skb) + mac_len; 4076 hlen = skb_headlen(skb) + mac_len;
4039 xdp.data = skb->data - mac_len; 4077 xdp->data = skb->data - mac_len;
4040 xdp.data_meta = xdp.data; 4078 xdp->data_meta = xdp->data;
4041 xdp.data_end = xdp.data + hlen; 4079 xdp->data_end = xdp->data + hlen;
4042 xdp.data_hard_start = skb->data - skb_headroom(skb); 4080 xdp->data_hard_start = skb->data - skb_headroom(skb);
4043 orig_data_end = xdp.data_end; 4081 orig_data_end = xdp->data_end;
4044 orig_data = xdp.data; 4082 orig_data = xdp->data;
4045 4083
4046 rxqueue = netif_get_rxqueue(skb); 4084 rxqueue = netif_get_rxqueue(skb);
4047 xdp.rxq = &rxqueue->xdp_rxq; 4085 xdp->rxq = &rxqueue->xdp_rxq;
4048 4086
4049 act = bpf_prog_run_xdp(xdp_prog, &xdp); 4087 act = bpf_prog_run_xdp(xdp_prog, xdp);
4050 4088
4051 off = xdp.data - orig_data; 4089 off = xdp->data - orig_data;
4052 if (off > 0) 4090 if (off > 0)
4053 __skb_pull(skb, off); 4091 __skb_pull(skb, off);
4054 else if (off < 0) 4092 else if (off < 0)
@@ -4058,10 +4096,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4058 /* check if bpf_xdp_adjust_tail was used. it can only "shrink" 4096 /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4059 * pckt. 4097 * pckt.
4060 */ 4098 */
4061 off = orig_data_end - xdp.data_end; 4099 off = orig_data_end - xdp->data_end;
4062 if (off != 0) { 4100 if (off != 0) {
4063 skb_set_tail_pointer(skb, xdp.data_end - xdp.data); 4101 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4064 skb->len -= off; 4102 skb->len -= off;
4103
4065 } 4104 }
4066 4105
4067 switch (act) { 4106 switch (act) {
@@ -4070,7 +4109,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4070 __skb_push(skb, mac_len); 4109 __skb_push(skb, mac_len);
4071 break; 4110 break;
4072 case XDP_PASS: 4111 case XDP_PASS:
4073 metalen = xdp.data - xdp.data_meta; 4112 metalen = xdp->data - xdp->data_meta;
4074 if (metalen) 4113 if (metalen)
4075 skb_metadata_set(skb, metalen); 4114 skb_metadata_set(skb, metalen);
4076 break; 4115 break;
@@ -4120,17 +4159,19 @@ static struct static_key generic_xdp_needed __read_mostly;
4120int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) 4159int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4121{ 4160{
4122 if (xdp_prog) { 4161 if (xdp_prog) {
4123 u32 act = netif_receive_generic_xdp(skb, xdp_prog); 4162 struct xdp_buff xdp;
4163 u32 act;
4124 int err; 4164 int err;
4125 4165
4166 act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4126 if (act != XDP_PASS) { 4167 if (act != XDP_PASS) {
4127 switch (act) { 4168 switch (act) {
4128 case XDP_REDIRECT: 4169 case XDP_REDIRECT:
4129 err = xdp_do_generic_redirect(skb->dev, skb, 4170 err = xdp_do_generic_redirect(skb->dev, skb,
4130 xdp_prog); 4171 &xdp, xdp_prog);
4131 if (err) 4172 if (err)
4132 goto out_redir; 4173 goto out_redir;
4133 /* fallthru to submit skb */ 4174 break;
4134 case XDP_TX: 4175 case XDP_TX:
4135 generic_xdp_tx(skb, xdp_prog); 4176 generic_xdp_tx(skb, xdp_prog);
4136 break; 4177 break;
diff --git a/net/core/filter.c b/net/core/filter.c
index d3781daa26ab..6877426c23a6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -59,6 +59,7 @@
59#include <net/tcp.h> 59#include <net/tcp.h>
60#include <net/xfrm.h> 60#include <net/xfrm.h>
61#include <linux/bpf_trace.h> 61#include <linux/bpf_trace.h>
62#include <net/xdp_sock.h>
62 63
63/** 64/**
64 * sk_filter_trim_cap - run a packet through a socket filter 65 * sk_filter_trim_cap - run a packet through a socket filter
@@ -112,12 +113,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
112} 113}
113EXPORT_SYMBOL(sk_filter_trim_cap); 114EXPORT_SYMBOL(sk_filter_trim_cap);
114 115
115BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) 116BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
116{ 117{
117 return skb_get_poff(skb); 118 return skb_get_poff(skb);
118} 119}
119 120
120BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) 121BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
121{ 122{
122 struct nlattr *nla; 123 struct nlattr *nla;
123 124
@@ -137,7 +138,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
137 return 0; 138 return 0;
138} 139}
139 140
140BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) 141BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
141{ 142{
142 struct nlattr *nla; 143 struct nlattr *nla;
143 144
@@ -161,13 +162,94 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
161 return 0; 162 return 0;
162} 163}
163 164
164BPF_CALL_0(__get_raw_cpu_id) 165BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
166 data, int, headlen, int, offset)
167{
168 u8 tmp, *ptr;
169 const int len = sizeof(tmp);
170
171 if (offset >= 0) {
172 if (headlen - offset >= len)
173 return *(u8 *)(data + offset);
174 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
175 return tmp;
176 } else {
177 ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
178 if (likely(ptr))
179 return *(u8 *)ptr;
180 }
181
182 return -EFAULT;
183}
184
185BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
186 int, offset)
187{
188 return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
189 offset);
190}
191
192BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
193 data, int, headlen, int, offset)
194{
195 u16 tmp, *ptr;
196 const int len = sizeof(tmp);
197
198 if (offset >= 0) {
199 if (headlen - offset >= len)
200 return get_unaligned_be16(data + offset);
201 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
202 return be16_to_cpu(tmp);
203 } else {
204 ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
205 if (likely(ptr))
206 return get_unaligned_be16(ptr);
207 }
208
209 return -EFAULT;
210}
211
212BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
213 int, offset)
214{
215 return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
216 offset);
217}
218
219BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
220 data, int, headlen, int, offset)
221{
222 u32 tmp, *ptr;
223 const int len = sizeof(tmp);
224
225 if (likely(offset >= 0)) {
226 if (headlen - offset >= len)
227 return get_unaligned_be32(data + offset);
228 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
229 return be32_to_cpu(tmp);
230 } else {
231 ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
232 if (likely(ptr))
233 return get_unaligned_be32(ptr);
234 }
235
236 return -EFAULT;
237}
238
239BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
240 int, offset)
241{
242 return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
243 offset);
244}
245
246BPF_CALL_0(bpf_get_raw_cpu_id)
165{ 247{
166 return raw_smp_processor_id(); 248 return raw_smp_processor_id();
167} 249}
168 250
169static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { 251static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
170 .func = __get_raw_cpu_id, 252 .func = bpf_get_raw_cpu_id,
171 .gpl_only = false, 253 .gpl_only = false,
172 .ret_type = RET_INTEGER, 254 .ret_type = RET_INTEGER,
173}; 255};
@@ -317,16 +399,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
317 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 399 /* Emit call(arg1=CTX, arg2=A, arg3=X) */
318 switch (fp->k) { 400 switch (fp->k) {
319 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 401 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
320 *insn = BPF_EMIT_CALL(__skb_get_pay_offset); 402 *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
321 break; 403 break;
322 case SKF_AD_OFF + SKF_AD_NLATTR: 404 case SKF_AD_OFF + SKF_AD_NLATTR:
323 *insn = BPF_EMIT_CALL(__skb_get_nlattr); 405 *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
324 break; 406 break;
325 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 407 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
326 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); 408 *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
327 break; 409 break;
328 case SKF_AD_OFF + SKF_AD_CPU: 410 case SKF_AD_OFF + SKF_AD_CPU:
329 *insn = BPF_EMIT_CALL(__get_raw_cpu_id); 411 *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
330 break; 412 break;
331 case SKF_AD_OFF + SKF_AD_RANDOM: 413 case SKF_AD_OFF + SKF_AD_RANDOM:
332 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); 414 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
@@ -353,26 +435,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
353 return true; 435 return true;
354} 436}
355 437
438static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
439{
440 const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
441 int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
442 bool endian = BPF_SIZE(fp->code) == BPF_H ||
443 BPF_SIZE(fp->code) == BPF_W;
444 bool indirect = BPF_MODE(fp->code) == BPF_IND;
445 const int ip_align = NET_IP_ALIGN;
446 struct bpf_insn *insn = *insnp;
447 int offset = fp->k;
448
449 if (!indirect &&
450 ((unaligned_ok && offset >= 0) ||
451 (!unaligned_ok && offset >= 0 &&
452 offset + ip_align >= 0 &&
453 offset + ip_align % size == 0))) {
454 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
455 *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
456 *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian);
457 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D,
458 offset);
459 if (endian)
460 *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
461 *insn++ = BPF_JMP_A(8);
462 }
463
464 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
465 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
466 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
467 if (!indirect) {
468 *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
469 } else {
470 *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
471 if (fp->k)
472 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
473 }
474
475 switch (BPF_SIZE(fp->code)) {
476 case BPF_B:
477 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
478 break;
479 case BPF_H:
480 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
481 break;
482 case BPF_W:
483 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
484 break;
485 default:
486 return false;
487 }
488
489 *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
490 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
491 *insn = BPF_EXIT_INSN();
492
493 *insnp = insn;
494 return true;
495}
496
356/** 497/**
357 * bpf_convert_filter - convert filter program 498 * bpf_convert_filter - convert filter program
358 * @prog: the user passed filter program 499 * @prog: the user passed filter program
359 * @len: the length of the user passed filter program 500 * @len: the length of the user passed filter program
360 * @new_prog: allocated 'struct bpf_prog' or NULL 501 * @new_prog: allocated 'struct bpf_prog' or NULL
361 * @new_len: pointer to store length of converted program 502 * @new_len: pointer to store length of converted program
503 * @seen_ld_abs: bool whether we've seen ld_abs/ind
362 * 504 *
363 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' 505 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
364 * style extended BPF (eBPF). 506 * style extended BPF (eBPF).
365 * Conversion workflow: 507 * Conversion workflow:
366 * 508 *
367 * 1) First pass for calculating the new program length: 509 * 1) First pass for calculating the new program length:
368 * bpf_convert_filter(old_prog, old_len, NULL, &new_len) 510 * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
369 * 511 *
370 * 2) 2nd pass to remap in two passes: 1st pass finds new 512 * 2) 2nd pass to remap in two passes: 1st pass finds new
371 * jump offsets, 2nd pass remapping: 513 * jump offsets, 2nd pass remapping:
372 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); 514 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
373 */ 515 */
374static int bpf_convert_filter(struct sock_filter *prog, int len, 516static int bpf_convert_filter(struct sock_filter *prog, int len,
375 struct bpf_prog *new_prog, int *new_len) 517 struct bpf_prog *new_prog, int *new_len,
518 bool *seen_ld_abs)
376{ 519{
377 int new_flen = 0, pass = 0, target, i, stack_off; 520 int new_flen = 0, pass = 0, target, i, stack_off;
378 struct bpf_insn *new_insn, *first_insn = NULL; 521 struct bpf_insn *new_insn, *first_insn = NULL;
@@ -411,12 +554,27 @@ do_pass:
411 * do this ourself. Initial CTX is present in BPF_REG_ARG1. 554 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
412 */ 555 */
413 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 556 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
557 if (*seen_ld_abs) {
558 /* For packet access in classic BPF, cache skb->data
559 * in callee-saved BPF R8 and skb->len - skb->data_len
560 * (headlen) in BPF R9. Since classic BPF is read-only
561 * on CTX, we only need to cache it once.
562 */
563 *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
564 BPF_REG_D, BPF_REG_CTX,
565 offsetof(struct sk_buff, data));
566 *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
567 offsetof(struct sk_buff, len));
568 *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
569 offsetof(struct sk_buff, data_len));
570 *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
571 }
414 } else { 572 } else {
415 new_insn += 3; 573 new_insn += 3;
416 } 574 }
417 575
418 for (i = 0; i < len; fp++, i++) { 576 for (i = 0; i < len; fp++, i++) {
419 struct bpf_insn tmp_insns[6] = { }; 577 struct bpf_insn tmp_insns[32] = { };
420 struct bpf_insn *insn = tmp_insns; 578 struct bpf_insn *insn = tmp_insns;
421 579
422 if (addrs) 580 if (addrs)
@@ -459,6 +617,11 @@ do_pass:
459 BPF_MODE(fp->code) == BPF_ABS && 617 BPF_MODE(fp->code) == BPF_ABS &&
460 convert_bpf_extensions(fp, &insn)) 618 convert_bpf_extensions(fp, &insn))
461 break; 619 break;
620 if (BPF_CLASS(fp->code) == BPF_LD &&
621 convert_bpf_ld_abs(fp, &insn)) {
622 *seen_ld_abs = true;
623 break;
624 }
462 625
463 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || 626 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
464 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { 627 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
@@ -561,21 +724,31 @@ jmp_rest:
561 break; 724 break;
562 725
563 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ 726 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
564 case BPF_LDX | BPF_MSH | BPF_B: 727 case BPF_LDX | BPF_MSH | BPF_B: {
565 /* tmp = A */ 728 struct sock_filter tmp = {
566 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); 729 .code = BPF_LD | BPF_ABS | BPF_B,
730 .k = fp->k,
731 };
732
733 *seen_ld_abs = true;
734
735 /* X = A */
736 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
567 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 737 /* A = BPF_R0 = *(u8 *) (skb->data + K) */
568 *insn++ = BPF_LD_ABS(BPF_B, fp->k); 738 convert_bpf_ld_abs(&tmp, &insn);
739 insn++;
569 /* A &= 0xf */ 740 /* A &= 0xf */
570 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 741 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
571 /* A <<= 2 */ 742 /* A <<= 2 */
572 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 743 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
744 /* tmp = X */
745 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
573 /* X = A */ 746 /* X = A */
574 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 747 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
575 /* A = tmp */ 748 /* A = tmp */
576 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 749 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
577 break; 750 break;
578 751 }
579 /* RET_K is remaped into 2 insns. RET_A case doesn't need an 752 /* RET_K is remaped into 2 insns. RET_A case doesn't need an
580 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. 753 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
581 */ 754 */
@@ -657,6 +830,8 @@ jmp_rest:
657 if (!new_prog) { 830 if (!new_prog) {
658 /* Only calculating new length. */ 831 /* Only calculating new length. */
659 *new_len = new_insn - first_insn; 832 *new_len = new_insn - first_insn;
833 if (*seen_ld_abs)
834 *new_len += 4; /* Prologue bits. */
660 return 0; 835 return 0;
661 } 836 }
662 837
@@ -1018,6 +1193,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1018 struct sock_filter *old_prog; 1193 struct sock_filter *old_prog;
1019 struct bpf_prog *old_fp; 1194 struct bpf_prog *old_fp;
1020 int err, new_len, old_len = fp->len; 1195 int err, new_len, old_len = fp->len;
1196 bool seen_ld_abs = false;
1021 1197
1022 /* We are free to overwrite insns et al right here as it 1198 /* We are free to overwrite insns et al right here as it
1023 * won't be used at this point in time anymore internally 1199 * won't be used at this point in time anymore internally
@@ -1039,7 +1215,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1039 } 1215 }
1040 1216
1041 /* 1st pass: calculate the new program length. */ 1217 /* 1st pass: calculate the new program length. */
1042 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); 1218 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
1219 &seen_ld_abs);
1043 if (err) 1220 if (err)
1044 goto out_err_free; 1221 goto out_err_free;
1045 1222
@@ -1058,7 +1235,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1058 fp->len = new_len; 1235 fp->len = new_len;
1059 1236
1060 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 1237 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1061 err = bpf_convert_filter(old_prog, old_len, fp, &new_len); 1238 err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
1239 &seen_ld_abs);
1062 if (err) 1240 if (err)
1063 /* 2nd bpf_convert_filter() can fail only if it fails 1241 /* 2nd bpf_convert_filter() can fail only if it fails
1064 * to allocate memory, remapping must succeed. Note, 1242 * to allocate memory, remapping must succeed. Note,
@@ -1506,6 +1684,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1506 .arg4_type = ARG_CONST_SIZE, 1684 .arg4_type = ARG_CONST_SIZE,
1507}; 1685};
1508 1686
1687BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
1688 u32, offset, void *, to, u32, len, u32, start_header)
1689{
1690 u8 *ptr;
1691
1692 if (unlikely(offset > 0xffff || len > skb_headlen(skb)))
1693 goto err_clear;
1694
1695 switch (start_header) {
1696 case BPF_HDR_START_MAC:
1697 ptr = skb_mac_header(skb) + offset;
1698 break;
1699 case BPF_HDR_START_NET:
1700 ptr = skb_network_header(skb) + offset;
1701 break;
1702 default:
1703 goto err_clear;
1704 }
1705
1706 if (likely(ptr >= skb_mac_header(skb) &&
1707 ptr + len <= skb_tail_pointer(skb))) {
1708 memcpy(to, ptr, len);
1709 return 0;
1710 }
1711
1712err_clear:
1713 memset(to, 0, len);
1714 return -EFAULT;
1715}
1716
1717static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
1718 .func = bpf_skb_load_bytes_relative,
1719 .gpl_only = false,
1720 .ret_type = RET_INTEGER,
1721 .arg1_type = ARG_PTR_TO_CTX,
1722 .arg2_type = ARG_ANYTHING,
1723 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
1724 .arg4_type = ARG_CONST_SIZE,
1725 .arg5_type = ARG_ANYTHING,
1726};
1727
1509BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) 1728BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
1510{ 1729{
1511 /* Idea is the following: should the needed direct read/write 1730 /* Idea is the following: should the needed direct read/write
@@ -2180,7 +2399,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
2180 return ret; 2399 return ret;
2181} 2400}
2182 2401
2183const struct bpf_func_proto bpf_skb_vlan_push_proto = { 2402static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
2184 .func = bpf_skb_vlan_push, 2403 .func = bpf_skb_vlan_push,
2185 .gpl_only = false, 2404 .gpl_only = false,
2186 .ret_type = RET_INTEGER, 2405 .ret_type = RET_INTEGER,
@@ -2188,7 +2407,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
2188 .arg2_type = ARG_ANYTHING, 2407 .arg2_type = ARG_ANYTHING,
2189 .arg3_type = ARG_ANYTHING, 2408 .arg3_type = ARG_ANYTHING,
2190}; 2409};
2191EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
2192 2410
2193BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) 2411BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
2194{ 2412{
@@ -2202,13 +2420,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
2202 return ret; 2420 return ret;
2203} 2421}
2204 2422
2205const struct bpf_func_proto bpf_skb_vlan_pop_proto = { 2423static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
2206 .func = bpf_skb_vlan_pop, 2424 .func = bpf_skb_vlan_pop,
2207 .gpl_only = false, 2425 .gpl_only = false,
2208 .ret_type = RET_INTEGER, 2426 .ret_type = RET_INTEGER,
2209 .arg1_type = ARG_PTR_TO_CTX, 2427 .arg1_type = ARG_PTR_TO_CTX,
2210}; 2428};
2211EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
2212 2429
2213static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) 2430static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
2214{ 2431{
@@ -2801,7 +3018,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
2801{ 3018{
2802 int err; 3019 int err;
2803 3020
2804 if (map->map_type == BPF_MAP_TYPE_DEVMAP) { 3021 switch (map->map_type) {
3022 case BPF_MAP_TYPE_DEVMAP: {
2805 struct net_device *dev = fwd; 3023 struct net_device *dev = fwd;
2806 struct xdp_frame *xdpf; 3024 struct xdp_frame *xdpf;
2807 3025
@@ -2819,14 +3037,25 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
2819 if (err) 3037 if (err)
2820 return err; 3038 return err;
2821 __dev_map_insert_ctx(map, index); 3039 __dev_map_insert_ctx(map, index);
2822 3040 break;
2823 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { 3041 }
3042 case BPF_MAP_TYPE_CPUMAP: {
2824 struct bpf_cpu_map_entry *rcpu = fwd; 3043 struct bpf_cpu_map_entry *rcpu = fwd;
2825 3044
2826 err = cpu_map_enqueue(rcpu, xdp, dev_rx); 3045 err = cpu_map_enqueue(rcpu, xdp, dev_rx);
2827 if (err) 3046 if (err)
2828 return err; 3047 return err;
2829 __cpu_map_insert_ctx(map, index); 3048 __cpu_map_insert_ctx(map, index);
3049 break;
3050 }
3051 case BPF_MAP_TYPE_XSKMAP: {
3052 struct xdp_sock *xs = fwd;
3053
3054 err = __xsk_map_redirect(map, xdp, xs);
3055 return err;
3056 }
3057 default:
3058 break;
2830 } 3059 }
2831 return 0; 3060 return 0;
2832} 3061}
@@ -2845,6 +3074,9 @@ void xdp_do_flush_map(void)
2845 case BPF_MAP_TYPE_CPUMAP: 3074 case BPF_MAP_TYPE_CPUMAP:
2846 __cpu_map_flush(map); 3075 __cpu_map_flush(map);
2847 break; 3076 break;
3077 case BPF_MAP_TYPE_XSKMAP:
3078 __xsk_map_flush(map);
3079 break;
2848 default: 3080 default:
2849 break; 3081 break;
2850 } 3082 }
@@ -2859,6 +3091,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
2859 return __dev_map_lookup_elem(map, index); 3091 return __dev_map_lookup_elem(map, index);
2860 case BPF_MAP_TYPE_CPUMAP: 3092 case BPF_MAP_TYPE_CPUMAP:
2861 return __cpu_map_lookup_elem(map, index); 3093 return __cpu_map_lookup_elem(map, index);
3094 case BPF_MAP_TYPE_XSKMAP:
3095 return __xsk_map_lookup_elem(map, index);
2862 default: 3096 default:
2863 return NULL; 3097 return NULL;
2864 } 3098 }
@@ -2956,13 +3190,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
2956 3190
2957static int xdp_do_generic_redirect_map(struct net_device *dev, 3191static int xdp_do_generic_redirect_map(struct net_device *dev,
2958 struct sk_buff *skb, 3192 struct sk_buff *skb,
3193 struct xdp_buff *xdp,
2959 struct bpf_prog *xdp_prog) 3194 struct bpf_prog *xdp_prog)
2960{ 3195{
2961 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3196 struct redirect_info *ri = this_cpu_ptr(&redirect_info);
2962 unsigned long map_owner = ri->map_owner; 3197 unsigned long map_owner = ri->map_owner;
2963 struct bpf_map *map = ri->map; 3198 struct bpf_map *map = ri->map;
2964 struct net_device *fwd = NULL;
2965 u32 index = ri->ifindex; 3199 u32 index = ri->ifindex;
3200 void *fwd = NULL;
2966 int err = 0; 3201 int err = 0;
2967 3202
2968 ri->ifindex = 0; 3203 ri->ifindex = 0;
@@ -2984,6 +3219,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
2984 if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) 3219 if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
2985 goto err; 3220 goto err;
2986 skb->dev = fwd; 3221 skb->dev = fwd;
3222 generic_xdp_tx(skb, xdp_prog);
3223 } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
3224 struct xdp_sock *xs = fwd;
3225
3226 err = xsk_generic_rcv(xs, xdp);
3227 if (err)
3228 goto err;
3229 consume_skb(skb);
2987 } else { 3230 } else {
2988 /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ 3231 /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
2989 err = -EBADRQC; 3232 err = -EBADRQC;
@@ -2998,7 +3241,7 @@ err:
2998} 3241}
2999 3242
3000int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, 3243int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
3001 struct bpf_prog *xdp_prog) 3244 struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
3002{ 3245{
3003 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3246 struct redirect_info *ri = this_cpu_ptr(&redirect_info);
3004 u32 index = ri->ifindex; 3247 u32 index = ri->ifindex;
@@ -3006,7 +3249,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
3006 int err = 0; 3249 int err = 0;
3007 3250
3008 if (ri->map) 3251 if (ri->map)
3009 return xdp_do_generic_redirect_map(dev, skb, xdp_prog); 3252 return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog);
3010 3253
3011 ri->ifindex = 0; 3254 ri->ifindex = 0;
3012 fwd = dev_get_by_index_rcu(dev_net(dev), index); 3255 fwd = dev_get_by_index_rcu(dev_net(dev), index);
@@ -3020,6 +3263,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
3020 3263
3021 skb->dev = fwd; 3264 skb->dev = fwd;
3022 _trace_xdp_redirect(dev, xdp_prog, index); 3265 _trace_xdp_redirect(dev, xdp_prog, index);
3266 generic_xdp_tx(skb, xdp_prog);
3023 return 0; 3267 return 0;
3024err: 3268err:
3025 _trace_xdp_redirect_err(dev, xdp_prog, index, err); 3269 _trace_xdp_redirect_err(dev, xdp_prog, index, err);
@@ -3858,6 +4102,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3858 switch (func_id) { 4102 switch (func_id) {
3859 case BPF_FUNC_skb_load_bytes: 4103 case BPF_FUNC_skb_load_bytes:
3860 return &bpf_skb_load_bytes_proto; 4104 return &bpf_skb_load_bytes_proto;
4105 case BPF_FUNC_skb_load_bytes_relative:
4106 return &bpf_skb_load_bytes_relative_proto;
3861 case BPF_FUNC_get_socket_cookie: 4107 case BPF_FUNC_get_socket_cookie:
3862 return &bpf_get_socket_cookie_proto; 4108 return &bpf_get_socket_cookie_proto;
3863 case BPF_FUNC_get_socket_uid: 4109 case BPF_FUNC_get_socket_uid:
@@ -3875,6 +4121,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3875 return &bpf_skb_store_bytes_proto; 4121 return &bpf_skb_store_bytes_proto;
3876 case BPF_FUNC_skb_load_bytes: 4122 case BPF_FUNC_skb_load_bytes:
3877 return &bpf_skb_load_bytes_proto; 4123 return &bpf_skb_load_bytes_proto;
4124 case BPF_FUNC_skb_load_bytes_relative:
4125 return &bpf_skb_load_bytes_relative_proto;
3878 case BPF_FUNC_skb_pull_data: 4126 case BPF_FUNC_skb_pull_data:
3879 return &bpf_skb_pull_data_proto; 4127 return &bpf_skb_pull_data_proto;
3880 case BPF_FUNC_csum_diff: 4128 case BPF_FUNC_csum_diff:
@@ -4304,6 +4552,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
4304 return insn - insn_buf; 4552 return insn - insn_buf;
4305} 4553}
4306 4554
4555static int bpf_gen_ld_abs(const struct bpf_insn *orig,
4556 struct bpf_insn *insn_buf)
4557{
4558 bool indirect = BPF_MODE(orig->code) == BPF_IND;
4559 struct bpf_insn *insn = insn_buf;
4560
4561 /* We're guaranteed here that CTX is in R6. */
4562 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
4563 if (!indirect) {
4564 *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
4565 } else {
4566 *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
4567 if (orig->imm)
4568 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
4569 }
4570
4571 switch (BPF_SIZE(orig->code)) {
4572 case BPF_B:
4573 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
4574 break;
4575 case BPF_H:
4576 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
4577 break;
4578 case BPF_W:
4579 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
4580 break;
4581 }
4582
4583 *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
4584 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
4585 *insn++ = BPF_EXIT_INSN();
4586
4587 return insn - insn_buf;
4588}
4589
4307static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, 4590static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
4308 const struct bpf_prog *prog) 4591 const struct bpf_prog *prog)
4309{ 4592{
@@ -5573,6 +5856,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {
5573 .get_func_proto = sk_filter_func_proto, 5856 .get_func_proto = sk_filter_func_proto,
5574 .is_valid_access = sk_filter_is_valid_access, 5857 .is_valid_access = sk_filter_is_valid_access,
5575 .convert_ctx_access = bpf_convert_ctx_access, 5858 .convert_ctx_access = bpf_convert_ctx_access,
5859 .gen_ld_abs = bpf_gen_ld_abs,
5576}; 5860};
5577 5861
5578const struct bpf_prog_ops sk_filter_prog_ops = { 5862const struct bpf_prog_ops sk_filter_prog_ops = {
@@ -5584,6 +5868,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
5584 .is_valid_access = tc_cls_act_is_valid_access, 5868 .is_valid_access = tc_cls_act_is_valid_access,
5585 .convert_ctx_access = tc_cls_act_convert_ctx_access, 5869 .convert_ctx_access = tc_cls_act_convert_ctx_access,
5586 .gen_prologue = tc_cls_act_prologue, 5870 .gen_prologue = tc_cls_act_prologue,
5871 .gen_ld_abs = bpf_gen_ld_abs,
5587}; 5872};
5588 5873
5589const struct bpf_prog_ops tc_cls_act_prog_ops = { 5874const struct bpf_prog_ops tc_cls_act_prog_ops = {
diff --git a/net/core/sock.c b/net/core/sock.c
index b2c3db169ca1..e7d8b6c955c6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX" 229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
230 x "AF_MAX"
230 231
231static const char *const af_family_key_strings[AF_MAX+1] = { 232static const char *const af_family_key_strings[AF_MAX+1] = {
232 _sock_locks("sk_lock-") 233 _sock_locks("sk_lock-")
@@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , 263 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
263 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , 264 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
264 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , 265 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
265 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" 266 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
267 "rlock-AF_MAX"
266}; 268};
267static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 269static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
268 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , 270 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
@@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
279 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , 281 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
280 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , 282 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
281 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , 283 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
282 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" 284 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
285 "wlock-AF_MAX"
283}; 286};
284static const char *const af_family_elock_key_strings[AF_MAX+1] = { 287static const char *const af_family_elock_key_strings[AF_MAX+1] = {
285 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , 288 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
@@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = {
296 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , 299 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
297 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , 300 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
298 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , 301 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
299 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" 302 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
303 "elock-AF_MAX"
300}; 304};
301 305
302/* 306/*
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 0c86b53a3a63..bf6758f74339 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,11 +308,9 @@ err:
308} 308}
309EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); 309EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
310 310
311void xdp_return_frame(struct xdp_frame *xdpf) 311static void xdp_return(void *data, struct xdp_mem_info *mem)
312{ 312{
313 struct xdp_mem_info *mem = &xdpf->mem;
314 struct xdp_mem_allocator *xa; 313 struct xdp_mem_allocator *xa;
315 void *data = xdpf->data;
316 struct page *page; 314 struct page *page;
317 315
318 switch (mem->type) { 316 switch (mem->type) {
@@ -339,4 +337,15 @@ void xdp_return_frame(struct xdp_frame *xdpf)
339 break; 337 break;
340 } 338 }
341} 339}
340
341void xdp_return_frame(struct xdp_frame *xdpf)
342{
343 xdp_return(xdpf->data, &xdpf->mem);
344}
342EXPORT_SYMBOL_GPL(xdp_return_frame); 345EXPORT_SYMBOL_GPL(xdp_return_frame);
346
347void xdp_return_buff(struct xdp_buff *xdp)
348{
349 xdp_return(xdp->data, &xdp->rxq->mem);
350}
351EXPORT_SYMBOL_GPL(xdp_return_buff);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 01f3515cada0..611a26d5235c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -209,7 +209,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *,
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *, 209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *); 210 struct tpacket3_hdr *);
211static void packet_flush_mclist(struct sock *sk); 211static void packet_flush_mclist(struct sock *sk);
212static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb); 212static u16 packet_pick_tx_queue(struct sk_buff *skb);
213 213
214struct packet_skb_cb { 214struct packet_skb_cb {
215 union { 215 union {
@@ -243,40 +243,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po);
243 243
244static int packet_direct_xmit(struct sk_buff *skb) 244static int packet_direct_xmit(struct sk_buff *skb)
245{ 245{
246 struct net_device *dev = skb->dev; 246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
247 struct sk_buff *orig_skb = skb;
248 struct netdev_queue *txq;
249 int ret = NETDEV_TX_BUSY;
250 bool again = false;
251
252 if (unlikely(!netif_running(dev) ||
253 !netif_carrier_ok(dev)))
254 goto drop;
255
256 skb = validate_xmit_skb_list(skb, dev, &again);
257 if (skb != orig_skb)
258 goto drop;
259
260 packet_pick_tx_queue(dev, skb);
261 txq = skb_get_tx_queue(dev, skb);
262
263 local_bh_disable();
264
265 HARD_TX_LOCK(dev, txq, smp_processor_id());
266 if (!netif_xmit_frozen_or_drv_stopped(txq))
267 ret = netdev_start_xmit(skb, dev, txq, false);
268 HARD_TX_UNLOCK(dev, txq);
269
270 local_bh_enable();
271
272 if (!dev_xmit_complete(ret))
273 kfree_skb(skb);
274
275 return ret;
276drop:
277 atomic_long_inc(&dev->tx_dropped);
278 kfree_skb_list(skb);
279 return NET_XMIT_DROP;
280} 247}
281 248
282static struct net_device *packet_cached_dev_get(struct packet_sock *po) 249static struct net_device *packet_cached_dev_get(struct packet_sock *po)
@@ -313,8 +280,9 @@ static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
313 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; 280 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
314} 281}
315 282
316static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) 283static u16 packet_pick_tx_queue(struct sk_buff *skb)
317{ 284{
285 struct net_device *dev = skb->dev;
318 const struct net_device_ops *ops = dev->netdev_ops; 286 const struct net_device_ops *ops = dev->netdev_ops;
319 u16 queue_index; 287 u16 queue_index;
320 288
@@ -326,7 +294,7 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
326 queue_index = __packet_pick_tx_queue(dev, skb); 294 queue_index = __packet_pick_tx_queue(dev, skb);
327 } 295 }
328 296
329 skb_set_queue_mapping(skb, queue_index); 297 return queue_index;
330} 298}
331 299
332/* __register_prot_hook must be invoked through register_prot_hook 300/* __register_prot_hook must be invoked through register_prot_hook
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
new file mode 100644
index 000000000000..90e4a7152854
--- /dev/null
+++ b/net/xdp/Kconfig
@@ -0,0 +1,7 @@
1config XDP_SOCKETS
2 bool "XDP sockets"
3 depends on BPF_SYSCALL
4 default n
5 help
6 XDP sockets allows a channel between XDP programs and
7 userspace applications.
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
new file mode 100644
index 000000000000..074fb2b2d51c
--- /dev/null
+++ b/net/xdp/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
2
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
new file mode 100644
index 000000000000..881dfdefe235
--- /dev/null
+++ b/net/xdp/xdp_umem.c
@@ -0,0 +1,260 @@
1// SPDX-License-Identifier: GPL-2.0
2/* XDP user-space packet buffer
3 * Copyright(c) 2018 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/init.h>
16#include <linux/sched/mm.h>
17#include <linux/sched/signal.h>
18#include <linux/sched/task.h>
19#include <linux/uaccess.h>
20#include <linux/slab.h>
21#include <linux/bpf.h>
22#include <linux/mm.h>
23
24#include "xdp_umem.h"
25
26#define XDP_UMEM_MIN_FRAME_SIZE 2048
27
28int xdp_umem_create(struct xdp_umem **umem)
29{
30 *umem = kzalloc(sizeof(**umem), GFP_KERNEL);
31
32 if (!(*umem))
33 return -ENOMEM;
34
35 return 0;
36}
37
38static void xdp_umem_unpin_pages(struct xdp_umem *umem)
39{
40 unsigned int i;
41
42 if (umem->pgs) {
43 for (i = 0; i < umem->npgs; i++) {
44 struct page *page = umem->pgs[i];
45
46 set_page_dirty_lock(page);
47 put_page(page);
48 }
49
50 kfree(umem->pgs);
51 umem->pgs = NULL;
52 }
53}
54
55static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
56{
57 if (umem->user) {
58 atomic_long_sub(umem->npgs, &umem->user->locked_vm);
59 free_uid(umem->user);
60 }
61}
62
63static void xdp_umem_release(struct xdp_umem *umem)
64{
65 struct task_struct *task;
66 struct mm_struct *mm;
67
68 if (umem->fq) {
69 xskq_destroy(umem->fq);
70 umem->fq = NULL;
71 }
72
73 if (umem->cq) {
74 xskq_destroy(umem->cq);
75 umem->cq = NULL;
76 }
77
78 if (umem->pgs) {
79 xdp_umem_unpin_pages(umem);
80
81 task = get_pid_task(umem->pid, PIDTYPE_PID);
82 put_pid(umem->pid);
83 if (!task)
84 goto out;
85 mm = get_task_mm(task);
86 put_task_struct(task);
87 if (!mm)
88 goto out;
89
90 mmput(mm);
91 umem->pgs = NULL;
92 }
93
94 xdp_umem_unaccount_pages(umem);
95out:
96 kfree(umem);
97}
98
99static void xdp_umem_release_deferred(struct work_struct *work)
100{
101 struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
102
103 xdp_umem_release(umem);
104}
105
106void xdp_get_umem(struct xdp_umem *umem)
107{
108 atomic_inc(&umem->users);
109}
110
111void xdp_put_umem(struct xdp_umem *umem)
112{
113 if (!umem)
114 return;
115
116 if (atomic_dec_and_test(&umem->users)) {
117 INIT_WORK(&umem->work, xdp_umem_release_deferred);
118 schedule_work(&umem->work);
119 }
120}
121
122static int xdp_umem_pin_pages(struct xdp_umem *umem)
123{
124 unsigned int gup_flags = FOLL_WRITE;
125 long npgs;
126 int err;
127
128 umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL);
129 if (!umem->pgs)
130 return -ENOMEM;
131
132 down_write(&current->mm->mmap_sem);
133 npgs = get_user_pages(umem->address, umem->npgs,
134 gup_flags, &umem->pgs[0], NULL);
135 up_write(&current->mm->mmap_sem);
136
137 if (npgs != umem->npgs) {
138 if (npgs >= 0) {
139 umem->npgs = npgs;
140 err = -ENOMEM;
141 goto out_pin;
142 }
143 err = npgs;
144 goto out_pgs;
145 }
146 return 0;
147
148out_pin:
149 xdp_umem_unpin_pages(umem);
150out_pgs:
151 kfree(umem->pgs);
152 umem->pgs = NULL;
153 return err;
154}
155
156static int xdp_umem_account_pages(struct xdp_umem *umem)
157{
158 unsigned long lock_limit, new_npgs, old_npgs;
159
160 if (capable(CAP_IPC_LOCK))
161 return 0;
162
163 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
164 umem->user = get_uid(current_user());
165
166 do {
167 old_npgs = atomic_long_read(&umem->user->locked_vm);
168 new_npgs = old_npgs + umem->npgs;
169 if (new_npgs > lock_limit) {
170 free_uid(umem->user);
171 umem->user = NULL;
172 return -ENOBUFS;
173 }
174 } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
175 new_npgs) != old_npgs);
176 return 0;
177}
178
179int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
180{
181 u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
182 u64 addr = mr->addr, size = mr->len;
183 unsigned int nframes, nfpp;
184 int size_chk, err;
185
186 if (!umem)
187 return -EINVAL;
188
189 if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
190 /* Strictly speaking we could support this, if:
191 * - huge pages, or*
192 * - using an IOMMU, or
193 * - making sure the memory area is consecutive
194 * but for now, we simply say "computer says no".
195 */
196 return -EINVAL;
197 }
198
199 if (!is_power_of_2(frame_size))
200 return -EINVAL;
201
202 if (!PAGE_ALIGNED(addr)) {
203 /* Memory area has to be page size aligned. For
204 * simplicity, this might change.
205 */
206 return -EINVAL;
207 }
208
209 if ((addr + size) < addr)
210 return -EINVAL;
211
212 nframes = size / frame_size;
213 if (nframes == 0 || nframes > UINT_MAX)
214 return -EINVAL;
215
216 nfpp = PAGE_SIZE / frame_size;
217 if (nframes < nfpp || nframes % nfpp)
218 return -EINVAL;
219
220 frame_headroom = ALIGN(frame_headroom, 64);
221
222 size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
223 if (size_chk < 0)
224 return -EINVAL;
225
226 umem->pid = get_task_pid(current, PIDTYPE_PID);
227 umem->size = (size_t)size;
228 umem->address = (unsigned long)addr;
229 umem->props.frame_size = frame_size;
230 umem->props.nframes = nframes;
231 umem->frame_headroom = frame_headroom;
232 umem->npgs = size / PAGE_SIZE;
233 umem->pgs = NULL;
234 umem->user = NULL;
235
236 umem->frame_size_log2 = ilog2(frame_size);
237 umem->nfpp_mask = nfpp - 1;
238 umem->nfpplog2 = ilog2(nfpp);
239 atomic_set(&umem->users, 1);
240
241 err = xdp_umem_account_pages(umem);
242 if (err)
243 goto out;
244
245 err = xdp_umem_pin_pages(umem);
246 if (err)
247 goto out_account;
248 return 0;
249
250out_account:
251 xdp_umem_unaccount_pages(umem);
252out:
253 put_pid(umem->pid);
254 return err;
255}
256
257bool xdp_umem_validate_queues(struct xdp_umem *umem)
258{
259 return (umem->fq && umem->cq);
260}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
new file mode 100644
index 000000000000..7e0b2fab8522
--- /dev/null
+++ b/net/xdp/xdp_umem.h
@@ -0,0 +1,67 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * XDP user-space packet buffer
3 * Copyright(c) 2018 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#ifndef XDP_UMEM_H_
16#define XDP_UMEM_H_
17
18#include <linux/mm.h>
19#include <linux/if_xdp.h>
20#include <linux/workqueue.h>
21
22#include "xsk_queue.h"
23#include "xdp_umem_props.h"
24
25struct xdp_umem {
26 struct xsk_queue *fq;
27 struct xsk_queue *cq;
28 struct page **pgs;
29 struct xdp_umem_props props;
30 u32 npgs;
31 u32 frame_headroom;
32 u32 nfpp_mask;
33 u32 nfpplog2;
34 u32 frame_size_log2;
35 struct user_struct *user;
36 struct pid *pid;
37 unsigned long address;
38 size_t size;
39 atomic_t users;
40 struct work_struct work;
41};
42
43static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx)
44{
45 u64 pg, off;
46 char *data;
47
48 pg = idx >> umem->nfpplog2;
49 off = (idx & umem->nfpp_mask) << umem->frame_size_log2;
50
51 data = page_address(umem->pgs[pg]);
52 return data + off;
53}
54
55static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem,
56 u32 idx)
57{
58 return xdp_umem_get_data(umem, idx) + umem->frame_headroom;
59}
60
61bool xdp_umem_validate_queues(struct xdp_umem *umem);
62int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
63void xdp_get_umem(struct xdp_umem *umem);
64void xdp_put_umem(struct xdp_umem *umem);
65int xdp_umem_create(struct xdp_umem **umem);
66
67#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
new file mode 100644
index 000000000000..77fb5daf29f3
--- /dev/null
+++ b/net/xdp/xdp_umem_props.h
@@ -0,0 +1,23 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * XDP user-space packet buffer
3 * Copyright(c) 2018 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#ifndef XDP_UMEM_PROPS_H_
16#define XDP_UMEM_PROPS_H_
17
18struct xdp_umem_props {
19 u32 frame_size;
20 u32 nframes;
21};
22
23#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
new file mode 100644
index 000000000000..009c5af5bba5
--- /dev/null
+++ b/net/xdp/xsk.c
@@ -0,0 +1,656 @@
1// SPDX-License-Identifier: GPL-2.0
2/* XDP sockets
3 *
4 * AF_XDP sockets allows a channel between XDP programs and userspace
5 * applications.
6 * Copyright(c) 2018 Intel Corporation.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * Author(s): Björn Töpel <bjorn.topel@intel.com>
18 * Magnus Karlsson <magnus.karlsson@intel.com>
19 */
20
21#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
22
23#include <linux/if_xdp.h>
24#include <linux/init.h>
25#include <linux/sched/mm.h>
26#include <linux/sched/signal.h>
27#include <linux/sched/task.h>
28#include <linux/socket.h>
29#include <linux/file.h>
30#include <linux/uaccess.h>
31#include <linux/net.h>
32#include <linux/netdevice.h>
33#include <net/xdp_sock.h>
34#include <net/xdp.h>
35
36#include "xsk_queue.h"
37#include "xdp_umem.h"
38
39#define TX_BATCH_SIZE 16
40
41static struct xdp_sock *xdp_sk(struct sock *sk)
42{
43 return (struct xdp_sock *)sk;
44}
45
46bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
47{
48 return !!xs->rx;
49}
50
51static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
52{
53 u32 *id, len = xdp->data_end - xdp->data;
54 void *buffer;
55 int err = 0;
56
57 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
58 return -EINVAL;
59
60 id = xskq_peek_id(xs->umem->fq);
61 if (!id)
62 return -ENOSPC;
63
64 buffer = xdp_umem_get_data_with_headroom(xs->umem, *id);
65 memcpy(buffer, xdp->data, len);
66 err = xskq_produce_batch_desc(xs->rx, *id, len,
67 xs->umem->frame_headroom);
68 if (!err)
69 xskq_discard_id(xs->umem->fq);
70
71 return err;
72}
73
74int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
75{
76 int err;
77
78 err = __xsk_rcv(xs, xdp);
79 if (likely(!err))
80 xdp_return_buff(xdp);
81 else
82 xs->rx_dropped++;
83
84 return err;
85}
86
87void xsk_flush(struct xdp_sock *xs)
88{
89 xskq_produce_flush_desc(xs->rx);
90 xs->sk.sk_data_ready(&xs->sk);
91}
92
93int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
94{
95 int err;
96
97 err = __xsk_rcv(xs, xdp);
98 if (!err)
99 xsk_flush(xs);
100 else
101 xs->rx_dropped++;
102
103 return err;
104}
105
106static void xsk_destruct_skb(struct sk_buff *skb)
107{
108 u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
109 struct xdp_sock *xs = xdp_sk(skb->sk);
110
111 WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
112
113 sock_wfree(skb);
114}
115
116static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
117 size_t total_len)
118{
119 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
120 u32 max_batch = TX_BATCH_SIZE;
121 struct xdp_sock *xs = xdp_sk(sk);
122 bool sent_frame = false;
123 struct xdp_desc desc;
124 struct sk_buff *skb;
125 int err = 0;
126
127 if (unlikely(!xs->tx))
128 return -ENOBUFS;
129 if (need_wait)
130 return -EOPNOTSUPP;
131
132 mutex_lock(&xs->mutex);
133
134 while (xskq_peek_desc(xs->tx, &desc)) {
135 char *buffer;
136 u32 id, len;
137
138 if (max_batch-- == 0) {
139 err = -EAGAIN;
140 goto out;
141 }
142
143 if (xskq_reserve_id(xs->umem->cq)) {
144 err = -EAGAIN;
145 goto out;
146 }
147
148 len = desc.len;
149 if (unlikely(len > xs->dev->mtu)) {
150 err = -EMSGSIZE;
151 goto out;
152 }
153
154 skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
155 if (unlikely(!skb)) {
156 err = -EAGAIN;
157 goto out;
158 }
159
160 skb_put(skb, len);
161 id = desc.idx;
162 buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
163 err = skb_store_bits(skb, 0, buffer, len);
164 if (unlikely(err)) {
165 kfree_skb(skb);
166 goto out;
167 }
168
169 skb->dev = xs->dev;
170 skb->priority = sk->sk_priority;
171 skb->mark = sk->sk_mark;
172 skb_shinfo(skb)->destructor_arg = (void *)(long)id;
173 skb->destructor = xsk_destruct_skb;
174
175 err = dev_direct_xmit(skb, xs->queue_id);
176 /* Ignore NET_XMIT_CN as packet might have been sent */
177 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
178 err = -EAGAIN;
179 /* SKB consumed by dev_direct_xmit() */
180 goto out;
181 }
182
183 sent_frame = true;
184 xskq_discard_desc(xs->tx);
185 }
186
187out:
188 if (sent_frame)
189 sk->sk_write_space(sk);
190
191 mutex_unlock(&xs->mutex);
192 return err;
193}
194
195static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
196{
197 struct sock *sk = sock->sk;
198 struct xdp_sock *xs = xdp_sk(sk);
199
200 if (unlikely(!xs->dev))
201 return -ENXIO;
202 if (unlikely(!(xs->dev->flags & IFF_UP)))
203 return -ENETDOWN;
204
205 return xsk_generic_xmit(sk, m, total_len);
206}
207
208static unsigned int xsk_poll(struct file *file, struct socket *sock,
209 struct poll_table_struct *wait)
210{
211 unsigned int mask = datagram_poll(file, sock, wait);
212 struct sock *sk = sock->sk;
213 struct xdp_sock *xs = xdp_sk(sk);
214
215 if (xs->rx && !xskq_empty_desc(xs->rx))
216 mask |= POLLIN | POLLRDNORM;
217 if (xs->tx && !xskq_full_desc(xs->tx))
218 mask |= POLLOUT | POLLWRNORM;
219
220 return mask;
221}
222
223static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
224 bool umem_queue)
225{
226 struct xsk_queue *q;
227
228 if (entries == 0 || *queue || !is_power_of_2(entries))
229 return -EINVAL;
230
231 q = xskq_create(entries, umem_queue);
232 if (!q)
233 return -ENOMEM;
234
235 *queue = q;
236 return 0;
237}
238
239static void __xsk_release(struct xdp_sock *xs)
240{
241 /* Wait for driver to stop using the xdp socket. */
242 synchronize_net();
243
244 dev_put(xs->dev);
245}
246
247static int xsk_release(struct socket *sock)
248{
249 struct sock *sk = sock->sk;
250 struct xdp_sock *xs = xdp_sk(sk);
251 struct net *net;
252
253 if (!sk)
254 return 0;
255
256 net = sock_net(sk);
257
258 local_bh_disable();
259 sock_prot_inuse_add(net, sk->sk_prot, -1);
260 local_bh_enable();
261
262 if (xs->dev) {
263 __xsk_release(xs);
264 xs->dev = NULL;
265 }
266
267 sock_orphan(sk);
268 sock->sk = NULL;
269
270 sk_refcnt_debug_release(sk);
271 sock_put(sk);
272
273 return 0;
274}
275
276static struct socket *xsk_lookup_xsk_from_fd(int fd)
277{
278 struct socket *sock;
279 int err;
280
281 sock = sockfd_lookup(fd, &err);
282 if (!sock)
283 return ERR_PTR(-ENOTSOCK);
284
285 if (sock->sk->sk_family != PF_XDP) {
286 sockfd_put(sock);
287 return ERR_PTR(-ENOPROTOOPT);
288 }
289
290 return sock;
291}
292
293static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
294{
295 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
296 struct sock *sk = sock->sk;
297 struct net_device *dev, *dev_curr;
298 struct xdp_sock *xs = xdp_sk(sk);
299 struct xdp_umem *old_umem = NULL;
300 int err = 0;
301
302 if (addr_len < sizeof(struct sockaddr_xdp))
303 return -EINVAL;
304 if (sxdp->sxdp_family != AF_XDP)
305 return -EINVAL;
306
307 mutex_lock(&xs->mutex);
308 dev_curr = xs->dev;
309 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
310 if (!dev) {
311 err = -ENODEV;
312 goto out_release;
313 }
314
315 if (!xs->rx && !xs->tx) {
316 err = -EINVAL;
317 goto out_unlock;
318 }
319
320 if (sxdp->sxdp_queue_id >= dev->num_rx_queues) {
321 err = -EINVAL;
322 goto out_unlock;
323 }
324
325 if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
326 struct xdp_sock *umem_xs;
327 struct socket *sock;
328
329 if (xs->umem) {
330 /* We have already our own. */
331 err = -EINVAL;
332 goto out_unlock;
333 }
334
335 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
336 if (IS_ERR(sock)) {
337 err = PTR_ERR(sock);
338 goto out_unlock;
339 }
340
341 umem_xs = xdp_sk(sock->sk);
342 if (!umem_xs->umem) {
343 /* No umem to inherit. */
344 err = -EBADF;
345 sockfd_put(sock);
346 goto out_unlock;
347 } else if (umem_xs->dev != dev ||
348 umem_xs->queue_id != sxdp->sxdp_queue_id) {
349 err = -EINVAL;
350 sockfd_put(sock);
351 goto out_unlock;
352 }
353
354 xdp_get_umem(umem_xs->umem);
355 old_umem = xs->umem;
356 xs->umem = umem_xs->umem;
357 sockfd_put(sock);
358 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
359 err = -EINVAL;
360 goto out_unlock;
361 } else {
362 /* This xsk has its own umem. */
363 xskq_set_umem(xs->umem->fq, &xs->umem->props);
364 xskq_set_umem(xs->umem->cq, &xs->umem->props);
365 }
366
367 /* Rebind? */
368 if (dev_curr && (dev_curr != dev ||
369 xs->queue_id != sxdp->sxdp_queue_id)) {
370 __xsk_release(xs);
371 if (old_umem)
372 xdp_put_umem(old_umem);
373 }
374
375 xs->dev = dev;
376 xs->queue_id = sxdp->sxdp_queue_id;
377
378 xskq_set_umem(xs->rx, &xs->umem->props);
379 xskq_set_umem(xs->tx, &xs->umem->props);
380
381out_unlock:
382 if (err)
383 dev_put(dev);
384out_release:
385 mutex_unlock(&xs->mutex);
386 return err;
387}
388
389static int xsk_setsockopt(struct socket *sock, int level, int optname,
390 char __user *optval, unsigned int optlen)
391{
392 struct sock *sk = sock->sk;
393 struct xdp_sock *xs = xdp_sk(sk);
394 int err;
395
396 if (level != SOL_XDP)
397 return -ENOPROTOOPT;
398
399 switch (optname) {
400 case XDP_RX_RING:
401 case XDP_TX_RING:
402 {
403 struct xsk_queue **q;
404 int entries;
405
406 if (optlen < sizeof(entries))
407 return -EINVAL;
408 if (copy_from_user(&entries, optval, sizeof(entries)))
409 return -EFAULT;
410
411 mutex_lock(&xs->mutex);
412 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
413 err = xsk_init_queue(entries, q, false);
414 mutex_unlock(&xs->mutex);
415 return err;
416 }
417 case XDP_UMEM_REG:
418 {
419 struct xdp_umem_reg mr;
420 struct xdp_umem *umem;
421
422 if (xs->umem)
423 return -EBUSY;
424
425 if (copy_from_user(&mr, optval, sizeof(mr)))
426 return -EFAULT;
427
428 mutex_lock(&xs->mutex);
429 err = xdp_umem_create(&umem);
430
431 err = xdp_umem_reg(umem, &mr);
432 if (err) {
433 kfree(umem);
434 mutex_unlock(&xs->mutex);
435 return err;
436 }
437
438 /* Make sure umem is ready before it can be seen by others */
439 smp_wmb();
440
441 xs->umem = umem;
442 mutex_unlock(&xs->mutex);
443 return 0;
444 }
445 case XDP_UMEM_FILL_RING:
446 case XDP_UMEM_COMPLETION_RING:
447 {
448 struct xsk_queue **q;
449 int entries;
450
451 if (!xs->umem)
452 return -EINVAL;
453
454 if (copy_from_user(&entries, optval, sizeof(entries)))
455 return -EFAULT;
456
457 mutex_lock(&xs->mutex);
458 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
459 &xs->umem->cq;
460 err = xsk_init_queue(entries, q, true);
461 mutex_unlock(&xs->mutex);
462 return err;
463 }
464 default:
465 break;
466 }
467
468 return -ENOPROTOOPT;
469}
470
471static int xsk_getsockopt(struct socket *sock, int level, int optname,
472 char __user *optval, int __user *optlen)
473{
474 struct sock *sk = sock->sk;
475 struct xdp_sock *xs = xdp_sk(sk);
476 int len;
477
478 if (level != SOL_XDP)
479 return -ENOPROTOOPT;
480
481 if (get_user(len, optlen))
482 return -EFAULT;
483 if (len < 0)
484 return -EINVAL;
485
486 switch (optname) {
487 case XDP_STATISTICS:
488 {
489 struct xdp_statistics stats;
490
491 if (len < sizeof(stats))
492 return -EINVAL;
493
494 mutex_lock(&xs->mutex);
495 stats.rx_dropped = xs->rx_dropped;
496 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
497 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
498 mutex_unlock(&xs->mutex);
499
500 if (copy_to_user(optval, &stats, sizeof(stats)))
501 return -EFAULT;
502 if (put_user(sizeof(stats), optlen))
503 return -EFAULT;
504
505 return 0;
506 }
507 default:
508 break;
509 }
510
511 return -EOPNOTSUPP;
512}
513
514static int xsk_mmap(struct file *file, struct socket *sock,
515 struct vm_area_struct *vma)
516{
517 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
518 unsigned long size = vma->vm_end - vma->vm_start;
519 struct xdp_sock *xs = xdp_sk(sock->sk);
520 struct xsk_queue *q = NULL;
521 unsigned long pfn;
522 struct page *qpg;
523
524 if (offset == XDP_PGOFF_RX_RING) {
525 q = xs->rx;
526 } else if (offset == XDP_PGOFF_TX_RING) {
527 q = xs->tx;
528 } else {
529 if (!xs->umem)
530 return -EINVAL;
531
532 if (offset == XDP_UMEM_PGOFF_FILL_RING)
533 q = xs->umem->fq;
534 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
535 q = xs->umem->cq;
536 }
537
538 if (!q)
539 return -EINVAL;
540
541 qpg = virt_to_head_page(q->ring);
542 if (size > (PAGE_SIZE << compound_order(qpg)))
543 return -EINVAL;
544
545 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
546 return remap_pfn_range(vma, vma->vm_start, pfn,
547 size, vma->vm_page_prot);
548}
549
550static struct proto xsk_proto = {
551 .name = "XDP",
552 .owner = THIS_MODULE,
553 .obj_size = sizeof(struct xdp_sock),
554};
555
556static const struct proto_ops xsk_proto_ops = {
557 .family = PF_XDP,
558 .owner = THIS_MODULE,
559 .release = xsk_release,
560 .bind = xsk_bind,
561 .connect = sock_no_connect,
562 .socketpair = sock_no_socketpair,
563 .accept = sock_no_accept,
564 .getname = sock_no_getname,
565 .poll = xsk_poll,
566 .ioctl = sock_no_ioctl,
567 .listen = sock_no_listen,
568 .shutdown = sock_no_shutdown,
569 .setsockopt = xsk_setsockopt,
570 .getsockopt = xsk_getsockopt,
571 .sendmsg = xsk_sendmsg,
572 .recvmsg = sock_no_recvmsg,
573 .mmap = xsk_mmap,
574 .sendpage = sock_no_sendpage,
575};
576
577static void xsk_destruct(struct sock *sk)
578{
579 struct xdp_sock *xs = xdp_sk(sk);
580
581 if (!sock_flag(sk, SOCK_DEAD))
582 return;
583
584 xskq_destroy(xs->rx);
585 xskq_destroy(xs->tx);
586 xdp_put_umem(xs->umem);
587
588 sk_refcnt_debug_dec(sk);
589}
590
591static int xsk_create(struct net *net, struct socket *sock, int protocol,
592 int kern)
593{
594 struct sock *sk;
595 struct xdp_sock *xs;
596
597 if (!ns_capable(net->user_ns, CAP_NET_RAW))
598 return -EPERM;
599 if (sock->type != SOCK_RAW)
600 return -ESOCKTNOSUPPORT;
601
602 if (protocol)
603 return -EPROTONOSUPPORT;
604
605 sock->state = SS_UNCONNECTED;
606
607 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
608 if (!sk)
609 return -ENOBUFS;
610
611 sock->ops = &xsk_proto_ops;
612
613 sock_init_data(sock, sk);
614
615 sk->sk_family = PF_XDP;
616
617 sk->sk_destruct = xsk_destruct;
618 sk_refcnt_debug_inc(sk);
619
620 xs = xdp_sk(sk);
621 mutex_init(&xs->mutex);
622
623 local_bh_disable();
624 sock_prot_inuse_add(net, &xsk_proto, 1);
625 local_bh_enable();
626
627 return 0;
628}
629
630static const struct net_proto_family xsk_family_ops = {
631 .family = PF_XDP,
632 .create = xsk_create,
633 .owner = THIS_MODULE,
634};
635
636static int __init xsk_init(void)
637{
638 int err;
639
640 err = proto_register(&xsk_proto, 0 /* no slab */);
641 if (err)
642 goto out;
643
644 err = sock_register(&xsk_family_ops);
645 if (err)
646 goto out_proto;
647
648 return 0;
649
650out_proto:
651 proto_unregister(&xsk_proto);
652out:
653 return err;
654}
655
656fs_initcall(xsk_init);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 000000000000..d012e5e23591
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,73 @@
1// SPDX-License-Identifier: GPL-2.0
2/* XDP user-space ring structure
3 * Copyright(c) 2018 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/slab.h>
16
17#include "xsk_queue.h"
18
19void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
20{
21 if (!q)
22 return;
23
24 q->umem_props = *umem_props;
25}
26
27static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
28{
29 return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
30}
31
32static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
33{
34 return (sizeof(struct xdp_ring) +
35 q->nentries * sizeof(struct xdp_desc));
36}
37
38struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
39{
40 struct xsk_queue *q;
41 gfp_t gfp_flags;
42 size_t size;
43
44 q = kzalloc(sizeof(*q), GFP_KERNEL);
45 if (!q)
46 return NULL;
47
48 q->nentries = nentries;
49 q->ring_mask = nentries - 1;
50
51 gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
52 __GFP_COMP | __GFP_NORETRY;
53 size = umem_queue ? xskq_umem_get_ring_size(q) :
54 xskq_rxtx_get_ring_size(q);
55
56 q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
57 get_order(size));
58 if (!q->ring) {
59 kfree(q);
60 return NULL;
61 }
62
63 return q;
64}
65
66void xskq_destroy(struct xsk_queue *q)
67{
68 if (!q)
69 return;
70
71 page_frag_free(q->ring);
72 kfree(q);
73}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 000000000000..7aa9a535db0e
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,247 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * XDP user-space ring structure
3 * Copyright(c) 2018 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#ifndef _LINUX_XSK_QUEUE_H
16#define _LINUX_XSK_QUEUE_H
17
18#include <linux/types.h>
19#include <linux/if_xdp.h>
20
21#include "xdp_umem_props.h"
22
23#define RX_BATCH_SIZE 16
24
25struct xsk_queue {
26 struct xdp_umem_props umem_props;
27 u32 ring_mask;
28 u32 nentries;
29 u32 prod_head;
30 u32 prod_tail;
31 u32 cons_head;
32 u32 cons_tail;
33 struct xdp_ring *ring;
34 u64 invalid_descs;
35};
36
37/* Common functions operating for both RXTX and umem queues */
38
39static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
40{
41 return q ? q->invalid_descs : 0;
42}
43
44static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
45{
46 u32 entries = q->prod_tail - q->cons_tail;
47
48 if (entries == 0) {
49 /* Refresh the local pointer */
50 q->prod_tail = READ_ONCE(q->ring->producer);
51 entries = q->prod_tail - q->cons_tail;
52 }
53
54 return (entries > dcnt) ? dcnt : entries;
55}
56
57static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
58{
59 u32 free_entries = q->nentries - (producer - q->cons_tail);
60
61 if (free_entries >= dcnt)
62 return free_entries;
63
64 /* Refresh the local tail pointer */
65 q->cons_tail = READ_ONCE(q->ring->consumer);
66 return q->nentries - (producer - q->cons_tail);
67}
68
69/* UMEM queue */
70
71static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx)
72{
73 if (unlikely(idx >= q->umem_props.nframes)) {
74 q->invalid_descs++;
75 return false;
76 }
77 return true;
78}
79
80static inline u32 *xskq_validate_id(struct xsk_queue *q)
81{
82 while (q->cons_tail != q->cons_head) {
83 struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
84 unsigned int idx = q->cons_tail & q->ring_mask;
85
86 if (xskq_is_valid_id(q, ring->desc[idx]))
87 return &ring->desc[idx];
88
89 q->cons_tail++;
90 }
91
92 return NULL;
93}
94
95static inline u32 *xskq_peek_id(struct xsk_queue *q)
96{
97 struct xdp_umem_ring *ring;
98
99 if (q->cons_tail == q->cons_head) {
100 WRITE_ONCE(q->ring->consumer, q->cons_tail);
101 q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
102
103 /* Order consumer and data */
104 smp_rmb();
105
106 return xskq_validate_id(q);
107 }
108
109 ring = (struct xdp_umem_ring *)q->ring;
110 return &ring->desc[q->cons_tail & q->ring_mask];
111}
112
113static inline void xskq_discard_id(struct xsk_queue *q)
114{
115 q->cons_tail++;
116 (void)xskq_validate_id(q);
117}
118
119static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
120{
121 struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
122
123 ring->desc[q->prod_tail++ & q->ring_mask] = id;
124
125 /* Order producer and data */
126 smp_wmb();
127
128 WRITE_ONCE(q->ring->producer, q->prod_tail);
129 return 0;
130}
131
132static inline int xskq_reserve_id(struct xsk_queue *q)
133{
134 if (xskq_nb_free(q, q->prod_head, 1) == 0)
135 return -ENOSPC;
136
137 q->prod_head++;
138 return 0;
139}
140
141/* Rx/Tx queue */
142
143static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
144{
145 u32 buff_len;
146
147 if (unlikely(d->idx >= q->umem_props.nframes)) {
148 q->invalid_descs++;
149 return false;
150 }
151
152 buff_len = q->umem_props.frame_size;
153 if (unlikely(d->len > buff_len || d->len == 0 ||
154 d->offset > buff_len || d->offset + d->len > buff_len)) {
155 q->invalid_descs++;
156 return false;
157 }
158
159 return true;
160}
161
162static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
163 struct xdp_desc *desc)
164{
165 while (q->cons_tail != q->cons_head) {
166 struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
167 unsigned int idx = q->cons_tail & q->ring_mask;
168
169 if (xskq_is_valid_desc(q, &ring->desc[idx])) {
170 if (desc)
171 *desc = ring->desc[idx];
172 return desc;
173 }
174
175 q->cons_tail++;
176 }
177
178 return NULL;
179}
180
181static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
182 struct xdp_desc *desc)
183{
184 struct xdp_rxtx_ring *ring;
185
186 if (q->cons_tail == q->cons_head) {
187 WRITE_ONCE(q->ring->consumer, q->cons_tail);
188 q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
189
190 /* Order consumer and data */
191 smp_rmb();
192
193 return xskq_validate_desc(q, desc);
194 }
195
196 ring = (struct xdp_rxtx_ring *)q->ring;
197 *desc = ring->desc[q->cons_tail & q->ring_mask];
198 return desc;
199}
200
201static inline void xskq_discard_desc(struct xsk_queue *q)
202{
203 q->cons_tail++;
204 (void)xskq_validate_desc(q, NULL);
205}
206
207static inline int xskq_produce_batch_desc(struct xsk_queue *q,
208 u32 id, u32 len, u16 offset)
209{
210 struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
211 unsigned int idx;
212
213 if (xskq_nb_free(q, q->prod_head, 1) == 0)
214 return -ENOSPC;
215
216 idx = (q->prod_head++) & q->ring_mask;
217 ring->desc[idx].idx = id;
218 ring->desc[idx].len = len;
219 ring->desc[idx].offset = offset;
220
221 return 0;
222}
223
224static inline void xskq_produce_flush_desc(struct xsk_queue *q)
225{
226 /* Order producer and data */
227 smp_wmb();
228
229 q->prod_tail = q->prod_head,
230 WRITE_ONCE(q->ring->producer, q->prod_tail);
231}
232
233static inline bool xskq_full_desc(struct xsk_queue *q)
234{
235 return (xskq_nb_avail(q, q->nentries) == q->nentries);
236}
237
238static inline bool xskq_empty_desc(struct xsk_queue *q)
239{
240 return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries);
241}
242
243void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
244struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
245void xskq_destroy(struct xsk_queue *q_ops);
246
247#endif /* _LINUX_XSK_QUEUE_H */
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b853581592fd..8e0c7fb6d7cc 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -45,10 +45,12 @@ hostprogs-y += xdp_rxq_info
45hostprogs-y += syscall_tp 45hostprogs-y += syscall_tp
46hostprogs-y += cpustat 46hostprogs-y += cpustat
47hostprogs-y += xdp_adjust_tail 47hostprogs-y += xdp_adjust_tail
48hostprogs-y += xdpsock
48 49
49# Libbpf dependencies 50# Libbpf dependencies
50LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o 51LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
51CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o 52CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
53TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
52 54
53test_lru_dist-objs := test_lru_dist.o $(LIBBPF) 55test_lru_dist-objs := test_lru_dist.o $(LIBBPF)
54sock_example-objs := sock_example.o $(LIBBPF) 56sock_example-objs := sock_example.o $(LIBBPF)
@@ -65,10 +67,10 @@ tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
65tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o 67tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
66load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o 68load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
67test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o 69test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
68trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o 70trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o $(TRACE_HELPERS)
69lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o 71lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
70offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o 72offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o $(TRACE_HELPERS)
71spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o 73spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o $(TRACE_HELPERS)
72map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o 74map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o
73test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o 75test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o
74test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o 76test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o
@@ -82,8 +84,8 @@ xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o
82xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o 84xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
83test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \ 85test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \
84 test_current_task_under_cgroup_user.o 86 test_current_task_under_cgroup_user.o
85trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o 87trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o $(TRACE_HELPERS)
86sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o 88sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o $(TRACE_HELPERS)
87tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o 89tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o
88lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o 90lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o
89xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o 91xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o
@@ -97,6 +99,7 @@ xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
97syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o 99syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
98cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o 100cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
99xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o 101xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
102xdpsock-objs := bpf_load.o $(LIBBPF) xdpsock_user.o
100 103
101# Tell kbuild to always build the programs 104# Tell kbuild to always build the programs
102always := $(hostprogs-y) 105always := $(hostprogs-y)
@@ -150,6 +153,7 @@ always += xdp2skb_meta_kern.o
150always += syscall_tp_kern.o 153always += syscall_tp_kern.o
151always += cpustat_kern.o 154always += cpustat_kern.o
152always += xdp_adjust_tail_kern.o 155always += xdp_adjust_tail_kern.o
156always += xdpsock_kern.o
153 157
154HOSTCFLAGS += -I$(objtree)/usr/include 158HOSTCFLAGS += -I$(objtree)/usr/include
155HOSTCFLAGS += -I$(srctree)/tools/lib/ 159HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -196,6 +200,7 @@ HOSTLOADLIBES_xdp_rxq_info += -lelf
196HOSTLOADLIBES_syscall_tp += -lelf 200HOSTLOADLIBES_syscall_tp += -lelf
197HOSTLOADLIBES_cpustat += -lelf 201HOSTLOADLIBES_cpustat += -lelf
198HOSTLOADLIBES_xdp_adjust_tail += -lelf 202HOSTLOADLIBES_xdp_adjust_tail += -lelf
203HOSTLOADLIBES_xdpsock += -lelf -pthread
199 204
200# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: 205# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
201# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang 206# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index feca497d6afd..da9bccfaf391 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -145,6 +145,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
145 } 145 }
146 146
147 if (is_kprobe || is_kretprobe) { 147 if (is_kprobe || is_kretprobe) {
148 bool need_normal_check = true;
149 const char *event_prefix = "";
150
148 if (is_kprobe) 151 if (is_kprobe)
149 event += 7; 152 event += 7;
150 else 153 else
@@ -158,18 +161,33 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
158 if (isdigit(*event)) 161 if (isdigit(*event))
159 return populate_prog_array(event, fd); 162 return populate_prog_array(event, fd);
160 163
161 snprintf(buf, sizeof(buf), 164#ifdef __x86_64__
162 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", 165 if (strncmp(event, "sys_", 4) == 0) {
163 is_kprobe ? 'p' : 'r', event, event); 166 snprintf(buf, sizeof(buf),
164 err = system(buf); 167 "echo '%c:__x64_%s __x64_%s' >> /sys/kernel/debug/tracing/kprobe_events",
165 if (err < 0) { 168 is_kprobe ? 'p' : 'r', event, event);
166 printf("failed to create kprobe '%s' error '%s'\n", 169 err = system(buf);
167 event, strerror(errno)); 170 if (err >= 0) {
168 return -1; 171 need_normal_check = false;
172 event_prefix = "__x64_";
173 }
174 }
175#endif
176 if (need_normal_check) {
177 snprintf(buf, sizeof(buf),
178 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
179 is_kprobe ? 'p' : 'r', event, event);
180 err = system(buf);
181 if (err < 0) {
182 printf("failed to create kprobe '%s' error '%s'\n",
183 event, strerror(errno));
184 return -1;
185 }
169 } 186 }
170 187
171 strcpy(buf, DEBUGFS); 188 strcpy(buf, DEBUGFS);
172 strcat(buf, "events/kprobes/"); 189 strcat(buf, "events/kprobes/");
190 strcat(buf, event_prefix);
173 strcat(buf, event); 191 strcat(buf, event);
174 strcat(buf, "/id"); 192 strcat(buf, "/id");
175 } else if (is_tracepoint) { 193 } else if (is_tracepoint) {
@@ -648,66 +666,3 @@ void read_trace_pipe(void)
648 } 666 }
649 } 667 }
650} 668}
651
652#define MAX_SYMS 300000
653static struct ksym syms[MAX_SYMS];
654static int sym_cnt;
655
656static int ksym_cmp(const void *p1, const void *p2)
657{
658 return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
659}
660
661int load_kallsyms(void)
662{
663 FILE *f = fopen("/proc/kallsyms", "r");
664 char func[256], buf[256];
665 char symbol;
666 void *addr;
667 int i = 0;
668
669 if (!f)
670 return -ENOENT;
671
672 while (!feof(f)) {
673 if (!fgets(buf, sizeof(buf), f))
674 break;
675 if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
676 break;
677 if (!addr)
678 continue;
679 syms[i].addr = (long) addr;
680 syms[i].name = strdup(func);
681 i++;
682 }
683 sym_cnt = i;
684 qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
685 return 0;
686}
687
688struct ksym *ksym_search(long key)
689{
690 int start = 0, end = sym_cnt;
691 int result;
692
693 while (start < end) {
694 size_t mid = start + (end - start) / 2;
695
696 result = key - syms[mid].addr;
697 if (result < 0)
698 end = mid;
699 else if (result > 0)
700 start = mid + 1;
701 else
702 return &syms[mid];
703 }
704
705 if (start >= 1 && syms[start - 1].addr < key &&
706 key < syms[start].addr)
707 /* valid ksym */
708 return &syms[start - 1];
709
710 /* out of range. return _stext */
711 return &syms[0];
712}
713
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 453c200b389b..2c3d0b448632 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -54,12 +54,5 @@ int load_bpf_file(char *path);
54int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); 54int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
55 55
56void read_trace_pipe(void); 56void read_trace_pipe(void);
57struct ksym {
58 long addr;
59 char *name;
60};
61
62int load_kallsyms(void);
63struct ksym *ksym_search(long key);
64int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); 57int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
65#endif 58#endif
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
index 512f87a5fd20..f06063af9fcb 100644
--- a/samples/bpf/offwaketime_user.c
+++ b/samples/bpf/offwaketime_user.c
@@ -17,6 +17,7 @@
17#include <sys/resource.h> 17#include <sys/resource.h>
18#include "libbpf.h" 18#include "libbpf.h"
19#include "bpf_load.h" 19#include "bpf_load.h"
20#include "trace_helpers.h"
20 21
21#define PRINT_RAW_ADDR 0 22#define PRINT_RAW_ADDR 0
22 23
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
index 4ed690b907ff..60c2b73d1b4d 100644
--- a/samples/bpf/sampleip_user.c
+++ b/samples/bpf/sampleip_user.c
@@ -22,6 +22,7 @@
22#include "libbpf.h" 22#include "libbpf.h"
23#include "bpf_load.h" 23#include "bpf_load.h"
24#include "perf-sys.h" 24#include "perf-sys.h"
25#include "trace_helpers.h"
25 26
26#define DEFAULT_FREQ 99 27#define DEFAULT_FREQ 99
27#define DEFAULT_SECS 5 28#define DEFAULT_SECS 5
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c
index 3d736219a31c..8d3e9cfa1909 100644
--- a/samples/bpf/spintest_user.c
+++ b/samples/bpf/spintest_user.c
@@ -7,6 +7,7 @@
7#include <sys/resource.h> 7#include <sys/resource.h>
8#include "libbpf.h" 8#include "libbpf.h"
9#include "bpf_load.h" 9#include "bpf_load.h"
10#include "trace_helpers.h"
10 11
11int main(int ac, char **argv) 12int main(int ac, char **argv)
12{ 13{
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index 56f7a259a7c9..1fa1becfa641 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -21,6 +21,7 @@
21#include "libbpf.h" 21#include "libbpf.h"
22#include "bpf_load.h" 22#include "bpf_load.h"
23#include "perf-sys.h" 23#include "perf-sys.h"
24#include "trace_helpers.h"
24 25
25#define SAMPLE_FREQ 50 26#define SAMPLE_FREQ 50
26 27
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index ccca1e348017..5e78c2ecd08d 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -21,100 +21,10 @@
21#include "libbpf.h" 21#include "libbpf.h"
22#include "bpf_load.h" 22#include "bpf_load.h"
23#include "perf-sys.h" 23#include "perf-sys.h"
24#include "trace_helpers.h"
24 25
25static int pmu_fd; 26static int pmu_fd;
26 27
27int page_size;
28int page_cnt = 8;
29volatile struct perf_event_mmap_page *header;
30
31typedef void (*print_fn)(void *data, int size);
32
33static int perf_event_mmap(int fd)
34{
35 void *base;
36 int mmap_size;
37
38 page_size = getpagesize();
39 mmap_size = page_size * (page_cnt + 1);
40
41 base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
42 if (base == MAP_FAILED) {
43 printf("mmap err\n");
44 return -1;
45 }
46
47 header = base;
48 return 0;
49}
50
51static int perf_event_poll(int fd)
52{
53 struct pollfd pfd = { .fd = fd, .events = POLLIN };
54
55 return poll(&pfd, 1, 1000);
56}
57
58struct perf_event_sample {
59 struct perf_event_header header;
60 __u32 size;
61 char data[];
62};
63
64static void perf_event_read(print_fn fn)
65{
66 __u64 data_tail = header->data_tail;
67 __u64 data_head = header->data_head;
68 __u64 buffer_size = page_cnt * page_size;
69 void *base, *begin, *end;
70 char buf[256];
71
72 asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
73 if (data_head == data_tail)
74 return;
75
76 base = ((char *)header) + page_size;
77
78 begin = base + data_tail % buffer_size;
79 end = base + data_head % buffer_size;
80
81 while (begin != end) {
82 struct perf_event_sample *e;
83
84 e = begin;
85 if (begin + e->header.size > base + buffer_size) {
86 long len = base + buffer_size - begin;
87
88 assert(len < e->header.size);
89 memcpy(buf, begin, len);
90 memcpy(buf + len, base, e->header.size - len);
91 e = (void *) buf;
92 begin = base + e->header.size - len;
93 } else if (begin + e->header.size == base + buffer_size) {
94 begin = base;
95 } else {
96 begin += e->header.size;
97 }
98
99 if (e->header.type == PERF_RECORD_SAMPLE) {
100 fn(e->data, e->size);
101 } else if (e->header.type == PERF_RECORD_LOST) {
102 struct {
103 struct perf_event_header header;
104 __u64 id;
105 __u64 lost;
106 } *lost = (void *) e;
107 printf("lost %lld events\n", lost->lost);
108 } else {
109 printf("unknown event type=%d size=%d\n",
110 e->header.type, e->header.size);
111 }
112 }
113
114 __sync_synchronize(); /* smp_mb() */
115 header->data_tail = data_head;
116}
117
118static __u64 time_get_ns(void) 28static __u64 time_get_ns(void)
119{ 29{
120 struct timespec ts; 30 struct timespec ts;
@@ -127,7 +37,7 @@ static __u64 start_time;
127 37
128#define MAX_CNT 100000ll 38#define MAX_CNT 100000ll
129 39
130static void print_bpf_output(void *data, int size) 40static int print_bpf_output(void *data, int size)
131{ 41{
132 static __u64 cnt; 42 static __u64 cnt;
133 struct { 43 struct {
@@ -138,7 +48,7 @@ static void print_bpf_output(void *data, int size)
138 if (e->cookie != 0x12345678) { 48 if (e->cookie != 0x12345678) {
139 printf("BUG pid %llx cookie %llx sized %d\n", 49 printf("BUG pid %llx cookie %llx sized %d\n",
140 e->pid, e->cookie, size); 50 e->pid, e->cookie, size);
141 kill(0, SIGINT); 51 return PERF_EVENT_ERROR;
142 } 52 }
143 53
144 cnt++; 54 cnt++;
@@ -146,8 +56,10 @@ static void print_bpf_output(void *data, int size)
146 if (cnt == MAX_CNT) { 56 if (cnt == MAX_CNT) {
147 printf("recv %lld events per sec\n", 57 printf("recv %lld events per sec\n",
148 MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); 58 MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
149 kill(0, SIGINT); 59 return PERF_EVENT_DONE;
150 } 60 }
61
62 return PERF_EVENT_CONT;
151} 63}
152 64
153static void test_bpf_perf_event(void) 65static void test_bpf_perf_event(void)
@@ -170,6 +82,7 @@ int main(int argc, char **argv)
170{ 82{
171 char filename[256]; 83 char filename[256];
172 FILE *f; 84 FILE *f;
85 int ret;
173 86
174 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 87 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
175 88
@@ -187,10 +100,7 @@ int main(int argc, char **argv)
187 (void) f; 100 (void) f;
188 101
189 start_time = time_get_ns(); 102 start_time = time_get_ns();
190 for (;;) { 103 ret = perf_event_poller(pmu_fd, print_bpf_output);
191 perf_event_poll(pmu_fd); 104 kill(0, SIGINT);
192 perf_event_read(print_bpf_output); 105 return ret;
193 }
194
195 return 0;
196} 106}
diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h
new file mode 100644
index 000000000000..533ab81adfa1
--- /dev/null
+++ b/samples/bpf/xdpsock.h
@@ -0,0 +1,11 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef XDPSOCK_H_
3#define XDPSOCK_H_
4
5/* Power-of-2 number of sockets */
6#define MAX_SOCKS 4
7
8/* Round-robin receive */
9#define RR_LB 0
10
11#endif /* XDPSOCK_H_ */
diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c
new file mode 100644
index 000000000000..d8806c41362e
--- /dev/null
+++ b/samples/bpf/xdpsock_kern.c
@@ -0,0 +1,56 @@
1// SPDX-License-Identifier: GPL-2.0
2#define KBUILD_MODNAME "foo"
3#include <uapi/linux/bpf.h>
4#include "bpf_helpers.h"
5
6#include "xdpsock.h"
7
8struct bpf_map_def SEC("maps") qidconf_map = {
9 .type = BPF_MAP_TYPE_ARRAY,
10 .key_size = sizeof(int),
11 .value_size = sizeof(int),
12 .max_entries = 1,
13};
14
15struct bpf_map_def SEC("maps") xsks_map = {
16 .type = BPF_MAP_TYPE_XSKMAP,
17 .key_size = sizeof(int),
18 .value_size = sizeof(int),
19 .max_entries = 4,
20};
21
22struct bpf_map_def SEC("maps") rr_map = {
23 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
24 .key_size = sizeof(int),
25 .value_size = sizeof(unsigned int),
26 .max_entries = 1,
27};
28
29SEC("xdp_sock")
30int xdp_sock_prog(struct xdp_md *ctx)
31{
32 int *qidconf, key = 0, idx;
33 unsigned int *rr;
34
35 qidconf = bpf_map_lookup_elem(&qidconf_map, &key);
36 if (!qidconf)
37 return XDP_ABORTED;
38
39 if (*qidconf != ctx->rx_queue_index)
40 return XDP_PASS;
41
42#if RR_LB /* NB! RR_LB is configured in xdpsock.h */
43 rr = bpf_map_lookup_elem(&rr_map, &key);
44 if (!rr)
45 return XDP_ABORTED;
46
47 *rr = (*rr + 1) & (MAX_SOCKS - 1);
48 idx = *rr;
49#else
50 idx = 0;
51#endif
52
53 return bpf_redirect_map(&xsks_map, idx, 0);
54}
55
56char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
new file mode 100644
index 000000000000..4b8a7cf3e63b
--- /dev/null
+++ b/samples/bpf/xdpsock_user.c
@@ -0,0 +1,948 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright(c) 2017 - 2018 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#include <assert.h>
15#include <errno.h>
16#include <getopt.h>
17#include <libgen.h>
18#include <linux/bpf.h>
19#include <linux/if_link.h>
20#include <linux/if_xdp.h>
21#include <linux/if_ether.h>
22#include <net/if.h>
23#include <signal.h>
24#include <stdbool.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <net/ethernet.h>
29#include <sys/resource.h>
30#include <sys/socket.h>
31#include <sys/mman.h>
32#include <time.h>
33#include <unistd.h>
34#include <pthread.h>
35#include <locale.h>
36#include <sys/types.h>
37#include <poll.h>
38
39#include "bpf_load.h"
40#include "bpf_util.h"
41#include "libbpf.h"
42
43#include "xdpsock.h"
44
45#ifndef SOL_XDP
46#define SOL_XDP 283
47#endif
48
49#ifndef AF_XDP
50#define AF_XDP 44
51#endif
52
53#ifndef PF_XDP
54#define PF_XDP AF_XDP
55#endif
56
57#define NUM_FRAMES 131072
58#define FRAME_HEADROOM 0
59#define FRAME_SIZE 2048
60#define NUM_DESCS 1024
61#define BATCH_SIZE 16
62
63#define FQ_NUM_DESCS 1024
64#define CQ_NUM_DESCS 1024
65
66#define DEBUG_HEXDUMP 0
67
68typedef __u32 u32;
69
70static unsigned long prev_time;
71
72enum benchmark_type {
73 BENCH_RXDROP = 0,
74 BENCH_TXONLY = 1,
75 BENCH_L2FWD = 2,
76};
77
78static enum benchmark_type opt_bench = BENCH_RXDROP;
79static u32 opt_xdp_flags;
80static const char *opt_if = "";
81static int opt_ifindex;
82static int opt_queue;
83static int opt_poll;
84static int opt_shared_packet_buffer;
85static int opt_interval = 1;
86
87struct xdp_umem_uqueue {
88 u32 cached_prod;
89 u32 cached_cons;
90 u32 mask;
91 u32 size;
92 struct xdp_umem_ring *ring;
93};
94
95struct xdp_umem {
96 char (*frames)[FRAME_SIZE];
97 struct xdp_umem_uqueue fq;
98 struct xdp_umem_uqueue cq;
99 int fd;
100};
101
102struct xdp_uqueue {
103 u32 cached_prod;
104 u32 cached_cons;
105 u32 mask;
106 u32 size;
107 struct xdp_rxtx_ring *ring;
108};
109
110struct xdpsock {
111 struct xdp_uqueue rx;
112 struct xdp_uqueue tx;
113 int sfd;
114 struct xdp_umem *umem;
115 u32 outstanding_tx;
116 unsigned long rx_npkts;
117 unsigned long tx_npkts;
118 unsigned long prev_rx_npkts;
119 unsigned long prev_tx_npkts;
120};
121
122#define MAX_SOCKS 4
123static int num_socks;
124struct xdpsock *xsks[MAX_SOCKS];
125
126static unsigned long get_nsecs(void)
127{
128 struct timespec ts;
129
130 clock_gettime(CLOCK_MONOTONIC, &ts);
131 return ts.tv_sec * 1000000000UL + ts.tv_nsec;
132}
133
134static void dump_stats(void);
135
136#define lassert(expr) \
137 do { \
138 if (!(expr)) { \
139 fprintf(stderr, "%s:%s:%i: Assertion failed: " \
140 #expr ": errno: %d/\"%s\"\n", \
141 __FILE__, __func__, __LINE__, \
142 errno, strerror(errno)); \
143 dump_stats(); \
144 exit(EXIT_FAILURE); \
145 } \
146 } while (0)
147
148#define barrier() __asm__ __volatile__("": : :"memory")
149#define u_smp_rmb() barrier()
150#define u_smp_wmb() barrier()
151#define likely(x) __builtin_expect(!!(x), 1)
152#define unlikely(x) __builtin_expect(!!(x), 0)
153
154static const char pkt_data[] =
155 "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
156 "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
157 "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
158 "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
159
160static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb)
161{
162 u32 free_entries = q->size - (q->cached_prod - q->cached_cons);
163
164 if (free_entries >= nb)
165 return free_entries;
166
167 /* Refresh the local tail pointer */
168 q->cached_cons = q->ring->ptrs.consumer;
169
170 return q->size - (q->cached_prod - q->cached_cons);
171}
172
173static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs)
174{
175 u32 free_entries = q->cached_cons - q->cached_prod;
176
177 if (free_entries >= ndescs)
178 return free_entries;
179
180 /* Refresh the local tail pointer */
181 q->cached_cons = q->ring->ptrs.consumer + q->size;
182 return q->cached_cons - q->cached_prod;
183}
184
185static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb)
186{
187 u32 entries = q->cached_prod - q->cached_cons;
188
189 if (entries == 0) {
190 q->cached_prod = q->ring->ptrs.producer;
191 entries = q->cached_prod - q->cached_cons;
192 }
193
194 return (entries > nb) ? nb : entries;
195}
196
197static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs)
198{
199 u32 entries = q->cached_prod - q->cached_cons;
200
201 if (entries == 0) {
202 q->cached_prod = q->ring->ptrs.producer;
203 entries = q->cached_prod - q->cached_cons;
204 }
205
206 return (entries > ndescs) ? ndescs : entries;
207}
208
209static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
210 struct xdp_desc *d,
211 size_t nb)
212{
213 u32 i;
214
215 if (umem_nb_free(fq, nb) < nb)
216 return -ENOSPC;
217
218 for (i = 0; i < nb; i++) {
219 u32 idx = fq->cached_prod++ & fq->mask;
220
221 fq->ring->desc[idx] = d[i].idx;
222 }
223
224 u_smp_wmb();
225
226 fq->ring->ptrs.producer = fq->cached_prod;
227
228 return 0;
229}
230
231static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u32 *d,
232 size_t nb)
233{
234 u32 i;
235
236 if (umem_nb_free(fq, nb) < nb)
237 return -ENOSPC;
238
239 for (i = 0; i < nb; i++) {
240 u32 idx = fq->cached_prod++ & fq->mask;
241
242 fq->ring->desc[idx] = d[i];
243 }
244
245 u_smp_wmb();
246
247 fq->ring->ptrs.producer = fq->cached_prod;
248
249 return 0;
250}
251
252static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
253 u32 *d, size_t nb)
254{
255 u32 idx, i, entries = umem_nb_avail(cq, nb);
256
257 u_smp_rmb();
258
259 for (i = 0; i < entries; i++) {
260 idx = cq->cached_cons++ & cq->mask;
261 d[i] = cq->ring->desc[idx];
262 }
263
264 if (entries > 0) {
265 u_smp_wmb();
266
267 cq->ring->ptrs.consumer = cq->cached_cons;
268 }
269
270 return entries;
271}
272
273static inline void *xq_get_data(struct xdpsock *xsk, __u32 idx, __u32 off)
274{
275 lassert(idx < NUM_FRAMES);
276 return &xsk->umem->frames[idx][off];
277}
278
279static inline int xq_enq(struct xdp_uqueue *uq,
280 const struct xdp_desc *descs,
281 unsigned int ndescs)
282{
283 struct xdp_rxtx_ring *r = uq->ring;
284 unsigned int i;
285
286 if (xq_nb_free(uq, ndescs) < ndescs)
287 return -ENOSPC;
288
289 for (i = 0; i < ndescs; i++) {
290 u32 idx = uq->cached_prod++ & uq->mask;
291
292 r->desc[idx].idx = descs[i].idx;
293 r->desc[idx].len = descs[i].len;
294 r->desc[idx].offset = descs[i].offset;
295 }
296
297 u_smp_wmb();
298
299 r->ptrs.producer = uq->cached_prod;
300 return 0;
301}
302
303static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
304 __u32 idx, unsigned int ndescs)
305{
306 struct xdp_rxtx_ring *q = uq->ring;
307 unsigned int i;
308
309 if (xq_nb_free(uq, ndescs) < ndescs)
310 return -ENOSPC;
311
312 for (i = 0; i < ndescs; i++) {
313 u32 idx = uq->cached_prod++ & uq->mask;
314
315 q->desc[idx].idx = idx + i;
316 q->desc[idx].len = sizeof(pkt_data) - 1;
317 q->desc[idx].offset = 0;
318 }
319
320 u_smp_wmb();
321
322 q->ptrs.producer = uq->cached_prod;
323 return 0;
324}
325
326static inline int xq_deq(struct xdp_uqueue *uq,
327 struct xdp_desc *descs,
328 int ndescs)
329{
330 struct xdp_rxtx_ring *r = uq->ring;
331 unsigned int idx;
332 int i, entries;
333
334 entries = xq_nb_avail(uq, ndescs);
335
336 u_smp_rmb();
337
338 for (i = 0; i < entries; i++) {
339 idx = uq->cached_cons++ & uq->mask;
340 descs[i] = r->desc[idx];
341 }
342
343 if (entries > 0) {
344 u_smp_wmb();
345
346 r->ptrs.consumer = uq->cached_cons;
347 }
348
349 return entries;
350}
351
352static void swap_mac_addresses(void *data)
353{
354 struct ether_header *eth = (struct ether_header *)data;
355 struct ether_addr *src_addr = (struct ether_addr *)&eth->ether_shost;
356 struct ether_addr *dst_addr = (struct ether_addr *)&eth->ether_dhost;
357 struct ether_addr tmp;
358
359 tmp = *src_addr;
360 *src_addr = *dst_addr;
361 *dst_addr = tmp;
362}
363
364#if DEBUG_HEXDUMP
365static void hex_dump(void *pkt, size_t length, const char *prefix)
366{
367 int i = 0;
368 const unsigned char *address = (unsigned char *)pkt;
369 const unsigned char *line = address;
370 size_t line_size = 32;
371 unsigned char c;
372
373 printf("length = %zu\n", length);
374 printf("%s | ", prefix);
375 while (length-- > 0) {
376 printf("%02X ", *address++);
377 if (!(++i % line_size) || (length == 0 && i % line_size)) {
378 if (length == 0) {
379 while (i++ % line_size)
380 printf("__ ");
381 }
382 printf(" | "); /* right close */
383 while (line < address) {
384 c = *line++;
385 printf("%c", (c < 33 || c == 255) ? 0x2E : c);
386 }
387 printf("\n");
388 if (length > 0)
389 printf("%s | ", prefix);
390 }
391 }
392 printf("\n");
393}
394#endif
395
396static size_t gen_eth_frame(char *frame)
397{
398 memcpy(frame, pkt_data, sizeof(pkt_data) - 1);
399 return sizeof(pkt_data) - 1;
400}
401
402static struct xdp_umem *xdp_umem_configure(int sfd)
403{
404 int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS;
405 struct xdp_umem_reg mr;
406 struct xdp_umem *umem;
407 void *bufs;
408
409 umem = calloc(1, sizeof(*umem));
410 lassert(umem);
411
412 lassert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
413 NUM_FRAMES * FRAME_SIZE) == 0);
414
415 mr.addr = (__u64)bufs;
416 mr.len = NUM_FRAMES * FRAME_SIZE;
417 mr.frame_size = FRAME_SIZE;
418 mr.frame_headroom = FRAME_HEADROOM;
419
420 lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0);
421 lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
422 sizeof(int)) == 0);
423 lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size,
424 sizeof(int)) == 0);
425
426 umem->fq.ring = mmap(0, sizeof(struct xdp_umem_ring) +
427 FQ_NUM_DESCS * sizeof(u32),
428 PROT_READ | PROT_WRITE,
429 MAP_SHARED | MAP_POPULATE, sfd,
430 XDP_UMEM_PGOFF_FILL_RING);
431 lassert(umem->fq.ring != MAP_FAILED);
432
433 umem->fq.mask = FQ_NUM_DESCS - 1;
434 umem->fq.size = FQ_NUM_DESCS;
435
436 umem->cq.ring = mmap(0, sizeof(struct xdp_umem_ring) +
437 CQ_NUM_DESCS * sizeof(u32),
438 PROT_READ | PROT_WRITE,
439 MAP_SHARED | MAP_POPULATE, sfd,
440 XDP_UMEM_PGOFF_COMPLETION_RING);
441 lassert(umem->cq.ring != MAP_FAILED);
442
443 umem->cq.mask = CQ_NUM_DESCS - 1;
444 umem->cq.size = CQ_NUM_DESCS;
445
446 umem->frames = (char (*)[FRAME_SIZE])bufs;
447 umem->fd = sfd;
448
449 if (opt_bench == BENCH_TXONLY) {
450 int i;
451
452 for (i = 0; i < NUM_FRAMES; i++)
453 (void)gen_eth_frame(&umem->frames[i][0]);
454 }
455
456 return umem;
457}
458
459static struct xdpsock *xsk_configure(struct xdp_umem *umem)
460{
461 struct sockaddr_xdp sxdp = {};
462 int sfd, ndescs = NUM_DESCS;
463 struct xdpsock *xsk;
464 bool shared = true;
465 u32 i;
466
467 sfd = socket(PF_XDP, SOCK_RAW, 0);
468 lassert(sfd >= 0);
469
470 xsk = calloc(1, sizeof(*xsk));
471 lassert(xsk);
472
473 xsk->sfd = sfd;
474 xsk->outstanding_tx = 0;
475
476 if (!umem) {
477 shared = false;
478 xsk->umem = xdp_umem_configure(sfd);
479 } else {
480 xsk->umem = umem;
481 }
482
483 lassert(setsockopt(sfd, SOL_XDP, XDP_RX_RING,
484 &ndescs, sizeof(int)) == 0);
485 lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING,
486 &ndescs, sizeof(int)) == 0);
487
488 /* Rx */
489 xsk->rx.ring = mmap(NULL,
490 sizeof(struct xdp_ring) +
491 NUM_DESCS * sizeof(struct xdp_desc),
492 PROT_READ | PROT_WRITE,
493 MAP_SHARED | MAP_POPULATE, sfd,
494 XDP_PGOFF_RX_RING);
495 lassert(xsk->rx.ring != MAP_FAILED);
496
497 if (!shared) {
498 for (i = 0; i < NUM_DESCS / 2; i++)
499 lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1)
500 == 0);
501 }
502
503 /* Tx */
504 xsk->tx.ring = mmap(NULL,
505 sizeof(struct xdp_ring) +
506 NUM_DESCS * sizeof(struct xdp_desc),
507 PROT_READ | PROT_WRITE,
508 MAP_SHARED | MAP_POPULATE, sfd,
509 XDP_PGOFF_TX_RING);
510 lassert(xsk->tx.ring != MAP_FAILED);
511
512 xsk->rx.mask = NUM_DESCS - 1;
513 xsk->rx.size = NUM_DESCS;
514
515 xsk->tx.mask = NUM_DESCS - 1;
516 xsk->tx.size = NUM_DESCS;
517
518 sxdp.sxdp_family = PF_XDP;
519 sxdp.sxdp_ifindex = opt_ifindex;
520 sxdp.sxdp_queue_id = opt_queue;
521 if (shared) {
522 sxdp.sxdp_flags = XDP_SHARED_UMEM;
523 sxdp.sxdp_shared_umem_fd = umem->fd;
524 }
525
526 lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0);
527
528 return xsk;
529}
530
531static void print_benchmark(bool running)
532{
533 const char *bench_str = "INVALID";
534
535 if (opt_bench == BENCH_RXDROP)
536 bench_str = "rxdrop";
537 else if (opt_bench == BENCH_TXONLY)
538 bench_str = "txonly";
539 else if (opt_bench == BENCH_L2FWD)
540 bench_str = "l2fwd";
541
542 printf("%s:%d %s ", opt_if, opt_queue, bench_str);
543 if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
544 printf("xdp-skb ");
545 else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
546 printf("xdp-drv ");
547 else
548 printf(" ");
549
550 if (opt_poll)
551 printf("poll() ");
552
553 if (running) {
554 printf("running...");
555 fflush(stdout);
556 }
557}
558
559static void dump_stats(void)
560{
561 unsigned long now = get_nsecs();
562 long dt = now - prev_time;
563 int i;
564
565 prev_time = now;
566
567 for (i = 0; i < num_socks; i++) {
568 char *fmt = "%-15s %'-11.0f %'-11lu\n";
569 double rx_pps, tx_pps;
570
571 rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
572 1000000000. / dt;
573 tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
574 1000000000. / dt;
575
576 printf("\n sock%d@", i);
577 print_benchmark(false);
578 printf("\n");
579
580 printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
581 dt / 1000000000.);
582 printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
583 printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
584
585 xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
586 xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
587 }
588}
589
590static void *poller(void *arg)
591{
592 (void)arg;
593 for (;;) {
594 sleep(opt_interval);
595 dump_stats();
596 }
597
598 return NULL;
599}
600
601static void int_exit(int sig)
602{
603 (void)sig;
604 dump_stats();
605 bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
606 exit(EXIT_SUCCESS);
607}
608
609static struct option long_options[] = {
610 {"rxdrop", no_argument, 0, 'r'},
611 {"txonly", no_argument, 0, 't'},
612 {"l2fwd", no_argument, 0, 'l'},
613 {"interface", required_argument, 0, 'i'},
614 {"queue", required_argument, 0, 'q'},
615 {"poll", no_argument, 0, 'p'},
616 {"shared-buffer", no_argument, 0, 's'},
617 {"xdp-skb", no_argument, 0, 'S'},
618 {"xdp-native", no_argument, 0, 'N'},
619 {"interval", required_argument, 0, 'n'},
620 {0, 0, 0, 0}
621};
622
623static void usage(const char *prog)
624{
625 const char *str =
626 " Usage: %s [OPTIONS]\n"
627 " Options:\n"
628 " -r, --rxdrop Discard all incoming packets (default)\n"
629 " -t, --txonly Only send packets\n"
630 " -l, --l2fwd MAC swap L2 forwarding\n"
631 " -i, --interface=n Run on interface n\n"
632 " -q, --queue=n Use queue n (default 0)\n"
633 " -p, --poll Use poll syscall\n"
634 " -s, --shared-buffer Use shared packet buffer\n"
635 " -S, --xdp-skb=n Use XDP skb-mod\n"
636 " -N, --xdp-native=n Enfore XDP native mode\n"
637 " -n, --interval=n Specify statistics update interval (default 1 sec).\n"
638 "\n";
639 fprintf(stderr, str, prog);
640 exit(EXIT_FAILURE);
641}
642
643static void parse_command_line(int argc, char **argv)
644{
645 int option_index, c;
646
647 opterr = 0;
648
649 for (;;) {
650 c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options,
651 &option_index);
652 if (c == -1)
653 break;
654
655 switch (c) {
656 case 'r':
657 opt_bench = BENCH_RXDROP;
658 break;
659 case 't':
660 opt_bench = BENCH_TXONLY;
661 break;
662 case 'l':
663 opt_bench = BENCH_L2FWD;
664 break;
665 case 'i':
666 opt_if = optarg;
667 break;
668 case 'q':
669 opt_queue = atoi(optarg);
670 break;
671 case 's':
672 opt_shared_packet_buffer = 1;
673 break;
674 case 'p':
675 opt_poll = 1;
676 break;
677 case 'S':
678 opt_xdp_flags |= XDP_FLAGS_SKB_MODE;
679 break;
680 case 'N':
681 opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
682 break;
683 case 'n':
684 opt_interval = atoi(optarg);
685 break;
686 default:
687 usage(basename(argv[0]));
688 }
689 }
690
691 opt_ifindex = if_nametoindex(opt_if);
692 if (!opt_ifindex) {
693 fprintf(stderr, "ERROR: interface \"%s\" does not exist\n",
694 opt_if);
695 usage(basename(argv[0]));
696 }
697}
698
699static void kick_tx(int fd)
700{
701 int ret;
702
703 ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
704 if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN)
705 return;
706 lassert(0);
707}
708
709static inline void complete_tx_l2fwd(struct xdpsock *xsk)
710{
711 u32 descs[BATCH_SIZE];
712 unsigned int rcvd;
713 size_t ndescs;
714
715 if (!xsk->outstanding_tx)
716 return;
717
718 kick_tx(xsk->sfd);
719 ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
720 xsk->outstanding_tx;
721
722 /* re-add completed Tx buffers */
723 rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs);
724 if (rcvd > 0) {
725 umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd);
726 xsk->outstanding_tx -= rcvd;
727 xsk->tx_npkts += rcvd;
728 }
729}
730
731static inline void complete_tx_only(struct xdpsock *xsk)
732{
733 u32 descs[BATCH_SIZE];
734 unsigned int rcvd;
735
736 if (!xsk->outstanding_tx)
737 return;
738
739 kick_tx(xsk->sfd);
740
741 rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE);
742 if (rcvd > 0) {
743 xsk->outstanding_tx -= rcvd;
744 xsk->tx_npkts += rcvd;
745 }
746}
747
748static void rx_drop(struct xdpsock *xsk)
749{
750 struct xdp_desc descs[BATCH_SIZE];
751 unsigned int rcvd, i;
752
753 rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
754 if (!rcvd)
755 return;
756
757 for (i = 0; i < rcvd; i++) {
758 u32 idx = descs[i].idx;
759
760 lassert(idx < NUM_FRAMES);
761#if DEBUG_HEXDUMP
762 char *pkt;
763 char buf[32];
764
765 pkt = xq_get_data(xsk, idx, descs[i].offset);
766 sprintf(buf, "idx=%d", idx);
767 hex_dump(pkt, descs[i].len, buf);
768#endif
769 }
770
771 xsk->rx_npkts += rcvd;
772
773 umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd);
774}
775
776static void rx_drop_all(void)
777{
778 struct pollfd fds[MAX_SOCKS + 1];
779 int i, ret, timeout, nfds = 1;
780
781 memset(fds, 0, sizeof(fds));
782
783 for (i = 0; i < num_socks; i++) {
784 fds[i].fd = xsks[i]->sfd;
785 fds[i].events = POLLIN;
786 timeout = 1000; /* 1sn */
787 }
788
789 for (;;) {
790 if (opt_poll) {
791 ret = poll(fds, nfds, timeout);
792 if (ret <= 0)
793 continue;
794 }
795
796 for (i = 0; i < num_socks; i++)
797 rx_drop(xsks[i]);
798 }
799}
800
801static void tx_only(struct xdpsock *xsk)
802{
803 int timeout, ret, nfds = 1;
804 struct pollfd fds[nfds + 1];
805 unsigned int idx = 0;
806
807 memset(fds, 0, sizeof(fds));
808 fds[0].fd = xsk->sfd;
809 fds[0].events = POLLOUT;
810 timeout = 1000; /* 1sn */
811
812 for (;;) {
813 if (opt_poll) {
814 ret = poll(fds, nfds, timeout);
815 if (ret <= 0)
816 continue;
817
818 if (fds[0].fd != xsk->sfd ||
819 !(fds[0].revents & POLLOUT))
820 continue;
821 }
822
823 if (xq_nb_free(&xsk->tx, BATCH_SIZE) >= BATCH_SIZE) {
824 lassert(xq_enq_tx_only(&xsk->tx, idx, BATCH_SIZE) == 0);
825
826 xsk->outstanding_tx += BATCH_SIZE;
827 idx += BATCH_SIZE;
828 idx %= NUM_FRAMES;
829 }
830
831 complete_tx_only(xsk);
832 }
833}
834
835static void l2fwd(struct xdpsock *xsk)
836{
837 for (;;) {
838 struct xdp_desc descs[BATCH_SIZE];
839 unsigned int rcvd, i;
840 int ret;
841
842 for (;;) {
843 complete_tx_l2fwd(xsk);
844
845 rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
846 if (rcvd > 0)
847 break;
848 }
849
850 for (i = 0; i < rcvd; i++) {
851 char *pkt = xq_get_data(xsk, descs[i].idx,
852 descs[i].offset);
853
854 swap_mac_addresses(pkt);
855#if DEBUG_HEXDUMP
856 char buf[32];
857 u32 idx = descs[i].idx;
858
859 sprintf(buf, "idx=%d", idx);
860 hex_dump(pkt, descs[i].len, buf);
861#endif
862 }
863
864 xsk->rx_npkts += rcvd;
865
866 ret = xq_enq(&xsk->tx, descs, rcvd);
867 lassert(ret == 0);
868 xsk->outstanding_tx += rcvd;
869 }
870}
871
872int main(int argc, char **argv)
873{
874 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
875 char xdp_filename[256];
876 int i, ret, key = 0;
877 pthread_t pt;
878
879 parse_command_line(argc, argv);
880
881 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
882 fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n",
883 strerror(errno));
884 exit(EXIT_FAILURE);
885 }
886
887 snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
888
889 if (load_bpf_file(xdp_filename)) {
890 fprintf(stderr, "ERROR: load_bpf_file %s\n", bpf_log_buf);
891 exit(EXIT_FAILURE);
892 }
893
894 if (!prog_fd[0]) {
895 fprintf(stderr, "ERROR: load_bpf_file: \"%s\"\n",
896 strerror(errno));
897 exit(EXIT_FAILURE);
898 }
899
900 if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd[0], opt_xdp_flags) < 0) {
901 fprintf(stderr, "ERROR: link set xdp fd failed\n");
902 exit(EXIT_FAILURE);
903 }
904
905 ret = bpf_map_update_elem(map_fd[0], &key, &opt_queue, 0);
906 if (ret) {
907 fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n");
908 exit(EXIT_FAILURE);
909 }
910
911 /* Create sockets... */
912 xsks[num_socks++] = xsk_configure(NULL);
913
914#if RR_LB
915 for (i = 0; i < MAX_SOCKS - 1; i++)
916 xsks[num_socks++] = xsk_configure(xsks[0]->umem);
917#endif
918
919 /* ...and insert them into the map. */
920 for (i = 0; i < num_socks; i++) {
921 key = i;
922 ret = bpf_map_update_elem(map_fd[1], &key, &xsks[i]->sfd, 0);
923 if (ret) {
924 fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
925 exit(EXIT_FAILURE);
926 }
927 }
928
929 signal(SIGINT, int_exit);
930 signal(SIGTERM, int_exit);
931 signal(SIGABRT, int_exit);
932
933 setlocale(LC_ALL, "");
934
935 ret = pthread_create(&pt, NULL, poller, NULL);
936 lassert(ret == 0);
937
938 prev_time = get_nsecs();
939
940 if (opt_bench == BENCH_RXDROP)
941 rx_drop_all();
942 else if (opt_bench == BENCH_TXONLY)
943 tx_only(xsks[0]);
944 else
945 l2fwd(xsks[0]);
946
947 return 0;
948}
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 30ba0fee36e4..8f59897fbda1 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -39,9 +39,9 @@ class Helper(object):
39 Break down helper function protocol into smaller chunks: return type, 39 Break down helper function protocol into smaller chunks: return type,
40 name, distincts arguments. 40 name, distincts arguments.
41 """ 41 """
42 arg_re = re.compile('^((const )?(struct )?(\w+|...))( (\**)(\w+))?$') 42 arg_re = re.compile('((const )?(struct )?(\w+|...))( (\**)(\w+))?$')
43 res = {} 43 res = {}
44 proto_re = re.compile('^(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$') 44 proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
45 45
46 capture = proto_re.match(self.proto) 46 capture = proto_re.match(self.proto)
47 res['ret_type'] = capture.group(1) 47 res['ret_type'] = capture.group(1)
@@ -87,7 +87,7 @@ class HeaderParser(object):
87 # - Same as above, with "const" and/or "struct" in front of type 87 # - Same as above, with "const" and/or "struct" in front of type
88 # - "..." (undefined number of arguments, for bpf_trace_printk()) 88 # - "..." (undefined number of arguments, for bpf_trace_printk())
89 # There is at least one term ("void"), and at most five arguments. 89 # There is at least one term ("void"), and at most five arguments.
90 p = re.compile('^ \* ((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$') 90 p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
91 capture = p.match(self.line) 91 capture = p.match(self.line)
92 if not capture: 92 if not capture:
93 raise NoHelperFound 93 raise NoHelperFound
@@ -95,7 +95,7 @@ class HeaderParser(object):
95 return capture.group(1) 95 return capture.group(1)
96 96
97 def parse_desc(self): 97 def parse_desc(self):
98 p = re.compile('^ \* \tDescription$') 98 p = re.compile(' \* ?(?:\t| {6,8})Description$')
99 capture = p.match(self.line) 99 capture = p.match(self.line)
100 if not capture: 100 if not capture:
101 # Helper can have empty description and we might be parsing another 101 # Helper can have empty description and we might be parsing another
@@ -109,7 +109,7 @@ class HeaderParser(object):
109 if self.line == ' *\n': 109 if self.line == ' *\n':
110 desc += '\n' 110 desc += '\n'
111 else: 111 else:
112 p = re.compile('^ \* \t\t(.*)') 112 p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)')
113 capture = p.match(self.line) 113 capture = p.match(self.line)
114 if capture: 114 if capture:
115 desc += capture.group(1) + '\n' 115 desc += capture.group(1) + '\n'
@@ -118,7 +118,7 @@ class HeaderParser(object):
118 return desc 118 return desc
119 119
120 def parse_ret(self): 120 def parse_ret(self):
121 p = re.compile('^ \* \tReturn$') 121 p = re.compile(' \* ?(?:\t| {6,8})Return$')
122 capture = p.match(self.line) 122 capture = p.match(self.line)
123 if not capture: 123 if not capture:
124 # Helper can have empty retval and we might be parsing another 124 # Helper can have empty retval and we might be parsing another
@@ -132,7 +132,7 @@ class HeaderParser(object):
132 if self.line == ' *\n': 132 if self.line == ' *\n':
133 ret += '\n' 133 ret += '\n'
134 else: 134 else:
135 p = re.compile('^ \* \t\t(.*)') 135 p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)')
136 capture = p.match(self.line) 136 capture = p.match(self.line)
137 if capture: 137 if capture:
138 ret += capture.group(1) + '\n' 138 ret += capture.group(1) + '\n'
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4cafe6a19167..5c508d26b367 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1471,7 +1471,9 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc
1471 return SECCLASS_QIPCRTR_SOCKET; 1471 return SECCLASS_QIPCRTR_SOCKET;
1472 case PF_SMC: 1472 case PF_SMC:
1473 return SECCLASS_SMC_SOCKET; 1473 return SECCLASS_SMC_SOCKET;
1474#if PF_MAX > 44 1474 case PF_XDP:
1475 return SECCLASS_XDP_SOCKET;
1476#if PF_MAX > 45
1475#error New address family defined, please update this function. 1477#error New address family defined, please update this function.
1476#endif 1478#endif
1477 } 1479 }
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 7f0372426494..bd5fe0d3204a 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -240,9 +240,11 @@ struct security_class_mapping secclass_map[] = {
240 { "manage_subnet", NULL } }, 240 { "manage_subnet", NULL } },
241 { "bpf", 241 { "bpf",
242 {"map_create", "map_read", "map_write", "prog_load", "prog_run"} }, 242 {"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
243 { "xdp_socket",
244 { COMMON_SOCK_PERMS, NULL } },
243 { NULL } 245 { NULL }
244 }; 246 };
245 247
246#if PF_MAX > 44 248#if PF_MAX > 45
247#error New address family defined, please update secclass_map. 249#error New address family defined, please update secclass_map.
248#endif 250#endif
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 5f512b14bff9..a6258bc8ec4f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -22,17 +22,19 @@ MAP COMMANDS
22============= 22=============
23 23
24| **bpftool** **map { show | list }** [*MAP*] 24| **bpftool** **map { show | list }** [*MAP*]
25| **bpftool** **map dump** *MAP* 25| **bpftool** **map dump** *MAP*
26| **bpftool** **map update** *MAP* **key** [**hex**] *BYTES* **value** [**hex**] *VALUE* [*UPDATE_FLAGS*] 26| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*]
27| **bpftool** **map lookup** *MAP* **key** [**hex**] *BYTES* 27| **bpftool** **map lookup** *MAP* **key** *DATA*
28| **bpftool** **map getnext** *MAP* [**key** [**hex**] *BYTES*] 28| **bpftool** **map getnext** *MAP* [**key** *DATA*]
29| **bpftool** **map delete** *MAP* **key** [**hex**] *BYTES* 29| **bpftool** **map delete** *MAP* **key** *DATA*
30| **bpftool** **map pin** *MAP* *FILE* 30| **bpftool** **map pin** *MAP* *FILE*
31| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
31| **bpftool** **map help** 32| **bpftool** **map help**
32| 33|
33| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } 34| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
35| *DATA* := { [**hex**] *BYTES* }
34| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } 36| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
35| *VALUE* := { *BYTES* | *MAP* | *PROG* } 37| *VALUE* := { *DATA* | *MAP* | *PROG* }
36| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } 38| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** }
37 39
38DESCRIPTION 40DESCRIPTION
@@ -48,7 +50,7 @@ DESCRIPTION
48 **bpftool map dump** *MAP* 50 **bpftool map dump** *MAP*
49 Dump all entries in a given *MAP*. 51 Dump all entries in a given *MAP*.
50 52
51 **bpftool map update** *MAP* **key** [**hex**] *BYTES* **value** [**hex**] *VALUE* [*UPDATE_FLAGS*] 53 **bpftool map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*]
52 Update map entry for a given *KEY*. 54 Update map entry for a given *KEY*.
53 55
54 *UPDATE_FLAGS* can be one of: **any** update existing entry 56 *UPDATE_FLAGS* can be one of: **any** update existing entry
@@ -61,13 +63,13 @@ DESCRIPTION
61 the bytes are parsed as decimal values, unless a "0x" prefix 63 the bytes are parsed as decimal values, unless a "0x" prefix
62 (for hexadecimal) or a "0" prefix (for octal) is provided. 64 (for hexadecimal) or a "0" prefix (for octal) is provided.
63 65
64 **bpftool map lookup** *MAP* **key** [**hex**] *BYTES* 66 **bpftool map lookup** *MAP* **key** *DATA*
65 Lookup **key** in the map. 67 Lookup **key** in the map.
66 68
67 **bpftool map getnext** *MAP* [**key** [**hex**] *BYTES*] 69 **bpftool map getnext** *MAP* [**key** *DATA*]
68 Get next key. If *key* is not specified, get first key. 70 Get next key. If *key* is not specified, get first key.
69 71
70 **bpftool map delete** *MAP* **key** [**hex**] *BYTES* 72 **bpftool map delete** *MAP* **key** *DATA*
71 Remove entry from the map. 73 Remove entry from the map.
72 74
73 **bpftool map pin** *MAP* *FILE* 75 **bpftool map pin** *MAP* *FILE*
@@ -75,6 +77,22 @@ DESCRIPTION
75 77
76 Note: *FILE* must be located in *bpffs* mount. 78 Note: *FILE* must be located in *bpffs* mount.
77 79
80 **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
81 Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map.
82
83 Install perf rings into a perf event array map and dump
84 output of any bpf_perf_event_output() call in the kernel.
85 By default read the number of CPUs on the system and
86 install perf ring for each CPU in the corresponding index
87 in the array.
88
89 If **cpu** and **index** are specified, install perf ring
90 for given **cpu** at **index** in the array (single ring).
91
92 Note that installing a perf ring into an array will silently
93 replace any existing ring. Any other application will stop
94 receiving events if it installed its rings earlier.
95
78 **bpftool map help** 96 **bpftool map help**
79 Print short help message. 97 Print short help message.
80 98
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 20689a321ffe..564cb0d9692b 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -23,7 +23,7 @@ SYNOPSIS
23 23
24 *MAP-COMMANDS* := 24 *MAP-COMMANDS* :=
25 { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete** 25 { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
26 | **pin** | **help** } 26 | **pin** | **event_pipe** | **help** }
27 27
28 *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** 28 *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin**
29 | **load** | **help** } 29 | **load** | **help** }
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 4e69782c4a79..892dbf095bff 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -39,7 +39,12 @@ CC = gcc
39 39
40CFLAGS += -O2 40CFLAGS += -O2
41CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers 41CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers
42CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/ 42CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
43 -I$(srctree)/kernel/bpf/ \
44 -I$(srctree)/tools/include \
45 -I$(srctree)/tools/include/uapi \
46 -I$(srctree)/tools/lib/bpf \
47 -I$(srctree)/tools/perf
43CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' 48CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"'
44LIBS = -lelf -lbfd -lopcodes $(LIBBPF) 49LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
45 50
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 852d84a98acd..b301c9b315f1 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -1,6 +1,6 @@
1# bpftool(8) bash completion -*- shell-script -*- 1# bpftool(8) bash completion -*- shell-script -*-
2# 2#
3# Copyright (C) 2017 Netronome Systems, Inc. 3# Copyright (C) 2017-2018 Netronome Systems, Inc.
4# 4#
5# This software is dual licensed under the GNU General License 5# This software is dual licensed under the GNU General License
6# Version 2, June 1991 as shown in the file COPYING in the top-level 6# Version 2, June 1991 as shown in the file COPYING in the top-level
@@ -79,6 +79,14 @@ _bpftool_get_map_ids()
79 command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) 79 command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
80} 80}
81 81
82_bpftool_get_perf_map_ids()
83{
84 COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \
85 command grep -C2 perf_event_array | \
86 command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
87}
88
89
82_bpftool_get_prog_ids() 90_bpftool_get_prog_ids()
83{ 91{
84 COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \ 92 COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \
@@ -359,10 +367,34 @@ _bpftool()
359 fi 367 fi
360 return 0 368 return 0
361 ;; 369 ;;
370 event_pipe)
371 case $prev in
372 $command)
373 COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
374 return 0
375 ;;
376 id)
377 _bpftool_get_perf_map_ids
378 return 0
379 ;;
380 cpu)
381 return 0
382 ;;
383 index)
384 return 0
385 ;;
386 *)
387 _bpftool_once_attr 'cpu'
388 _bpftool_once_attr 'index'
389 return 0
390 ;;
391 esac
392 ;;
362 *) 393 *)
363 [[ $prev == $object ]] && \ 394 [[ $prev == $object ]] && \
364 COMPREPLY=( $( compgen -W 'delete dump getnext help \ 395 COMPREPLY=( $( compgen -W 'delete dump getnext help \
365 lookup pin show list update' -- "$cur" ) ) 396 lookup pin event_pipe show list update' -- \
397 "$cur" ) )
366 ;; 398 ;;
367 esac 399 esac
368 ;; 400 ;;
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 465995281dcd..32f9e397a6c0 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2017 Netronome Systems, Inc. 2 * Copyright (C) 2017-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -33,6 +33,7 @@
33 33
34/* Author: Jakub Kicinski <kubakici@wp.pl> */ 34/* Author: Jakub Kicinski <kubakici@wp.pl> */
35 35
36#include <ctype.h>
36#include <errno.h> 37#include <errno.h>
37#include <fcntl.h> 38#include <fcntl.h>
38#include <fts.h> 39#include <fts.h>
@@ -330,6 +331,16 @@ char *get_fdinfo(int fd, const char *key)
330 return NULL; 331 return NULL;
331} 332}
332 333
334void print_data_json(uint8_t *data, size_t len)
335{
336 unsigned int i;
337
338 jsonw_start_array(json_wtr);
339 for (i = 0; i < len; i++)
340 jsonw_printf(json_wtr, "%d", data[i]);
341 jsonw_end_array(json_wtr);
342}
343
333void print_hex_data_json(uint8_t *data, size_t len) 344void print_hex_data_json(uint8_t *data, size_t len)
334{ 345{
335 unsigned int i; 346 unsigned int i;
@@ -420,6 +431,70 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab)
420 } 431 }
421} 432}
422 433
434unsigned int get_page_size(void)
435{
436 static int result;
437
438 if (!result)
439 result = getpagesize();
440 return result;
441}
442
443unsigned int get_possible_cpus(void)
444{
445 static unsigned int result;
446 char buf[128];
447 long int n;
448 char *ptr;
449 int fd;
450
451 if (result)
452 return result;
453
454 fd = open("/sys/devices/system/cpu/possible", O_RDONLY);
455 if (fd < 0) {
456 p_err("can't open sysfs possible cpus");
457 exit(-1);
458 }
459
460 n = read(fd, buf, sizeof(buf));
461 if (n < 2) {
462 p_err("can't read sysfs possible cpus");
463 exit(-1);
464 }
465 close(fd);
466
467 if (n == sizeof(buf)) {
468 p_err("read sysfs possible cpus overflow");
469 exit(-1);
470 }
471
472 ptr = buf;
473 n = 0;
474 while (*ptr && *ptr != '\n') {
475 unsigned int a, b;
476
477 if (sscanf(ptr, "%u-%u", &a, &b) == 2) {
478 n += b - a + 1;
479
480 ptr = strchr(ptr, '-') + 1;
481 } else if (sscanf(ptr, "%u", &a) == 1) {
482 n++;
483 } else {
484 assert(0);
485 }
486
487 while (isdigit(*ptr))
488 ptr++;
489 if (*ptr == ',')
490 ptr++;
491 }
492
493 result = n;
494
495 return result;
496}
497
423static char * 498static char *
424ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf) 499ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf)
425{ 500{
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index b8e9584d6246..6173cd997e7a 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2017 Netronome Systems, Inc. 2 * Copyright (C) 2017-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -117,14 +117,19 @@ int do_pin_fd(int fd, const char *name);
117 117
118int do_prog(int argc, char **arg); 118int do_prog(int argc, char **arg);
119int do_map(int argc, char **arg); 119int do_map(int argc, char **arg);
120int do_event_pipe(int argc, char **argv);
120int do_cgroup(int argc, char **arg); 121int do_cgroup(int argc, char **arg);
121 122
122int prog_parse_fd(int *argc, char ***argv); 123int prog_parse_fd(int *argc, char ***argv);
124int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
123 125
124void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, 126void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
125 const char *arch); 127 const char *arch);
128void print_data_json(uint8_t *data, size_t len);
126void print_hex_data_json(uint8_t *data, size_t len); 129void print_hex_data_json(uint8_t *data, size_t len);
127 130
131unsigned int get_page_size(void);
132unsigned int get_possible_cpus(void);
128const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino); 133const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino);
129 134
130#endif 135#endif
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index a6cdb640a0d7..af6766e956ba 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2017 Netronome Systems, Inc. 2 * Copyright (C) 2017-2018 Netronome Systems, Inc.
3 * 3 *
4 * This software is dual licensed under the GNU General License Version 2, 4 * This software is dual licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this 5 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -34,7 +34,6 @@
34/* Author: Jakub Kicinski <kubakici@wp.pl> */ 34/* Author: Jakub Kicinski <kubakici@wp.pl> */
35 35
36#include <assert.h> 36#include <assert.h>
37#include <ctype.h>
38#include <errno.h> 37#include <errno.h>
39#include <fcntl.h> 38#include <fcntl.h>
40#include <stdbool.h> 39#include <stdbool.h>
@@ -69,61 +68,6 @@ static const char * const map_type_name[] = {
69 [BPF_MAP_TYPE_CPUMAP] = "cpumap", 68 [BPF_MAP_TYPE_CPUMAP] = "cpumap",
70}; 69};
71 70
72static unsigned int get_possible_cpus(void)
73{
74 static unsigned int result;
75 char buf[128];
76 long int n;
77 char *ptr;
78 int fd;
79
80 if (result)
81 return result;
82
83 fd = open("/sys/devices/system/cpu/possible", O_RDONLY);
84 if (fd < 0) {
85 p_err("can't open sysfs possible cpus");
86 exit(-1);
87 }
88
89 n = read(fd, buf, sizeof(buf));
90 if (n < 2) {
91 p_err("can't read sysfs possible cpus");
92 exit(-1);
93 }
94 close(fd);
95
96 if (n == sizeof(buf)) {
97 p_err("read sysfs possible cpus overflow");
98 exit(-1);
99 }
100
101 ptr = buf;
102 n = 0;
103 while (*ptr && *ptr != '\n') {
104 unsigned int a, b;
105
106 if (sscanf(ptr, "%u-%u", &a, &b) == 2) {
107 n += b - a + 1;
108
109 ptr = strchr(ptr, '-') + 1;
110 } else if (sscanf(ptr, "%u", &a) == 1) {
111 n++;
112 } else {
113 assert(0);
114 }
115
116 while (isdigit(*ptr))
117 ptr++;
118 if (*ptr == ',')
119 ptr++;
120 }
121
122 result = n;
123
124 return result;
125}
126
127static bool map_is_per_cpu(__u32 type) 71static bool map_is_per_cpu(__u32 type)
128{ 72{
129 return type == BPF_MAP_TYPE_PERCPU_HASH || 73 return type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -186,8 +130,7 @@ static int map_parse_fd(int *argc, char ***argv)
186 return -1; 130 return -1;
187} 131}
188 132
189static int 133int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
190map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
191{ 134{
192 int err; 135 int err;
193 int fd; 136 int fd;
@@ -873,23 +816,25 @@ static int do_help(int argc, char **argv)
873 816
874 fprintf(stderr, 817 fprintf(stderr,
875 "Usage: %s %s { show | list } [MAP]\n" 818 "Usage: %s %s { show | list } [MAP]\n"
876 " %s %s dump MAP\n" 819 " %s %s dump MAP\n"
877 " %s %s update MAP key [hex] BYTES value [hex] VALUE [UPDATE_FLAGS]\n" 820 " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n"
878 " %s %s lookup MAP key [hex] BYTES\n" 821 " %s %s lookup MAP key DATA\n"
879 " %s %s getnext MAP [key [hex] BYTES]\n" 822 " %s %s getnext MAP [key DATA]\n"
880 " %s %s delete MAP key [hex] BYTES\n" 823 " %s %s delete MAP key DATA\n"
881 " %s %s pin MAP FILE\n" 824 " %s %s pin MAP FILE\n"
825 " %s %s event_pipe MAP [cpu N index M]\n"
882 " %s %s help\n" 826 " %s %s help\n"
883 "\n" 827 "\n"
884 " MAP := { id MAP_ID | pinned FILE }\n" 828 " MAP := { id MAP_ID | pinned FILE }\n"
829 " DATA := { [hex] BYTES }\n"
885 " " HELP_SPEC_PROGRAM "\n" 830 " " HELP_SPEC_PROGRAM "\n"
886 " VALUE := { BYTES | MAP | PROG }\n" 831 " VALUE := { DATA | MAP | PROG }\n"
887 " UPDATE_FLAGS := { any | exist | noexist }\n" 832 " UPDATE_FLAGS := { any | exist | noexist }\n"
888 " " HELP_SPEC_OPTIONS "\n" 833 " " HELP_SPEC_OPTIONS "\n"
889 "", 834 "",
890 bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], 835 bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
891 bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], 836 bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
892 bin_name, argv[-2], bin_name, argv[-2]); 837 bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
893 838
894 return 0; 839 return 0;
895} 840}
@@ -904,6 +849,7 @@ static const struct cmd cmds[] = {
904 { "getnext", do_getnext }, 849 { "getnext", do_getnext },
905 { "delete", do_delete }, 850 { "delete", do_delete },
906 { "pin", do_pin }, 851 { "pin", do_pin },
852 { "event_pipe", do_event_pipe },
907 { 0 } 853 { 0 }
908}; 854};
909 855
diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
new file mode 100644
index 000000000000..c5a2ced8552d
--- /dev/null
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -0,0 +1,347 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (C) 2018 Netronome Systems, Inc. */
3/* This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <errno.h>
8#include <fcntl.h>
9#include <libbpf.h>
10#include <poll.h>
11#include <signal.h>
12#include <stdbool.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16#include <time.h>
17#include <unistd.h>
18#include <linux/bpf.h>
19#include <linux/perf_event.h>
20#include <sys/ioctl.h>
21#include <sys/mman.h>
22#include <sys/syscall.h>
23
24#include <bpf.h>
25#include <perf-sys.h>
26
27#include "main.h"
28
29#define MMAP_PAGE_CNT 16
30
31static bool stop;
32
33struct event_ring_info {
34 int fd;
35 int key;
36 unsigned int cpu;
37 void *mem;
38};
39
40struct perf_event_sample {
41 struct perf_event_header header;
42 __u32 size;
43 unsigned char data[];
44};
45
46static void int_exit(int signo)
47{
48 fprintf(stderr, "Stopping...\n");
49 stop = true;
50}
51
52static void
53print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
54{
55 struct {
56 struct perf_event_header header;
57 __u64 id;
58 __u64 lost;
59 } *lost = (void *)e;
60 struct timespec ts;
61
62 if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
63 perror("Can't read clock for timestamp");
64 return;
65 }
66
67 if (json_output) {
68 jsonw_start_object(json_wtr);
69 jsonw_name(json_wtr, "timestamp");
70 jsonw_uint(json_wtr, ts.tv_sec * 1000000000ull + ts.tv_nsec);
71 jsonw_name(json_wtr, "type");
72 jsonw_uint(json_wtr, e->header.type);
73 jsonw_name(json_wtr, "cpu");
74 jsonw_uint(json_wtr, ring->cpu);
75 jsonw_name(json_wtr, "index");
76 jsonw_uint(json_wtr, ring->key);
77 if (e->header.type == PERF_RECORD_SAMPLE) {
78 jsonw_name(json_wtr, "data");
79 print_data_json(e->data, e->size);
80 } else if (e->header.type == PERF_RECORD_LOST) {
81 jsonw_name(json_wtr, "lost");
82 jsonw_start_object(json_wtr);
83 jsonw_name(json_wtr, "id");
84 jsonw_uint(json_wtr, lost->id);
85 jsonw_name(json_wtr, "count");
86 jsonw_uint(json_wtr, lost->lost);
87 jsonw_end_object(json_wtr);
88 }
89 jsonw_end_object(json_wtr);
90 } else {
91 if (e->header.type == PERF_RECORD_SAMPLE) {
92 printf("== @%ld.%ld CPU: %d index: %d =====\n",
93 (long)ts.tv_sec, ts.tv_nsec,
94 ring->cpu, ring->key);
95 fprint_hex(stdout, e->data, e->size, " ");
96 printf("\n");
97 } else if (e->header.type == PERF_RECORD_LOST) {
98 printf("lost %lld events\n", lost->lost);
99 } else {
100 printf("unknown event type=%d size=%d\n",
101 e->header.type, e->header.size);
102 }
103 }
104}
105
106static void
107perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len)
108{
109 volatile struct perf_event_mmap_page *header = ring->mem;
110 __u64 buffer_size = MMAP_PAGE_CNT * get_page_size();
111 __u64 data_tail = header->data_tail;
112 __u64 data_head = header->data_head;
113 void *base, *begin, *end;
114
115 asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
116 if (data_head == data_tail)
117 return;
118
119 base = ((char *)header) + get_page_size();
120
121 begin = base + data_tail % buffer_size;
122 end = base + data_head % buffer_size;
123
124 while (begin != end) {
125 struct perf_event_sample *e;
126
127 e = begin;
128 if (begin + e->header.size > base + buffer_size) {
129 long len = base + buffer_size - begin;
130
131 if (*buf_len < e->header.size) {
132 free(*buf);
133 *buf = malloc(e->header.size);
134 if (!*buf) {
135 fprintf(stderr,
136 "can't allocate memory");
137 stop = true;
138 return;
139 }
140 *buf_len = e->header.size;
141 }
142
143 memcpy(*buf, begin, len);
144 memcpy(*buf + len, base, e->header.size - len);
145 e = (void *)*buf;
146 begin = base + e->header.size - len;
147 } else if (begin + e->header.size == base + buffer_size) {
148 begin = base;
149 } else {
150 begin += e->header.size;
151 }
152
153 print_bpf_output(ring, e);
154 }
155
156 __sync_synchronize(); /* smp_mb() */
157 header->data_tail = data_head;
158}
159
160static int perf_mmap_size(void)
161{
162 return get_page_size() * (MMAP_PAGE_CNT + 1);
163}
164
165static void *perf_event_mmap(int fd)
166{
167 int mmap_size = perf_mmap_size();
168 void *base;
169
170 base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
171 if (base == MAP_FAILED) {
172 p_err("event mmap failed: %s\n", strerror(errno));
173 return NULL;
174 }
175
176 return base;
177}
178
179static void perf_event_unmap(void *mem)
180{
181 if (munmap(mem, perf_mmap_size()))
182 fprintf(stderr, "Can't unmap ring memory!\n");
183}
184
185static int bpf_perf_event_open(int map_fd, int key, int cpu)
186{
187 struct perf_event_attr attr = {
188 .sample_type = PERF_SAMPLE_RAW,
189 .type = PERF_TYPE_SOFTWARE,
190 .config = PERF_COUNT_SW_BPF_OUTPUT,
191 };
192 int pmu_fd;
193
194 pmu_fd = sys_perf_event_open(&attr, -1, cpu, -1, 0);
195 if (pmu_fd < 0) {
196 p_err("failed to open perf event %d for CPU %d", key, cpu);
197 return -1;
198 }
199
200 if (bpf_map_update_elem(map_fd, &key, &pmu_fd, BPF_ANY)) {
201 p_err("failed to update map for event %d for CPU %d", key, cpu);
202 goto err_close;
203 }
204 if (ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
205 p_err("failed to enable event %d for CPU %d", key, cpu);
206 goto err_close;
207 }
208
209 return pmu_fd;
210
211err_close:
212 close(pmu_fd);
213 return -1;
214}
215
216int do_event_pipe(int argc, char **argv)
217{
218 int i, nfds, map_fd, index = -1, cpu = -1;
219 struct bpf_map_info map_info = {};
220 struct event_ring_info *rings;
221 size_t tmp_buf_sz = 0;
222 void *tmp_buf = NULL;
223 struct pollfd *pfds;
224 __u32 map_info_len;
225 bool do_all = true;
226
227 map_info_len = sizeof(map_info);
228 map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len);
229 if (map_fd < 0)
230 return -1;
231
232 if (map_info.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
233 p_err("map is not a perf event array");
234 goto err_close_map;
235 }
236
237 while (argc) {
238 if (argc < 2)
239 BAD_ARG();
240
241 if (is_prefix(*argv, "cpu")) {
242 char *endptr;
243
244 NEXT_ARG();
245 cpu = strtoul(*argv, &endptr, 0);
246 if (*endptr) {
247 p_err("can't parse %s as CPU ID", **argv);
248 goto err_close_map;
249 }
250
251 NEXT_ARG();
252 } else if (is_prefix(*argv, "index")) {
253 char *endptr;
254
255 NEXT_ARG();
256 index = strtoul(*argv, &endptr, 0);
257 if (*endptr) {
258 p_err("can't parse %s as index", **argv);
259 goto err_close_map;
260 }
261
262 NEXT_ARG();
263 } else {
264 BAD_ARG();
265 }
266
267 do_all = false;
268 }
269
270 if (!do_all) {
271 if (index == -1 || cpu == -1) {
272 p_err("cpu and index must be specified together");
273 goto err_close_map;
274 }
275
276 nfds = 1;
277 } else {
278 nfds = min(get_possible_cpus(), map_info.max_entries);
279 cpu = 0;
280 index = 0;
281 }
282
283 rings = calloc(nfds, sizeof(rings[0]));
284 if (!rings)
285 goto err_close_map;
286
287 pfds = calloc(nfds, sizeof(pfds[0]));
288 if (!pfds)
289 goto err_free_rings;
290
291 for (i = 0; i < nfds; i++) {
292 rings[i].cpu = cpu + i;
293 rings[i].key = index + i;
294
295 rings[i].fd = bpf_perf_event_open(map_fd, rings[i].key,
296 rings[i].cpu);
297 if (rings[i].fd < 0)
298 goto err_close_fds_prev;
299
300 rings[i].mem = perf_event_mmap(rings[i].fd);
301 if (!rings[i].mem)
302 goto err_close_fds_current;
303
304 pfds[i].fd = rings[i].fd;
305 pfds[i].events = POLLIN;
306 }
307
308 signal(SIGINT, int_exit);
309 signal(SIGHUP, int_exit);
310 signal(SIGTERM, int_exit);
311
312 if (json_output)
313 jsonw_start_array(json_wtr);
314
315 while (!stop) {
316 poll(pfds, nfds, 200);
317 for (i = 0; i < nfds; i++)
318 perf_event_read(&rings[i], &tmp_buf, &tmp_buf_sz);
319 }
320 free(tmp_buf);
321
322 if (json_output)
323 jsonw_end_array(json_wtr);
324
325 for (i = 0; i < nfds; i++) {
326 perf_event_unmap(rings[i].mem);
327 close(rings[i].fd);
328 }
329 free(pfds);
330 free(rings);
331 close(map_fd);
332
333 return 0;
334
335err_close_fds_prev:
336 while (i--) {
337 perf_event_unmap(rings[i].mem);
338err_close_fds_current:
339 close(rings[i].fd);
340 }
341 free(pfds);
342err_free_rings:
343 free(rings);
344err_close_map:
345 close(map_fd);
346 return -1;
347}
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index e71a0a11afde..9bdfdf2d3fbe 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -96,7 +96,10 @@ static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
96 return; 96 return;
97 } 97 }
98 98
99 strftime(buf, size, "%b %d/%H:%M", &load_tm); 99 if (json_output)
100 strftime(buf, size, "%s", &load_tm);
101 else
102 strftime(buf, size, "%FT%T%z", &load_tm);
100} 103}
101 104
102static int prog_fd_by_tag(unsigned char *tag) 105static int prog_fd_by_tag(unsigned char *tag)
@@ -245,7 +248,8 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
245 print_boot_time(info->load_time, buf, sizeof(buf)); 248 print_boot_time(info->load_time, buf, sizeof(buf));
246 249
247 /* Piggy back on load_time, since 0 uid is a valid one */ 250 /* Piggy back on load_time, since 0 uid is a valid one */
248 jsonw_string_field(json_wtr, "loaded_at", buf); 251 jsonw_name(json_wtr, "loaded_at");
252 jsonw_printf(json_wtr, "%s", buf);
249 jsonw_uint_field(json_wtr, "uid", info->created_by_uid); 253 jsonw_uint_field(json_wtr, "uid", info->created_by_uid);
250 } 254 }
251 255
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index da77a9388947..83a95ae388dd 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -828,12 +828,12 @@ union bpf_attr {
828 * 828 *
829 * Also, be aware that the newer helper 829 * Also, be aware that the newer helper
830 * **bpf_perf_event_read_value**\ () is recommended over 830 * **bpf_perf_event_read_value**\ () is recommended over
831 * **bpf_perf_event_read*\ () in general. The latter has some ABI 831 * **bpf_perf_event_read**\ () in general. The latter has some ABI
832 * quirks where error and counter value are used as a return code 832 * quirks where error and counter value are used as a return code
833 * (which is wrong to do since ranges may overlap). This issue is 833 * (which is wrong to do since ranges may overlap). This issue is
834 * fixed with bpf_perf_event_read_value(), which at the same time 834 * fixed with **bpf_perf_event_read_value**\ (), which at the same
835 * provides more features over the **bpf_perf_event_read**\ () 835 * time provides more features over the **bpf_perf_event_read**\
836 * interface. Please refer to the description of 836 * () interface. Please refer to the description of
837 * **bpf_perf_event_read_value**\ () for details. 837 * **bpf_perf_event_read_value**\ () for details.
838 * Return 838 * Return
839 * The value of the perf event counter read from the map, or a 839 * The value of the perf event counter read from the map, or a
@@ -1361,7 +1361,7 @@ union bpf_attr {
1361 * Return 1361 * Return
1362 * 0 1362 * 0
1363 * 1363 *
1364 * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) 1364 * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
1365 * Description 1365 * Description
1366 * Emulate a call to **setsockopt()** on the socket associated to 1366 * Emulate a call to **setsockopt()** on the socket associated to
1367 * *bpf_socket*, which must be a full socket. The *level* at 1367 * *bpf_socket*, which must be a full socket. The *level* at
@@ -1435,7 +1435,7 @@ union bpf_attr {
1435 * Return 1435 * Return
1436 * **SK_PASS** on success, or **SK_DROP** on error. 1436 * **SK_PASS** on success, or **SK_DROP** on error.
1437 * 1437 *
1438 * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) 1438 * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
1439 * Description 1439 * Description
1440 * Add an entry to, or update a *map* referencing sockets. The 1440 * Add an entry to, or update a *map* referencing sockets. The
1441 * *skops* is used as a new value for the entry associated to 1441 * *skops* is used as a new value for the entry associated to
@@ -1533,7 +1533,7 @@ union bpf_attr {
1533 * Return 1533 * Return
1534 * 0 on success, or a negative error in case of failure. 1534 * 0 on success, or a negative error in case of failure.
1535 * 1535 *
1536 * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size) 1536 * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
1537 * Description 1537 * Description
1538 * For en eBPF program attached to a perf event, retrieve the 1538 * For en eBPF program attached to a perf event, retrieve the
1539 * value of the event counter associated to *ctx* and store it in 1539 * value of the event counter associated to *ctx* and store it in
@@ -1544,7 +1544,7 @@ union bpf_attr {
1544 * Return 1544 * Return
1545 * 0 on success, or a negative error in case of failure. 1545 * 0 on success, or a negative error in case of failure.
1546 * 1546 *
1547 * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) 1547 * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
1548 * Description 1548 * Description
1549 * Emulate a call to **getsockopt()** on the socket associated to 1549 * Emulate a call to **getsockopt()** on the socket associated to
1550 * *bpf_socket*, which must be a full socket. The *level* at 1550 * *bpf_socket*, which must be a full socket. The *level* at
@@ -1588,7 +1588,7 @@ union bpf_attr {
1588 * Return 1588 * Return
1589 * 0 1589 * 0
1590 * 1590 *
1591 * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval) 1591 * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
1592 * Description 1592 * Description
1593 * Attempt to set the value of the **bpf_sock_ops_cb_flags** field 1593 * Attempt to set the value of the **bpf_sock_ops_cb_flags** field
1594 * for the full TCP socket associated to *bpf_sock_ops* to 1594 * for the full TCP socket associated to *bpf_sock_ops* to
@@ -1721,7 +1721,7 @@ union bpf_attr {
1721 * Return 1721 * Return
1722 * 0 on success, or a negative error in case of failure. 1722 * 0 on success, or a negative error in case of failure.
1723 * 1723 *
1724 * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len) 1724 * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
1725 * Description 1725 * Description
1726 * Bind the socket associated to *ctx* to the address pointed by 1726 * Bind the socket associated to *ctx* to the address pointed by
1727 * *addr*, of length *addr_len*. This allows for making outgoing 1727 * *addr*, of length *addr_len*. This allows for making outgoing
@@ -1767,6 +1767,64 @@ union bpf_attr {
1767 * **CONFIG_XFRM** configuration option. 1767 * **CONFIG_XFRM** configuration option.
1768 * Return 1768 * Return
1769 * 0 on success, or a negative error in case of failure. 1769 * 0 on success, or a negative error in case of failure.
1770 *
1771 * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
1772 * Description
1773 * Return a user or a kernel stack in bpf program provided buffer.
1774 * To achieve this, the helper needs *ctx*, which is a pointer
1775 * to the context on which the tracing program is executed.
1776 * To store the stacktrace, the bpf program provides *buf* with
1777 * a nonnegative *size*.
1778 *
1779 * The last argument, *flags*, holds the number of stack frames to
1780 * skip (from 0 to 255), masked with
1781 * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
1782 * the following flags:
1783 *
1784 * **BPF_F_USER_STACK**
1785 * Collect a user space stack instead of a kernel stack.
1786 * **BPF_F_USER_BUILD_ID**
1787 * Collect buildid+offset instead of ips for user stack,
1788 * only valid if **BPF_F_USER_STACK** is also specified.
1789 *
1790 * **bpf_get_stack**\ () can collect up to
1791 * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
1792 * to sufficient large buffer size. Note that
1793 * this limit can be controlled with the **sysctl** program, and
1794 * that it should be manually increased in order to profile long
1795 * user stacks (such as stacks for Java programs). To do so, use:
1796 *
1797 * ::
1798 *
1799 * # sysctl kernel.perf_event_max_stack=<new value>
1800 *
1801 * Return
1802 * a non-negative value equal to or less than size on success, or
1803 * a negative error in case of failure.
1804 *
1805 * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
1806 * Description
1807 * This helper is similar to **bpf_skb_load_bytes**\ () in that
1808 * it provides an easy way to load *len* bytes from *offset*
1809 * from the packet associated to *skb*, into the buffer pointed
1810 * by *to*. The difference to **bpf_skb_load_bytes**\ () is that
1811 * a fifth argument *start_header* exists in order to select a
1812 * base offset to start from. *start_header* can be one of:
1813 *
1814 * **BPF_HDR_START_MAC**
1815 * Base offset to load data from is *skb*'s mac header.
1816 * **BPF_HDR_START_NET**
1817 * Base offset to load data from is *skb*'s network header.
1818 *
1819 * In general, "direct packet access" is the preferred method to
1820 * access packet data, however, this helper is in particular useful
1821 * in socket filters where *skb*\ **->data** does not always point
1822 * to the start of the mac header and where "direct packet access"
1823 * is not available.
1824 *
1825 * Return
1826 * 0 on success, or a negative error in case of failure.
1827 *
1770 */ 1828 */
1771#define __BPF_FUNC_MAPPER(FN) \ 1829#define __BPF_FUNC_MAPPER(FN) \
1772 FN(unspec), \ 1830 FN(unspec), \
@@ -1835,7 +1893,9 @@ union bpf_attr {
1835 FN(msg_pull_data), \ 1893 FN(msg_pull_data), \
1836 FN(bind), \ 1894 FN(bind), \
1837 FN(xdp_adjust_tail), \ 1895 FN(xdp_adjust_tail), \
1838 FN(skb_get_xfrm_state), 1896 FN(skb_get_xfrm_state), \
1897 FN(get_stack), \
1898 FN(skb_load_bytes_relative),
1839 1899
1840/* integer value in 'imm' field of BPF_CALL instruction selects which helper 1900/* integer value in 'imm' field of BPF_CALL instruction selects which helper
1841 * function eBPF program intends to call 1901 * function eBPF program intends to call
@@ -1869,11 +1929,14 @@ enum bpf_func_id {
1869/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ 1929/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
1870#define BPF_F_TUNINFO_IPV6 (1ULL << 0) 1930#define BPF_F_TUNINFO_IPV6 (1ULL << 0)
1871 1931
1872/* BPF_FUNC_get_stackid flags. */ 1932/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
1873#define BPF_F_SKIP_FIELD_MASK 0xffULL 1933#define BPF_F_SKIP_FIELD_MASK 0xffULL
1874#define BPF_F_USER_STACK (1ULL << 8) 1934#define BPF_F_USER_STACK (1ULL << 8)
1935/* flags used by BPF_FUNC_get_stackid only. */
1875#define BPF_F_FAST_STACK_CMP (1ULL << 9) 1936#define BPF_F_FAST_STACK_CMP (1ULL << 9)
1876#define BPF_F_REUSE_STACKID (1ULL << 10) 1937#define BPF_F_REUSE_STACKID (1ULL << 10)
1938/* flags used by BPF_FUNC_get_stack only. */
1939#define BPF_F_USER_BUILD_ID (1ULL << 11)
1877 1940
1878/* BPF_FUNC_skb_set_tunnel_key flags. */ 1941/* BPF_FUNC_skb_set_tunnel_key flags. */
1879#define BPF_F_ZERO_CSUM_TX (1ULL << 1) 1942#define BPF_F_ZERO_CSUM_TX (1ULL << 1)
@@ -1893,6 +1956,12 @@ enum bpf_adj_room_mode {
1893 BPF_ADJ_ROOM_NET, 1956 BPF_ADJ_ROOM_NET,
1894}; 1957};
1895 1958
1959/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
1960enum bpf_hdr_start_off {
1961 BPF_HDR_START_MAC,
1962 BPF_HDR_START_NET,
1963};
1964
1896/* user accessible mirror of in-kernel sk_buff. 1965/* user accessible mirror of in-kernel sk_buff.
1897 * new fields can only be added to the end of this structure 1966 * new fields can only be added to the end of this structure
1898 */ 1967 */
diff --git a/tools/include/uapi/linux/erspan.h b/tools/include/uapi/linux/erspan.h
new file mode 100644
index 000000000000..841573019ae1
--- /dev/null
+++ b/tools/include/uapi/linux/erspan.h
@@ -0,0 +1,52 @@
1/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2/*
3 * ERSPAN Tunnel Metadata
4 *
5 * Copyright (c) 2018 VMware
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2
9 * as published by the Free Software Foundation.
10 *
11 * Userspace API for metadata mode ERSPAN tunnel
12 */
13#ifndef _UAPI_ERSPAN_H
14#define _UAPI_ERSPAN_H
15
16#include <linux/types.h> /* For __beXX in userspace */
17#include <asm/byteorder.h>
18
19/* ERSPAN version 2 metadata header */
20struct erspan_md2 {
21 __be32 timestamp;
22 __be16 sgt; /* security group tag */
23#if defined(__LITTLE_ENDIAN_BITFIELD)
24 __u8 hwid_upper:2,
25 ft:5,
26 p:1;
27 __u8 o:1,
28 gra:2,
29 dir:1,
30 hwid:4;
31#elif defined(__BIG_ENDIAN_BITFIELD)
32 __u8 p:1,
33 ft:5,
34 hwid_upper:2;
35 __u8 hwid:4,
36 dir:1,
37 gra:2,
38 o:1;
39#else
40#error "Please fix <asm/byteorder.h>"
41#endif
42};
43
44struct erspan_metadata {
45 int version;
46 union {
47 __be32 index; /* Version 1 (type II)*/
48 struct erspan_md2 md2; /* Version 2 (type III) */
49 } u;
50};
51
52#endif /* _UAPI_ERSPAN_H */
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index b64a7a39cbc8..9d762184b805 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -32,7 +32,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
32 test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ 32 test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
33 sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \ 33 sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
34 sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \ 34 sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
35 test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o 35 test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
36 test_get_stack_rawtp.o
36 37
37# Order correspond to 'make run_tests' order 38# Order correspond to 'make run_tests' order
38TEST_PROGS := test_kmod.sh \ 39TEST_PROGS := test_kmod.sh \
@@ -58,6 +59,7 @@ $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
58$(OUTPUT)/test_sock: cgroup_helpers.c 59$(OUTPUT)/test_sock: cgroup_helpers.c
59$(OUTPUT)/test_sock_addr: cgroup_helpers.c 60$(OUTPUT)/test_sock_addr: cgroup_helpers.c
60$(OUTPUT)/test_sockmap: cgroup_helpers.c 61$(OUTPUT)/test_sockmap: cgroup_helpers.c
62$(OUTPUT)/test_progs: trace_helpers.c
61 63
62.PHONY: force 64.PHONY: force
63 65
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 69d7b918e66a..265f8e0e8ada 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -101,6 +101,8 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
101static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, 101static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
102 int size, int flags) = 102 int size, int flags) =
103 (void *) BPF_FUNC_skb_get_xfrm_state; 103 (void *) BPF_FUNC_skb_get_xfrm_state;
104static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
105 (void *) BPF_FUNC_get_stack;
104 106
105/* llvm builtin functions that eBPF C program may use to 107/* llvm builtin functions that eBPF C program may use to
106 * emit BPF_LD_ABS and BPF_LD_IND instructions 108 * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/test_get_stack_rawtp.c
new file mode 100644
index 000000000000..f6d9f238e00a
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_get_stack_rawtp.c
@@ -0,0 +1,102 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/bpf.h>
4#include "bpf_helpers.h"
5
6/* Permit pretty deep stack traces */
7#define MAX_STACK_RAWTP 100
8struct stack_trace_t {
9 int pid;
10 int kern_stack_size;
11 int user_stack_size;
12 int user_stack_buildid_size;
13 __u64 kern_stack[MAX_STACK_RAWTP];
14 __u64 user_stack[MAX_STACK_RAWTP];
15 struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
16};
17
18struct bpf_map_def SEC("maps") perfmap = {
19 .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
20 .key_size = sizeof(int),
21 .value_size = sizeof(__u32),
22 .max_entries = 2,
23};
24
25struct bpf_map_def SEC("maps") stackdata_map = {
26 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
27 .key_size = sizeof(__u32),
28 .value_size = sizeof(struct stack_trace_t),
29 .max_entries = 1,
30};
31
32/* Allocate per-cpu space twice the needed. For the code below
33 * usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
34 * if (usize < 0)
35 * return 0;
36 * ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
37 *
38 * If we have value_size = MAX_STACK_RAWTP * sizeof(__u64),
39 * verifier will complain that access "raw_data + usize"
40 * with size "max_len - usize" may be out of bound.
41 * The maximum "raw_data + usize" is "raw_data + max_len"
42 * and the maximum "max_len - usize" is "max_len", verifier
43 * concludes that the maximum buffer access range is
44 * "raw_data[0...max_len * 2 - 1]" and hence reject the program.
45 *
46 * Doubling the to-be-used max buffer size can fix this verifier
47 * issue and avoid complicated C programming massaging.
48 * This is an acceptable workaround since there is one entry here.
49 */
50struct bpf_map_def SEC("maps") rawdata_map = {
51 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
52 .key_size = sizeof(__u32),
53 .value_size = MAX_STACK_RAWTP * sizeof(__u64) * 2,
54 .max_entries = 1,
55};
56
57SEC("tracepoint/raw_syscalls/sys_enter")
58int bpf_prog1(void *ctx)
59{
60 int max_len, max_buildid_len, usize, ksize, total_size;
61 struct stack_trace_t *data;
62 void *raw_data;
63 __u32 key = 0;
64
65 data = bpf_map_lookup_elem(&stackdata_map, &key);
66 if (!data)
67 return 0;
68
69 max_len = MAX_STACK_RAWTP * sizeof(__u64);
70 max_buildid_len = MAX_STACK_RAWTP * sizeof(struct bpf_stack_build_id);
71 data->pid = bpf_get_current_pid_tgid();
72 data->kern_stack_size = bpf_get_stack(ctx, data->kern_stack,
73 max_len, 0);
74 data->user_stack_size = bpf_get_stack(ctx, data->user_stack, max_len,
75 BPF_F_USER_STACK);
76 data->user_stack_buildid_size = bpf_get_stack(
77 ctx, data->user_stack_buildid, max_buildid_len,
78 BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
79 bpf_perf_event_output(ctx, &perfmap, 0, data, sizeof(*data));
80
81 /* write both kernel and user stacks to the same buffer */
82 raw_data = bpf_map_lookup_elem(&rawdata_map, &key);
83 if (!raw_data)
84 return 0;
85
86 usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
87 if (usize < 0)
88 return 0;
89
90 ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
91 if (ksize < 0)
92 return 0;
93
94 total_size = usize + ksize;
95 if (total_size > 0 && total_size <= max_len)
96 bpf_perf_event_output(ctx, &perfmap, 0, raw_data, total_size);
97
98 return 0;
99}
100
101char _license[] SEC("license") = "GPL";
102__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index fac581f1c57f..ed197eef1cfc 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -38,8 +38,10 @@ typedef __u16 __sum16;
38#include "bpf_util.h" 38#include "bpf_util.h"
39#include "bpf_endian.h" 39#include "bpf_endian.h"
40#include "bpf_rlimit.h" 40#include "bpf_rlimit.h"
41#include "trace_helpers.h"
41 42
42static int error_cnt, pass_cnt; 43static int error_cnt, pass_cnt;
44static bool jit_enabled;
43 45
44#define MAGIC_BYTES 123 46#define MAGIC_BYTES 123
45 47
@@ -391,13 +393,30 @@ static inline __u64 ptr_to_u64(const void *ptr)
391 return (__u64) (unsigned long) ptr; 393 return (__u64) (unsigned long) ptr;
392} 394}
393 395
396static bool is_jit_enabled(void)
397{
398 const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
399 bool enabled = false;
400 int sysctl_fd;
401
402 sysctl_fd = open(jit_sysctl, 0, O_RDONLY);
403 if (sysctl_fd != -1) {
404 char tmpc;
405
406 if (read(sysctl_fd, &tmpc, sizeof(tmpc)) == 1)
407 enabled = (tmpc != '0');
408 close(sysctl_fd);
409 }
410
411 return enabled;
412}
413
394static void test_bpf_obj_id(void) 414static void test_bpf_obj_id(void)
395{ 415{
396 const __u64 array_magic_value = 0xfaceb00c; 416 const __u64 array_magic_value = 0xfaceb00c;
397 const __u32 array_key = 0; 417 const __u32 array_key = 0;
398 const int nr_iters = 2; 418 const int nr_iters = 2;
399 const char *file = "./test_obj_id.o"; 419 const char *file = "./test_obj_id.o";
400 const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
401 const char *expected_prog_name = "test_obj_id"; 420 const char *expected_prog_name = "test_obj_id";
402 const char *expected_map_name = "test_map_id"; 421 const char *expected_map_name = "test_map_id";
403 const __u64 nsec_per_sec = 1000000000; 422 const __u64 nsec_per_sec = 1000000000;
@@ -414,20 +433,11 @@ static void test_bpf_obj_id(void)
414 char jited_insns[128], xlated_insns[128], zeros[128]; 433 char jited_insns[128], xlated_insns[128], zeros[128];
415 __u32 i, next_id, info_len, nr_id_found, duration = 0; 434 __u32 i, next_id, info_len, nr_id_found, duration = 0;
416 struct timespec real_time_ts, boot_time_ts; 435 struct timespec real_time_ts, boot_time_ts;
417 int sysctl_fd, jit_enabled = 0, err = 0; 436 int err = 0;
418 __u64 array_value; 437 __u64 array_value;
419 uid_t my_uid = getuid(); 438 uid_t my_uid = getuid();
420 time_t now, load_time; 439 time_t now, load_time;
421 440
422 sysctl_fd = open(jit_sysctl, 0, O_RDONLY);
423 if (sysctl_fd != -1) {
424 char tmpc;
425
426 if (read(sysctl_fd, &tmpc, sizeof(tmpc)) == 1)
427 jit_enabled = (tmpc != '0');
428 close(sysctl_fd);
429 }
430
431 err = bpf_prog_get_fd_by_id(0); 441 err = bpf_prog_get_fd_by_id(0);
432 CHECK(err >= 0 || errno != ENOENT, 442 CHECK(err >= 0 || errno != ENOENT,
433 "get-fd-by-notexist-prog-id", "err %d errno %d\n", err, errno); 443 "get-fd-by-notexist-prog-id", "err %d errno %d\n", err, errno);
@@ -896,11 +906,47 @@ static int compare_map_keys(int map1_fd, int map2_fd)
896 return 0; 906 return 0;
897} 907}
898 908
909static int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len)
910{
911 __u32 key, next_key, *cur_key_p, *next_key_p;
912 char *val_buf1, *val_buf2;
913 int i, err = 0;
914
915 val_buf1 = malloc(stack_trace_len);
916 val_buf2 = malloc(stack_trace_len);
917 cur_key_p = NULL;
918 next_key_p = &key;
919 while (bpf_map_get_next_key(smap_fd, cur_key_p, next_key_p) == 0) {
920 err = bpf_map_lookup_elem(smap_fd, next_key_p, val_buf1);
921 if (err)
922 goto out;
923 err = bpf_map_lookup_elem(amap_fd, next_key_p, val_buf2);
924 if (err)
925 goto out;
926 for (i = 0; i < stack_trace_len; i++) {
927 if (val_buf1[i] != val_buf2[i]) {
928 err = -1;
929 goto out;
930 }
931 }
932 key = *next_key_p;
933 cur_key_p = &key;
934 next_key_p = &next_key;
935 }
936 if (errno != ENOENT)
937 err = -1;
938
939out:
940 free(val_buf1);
941 free(val_buf2);
942 return err;
943}
944
899static void test_stacktrace_map() 945static void test_stacktrace_map()
900{ 946{
901 int control_map_fd, stackid_hmap_fd, stackmap_fd; 947 int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
902 const char *file = "./test_stacktrace_map.o"; 948 const char *file = "./test_stacktrace_map.o";
903 int bytes, efd, err, pmu_fd, prog_fd; 949 int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
904 struct perf_event_attr attr = {}; 950 struct perf_event_attr attr = {};
905 __u32 key, val, duration = 0; 951 __u32 key, val, duration = 0;
906 struct bpf_object *obj; 952 struct bpf_object *obj;
@@ -956,6 +1002,10 @@ static void test_stacktrace_map()
956 if (stackmap_fd < 0) 1002 if (stackmap_fd < 0)
957 goto disable_pmu; 1003 goto disable_pmu;
958 1004
1005 stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
1006 if (stack_amap_fd < 0)
1007 goto disable_pmu;
1008
959 /* give some time for bpf program run */ 1009 /* give some time for bpf program run */
960 sleep(1); 1010 sleep(1);
961 1011
@@ -977,6 +1027,12 @@ static void test_stacktrace_map()
977 "err %d errno %d\n", err, errno)) 1027 "err %d errno %d\n", err, errno))
978 goto disable_pmu_noerr; 1028 goto disable_pmu_noerr;
979 1029
1030 stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
1031 err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
1032 if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
1033 "err %d errno %d\n", err, errno))
1034 goto disable_pmu_noerr;
1035
980 goto disable_pmu_noerr; 1036 goto disable_pmu_noerr;
981disable_pmu: 1037disable_pmu:
982 error_cnt++; 1038 error_cnt++;
@@ -1070,9 +1126,9 @@ err:
1070 1126
1071static void test_stacktrace_build_id(void) 1127static void test_stacktrace_build_id(void)
1072{ 1128{
1073 int control_map_fd, stackid_hmap_fd, stackmap_fd; 1129 int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
1074 const char *file = "./test_stacktrace_build_id.o"; 1130 const char *file = "./test_stacktrace_build_id.o";
1075 int bytes, efd, err, pmu_fd, prog_fd; 1131 int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
1076 struct perf_event_attr attr = {}; 1132 struct perf_event_attr attr = {};
1077 __u32 key, previous_key, val, duration = 0; 1133 __u32 key, previous_key, val, duration = 0;
1078 struct bpf_object *obj; 1134 struct bpf_object *obj;
@@ -1137,6 +1193,11 @@ static void test_stacktrace_build_id(void)
1137 err, errno)) 1193 err, errno))
1138 goto disable_pmu; 1194 goto disable_pmu;
1139 1195
1196 stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
1197 if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
1198 "err %d errno %d\n", err, errno))
1199 goto disable_pmu;
1200
1140 assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null") 1201 assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
1141 == 0); 1202 == 0);
1142 assert(system("./urandom_read") == 0); 1203 assert(system("./urandom_read") == 0);
@@ -1188,8 +1249,15 @@ static void test_stacktrace_build_id(void)
1188 previous_key = key; 1249 previous_key = key;
1189 } while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0); 1250 } while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
1190 1251
1191 CHECK(build_id_matches < 1, "build id match", 1252 if (CHECK(build_id_matches < 1, "build id match",
1192 "Didn't find expected build ID from the map\n"); 1253 "Didn't find expected build ID from the map\n"))
1254 goto disable_pmu;
1255
1256 stack_trace_len = PERF_MAX_STACK_DEPTH
1257 * sizeof(struct bpf_stack_build_id);
1258 err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
1259 CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
1260 "err %d errno %d\n", err, errno);
1193 1261
1194disable_pmu: 1262disable_pmu:
1195 ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE); 1263 ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
@@ -1204,8 +1272,147 @@ out:
1204 return; 1272 return;
1205} 1273}
1206 1274
1275#define MAX_CNT_RAWTP 10ull
1276#define MAX_STACK_RAWTP 100
1277struct get_stack_trace_t {
1278 int pid;
1279 int kern_stack_size;
1280 int user_stack_size;
1281 int user_stack_buildid_size;
1282 __u64 kern_stack[MAX_STACK_RAWTP];
1283 __u64 user_stack[MAX_STACK_RAWTP];
1284 struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
1285};
1286
1287static int get_stack_print_output(void *data, int size)
1288{
1289 bool good_kern_stack = false, good_user_stack = false;
1290 const char *nonjit_func = "___bpf_prog_run";
1291 struct get_stack_trace_t *e = data;
1292 int i, num_stack;
1293 static __u64 cnt;
1294 struct ksym *ks;
1295
1296 cnt++;
1297
1298 if (size < sizeof(struct get_stack_trace_t)) {
1299 __u64 *raw_data = data;
1300 bool found = false;
1301
1302 num_stack = size / sizeof(__u64);
1303 /* If jit is enabled, we do not have a good way to
1304 * verify the sanity of the kernel stack. So we
1305 * just assume it is good if the stack is not empty.
1306 * This could be improved in the future.
1307 */
1308 if (jit_enabled) {
1309 found = num_stack > 0;
1310 } else {
1311 for (i = 0; i < num_stack; i++) {
1312 ks = ksym_search(raw_data[i]);
1313 if (strcmp(ks->name, nonjit_func) == 0) {
1314 found = true;
1315 break;
1316 }
1317 }
1318 }
1319 if (found) {
1320 good_kern_stack = true;
1321 good_user_stack = true;
1322 }
1323 } else {
1324 num_stack = e->kern_stack_size / sizeof(__u64);
1325 if (jit_enabled) {
1326 good_kern_stack = num_stack > 0;
1327 } else {
1328 for (i = 0; i < num_stack; i++) {
1329 ks = ksym_search(e->kern_stack[i]);
1330 if (strcmp(ks->name, nonjit_func) == 0) {
1331 good_kern_stack = true;
1332 break;
1333 }
1334 }
1335 }
1336 if (e->user_stack_size > 0 && e->user_stack_buildid_size > 0)
1337 good_user_stack = true;
1338 }
1339 if (!good_kern_stack || !good_user_stack)
1340 return PERF_EVENT_ERROR;
1341
1342 if (cnt == MAX_CNT_RAWTP)
1343 return PERF_EVENT_DONE;
1344
1345 return PERF_EVENT_CONT;
1346}
1347
1348static void test_get_stack_raw_tp(void)
1349{
1350 const char *file = "./test_get_stack_rawtp.o";
1351 int i, efd, err, prog_fd, pmu_fd, perfmap_fd;
1352 struct perf_event_attr attr = {};
1353 struct timespec tv = {0, 10};
1354 __u32 key = 0, duration = 0;
1355 struct bpf_object *obj;
1356
1357 err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
1358 if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
1359 return;
1360
1361 efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
1362 if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
1363 goto close_prog;
1364
1365 perfmap_fd = bpf_find_map(__func__, obj, "perfmap");
1366 if (CHECK(perfmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
1367 perfmap_fd, errno))
1368 goto close_prog;
1369
1370 err = load_kallsyms();
1371 if (CHECK(err < 0, "load_kallsyms", "err %d errno %d\n", err, errno))
1372 goto close_prog;
1373
1374 attr.sample_type = PERF_SAMPLE_RAW;
1375 attr.type = PERF_TYPE_SOFTWARE;
1376 attr.config = PERF_COUNT_SW_BPF_OUTPUT;
1377 pmu_fd = syscall(__NR_perf_event_open, &attr, getpid()/*pid*/, -1/*cpu*/,
1378 -1/*group_fd*/, 0);
1379 if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
1380 errno))
1381 goto close_prog;
1382
1383 err = bpf_map_update_elem(perfmap_fd, &key, &pmu_fd, BPF_ANY);
1384 if (CHECK(err < 0, "bpf_map_update_elem", "err %d errno %d\n", err,
1385 errno))
1386 goto close_prog;
1387
1388 err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
1389 if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n",
1390 err, errno))
1391 goto close_prog;
1392
1393 err = perf_event_mmap(pmu_fd);
1394 if (CHECK(err < 0, "perf_event_mmap", "err %d errno %d\n", err, errno))
1395 goto close_prog;
1396
1397 /* trigger some syscall action */
1398 for (i = 0; i < MAX_CNT_RAWTP; i++)
1399 nanosleep(&tv, NULL);
1400
1401 err = perf_event_poller(pmu_fd, get_stack_print_output);
1402 if (CHECK(err < 0, "perf_event_poller", "err %d errno %d\n", err, errno))
1403 goto close_prog;
1404
1405 goto close_prog_noerr;
1406close_prog:
1407 error_cnt++;
1408close_prog_noerr:
1409 bpf_object__close(obj);
1410}
1411
1207int main(void) 1412int main(void)
1208{ 1413{
1414 jit_enabled = is_jit_enabled();
1415
1209 test_pkt_access(); 1416 test_pkt_access();
1210 test_xdp(); 1417 test_xdp();
1211 test_xdp_adjust_tail(); 1418 test_xdp_adjust_tail();
@@ -1219,6 +1426,7 @@ int main(void)
1219 test_stacktrace_map(); 1426 test_stacktrace_map();
1220 test_stacktrace_build_id(); 1427 test_stacktrace_build_id();
1221 test_stacktrace_map_raw_tp(); 1428 test_stacktrace_map_raw_tp();
1429 test_get_stack_raw_tp();
1222 1430
1223 printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); 1431 printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
1224 return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; 1432 return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
index b755bd783ce5..d86c281e957f 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_build_id.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
@@ -19,7 +19,7 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
19 .type = BPF_MAP_TYPE_HASH, 19 .type = BPF_MAP_TYPE_HASH,
20 .key_size = sizeof(__u32), 20 .key_size = sizeof(__u32),
21 .value_size = sizeof(__u32), 21 .value_size = sizeof(__u32),
22 .max_entries = 10000, 22 .max_entries = 16384,
23}; 23};
24 24
25struct bpf_map_def SEC("maps") stackmap = { 25struct bpf_map_def SEC("maps") stackmap = {
@@ -31,6 +31,14 @@ struct bpf_map_def SEC("maps") stackmap = {
31 .map_flags = BPF_F_STACK_BUILD_ID, 31 .map_flags = BPF_F_STACK_BUILD_ID,
32}; 32};
33 33
34struct bpf_map_def SEC("maps") stack_amap = {
35 .type = BPF_MAP_TYPE_ARRAY,
36 .key_size = sizeof(__u32),
37 .value_size = sizeof(struct bpf_stack_build_id)
38 * PERF_MAX_STACK_DEPTH,
39 .max_entries = 128,
40};
41
34/* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */ 42/* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */
35struct random_urandom_args { 43struct random_urandom_args {
36 unsigned long long pad; 44 unsigned long long pad;
@@ -42,7 +50,10 @@ struct random_urandom_args {
42SEC("tracepoint/random/urandom_read") 50SEC("tracepoint/random/urandom_read")
43int oncpu(struct random_urandom_args *args) 51int oncpu(struct random_urandom_args *args)
44{ 52{
53 __u32 max_len = sizeof(struct bpf_stack_build_id)
54 * PERF_MAX_STACK_DEPTH;
45 __u32 key = 0, val = 0, *value_p; 55 __u32 key = 0, val = 0, *value_p;
56 void *stack_p;
46 57
47 value_p = bpf_map_lookup_elem(&control_map, &key); 58 value_p = bpf_map_lookup_elem(&control_map, &key);
48 if (value_p && *value_p) 59 if (value_p && *value_p)
@@ -50,8 +61,13 @@ int oncpu(struct random_urandom_args *args)
50 61
51 /* The size of stackmap and stackid_hmap should be the same */ 62 /* The size of stackmap and stackid_hmap should be the same */
52 key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK); 63 key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK);
53 if ((int)key >= 0) 64 if ((int)key >= 0) {
54 bpf_map_update_elem(&stackid_hmap, &key, &val, 0); 65 bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
66 stack_p = bpf_map_lookup_elem(&stack_amap, &key);
67 if (stack_p)
68 bpf_get_stack(args, stack_p, max_len,
69 BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
70 }
55 71
56 return 0; 72 return 0;
57} 73}
diff --git a/tools/testing/selftests/bpf/test_stacktrace_map.c b/tools/testing/selftests/bpf/test_stacktrace_map.c
index 76d85c5d08bd..af111af7ca1a 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_map.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_map.c
@@ -19,14 +19,21 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
19 .type = BPF_MAP_TYPE_HASH, 19 .type = BPF_MAP_TYPE_HASH,
20 .key_size = sizeof(__u32), 20 .key_size = sizeof(__u32),
21 .value_size = sizeof(__u32), 21 .value_size = sizeof(__u32),
22 .max_entries = 10000, 22 .max_entries = 16384,
23}; 23};
24 24
25struct bpf_map_def SEC("maps") stackmap = { 25struct bpf_map_def SEC("maps") stackmap = {
26 .type = BPF_MAP_TYPE_STACK_TRACE, 26 .type = BPF_MAP_TYPE_STACK_TRACE,
27 .key_size = sizeof(__u32), 27 .key_size = sizeof(__u32),
28 .value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH, 28 .value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
29 .max_entries = 10000, 29 .max_entries = 16384,
30};
31
32struct bpf_map_def SEC("maps") stack_amap = {
33 .type = BPF_MAP_TYPE_ARRAY,
34 .key_size = sizeof(__u32),
35 .value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
36 .max_entries = 16384,
30}; 37};
31 38
32/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ 39/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
@@ -44,7 +51,9 @@ struct sched_switch_args {
44SEC("tracepoint/sched/sched_switch") 51SEC("tracepoint/sched/sched_switch")
45int oncpu(struct sched_switch_args *ctx) 52int oncpu(struct sched_switch_args *ctx)
46{ 53{
54 __u32 max_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
47 __u32 key = 0, val = 0, *value_p; 55 __u32 key = 0, val = 0, *value_p;
56 void *stack_p;
48 57
49 value_p = bpf_map_lookup_elem(&control_map, &key); 58 value_p = bpf_map_lookup_elem(&control_map, &key);
50 if (value_p && *value_p) 59 if (value_p && *value_p)
@@ -52,8 +61,12 @@ int oncpu(struct sched_switch_args *ctx)
52 61
53 /* The size of stackmap and stackid_hmap should be the same */ 62 /* The size of stackmap and stackid_hmap should be the same */
54 key = bpf_get_stackid(ctx, &stackmap, 0); 63 key = bpf_get_stackid(ctx, &stackmap, 0);
55 if ((int)key >= 0) 64 if ((int)key >= 0) {
56 bpf_map_update_elem(&stackid_hmap, &key, &val, 0); 65 bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
66 stack_p = bpf_map_lookup_elem(&stack_amap, &key);
67 if (stack_p)
68 bpf_get_stack(ctx, stack_p, max_len, 0);
69 }
57 70
58 return 0; 71 return 0;
59} 72}
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 165e9ddfa446..275b4570b5b8 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -47,7 +47,7 @@
47# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 47# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
48#endif 48#endif
49 49
50#define MAX_INSNS 512 50#define MAX_INSNS BPF_MAXINSNS
51#define MAX_FIXUPS 8 51#define MAX_FIXUPS 8
52#define MAX_NR_MAPS 4 52#define MAX_NR_MAPS 4
53#define POINTER_VALUE 0xcafe4all 53#define POINTER_VALUE 0xcafe4all
@@ -77,6 +77,8 @@ struct bpf_test {
77 } result, result_unpriv; 77 } result, result_unpriv;
78 enum bpf_prog_type prog_type; 78 enum bpf_prog_type prog_type;
79 uint8_t flags; 79 uint8_t flags;
80 __u8 data[TEST_DATA_LEN];
81 void (*fill_helper)(struct bpf_test *self);
80}; 82};
81 83
82/* Note we want this to be 64 bit aligned so that the end of our array is 84/* Note we want this to be 64 bit aligned so that the end of our array is
@@ -94,6 +96,62 @@ struct other_val {
94 long long bar; 96 long long bar;
95}; 97};
96 98
99static void bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self)
100{
101 /* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */
102#define PUSH_CNT 51
103 unsigned int len = BPF_MAXINSNS;
104 struct bpf_insn *insn = self->insns;
105 int i = 0, j, k = 0;
106
107 insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
108loop:
109 for (j = 0; j < PUSH_CNT; j++) {
110 insn[i++] = BPF_LD_ABS(BPF_B, 0);
111 insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2);
112 i++;
113 insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
114 insn[i++] = BPF_MOV64_IMM(BPF_REG_2, 1);
115 insn[i++] = BPF_MOV64_IMM(BPF_REG_3, 2);
116 insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
117 BPF_FUNC_skb_vlan_push),
118 insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2);
119 i++;
120 }
121
122 for (j = 0; j < PUSH_CNT; j++) {
123 insn[i++] = BPF_LD_ABS(BPF_B, 0);
124 insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2);
125 i++;
126 insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
127 insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
128 BPF_FUNC_skb_vlan_pop),
129 insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2);
130 i++;
131 }
132 if (++k < 5)
133 goto loop;
134
135 for (; i < len - 1; i++)
136 insn[i] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 0xbef);
137 insn[len - 1] = BPF_EXIT_INSN();
138}
139
140static void bpf_fill_jump_around_ld_abs(struct bpf_test *self)
141{
142 struct bpf_insn *insn = self->insns;
143 unsigned int len = BPF_MAXINSNS;
144 int i = 0;
145
146 insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
147 insn[i++] = BPF_LD_ABS(BPF_B, 0);
148 insn[i] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 10, len - i - 2);
149 i++;
150 while (i < len - 1)
151 insn[i++] = BPF_LD_ABS(BPF_B, 1);
152 insn[i] = BPF_EXIT_INSN();
153}
154
97static struct bpf_test tests[] = { 155static struct bpf_test tests[] = {
98 { 156 {
99 "add+sub+mul", 157 "add+sub+mul",
@@ -11680,6 +11738,242 @@ static struct bpf_test tests[] = {
11680 .errstr = "BPF_XADD stores into R2 packet", 11738 .errstr = "BPF_XADD stores into R2 packet",
11681 .prog_type = BPF_PROG_TYPE_XDP, 11739 .prog_type = BPF_PROG_TYPE_XDP,
11682 }, 11740 },
11741 {
11742 "bpf_get_stack return R0 within range",
11743 .insns = {
11744 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
11745 BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
11746 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
11747 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
11748 BPF_LD_MAP_FD(BPF_REG_1, 0),
11749 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
11750 BPF_FUNC_map_lookup_elem),
11751 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 28),
11752 BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
11753 BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)),
11754 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
11755 BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
11756 BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)),
11757 BPF_MOV64_IMM(BPF_REG_4, 256),
11758 BPF_EMIT_CALL(BPF_FUNC_get_stack),
11759 BPF_MOV64_IMM(BPF_REG_1, 0),
11760 BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
11761 BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32),
11762 BPF_ALU64_IMM(BPF_ARSH, BPF_REG_8, 32),
11763 BPF_JMP_REG(BPF_JSLT, BPF_REG_1, BPF_REG_8, 16),
11764 BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8),
11765 BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
11766 BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_8),
11767 BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
11768 BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),
11769 BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 32),
11770 BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
11771 BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),
11772 BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
11773 BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)),
11774 BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_5),
11775 BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 4),
11776 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
11777 BPF_MOV64_REG(BPF_REG_3, BPF_REG_9),
11778 BPF_MOV64_IMM(BPF_REG_4, 0),
11779 BPF_EMIT_CALL(BPF_FUNC_get_stack),
11780 BPF_EXIT_INSN(),
11781 },
11782 .fixup_map2 = { 4 },
11783 .result = ACCEPT,
11784 .prog_type = BPF_PROG_TYPE_TRACEPOINT,
11785 },
11786 {
11787 "ld_abs: invalid op 1",
11788 .insns = {
11789 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
11790 BPF_LD_ABS(BPF_DW, 0),
11791 BPF_EXIT_INSN(),
11792 },
11793 .prog_type = BPF_PROG_TYPE_SCHED_CLS,
11794 .result = REJECT,
11795 .errstr = "unknown opcode",
11796 },
11797 {
11798 "ld_abs: invalid op 2",
11799 .insns = {
11800 BPF_MOV32_IMM(BPF_REG_0, 256),
11801 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
11802 BPF_LD_IND(BPF_DW, BPF_REG_0, 0),
11803 BPF_EXIT_INSN(),
11804 },
11805 .prog_type = BPF_PROG_TYPE_SCHED_CLS,
11806 .result = REJECT,
11807 .errstr = "unknown opcode",
11808 },
11809 {
11810 "ld_abs: nmap reduced",
11811 .insns = {
11812 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
11813 BPF_LD_ABS(BPF_H, 12),
11814 BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 28),
11815 BPF_LD_ABS(BPF_H, 12),
11816 BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 26),
11817 BPF_MOV32_IMM(BPF_REG_0, 18),
11818 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -64),
11819 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -64),
11820 BPF_LD_IND(BPF_W, BPF_REG_7, 14),
11821 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -60),
11822 BPF_MOV32_IMM(BPF_REG_0, 280971478),
11823 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56),
11824 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56),
11825 BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -60),
11826 BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
11827 BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 15),
11828 BPF_LD_ABS(BPF_H, 12),
11829 BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 13),
11830 BPF_MOV32_IMM(BPF_REG_0, 22),
11831 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56),
11832 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56),
11833 BPF_LD_IND(BPF_H, BPF_REG_7, 14),
11834 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -52),
11835 BPF_MOV32_IMM(BPF_REG_0, 17366),
11836 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -48),
11837 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -48),
11838 BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -52),
11839 BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
11840 BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
11841 BPF_MOV32_IMM(BPF_REG_0, 256),
11842 BPF_EXIT_INSN(),
11843 BPF_MOV32_IMM(BPF_REG_0, 0),
11844 BPF_EXIT_INSN(),
11845 },
11846 .data = {
11847 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0,
11848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11849 0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6,
11850 },
11851 .prog_type = BPF_PROG_TYPE_SCHED_CLS,
11852 .result = ACCEPT,
11853 .retval = 256,
11854 },
11855 {
11856 "ld_abs: div + abs, test 1",
11857 .insns = {
11858 BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
11859 BPF_LD_ABS(BPF_B, 3),
11860 BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2),
11861 BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2),
11862 BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0),
11863 BPF_LD_ABS(BPF_B, 4),
11864 BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0),
11865 BPF_LD_IND(BPF_B, BPF_REG_8, -70),
11866 BPF_EXIT_INSN(),
11867 },
11868 .data = {
11869 10, 20, 30, 40, 50,
11870 },
11871 .prog_type = BPF_PROG_TYPE_SCHED_CLS,
11872 .result = ACCEPT,
11873 .retval = 10,
11874 },
11875 {
11876 "ld_abs: div + abs, test 2",
11877 .insns = {
11878 BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
11879 BPF_LD_ABS(BPF_B, 3),
11880 BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2),
11881 BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2),
11882 BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0),
11883 BPF_LD_ABS(BPF_B, 128),
11884 BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0),
11885 BPF_LD_IND(BPF_B, BPF_REG_8, -70),
11886 BPF_EXIT_INSN(),
11887 },
11888 .data = {
11889 10, 20, 30, 40, 50,
11890 },
11891 .prog_type = BPF_PROG_TYPE_SCHED_CLS,
11892 .result = ACCEPT,
11893 .retval = 0,
11894 },
11895 {
11896 "ld_abs: div + abs, test 3",
11897 .insns = {
11898 BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
11899 BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0),
11900 BPF_LD_ABS(BPF_B, 3),
11901 BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7),
11902 BPF_EXIT_INSN(),
11903 },
11904 .data = {
11905 10, 20, 30, 40, 50,
11906 },
11907 .prog_type = BPF_PROG_TYPE_SCHED_CLS,
11908 .result = ACCEPT,
11909 .retval = 0,
11910 },
11911 {
11912 "ld_abs: div + abs, test 4",
11913 .insns = {
11914 BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
11915 BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0),
11916 BPF_LD_ABS(BPF_B, 256),
11917 BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7),
11918 BPF_EXIT_INSN(),
11919 },
11920 .data = {
11921 10, 20, 30, 40, 50,
11922 },
11923 .prog_type = BPF_PROG_TYPE_SCHED_CLS,
11924 .result = ACCEPT,
11925 .retval = 0,
11926 },
11927 {
11928 "ld_abs: vlan + abs, test 1",
11929 .insns = { },
11930 .data = {
11931 0x34,
11932 },
11933 .fill_helper = bpf_fill_ld_abs_vlan_push_pop,
11934 .prog_type = BPF_PROG_TYPE_SCHED_CLS,
11935 .result = ACCEPT,
11936 .retval = 0xbef,
11937 },
11938 {
11939 "ld_abs: vlan + abs, test 2",
11940 .insns = {
11941 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
11942 BPF_LD_ABS(BPF_B, 0),
11943 BPF_LD_ABS(BPF_H, 0),
11944 BPF_LD_ABS(BPF_W, 0),
11945 BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
11946 BPF_MOV64_IMM(BPF_REG_6, 0),
11947 BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
11948 BPF_MOV64_IMM(BPF_REG_2, 1),
11949 BPF_MOV64_IMM(BPF_REG_3, 2),
11950 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
11951 BPF_FUNC_skb_vlan_push),
11952 BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),
11953 BPF_LD_ABS(BPF_B, 0),
11954 BPF_LD_ABS(BPF_H, 0),
11955 BPF_LD_ABS(BPF_W, 0),
11956 BPF_MOV64_IMM(BPF_REG_0, 42),
11957 BPF_EXIT_INSN(),
11958 },
11959 .data = {
11960 0x34,
11961 },
11962 .prog_type = BPF_PROG_TYPE_SCHED_CLS,
11963 .result = ACCEPT,
11964 .retval = 42,
11965 },
11966 {
11967 "ld_abs: jump around ld_abs",
11968 .insns = { },
11969 .data = {
11970 10, 11,
11971 },
11972 .fill_helper = bpf_fill_jump_around_ld_abs,
11973 .prog_type = BPF_PROG_TYPE_SCHED_CLS,
11974 .result = ACCEPT,
11975 .retval = 10,
11976 },
11683}; 11977};
11684 11978
11685static int probe_filter_length(const struct bpf_insn *fp) 11979static int probe_filter_length(const struct bpf_insn *fp)
@@ -11783,7 +12077,7 @@ static int create_map_in_map(void)
11783 return outer_map_fd; 12077 return outer_map_fd;
11784} 12078}
11785 12079
11786static char bpf_vlog[32768]; 12080static char bpf_vlog[UINT_MAX >> 8];
11787 12081
11788static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, 12082static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
11789 int *map_fds) 12083 int *map_fds)
@@ -11794,6 +12088,9 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
11794 int *fixup_prog = test->fixup_prog; 12088 int *fixup_prog = test->fixup_prog;
11795 int *fixup_map_in_map = test->fixup_map_in_map; 12089 int *fixup_map_in_map = test->fixup_map_in_map;
11796 12090
12091 if (test->fill_helper)
12092 test->fill_helper(test);
12093
11797 /* Allocating HTs with 1 elem is fine here, since we only test 12094 /* Allocating HTs with 1 elem is fine here, since we only test
11798 * for verifier and not do a runtime lookup, so the only thing 12095 * for verifier and not do a runtime lookup, so the only thing
11799 * that really matters is value size in this case. 12096 * that really matters is value size in this case.
@@ -11843,10 +12140,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
11843 int *passes, int *errors) 12140 int *passes, int *errors)
11844{ 12141{
11845 int fd_prog, expected_ret, reject_from_alignment; 12142 int fd_prog, expected_ret, reject_from_alignment;
12143 int prog_len, prog_type = test->prog_type;
11846 struct bpf_insn *prog = test->insns; 12144 struct bpf_insn *prog = test->insns;
11847 int prog_len = probe_filter_length(prog);
11848 char data_in[TEST_DATA_LEN] = {};
11849 int prog_type = test->prog_type;
11850 int map_fds[MAX_NR_MAPS]; 12145 int map_fds[MAX_NR_MAPS];
11851 const char *expected_err; 12146 const char *expected_err;
11852 uint32_t retval; 12147 uint32_t retval;
@@ -11856,6 +12151,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
11856 map_fds[i] = -1; 12151 map_fds[i] = -1;
11857 12152
11858 do_test_fixup(test, prog, map_fds); 12153 do_test_fixup(test, prog, map_fds);
12154 prog_len = probe_filter_length(prog);
11859 12155
11860 fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, 12156 fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
11861 prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT, 12157 prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT,
@@ -11895,8 +12191,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
11895 } 12191 }
11896 12192
11897 if (fd_prog >= 0) { 12193 if (fd_prog >= 0) {
11898 err = bpf_prog_test_run(fd_prog, 1, data_in, sizeof(data_in), 12194 err = bpf_prog_test_run(fd_prog, 1, test->data,
11899 NULL, NULL, &retval, NULL); 12195 sizeof(test->data), NULL, NULL,
12196 &retval, NULL);
11900 if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) { 12197 if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) {
11901 printf("Unexpected bpf_prog_test_run error\n"); 12198 printf("Unexpected bpf_prog_test_run error\n");
11902 goto fail_log; 12199 goto fail_log;
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
new file mode 100644
index 000000000000..ad025bd75f1c
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -0,0 +1,180 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <stdlib.h>
4#include <string.h>
5#include <assert.h>
6#include <errno.h>
7#include <poll.h>
8#include <unistd.h>
9#include <linux/perf_event.h>
10#include <sys/mman.h>
11#include "trace_helpers.h"
12
13#define MAX_SYMS 300000
14static struct ksym syms[MAX_SYMS];
15static int sym_cnt;
16
17static int ksym_cmp(const void *p1, const void *p2)
18{
19 return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
20}
21
22int load_kallsyms(void)
23{
24 FILE *f = fopen("/proc/kallsyms", "r");
25 char func[256], buf[256];
26 char symbol;
27 void *addr;
28 int i = 0;
29
30 if (!f)
31 return -ENOENT;
32
33 while (!feof(f)) {
34 if (!fgets(buf, sizeof(buf), f))
35 break;
36 if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
37 break;
38 if (!addr)
39 continue;
40 syms[i].addr = (long) addr;
41 syms[i].name = strdup(func);
42 i++;
43 }
44 sym_cnt = i;
45 qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
46 return 0;
47}
48
49struct ksym *ksym_search(long key)
50{
51 int start = 0, end = sym_cnt;
52 int result;
53
54 while (start < end) {
55 size_t mid = start + (end - start) / 2;
56
57 result = key - syms[mid].addr;
58 if (result < 0)
59 end = mid;
60 else if (result > 0)
61 start = mid + 1;
62 else
63 return &syms[mid];
64 }
65
66 if (start >= 1 && syms[start - 1].addr < key &&
67 key < syms[start].addr)
68 /* valid ksym */
69 return &syms[start - 1];
70
71 /* out of range. return _stext */
72 return &syms[0];
73}
74
75static int page_size;
76static int page_cnt = 8;
77static volatile struct perf_event_mmap_page *header;
78
79int perf_event_mmap(int fd)
80{
81 void *base;
82 int mmap_size;
83
84 page_size = getpagesize();
85 mmap_size = page_size * (page_cnt + 1);
86
87 base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
88 if (base == MAP_FAILED) {
89 printf("mmap err\n");
90 return -1;
91 }
92
93 header = base;
94 return 0;
95}
96
97static int perf_event_poll(int fd)
98{
99 struct pollfd pfd = { .fd = fd, .events = POLLIN };
100
101 return poll(&pfd, 1, 1000);
102}
103
104struct perf_event_sample {
105 struct perf_event_header header;
106 __u32 size;
107 char data[];
108};
109
110static int perf_event_read(perf_event_print_fn fn)
111{
112 __u64 data_tail = header->data_tail;
113 __u64 data_head = header->data_head;
114 __u64 buffer_size = page_cnt * page_size;
115 void *base, *begin, *end;
116 char buf[256];
117 int ret;
118
119 asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
120 if (data_head == data_tail)
121 return PERF_EVENT_CONT;
122
123 base = ((char *)header) + page_size;
124
125 begin = base + data_tail % buffer_size;
126 end = base + data_head % buffer_size;
127
128 while (begin != end) {
129 struct perf_event_sample *e;
130
131 e = begin;
132 if (begin + e->header.size > base + buffer_size) {
133 long len = base + buffer_size - begin;
134
135 assert(len < e->header.size);
136 memcpy(buf, begin, len);
137 memcpy(buf + len, base, e->header.size - len);
138 e = (void *) buf;
139 begin = base + e->header.size - len;
140 } else if (begin + e->header.size == base + buffer_size) {
141 begin = base;
142 } else {
143 begin += e->header.size;
144 }
145
146 if (e->header.type == PERF_RECORD_SAMPLE) {
147 ret = fn(e->data, e->size);
148 if (ret != PERF_EVENT_CONT)
149 return ret;
150 } else if (e->header.type == PERF_RECORD_LOST) {
151 struct {
152 struct perf_event_header header;
153 __u64 id;
154 __u64 lost;
155 } *lost = (void *) e;
156 printf("lost %lld events\n", lost->lost);
157 } else {
158 printf("unknown event type=%d size=%d\n",
159 e->header.type, e->header.size);
160 }
161 }
162
163 __sync_synchronize(); /* smp_mb() */
164 header->data_tail = data_head;
165 return PERF_EVENT_CONT;
166}
167
168int perf_event_poller(int fd, perf_event_print_fn output_fn)
169{
170 int ret;
171
172 for (;;) {
173 perf_event_poll(fd);
174 ret = perf_event_read(output_fn);
175 if (ret != PERF_EVENT_CONT)
176 return ret;
177 }
178
179 return PERF_EVENT_DONE;
180}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
new file mode 100644
index 000000000000..fe3eefd21e86
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -0,0 +1,23 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __TRACE_HELPER_H
3#define __TRACE_HELPER_H
4
5struct ksym {
6 long addr;
7 char *name;
8};
9
10int load_kallsyms(void);
11struct ksym *ksym_search(long key);
12
13typedef int (*perf_event_print_fn)(void *data, int size);
14
15/* return code for perf_event_print_fn */
16#define PERF_EVENT_DONE 0
17#define PERF_EVENT_ERROR -1
18#define PERF_EVENT_CONT -2
19
20int perf_event_mmap(int fd);
21/* return PERF_EVENT_DONE or PERF_EVENT_ERROR */
22int perf_event_poller(int fd, perf_event_print_fn output_fn);
23#endif