aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJesper Dangaard Brouer <brouer@redhat.com>2017-10-16 06:19:28 -0400
committerDavid S. Miller <davem@davemloft.net>2017-10-18 07:12:18 -0400
commit6710e1126934d8b4372b4d2f9ae1646cd3f151bf (patch)
tree38c403515fd5afc5f212666ed37c3b29542aa624 /kernel
parent4b70c62b9eafcee0505b440732d2e00c50f3085d (diff)
bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP
The 'cpumap' is primarily used as a backend map for XDP BPF helper call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. This patch implement the main part of the map. It is not connected to the XDP redirect system yet, and no SKB allocation are done yet. The main concern in this patch is to ensure the datapath can run without any locking. This adds complexity to the setup and tear-down procedure, which assumptions are extra carefully documented in the code comments. V2: - make sure array isn't larger than NR_CPUS - make sure CPUs added is a valid possible CPU V3: fix nitpicks from Jakub Kicinski <kubakici@wp.pl> V5: - Restrict map allocation to root / CAP_SYS_ADMIN - WARN_ON_ONCE if queue is not empty on tear-down - Return -EPERM on memlock limit instead of -ENOMEM - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup() - Moved cpu_map_enqueue() to next patch V6: all notice by Daniel Borkmann - Fix err return code in cpu_map_alloc() introduced in V5 - Move cpu_possible() check after max_entries boundary check - Forbid usage initially in check_map_func_compatibility() V7: - Fix alloc error path spotted by Daniel Borkmann - Did stress test adding+removing CPUs from the map concurrently - Fixed refcnt issue on cpu_map_entry, kthread started too soon - Make sure packets are flushed during tear-down, involved use of rcu_barrier() and kthread_run only exit after queue is empty - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring V8: - Nitpicking comments and gramma by Edward Cree - Fix missing semi-colon introduced in V7 due to rebasing - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/cpumap.c560
-rw-r--r--kernel/bpf/syscall.c8
-rw-r--r--kernel/bpf/verifier.c5
4 files changed, 573 insertions, 1 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 53fb09f92e3f..e597daae6120 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
5obj-$(CONFIG_BPF_SYSCALL) += disasm.o 5obj-$(CONFIG_BPF_SYSCALL) += disasm.o
6ifeq ($(CONFIG_NET),y) 6ifeq ($(CONFIG_NET),y)
7obj-$(CONFIG_BPF_SYSCALL) += devmap.o 7obj-$(CONFIG_BPF_SYSCALL) += devmap.o
8obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
8ifeq ($(CONFIG_STREAM_PARSER),y) 9ifeq ($(CONFIG_STREAM_PARSER),y)
9obj-$(CONFIG_BPF_SYSCALL) += sockmap.o 10obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
10endif 11endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..e1e25ddba038
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,560 @@
1/* bpf/cpumap.c
2 *
3 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
4 * Released under terms in GPL version 2. See COPYING.
5 */
6
7/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
8 * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
9 *
10 * Unlike devmap which redirects XDP frames out another NIC device,
11 * this map type redirects raw XDP frames to another CPU. The remote
12 * CPU will do SKB-allocation and call the normal network stack.
13 *
14 * This is a scalability and isolation mechanism, that allow
15 * separating the early driver network XDP layer, from the rest of the
16 * netstack, and assigning dedicated CPUs for this stage. This
17 * basically allows for 10G wirespeed pre-filtering via bpf.
18 */
19#include <linux/bpf.h>
20#include <linux/filter.h>
21#include <linux/ptr_ring.h>
22
23#include <linux/sched.h>
24#include <linux/workqueue.h>
25#include <linux/kthread.h>
26#include <linux/capability.h>
27
28/* General idea: XDP packets getting XDP redirected to another CPU,
29 * will maximum be stored/queued for one driver ->poll() call. It is
30 * guaranteed that setting flush bit and flush operation happen on
31 * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
32 * which queue in bpf_cpu_map_entry contains packets.
33 */
34
35#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
36struct xdp_bulk_queue {
37 void *q[CPU_MAP_BULK_SIZE];
38 unsigned int count;
39};
40
41/* Struct for every remote "destination" CPU in map */
42struct bpf_cpu_map_entry {
43 u32 qsize; /* Queue size placeholder for map lookup */
44
45 /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
46 struct xdp_bulk_queue __percpu *bulkq;
47
48 /* Queue with potential multi-producers, and single-consumer kthread */
49 struct ptr_ring *queue;
50 struct task_struct *kthread;
51 struct work_struct kthread_stop_wq;
52
53 atomic_t refcnt; /* Control when this struct can be free'ed */
54 struct rcu_head rcu;
55};
56
57struct bpf_cpu_map {
58 struct bpf_map map;
59 /* Below members specific for map type */
60 struct bpf_cpu_map_entry **cpu_map;
61 unsigned long __percpu *flush_needed;
62};
63
64static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
65 struct xdp_bulk_queue *bq);
66
67static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
68{
69 return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
70}
71
72static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
73{
74 struct bpf_cpu_map *cmap;
75 int err = -ENOMEM;
76 u64 cost;
77 int ret;
78
79 if (!capable(CAP_SYS_ADMIN))
80 return ERR_PTR(-EPERM);
81
82 /* check sanity of attributes */
83 if (attr->max_entries == 0 || attr->key_size != 4 ||
84 attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
85 return ERR_PTR(-EINVAL);
86
87 cmap = kzalloc(sizeof(*cmap), GFP_USER);
88 if (!cmap)
89 return ERR_PTR(-ENOMEM);
90
91 /* mandatory map attributes */
92 cmap->map.map_type = attr->map_type;
93 cmap->map.key_size = attr->key_size;
94 cmap->map.value_size = attr->value_size;
95 cmap->map.max_entries = attr->max_entries;
96 cmap->map.map_flags = attr->map_flags;
97 cmap->map.numa_node = bpf_map_attr_numa_node(attr);
98
99 /* Pre-limit array size based on NR_CPUS, not final CPU check */
100 if (cmap->map.max_entries > NR_CPUS) {
101 err = -E2BIG;
102 goto free_cmap;
103 }
104
105 /* make sure page count doesn't overflow */
106 cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
107 cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
108 if (cost >= U32_MAX - PAGE_SIZE)
109 goto free_cmap;
110 cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
111
112 /* Notice returns -EPERM on if map size is larger than memlock limit */
113 ret = bpf_map_precharge_memlock(cmap->map.pages);
114 if (ret) {
115 err = ret;
116 goto free_cmap;
117 }
118
119 /* A per cpu bitfield with a bit per possible CPU in map */
120 cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
121 __alignof__(unsigned long));
122 if (!cmap->flush_needed)
123 goto free_cmap;
124
125 /* Alloc array for possible remote "destination" CPUs */
126 cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
127 sizeof(struct bpf_cpu_map_entry *),
128 cmap->map.numa_node);
129 if (!cmap->cpu_map)
130 goto free_percpu;
131
132 return &cmap->map;
133free_percpu:
134 free_percpu(cmap->flush_needed);
135free_cmap:
136 kfree(cmap);
137 return ERR_PTR(err);
138}
139
140void __cpu_map_queue_destructor(void *ptr)
141{
142 /* The tear-down procedure should have made sure that queue is
143 * empty. See __cpu_map_entry_replace() and work-queue
144 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
145 * gracefully and warn once.
146 */
147 if (WARN_ON_ONCE(ptr))
148 page_frag_free(ptr);
149}
150
151static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
152{
153 if (atomic_dec_and_test(&rcpu->refcnt)) {
154 /* The queue should be empty at this point */
155 ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
156 kfree(rcpu->queue);
157 kfree(rcpu);
158 }
159}
160
161static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
162{
163 atomic_inc(&rcpu->refcnt);
164}
165
166/* called from workqueue, to workaround syscall using preempt_disable */
167static void cpu_map_kthread_stop(struct work_struct *work)
168{
169 struct bpf_cpu_map_entry *rcpu;
170
171 rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
172
173 /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
174 * as it waits until all in-flight call_rcu() callbacks complete.
175 */
176 rcu_barrier();
177
178 /* kthread_stop will wake_up_process and wait for it to complete */
179 kthread_stop(rcpu->kthread);
180}
181
182static int cpu_map_kthread_run(void *data)
183{
184 struct bpf_cpu_map_entry *rcpu = data;
185
186 set_current_state(TASK_INTERRUPTIBLE);
187
188 /* When kthread gives stop order, then rcpu have been disconnected
189 * from map, thus no new packets can enter. Remaining in-flight
190 * per CPU stored packets are flushed to this queue. Wait honoring
191 * kthread_stop signal until queue is empty.
192 */
193 while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
194 struct xdp_pkt *xdp_pkt;
195
196 schedule();
197 /* Do work */
198 while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
199 /* For now just "refcnt-free" */
200 page_frag_free(xdp_pkt);
201 }
202 __set_current_state(TASK_INTERRUPTIBLE);
203 }
204 __set_current_state(TASK_RUNNING);
205
206 put_cpu_map_entry(rcpu);
207 return 0;
208}
209
210struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
211{
212 gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
213 struct bpf_cpu_map_entry *rcpu;
214 int numa, err;
215
216 /* Have map->numa_node, but choose node of redirect target CPU */
217 numa = cpu_to_node(cpu);
218
219 rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
220 if (!rcpu)
221 return NULL;
222
223 /* Alloc percpu bulkq */
224 rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
225 sizeof(void *), gfp);
226 if (!rcpu->bulkq)
227 goto free_rcu;
228
229 /* Alloc queue */
230 rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
231 if (!rcpu->queue)
232 goto free_bulkq;
233
234 err = ptr_ring_init(rcpu->queue, qsize, gfp);
235 if (err)
236 goto free_queue;
237
238 rcpu->qsize = qsize;
239
240 /* Setup kthread */
241 rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
242 "cpumap/%d/map:%d", cpu, map_id);
243 if (IS_ERR(rcpu->kthread))
244 goto free_ptr_ring;
245
246 get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
247 get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
248
249 /* Make sure kthread runs on a single CPU */
250 kthread_bind(rcpu->kthread, cpu);
251 wake_up_process(rcpu->kthread);
252
253 return rcpu;
254
255free_ptr_ring:
256 ptr_ring_cleanup(rcpu->queue, NULL);
257free_queue:
258 kfree(rcpu->queue);
259free_bulkq:
260 free_percpu(rcpu->bulkq);
261free_rcu:
262 kfree(rcpu);
263 return NULL;
264}
265
266void __cpu_map_entry_free(struct rcu_head *rcu)
267{
268 struct bpf_cpu_map_entry *rcpu;
269 int cpu;
270
271 /* This cpu_map_entry have been disconnected from map and one
272 * RCU graze-period have elapsed. Thus, XDP cannot queue any
273 * new packets and cannot change/set flush_needed that can
274 * find this entry.
275 */
276 rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
277
278 /* Flush remaining packets in percpu bulkq */
279 for_each_online_cpu(cpu) {
280 struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
281
282 /* No concurrent bq_enqueue can run at this point */
283 bq_flush_to_queue(rcpu, bq);
284 }
285 free_percpu(rcpu->bulkq);
286 /* Cannot kthread_stop() here, last put free rcpu resources */
287 put_cpu_map_entry(rcpu);
288}
289
290/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
291 * ensure any driver rcu critical sections have completed, but this
292 * does not guarantee a flush has happened yet. Because driver side
293 * rcu_read_lock/unlock only protects the running XDP program. The
294 * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
295 * pending flush op doesn't fail.
296 *
297 * The bpf_cpu_map_entry is still used by the kthread, and there can
298 * still be pending packets (in queue and percpu bulkq). A refcnt
299 * makes sure to last user (kthread_stop vs. call_rcu) free memory
300 * resources.
301 *
302 * The rcu callback __cpu_map_entry_free flush remaining packets in
303 * percpu bulkq to queue. Due to caller map_delete_elem() disable
304 * preemption, cannot call kthread_stop() to make sure queue is empty.
305 * Instead a work_queue is started for stopping kthread,
306 * cpu_map_kthread_stop, which waits for an RCU graze period before
307 * stopping kthread, emptying the queue.
308 */
309void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
310 u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
311{
312 struct bpf_cpu_map_entry *old_rcpu;
313
314 old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
315 if (old_rcpu) {
316 call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
317 INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
318 schedule_work(&old_rcpu->kthread_stop_wq);
319 }
320}
321
322int cpu_map_delete_elem(struct bpf_map *map, void *key)
323{
324 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
325 u32 key_cpu = *(u32 *)key;
326
327 if (key_cpu >= map->max_entries)
328 return -EINVAL;
329
330 /* notice caller map_delete_elem() use preempt_disable() */
331 __cpu_map_entry_replace(cmap, key_cpu, NULL);
332 return 0;
333}
334
335int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
336 u64 map_flags)
337{
338 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
339 struct bpf_cpu_map_entry *rcpu;
340
341 /* Array index key correspond to CPU number */
342 u32 key_cpu = *(u32 *)key;
343 /* Value is the queue size */
344 u32 qsize = *(u32 *)value;
345
346 if (unlikely(map_flags > BPF_EXIST))
347 return -EINVAL;
348 if (unlikely(key_cpu >= cmap->map.max_entries))
349 return -E2BIG;
350 if (unlikely(map_flags == BPF_NOEXIST))
351 return -EEXIST;
352 if (unlikely(qsize > 16384)) /* sanity limit on qsize */
353 return -EOVERFLOW;
354
355 /* Make sure CPU is a valid possible cpu */
356 if (!cpu_possible(key_cpu))
357 return -ENODEV;
358
359 if (qsize == 0) {
360 rcpu = NULL; /* Same as deleting */
361 } else {
362 /* Updating qsize cause re-allocation of bpf_cpu_map_entry */
363 rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
364 if (!rcpu)
365 return -ENOMEM;
366 }
367 rcu_read_lock();
368 __cpu_map_entry_replace(cmap, key_cpu, rcpu);
369 rcu_read_unlock();
370 return 0;
371}
372
373void cpu_map_free(struct bpf_map *map)
374{
375 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
376 int cpu;
377 u32 i;
378
379 /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
380 * so the bpf programs (can be more than one that used this map) were
381 * disconnected from events. Wait for outstanding critical sections in
382 * these programs to complete. The rcu critical section only guarantees
383 * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
384 * It does __not__ ensure pending flush operations (if any) are
385 * complete.
386 */
387 synchronize_rcu();
388
389 /* To ensure all pending flush operations have completed wait for flush
390 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
391 * Because the above synchronize_rcu() ensures the map is disconnected
392 * from the program we can assume no new bits will be set.
393 */
394 for_each_online_cpu(cpu) {
395 unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
396
397 while (!bitmap_empty(bitmap, cmap->map.max_entries))
398 cond_resched();
399 }
400
401 /* For cpu_map the remote CPUs can still be using the entries
402 * (struct bpf_cpu_map_entry).
403 */
404 for (i = 0; i < cmap->map.max_entries; i++) {
405 struct bpf_cpu_map_entry *rcpu;
406
407 rcpu = READ_ONCE(cmap->cpu_map[i]);
408 if (!rcpu)
409 continue;
410
411 /* bq flush and cleanup happens after RCU graze-period */
412 __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
413 }
414 free_percpu(cmap->flush_needed);
415 bpf_map_area_free(cmap->cpu_map);
416 kfree(cmap);
417}
418
419struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
420{
421 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
422 struct bpf_cpu_map_entry *rcpu;
423
424 if (key >= map->max_entries)
425 return NULL;
426
427 rcpu = READ_ONCE(cmap->cpu_map[key]);
428 return rcpu;
429}
430
431static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
432{
433 struct bpf_cpu_map_entry *rcpu =
434 __cpu_map_lookup_elem(map, *(u32 *)key);
435
436 return rcpu ? &rcpu->qsize : NULL;
437}
438
439static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
440{
441 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
442 u32 index = key ? *(u32 *)key : U32_MAX;
443 u32 *next = next_key;
444
445 if (index >= cmap->map.max_entries) {
446 *next = 0;
447 return 0;
448 }
449
450 if (index == cmap->map.max_entries - 1)
451 return -ENOENT;
452 *next = index + 1;
453 return 0;
454}
455
456const struct bpf_map_ops cpu_map_ops = {
457 .map_alloc = cpu_map_alloc,
458 .map_free = cpu_map_free,
459 .map_delete_elem = cpu_map_delete_elem,
460 .map_update_elem = cpu_map_update_elem,
461 .map_lookup_elem = cpu_map_lookup_elem,
462 .map_get_next_key = cpu_map_get_next_key,
463};
464
465static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
466 struct xdp_bulk_queue *bq)
467{
468 struct ptr_ring *q;
469 int i;
470
471 if (unlikely(!bq->count))
472 return 0;
473
474 q = rcpu->queue;
475 spin_lock(&q->producer_lock);
476
477 for (i = 0; i < bq->count; i++) {
478 void *xdp_pkt = bq->q[i];
479 int err;
480
481 err = __ptr_ring_produce(q, xdp_pkt);
482 if (err) {
483 /* Free xdp_pkt */
484 page_frag_free(xdp_pkt);
485 }
486 }
487 bq->count = 0;
488 spin_unlock(&q->producer_lock);
489
490 return 0;
491}
492
493/* Notice: Will change in later patch */
494struct xdp_pkt {
495 void *data;
496 u16 len;
497 u16 headroom;
498};
499
500/* Runs under RCU-read-side, plus in softirq under NAPI protection.
501 * Thus, safe percpu variable access.
502 */
503int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
504{
505 struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
506
507 if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
508 bq_flush_to_queue(rcpu, bq);
509
510 /* Notice, xdp_buff/page MUST be queued here, long enough for
511 * driver to code invoking us to finished, due to driver
512 * (e.g. ixgbe) recycle tricks based on page-refcnt.
513 *
514 * Thus, incoming xdp_pkt is always queued here (else we race
515 * with another CPU on page-refcnt and remaining driver code).
516 * Queue time is very short, as driver will invoke flush
517 * operation, when completing napi->poll call.
518 */
519 bq->q[bq->count++] = xdp_pkt;
520 return 0;
521}
522
523void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
524{
525 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
526 unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
527
528 __set_bit(bit, bitmap);
529}
530
531void __cpu_map_flush(struct bpf_map *map)
532{
533 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
534 unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
535 u32 bit;
536
537 /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
538 * and __cpu_map_flush() happen on same CPU. Thus, the percpu
539 * bitmap indicate which percpu bulkq have packets.
540 */
541 for_each_set_bit(bit, bitmap, map->max_entries) {
542 struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
543 struct xdp_bulk_queue *bq;
544
545 /* This is possible if entry is removed by user space
546 * between xdp redirect and flush op.
547 */
548 if (unlikely(!rcpu))
549 continue;
550
551 __clear_bit(bit, bitmap);
552
553 /* Flush all frames in bulkq to real queue */
554 bq = this_cpu_ptr(rcpu->bulkq);
555 bq_flush_to_queue(rcpu, bq);
556
557 /* If already running, costs spin_lock_irqsave + smb_mb */
558 wake_up_process(rcpu->kthread);
559 }
560}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d124e702e040..54fba06942f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -592,6 +592,12 @@ static int map_update_elem(union bpf_attr *attr)
592 if (copy_from_user(value, uvalue, value_size) != 0) 592 if (copy_from_user(value, uvalue, value_size) != 0)
593 goto free_value; 593 goto free_value;
594 594
595 /* Need to create a kthread, thus must support schedule */
596 if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
597 err = map->ops->map_update_elem(map, key, value, attr->flags);
598 goto out;
599 }
600
595 /* must increment bpf_prog_active to avoid kprobe+bpf triggering from 601 /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
596 * inside bpf map update or delete otherwise deadlocks are possible 602 * inside bpf map update or delete otherwise deadlocks are possible
597 */ 603 */
@@ -622,7 +628,7 @@ static int map_update_elem(union bpf_attr *attr)
622 } 628 }
623 __this_cpu_dec(bpf_prog_active); 629 __this_cpu_dec(bpf_prog_active);
624 preempt_enable(); 630 preempt_enable();
625 631out:
626 if (!err) 632 if (!err)
627 trace_bpf_map_update_elem(map, ufd, key, value); 633 trace_bpf_map_update_elem(map, ufd, key, value);
628free_value: 634free_value:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9755279d94cb..cefa64be9a2f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1444,6 +1444,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
1444 if (func_id != BPF_FUNC_redirect_map) 1444 if (func_id != BPF_FUNC_redirect_map)
1445 goto error; 1445 goto error;
1446 break; 1446 break;
1447 /* Restrict bpf side of cpumap, open when use-cases appear */
1448 case BPF_MAP_TYPE_CPUMAP:
1449 if (func_id != BPF_FUNC_redirect_map)
1450 goto error;
1451 break;
1447 case BPF_MAP_TYPE_ARRAY_OF_MAPS: 1452 case BPF_MAP_TYPE_ARRAY_OF_MAPS:
1448 case BPF_MAP_TYPE_HASH_OF_MAPS: 1453 case BPF_MAP_TYPE_HASH_OF_MAPS:
1449 if (func_id != BPF_FUNC_map_lookup_elem) 1454 if (func_id != BPF_FUNC_map_lookup_elem)