aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/core.c12
-rw-r--r--kernel/bpf/hashtab.c253
-rw-r--r--kernel/bpf/lpm_trie.c6
-rw-r--r--kernel/bpf/syscall.c8
-rw-r--r--kernel/bpf/verifier.c64
5 files changed, 208 insertions, 135 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f45827e205d3..b4f1cb0c5ac7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1162,12 +1162,12 @@ out:
1162 LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ 1162 LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
1163 off = IMM; 1163 off = IMM;
1164load_word: 1164load_word:
1165 /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are 1165 /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
1166 * only appearing in the programs where ctx == 1166 * appearing in the programs where ctx == skb
1167 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] 1167 * (see may_access_skb() in the verifier). All programs
1168 * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, 1168 * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
1169 * internal BPF verifier will check that BPF_R6 == 1169 * bpf_convert_filter() saves it in BPF_R6, internal BPF
1170 * ctx. 1170 * verifier will check that BPF_R6 == ctx.
1171 * 1171 *
1172 * BPF_ABS and BPF_IND are wrappers of function calls, 1172 * BPF_ABS and BPF_IND are wrappers of function calls,
1173 * so they scratch BPF_R1-BPF_R5 registers, preserve 1173 * so they scratch BPF_R1-BPF_R5 registers, preserve
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3ea87fb19a94..361a69dfe543 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,11 +13,12 @@
13#include <linux/bpf.h> 13#include <linux/bpf.h>
14#include <linux/jhash.h> 14#include <linux/jhash.h>
15#include <linux/filter.h> 15#include <linux/filter.h>
16#include <linux/rculist_nulls.h>
16#include "percpu_freelist.h" 17#include "percpu_freelist.h"
17#include "bpf_lru_list.h" 18#include "bpf_lru_list.h"
18 19
19struct bucket { 20struct bucket {
20 struct hlist_head head; 21 struct hlist_nulls_head head;
21 raw_spinlock_t lock; 22 raw_spinlock_t lock;
22}; 23};
23 24
@@ -29,28 +30,26 @@ struct bpf_htab {
29 struct pcpu_freelist freelist; 30 struct pcpu_freelist freelist;
30 struct bpf_lru lru; 31 struct bpf_lru lru;
31 }; 32 };
32 void __percpu *extra_elems; 33 struct htab_elem *__percpu *extra_elems;
33 atomic_t count; /* number of elements in this hashtable */ 34 atomic_t count; /* number of elements in this hashtable */
34 u32 n_buckets; /* number of hash buckets */ 35 u32 n_buckets; /* number of hash buckets */
35 u32 elem_size; /* size of each element in bytes */ 36 u32 elem_size; /* size of each element in bytes */
36}; 37};
37 38
38enum extra_elem_state {
39 HTAB_NOT_AN_EXTRA_ELEM = 0,
40 HTAB_EXTRA_ELEM_FREE,
41 HTAB_EXTRA_ELEM_USED
42};
43
44/* each htab element is struct htab_elem + key + value */ 39/* each htab element is struct htab_elem + key + value */
45struct htab_elem { 40struct htab_elem {
46 union { 41 union {
47 struct hlist_node hash_node; 42 struct hlist_nulls_node hash_node;
48 struct bpf_htab *htab; 43 struct {
49 struct pcpu_freelist_node fnode; 44 void *padding;
45 union {
46 struct bpf_htab *htab;
47 struct pcpu_freelist_node fnode;
48 };
49 };
50 }; 50 };
51 union { 51 union {
52 struct rcu_head rcu; 52 struct rcu_head rcu;
53 enum extra_elem_state state;
54 struct bpf_lru_node lru_node; 53 struct bpf_lru_node lru_node;
55 }; 54 };
56 u32 hash; 55 u32 hash;
@@ -71,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
71 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 70 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
72} 71}
73 72
73static bool htab_is_prealloc(const struct bpf_htab *htab)
74{
75 return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
76}
77
74static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 78static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
75 void __percpu *pptr) 79 void __percpu *pptr)
76{ 80{
@@ -122,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
122 126
123static int prealloc_init(struct bpf_htab *htab) 127static int prealloc_init(struct bpf_htab *htab)
124{ 128{
129 u32 num_entries = htab->map.max_entries;
125 int err = -ENOMEM, i; 130 int err = -ENOMEM, i;
126 131
127 htab->elems = bpf_map_area_alloc(htab->elem_size * 132 if (!htab_is_percpu(htab) && !htab_is_lru(htab))
128 htab->map.max_entries); 133 num_entries += num_possible_cpus();
134
135 htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
129 if (!htab->elems) 136 if (!htab->elems)
130 return -ENOMEM; 137 return -ENOMEM;
131 138
132 if (!htab_is_percpu(htab)) 139 if (!htab_is_percpu(htab))
133 goto skip_percpu_elems; 140 goto skip_percpu_elems;
134 141
135 for (i = 0; i < htab->map.max_entries; i++) { 142 for (i = 0; i < num_entries; i++) {
136 u32 size = round_up(htab->map.value_size, 8); 143 u32 size = round_up(htab->map.value_size, 8);
137 void __percpu *pptr; 144 void __percpu *pptr;
138 145
@@ -160,10 +167,11 @@ skip_percpu_elems:
160 if (htab_is_lru(htab)) 167 if (htab_is_lru(htab))
161 bpf_lru_populate(&htab->lru, htab->elems, 168 bpf_lru_populate(&htab->lru, htab->elems,
162 offsetof(struct htab_elem, lru_node), 169 offsetof(struct htab_elem, lru_node),
163 htab->elem_size, htab->map.max_entries); 170 htab->elem_size, num_entries);
164 else 171 else
165 pcpu_freelist_populate(&htab->freelist, htab->elems, 172 pcpu_freelist_populate(&htab->freelist,
166 htab->elem_size, htab->map.max_entries); 173 htab->elems + offsetof(struct htab_elem, fnode),
174 htab->elem_size, num_entries);
167 175
168 return 0; 176 return 0;
169 177
@@ -184,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab)
184 192
185static int alloc_extra_elems(struct bpf_htab *htab) 193static int alloc_extra_elems(struct bpf_htab *htab)
186{ 194{
187 void __percpu *pptr; 195 struct htab_elem *__percpu *pptr, *l_new;
196 struct pcpu_freelist_node *l;
188 int cpu; 197 int cpu;
189 198
190 pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); 199 pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
200 GFP_USER | __GFP_NOWARN);
191 if (!pptr) 201 if (!pptr)
192 return -ENOMEM; 202 return -ENOMEM;
193 203
194 for_each_possible_cpu(cpu) { 204 for_each_possible_cpu(cpu) {
195 ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = 205 l = pcpu_freelist_pop(&htab->freelist);
196 HTAB_EXTRA_ELEM_FREE; 206 /* pop will succeed, since prealloc_init()
207 * preallocated extra num_possible_cpus elements
208 */
209 l_new = container_of(l, struct htab_elem, fnode);
210 *per_cpu_ptr(pptr, cpu) = l_new;
197 } 211 }
198 htab->extra_elems = pptr; 212 htab->extra_elems = pptr;
199 return 0; 213 return 0;
@@ -217,6 +231,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
217 int err, i; 231 int err, i;
218 u64 cost; 232 u64 cost;
219 233
234 BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
235 offsetof(struct htab_elem, hash_node.pprev));
236 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
237 offsetof(struct htab_elem, hash_node.pprev));
238
220 if (lru && !capable(CAP_SYS_ADMIN)) 239 if (lru && !capable(CAP_SYS_ADMIN))
221 /* LRU implementation is much complicated than other 240 /* LRU implementation is much complicated than other
222 * maps. Hence, limit to CAP_SYS_ADMIN for now. 241 * maps. Hence, limit to CAP_SYS_ADMIN for now.
@@ -326,29 +345,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
326 goto free_htab; 345 goto free_htab;
327 346
328 for (i = 0; i < htab->n_buckets; i++) { 347 for (i = 0; i < htab->n_buckets; i++) {
329 INIT_HLIST_HEAD(&htab->buckets[i].head); 348 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
330 raw_spin_lock_init(&htab->buckets[i].lock); 349 raw_spin_lock_init(&htab->buckets[i].lock);
331 } 350 }
332 351
333 if (!percpu && !lru) {
334 /* lru itself can remove the least used element, so
335 * there is no need for an extra elem during map_update.
336 */
337 err = alloc_extra_elems(htab);
338 if (err)
339 goto free_buckets;
340 }
341
342 if (prealloc) { 352 if (prealloc) {
343 err = prealloc_init(htab); 353 err = prealloc_init(htab);
344 if (err) 354 if (err)
345 goto free_extra_elems; 355 goto free_buckets;
356
357 if (!percpu && !lru) {
358 /* lru itself can remove the least used element, so
359 * there is no need for an extra elem during map_update.
360 */
361 err = alloc_extra_elems(htab);
362 if (err)
363 goto free_prealloc;
364 }
346 } 365 }
347 366
348 return &htab->map; 367 return &htab->map;
349 368
350free_extra_elems: 369free_prealloc:
351 free_percpu(htab->extra_elems); 370 prealloc_destroy(htab);
352free_buckets: 371free_buckets:
353 bpf_map_area_free(htab->buckets); 372 bpf_map_area_free(htab->buckets);
354free_htab: 373free_htab:
@@ -366,20 +385,44 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
366 return &htab->buckets[hash & (htab->n_buckets - 1)]; 385 return &htab->buckets[hash & (htab->n_buckets - 1)];
367} 386}
368 387
369static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) 388static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
370{ 389{
371 return &__select_bucket(htab, hash)->head; 390 return &__select_bucket(htab, hash)->head;
372} 391}
373 392
374static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, 393/* this lookup function can only be called with bucket lock taken */
394static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
375 void *key, u32 key_size) 395 void *key, u32 key_size)
376{ 396{
397 struct hlist_nulls_node *n;
398 struct htab_elem *l;
399
400 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
401 if (l->hash == hash && !memcmp(&l->key, key, key_size))
402 return l;
403
404 return NULL;
405}
406
407/* can be called without bucket lock. it will repeat the loop in
408 * the unlikely event when elements moved from one bucket into another
409 * while link list is being walked
410 */
411static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
412 u32 hash, void *key,
413 u32 key_size, u32 n_buckets)
414{
415 struct hlist_nulls_node *n;
377 struct htab_elem *l; 416 struct htab_elem *l;
378 417
379 hlist_for_each_entry_rcu(l, head, hash_node) 418again:
419 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
380 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 420 if (l->hash == hash && !memcmp(&l->key, key, key_size))
381 return l; 421 return l;
382 422
423 if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
424 goto again;
425
383 return NULL; 426 return NULL;
384} 427}
385 428
@@ -387,7 +430,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
387static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 430static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
388{ 431{
389 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 432 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
390 struct hlist_head *head; 433 struct hlist_nulls_head *head;
391 struct htab_elem *l; 434 struct htab_elem *l;
392 u32 hash, key_size; 435 u32 hash, key_size;
393 436
@@ -400,7 +443,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
400 443
401 head = select_bucket(htab, hash); 444 head = select_bucket(htab, hash);
402 445
403 l = lookup_elem_raw(head, hash, key, key_size); 446 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
404 447
405 return l; 448 return l;
406} 449}
@@ -433,8 +476,9 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
433static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 476static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
434{ 477{
435 struct bpf_htab *htab = (struct bpf_htab *)arg; 478 struct bpf_htab *htab = (struct bpf_htab *)arg;
436 struct htab_elem *l, *tgt_l; 479 struct htab_elem *l = NULL, *tgt_l;
437 struct hlist_head *head; 480 struct hlist_nulls_head *head;
481 struct hlist_nulls_node *n;
438 unsigned long flags; 482 unsigned long flags;
439 struct bucket *b; 483 struct bucket *b;
440 484
@@ -444,9 +488,9 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
444 488
445 raw_spin_lock_irqsave(&b->lock, flags); 489 raw_spin_lock_irqsave(&b->lock, flags);
446 490
447 hlist_for_each_entry_rcu(l, head, hash_node) 491 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
448 if (l == tgt_l) { 492 if (l == tgt_l) {
449 hlist_del_rcu(&l->hash_node); 493 hlist_nulls_del_rcu(&l->hash_node);
450 break; 494 break;
451 } 495 }
452 496
@@ -459,7 +503,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
459static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 503static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
460{ 504{
461 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 505 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
462 struct hlist_head *head; 506 struct hlist_nulls_head *head;
463 struct htab_elem *l, *next_l; 507 struct htab_elem *l, *next_l;
464 u32 hash, key_size; 508 u32 hash, key_size;
465 int i; 509 int i;
@@ -473,7 +517,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
473 head = select_bucket(htab, hash); 517 head = select_bucket(htab, hash);
474 518
475 /* lookup the key */ 519 /* lookup the key */
476 l = lookup_elem_raw(head, hash, key, key_size); 520 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
477 521
478 if (!l) { 522 if (!l) {
479 i = 0; 523 i = 0;
@@ -481,7 +525,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
481 } 525 }
482 526
483 /* key was found, get next key in the same bucket */ 527 /* key was found, get next key in the same bucket */
484 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), 528 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
485 struct htab_elem, hash_node); 529 struct htab_elem, hash_node);
486 530
487 if (next_l) { 531 if (next_l) {
@@ -500,7 +544,7 @@ find_first_elem:
500 head = select_bucket(htab, i); 544 head = select_bucket(htab, i);
501 545
502 /* pick first element in the bucket */ 546 /* pick first element in the bucket */
503 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), 547 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
504 struct htab_elem, hash_node); 548 struct htab_elem, hash_node);
505 if (next_l) { 549 if (next_l) {
506 /* if it's not empty, just return it */ 550 /* if it's not empty, just return it */
@@ -538,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
538 582
539static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 583static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
540{ 584{
541 if (l->state == HTAB_EXTRA_ELEM_USED) { 585 if (htab_is_prealloc(htab)) {
542 l->state = HTAB_EXTRA_ELEM_FREE;
543 return;
544 }
545
546 if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
547 pcpu_freelist_push(&htab->freelist, &l->fnode); 586 pcpu_freelist_push(&htab->freelist, &l->fnode);
548 } else { 587 } else {
549 atomic_dec(&htab->count); 588 atomic_dec(&htab->count);
@@ -573,43 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
573static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 612static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
574 void *value, u32 key_size, u32 hash, 613 void *value, u32 key_size, u32 hash,
575 bool percpu, bool onallcpus, 614 bool percpu, bool onallcpus,
576 bool old_elem_exists) 615 struct htab_elem *old_elem)
577{ 616{
578 u32 size = htab->map.value_size; 617 u32 size = htab->map.value_size;
579 bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); 618 bool prealloc = htab_is_prealloc(htab);
580 struct htab_elem *l_new; 619 struct htab_elem *l_new, **pl_new;
581 void __percpu *pptr; 620 void __percpu *pptr;
582 int err = 0;
583 621
584 if (prealloc) { 622 if (prealloc) {
585 l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); 623 if (old_elem) {
586 if (!l_new) 624 /* if we're updating the existing element,
587 err = -E2BIG; 625 * use per-cpu extra elems to avoid freelist_pop/push
588 } else { 626 */
589 if (atomic_inc_return(&htab->count) > htab->map.max_entries) { 627 pl_new = this_cpu_ptr(htab->extra_elems);
590 atomic_dec(&htab->count); 628 l_new = *pl_new;
591 err = -E2BIG; 629 *pl_new = old_elem;
592 } else { 630 } else {
593 l_new = kmalloc(htab->elem_size, 631 struct pcpu_freelist_node *l;
594 GFP_ATOMIC | __GFP_NOWARN);
595 if (!l_new)
596 return ERR_PTR(-ENOMEM);
597 }
598 }
599 632
600 if (err) { 633 l = pcpu_freelist_pop(&htab->freelist);
601 if (!old_elem_exists) 634 if (!l)
602 return ERR_PTR(err); 635 return ERR_PTR(-E2BIG);
603 636 l_new = container_of(l, struct htab_elem, fnode);
604 /* if we're updating the existing element and the hash table 637 }
605 * is full, use per-cpu extra elems
606 */
607 l_new = this_cpu_ptr(htab->extra_elems);
608 if (l_new->state != HTAB_EXTRA_ELEM_FREE)
609 return ERR_PTR(-E2BIG);
610 l_new->state = HTAB_EXTRA_ELEM_USED;
611 } else { 638 } else {
612 l_new->state = HTAB_NOT_AN_EXTRA_ELEM; 639 if (atomic_inc_return(&htab->count) > htab->map.max_entries)
640 if (!old_elem) {
641 /* when map is full and update() is replacing
642 * old element, it's ok to allocate, since
643 * old element will be freed immediately.
644 * Otherwise return an error
645 */
646 atomic_dec(&htab->count);
647 return ERR_PTR(-E2BIG);
648 }
649 l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
650 if (!l_new)
651 return ERR_PTR(-ENOMEM);
613 } 652 }
614 653
615 memcpy(l_new->key, key, key_size); 654 memcpy(l_new->key, key, key_size);
@@ -661,7 +700,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
661{ 700{
662 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 701 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
663 struct htab_elem *l_new = NULL, *l_old; 702 struct htab_elem *l_new = NULL, *l_old;
664 struct hlist_head *head; 703 struct hlist_nulls_head *head;
665 unsigned long flags; 704 unsigned long flags;
666 struct bucket *b; 705 struct bucket *b;
667 u32 key_size, hash; 706 u32 key_size, hash;
@@ -690,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
690 goto err; 729 goto err;
691 730
692 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 731 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
693 !!l_old); 732 l_old);
694 if (IS_ERR(l_new)) { 733 if (IS_ERR(l_new)) {
695 /* all pre-allocated elements are in use or memory exhausted */ 734 /* all pre-allocated elements are in use or memory exhausted */
696 ret = PTR_ERR(l_new); 735 ret = PTR_ERR(l_new);
@@ -700,10 +739,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
700 /* add new element to the head of the list, so that 739 /* add new element to the head of the list, so that
701 * concurrent search will find it before old elem 740 * concurrent search will find it before old elem
702 */ 741 */
703 hlist_add_head_rcu(&l_new->hash_node, head); 742 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
704 if (l_old) { 743 if (l_old) {
705 hlist_del_rcu(&l_old->hash_node); 744 hlist_nulls_del_rcu(&l_old->hash_node);
706 free_htab_elem(htab, l_old); 745 if (!htab_is_prealloc(htab))
746 free_htab_elem(htab, l_old);
707 } 747 }
708 ret = 0; 748 ret = 0;
709err: 749err:
@@ -716,7 +756,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
716{ 756{
717 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 757 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
718 struct htab_elem *l_new, *l_old = NULL; 758 struct htab_elem *l_new, *l_old = NULL;
719 struct hlist_head *head; 759 struct hlist_nulls_head *head;
720 unsigned long flags; 760 unsigned long flags;
721 struct bucket *b; 761 struct bucket *b;
722 u32 key_size, hash; 762 u32 key_size, hash;
@@ -757,10 +797,10 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
757 /* add new element to the head of the list, so that 797 /* add new element to the head of the list, so that
758 * concurrent search will find it before old elem 798 * concurrent search will find it before old elem
759 */ 799 */
760 hlist_add_head_rcu(&l_new->hash_node, head); 800 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
761 if (l_old) { 801 if (l_old) {
762 bpf_lru_node_set_ref(&l_new->lru_node); 802 bpf_lru_node_set_ref(&l_new->lru_node);
763 hlist_del_rcu(&l_old->hash_node); 803 hlist_nulls_del_rcu(&l_old->hash_node);
764 } 804 }
765 ret = 0; 805 ret = 0;
766 806
@@ -781,7 +821,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
781{ 821{
782 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 822 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
783 struct htab_elem *l_new = NULL, *l_old; 823 struct htab_elem *l_new = NULL, *l_old;
784 struct hlist_head *head; 824 struct hlist_nulls_head *head;
785 unsigned long flags; 825 unsigned long flags;
786 struct bucket *b; 826 struct bucket *b;
787 u32 key_size, hash; 827 u32 key_size, hash;
@@ -815,12 +855,12 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
815 value, onallcpus); 855 value, onallcpus);
816 } else { 856 } else {
817 l_new = alloc_htab_elem(htab, key, value, key_size, 857 l_new = alloc_htab_elem(htab, key, value, key_size,
818 hash, true, onallcpus, false); 858 hash, true, onallcpus, NULL);
819 if (IS_ERR(l_new)) { 859 if (IS_ERR(l_new)) {
820 ret = PTR_ERR(l_new); 860 ret = PTR_ERR(l_new);
821 goto err; 861 goto err;
822 } 862 }
823 hlist_add_head_rcu(&l_new->hash_node, head); 863 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
824 } 864 }
825 ret = 0; 865 ret = 0;
826err: 866err:
@@ -834,7 +874,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
834{ 874{
835 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 875 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
836 struct htab_elem *l_new = NULL, *l_old; 876 struct htab_elem *l_new = NULL, *l_old;
837 struct hlist_head *head; 877 struct hlist_nulls_head *head;
838 unsigned long flags; 878 unsigned long flags;
839 struct bucket *b; 879 struct bucket *b;
840 u32 key_size, hash; 880 u32 key_size, hash;
@@ -882,7 +922,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
882 } else { 922 } else {
883 pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), 923 pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
884 value, onallcpus); 924 value, onallcpus);
885 hlist_add_head_rcu(&l_new->hash_node, head); 925 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
886 l_new = NULL; 926 l_new = NULL;
887 } 927 }
888 ret = 0; 928 ret = 0;
@@ -910,7 +950,7 @@ static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
910static int htab_map_delete_elem(struct bpf_map *map, void *key) 950static int htab_map_delete_elem(struct bpf_map *map, void *key)
911{ 951{
912 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 952 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
913 struct hlist_head *head; 953 struct hlist_nulls_head *head;
914 struct bucket *b; 954 struct bucket *b;
915 struct htab_elem *l; 955 struct htab_elem *l;
916 unsigned long flags; 956 unsigned long flags;
@@ -930,7 +970,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
930 l = lookup_elem_raw(head, hash, key, key_size); 970 l = lookup_elem_raw(head, hash, key, key_size);
931 971
932 if (l) { 972 if (l) {
933 hlist_del_rcu(&l->hash_node); 973 hlist_nulls_del_rcu(&l->hash_node);
934 free_htab_elem(htab, l); 974 free_htab_elem(htab, l);
935 ret = 0; 975 ret = 0;
936 } 976 }
@@ -942,7 +982,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
942static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) 982static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
943{ 983{
944 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 984 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
945 struct hlist_head *head; 985 struct hlist_nulls_head *head;
946 struct bucket *b; 986 struct bucket *b;
947 struct htab_elem *l; 987 struct htab_elem *l;
948 unsigned long flags; 988 unsigned long flags;
@@ -962,7 +1002,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
962 l = lookup_elem_raw(head, hash, key, key_size); 1002 l = lookup_elem_raw(head, hash, key, key_size);
963 1003
964 if (l) { 1004 if (l) {
965 hlist_del_rcu(&l->hash_node); 1005 hlist_nulls_del_rcu(&l->hash_node);
966 ret = 0; 1006 ret = 0;
967 } 1007 }
968 1008
@@ -977,14 +1017,13 @@ static void delete_all_elements(struct bpf_htab *htab)
977 int i; 1017 int i;
978 1018
979 for (i = 0; i < htab->n_buckets; i++) { 1019 for (i = 0; i < htab->n_buckets; i++) {
980 struct hlist_head *head = select_bucket(htab, i); 1020 struct hlist_nulls_head *head = select_bucket(htab, i);
981 struct hlist_node *n; 1021 struct hlist_nulls_node *n;
982 struct htab_elem *l; 1022 struct htab_elem *l;
983 1023
984 hlist_for_each_entry_safe(l, n, head, hash_node) { 1024 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
985 hlist_del_rcu(&l->hash_node); 1025 hlist_nulls_del_rcu(&l->hash_node);
986 if (l->state != HTAB_EXTRA_ELEM_USED) 1026 htab_elem_free(htab, l);
987 htab_elem_free(htab, l);
988 } 1027 }
989 } 1028 }
990} 1029}
@@ -1004,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map)
1004 * not have executed. Wait for them. 1043 * not have executed. Wait for them.
1005 */ 1044 */
1006 rcu_barrier(); 1045 rcu_barrier();
1007 if (htab->map.map_flags & BPF_F_NO_PREALLOC) 1046 if (!htab_is_prealloc(htab))
1008 delete_all_elements(htab); 1047 delete_all_elements(htab);
1009 else 1048 else
1010 prealloc_destroy(htab); 1049 prealloc_destroy(htab);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 8bfe0afaee10..b37bd9ab7f57 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -500,9 +500,15 @@ unlock:
500 raw_spin_unlock(&trie->lock); 500 raw_spin_unlock(&trie->lock);
501} 501}
502 502
503static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
504{
505 return -ENOTSUPP;
506}
507
503static const struct bpf_map_ops trie_ops = { 508static const struct bpf_map_ops trie_ops = {
504 .map_alloc = trie_alloc, 509 .map_alloc = trie_alloc,
505 .map_free = trie_free, 510 .map_free = trie_free,
511 .map_get_next_key = trie_get_next_key,
506 .map_lookup_elem = trie_lookup_elem, 512 .map_lookup_elem = trie_lookup_elem,
507 .map_update_elem = trie_update_elem, 513 .map_update_elem = trie_update_elem,
508 .map_delete_elem = trie_delete_elem, 514 .map_delete_elem = trie_delete_elem,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7af0dcc5d755..821f9e807de5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -617,6 +617,14 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
617 if (insn->imm == BPF_FUNC_xdp_adjust_head) 617 if (insn->imm == BPF_FUNC_xdp_adjust_head)
618 prog->xdp_adjust_head = 1; 618 prog->xdp_adjust_head = 1;
619 if (insn->imm == BPF_FUNC_tail_call) { 619 if (insn->imm == BPF_FUNC_tail_call) {
620 /* If we tail call into other programs, we
621 * cannot make any assumptions since they
622 * can be replaced dynamically during runtime
623 * in the program array.
624 */
625 prog->cb_access = 1;
626 prog->xdp_adjust_head = 1;
627
620 /* mark bpf_tail_call as different opcode 628 /* mark bpf_tail_call as different opcode
621 * to avoid conditional branch in 629 * to avoid conditional branch in
622 * interpeter for every normal call 630 * interpeter for every normal call
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 796b68d00119..a834068a400e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -765,38 +765,56 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
765 } 765 }
766} 766}
767 767
768static int check_ptr_alignment(struct bpf_verifier_env *env, 768static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
769 struct bpf_reg_state *reg, int off, int size) 769 int off, int size)
770{ 770{
771 if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
772 if (off % size != 0) {
773 verbose("misaligned access off %d size %d\n",
774 off, size);
775 return -EACCES;
776 } else {
777 return 0;
778 }
779 }
780
781 if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
782 /* misaligned access to packet is ok on x86,arm,arm64 */
783 return 0;
784
785 if (reg->id && size != 1) { 771 if (reg->id && size != 1) {
786 verbose("Unknown packet alignment. Only byte-sized access allowed\n"); 772 verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n");
787 return -EACCES; 773 return -EACCES;
788 } 774 }
789 775
790 /* skb->data is NET_IP_ALIGN-ed */ 776 /* skb->data is NET_IP_ALIGN-ed */
791 if (reg->type == PTR_TO_PACKET && 777 if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
792 (NET_IP_ALIGN + reg->off + off) % size != 0) {
793 verbose("misaligned packet access off %d+%d+%d size %d\n", 778 verbose("misaligned packet access off %d+%d+%d size %d\n",
794 NET_IP_ALIGN, reg->off, off, size); 779 NET_IP_ALIGN, reg->off, off, size);
795 return -EACCES; 780 return -EACCES;
796 } 781 }
782
797 return 0; 783 return 0;
798} 784}
799 785
786static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
787 int size)
788{
789 if (size != 1) {
790 verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
791 return -EACCES;
792 }
793
794 return 0;
795}
796
797static int check_ptr_alignment(const struct bpf_reg_state *reg,
798 int off, int size)
799{
800 switch (reg->type) {
801 case PTR_TO_PACKET:
802 return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
803 check_pkt_ptr_alignment(reg, off, size);
804 case PTR_TO_MAP_VALUE_ADJ:
805 return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
806 check_val_ptr_alignment(reg, size);
807 default:
808 if (off % size != 0) {
809 verbose("misaligned access off %d size %d\n",
810 off, size);
811 return -EACCES;
812 }
813
814 return 0;
815 }
816}
817
800/* check whether memory at (regno + off) is accessible for t = (read | write) 818/* check whether memory at (regno + off) is accessible for t = (read | write)
801 * if t==write, value_regno is a register which value is stored into memory 819 * if t==write, value_regno is a register which value is stored into memory
802 * if t==read, value_regno is a register which will receive the value from memory 820 * if t==read, value_regno is a register which will receive the value from memory
@@ -818,7 +836,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
818 if (size < 0) 836 if (size < 0)
819 return size; 837 return size;
820 838
821 err = check_ptr_alignment(env, reg, off, size); 839 err = check_ptr_alignment(reg, off, size);
822 if (err) 840 if (err)
823 return err; 841 return err;
824 842
@@ -1925,6 +1943,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
1925 * register as unknown. 1943 * register as unknown.
1926 */ 1944 */
1927 if (env->allow_ptr_leaks && 1945 if (env->allow_ptr_leaks &&
1946 BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
1928 (dst_reg->type == PTR_TO_MAP_VALUE || 1947 (dst_reg->type == PTR_TO_MAP_VALUE ||
1929 dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) 1948 dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
1930 dst_reg->type = PTR_TO_MAP_VALUE_ADJ; 1949 dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
@@ -1973,14 +1992,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
1973 1992
1974 for (i = 0; i < MAX_BPF_REG; i++) 1993 for (i = 0; i < MAX_BPF_REG; i++)
1975 if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) 1994 if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
1976 regs[i].range = dst_reg->off; 1995 /* keep the maximum range already checked */
1996 regs[i].range = max(regs[i].range, dst_reg->off);
1977 1997
1978 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { 1998 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
1979 if (state->stack_slot_type[i] != STACK_SPILL) 1999 if (state->stack_slot_type[i] != STACK_SPILL)
1980 continue; 2000 continue;
1981 reg = &state->spilled_regs[i / BPF_REG_SIZE]; 2001 reg = &state->spilled_regs[i / BPF_REG_SIZE];
1982 if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) 2002 if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
1983 reg->range = dst_reg->off; 2003 reg->range = max(reg->range, dst_reg->off);
1984 } 2004 }
1985} 2005}
1986 2006