diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/core.c | 12 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 253 | ||||
-rw-r--r-- | kernel/bpf/lpm_trie.c | 6 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 8 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 64 |
5 files changed, 208 insertions, 135 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f45827e205d3..b4f1cb0c5ac7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -1162,12 +1162,12 @@ out: | |||
1162 | LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ | 1162 | LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ |
1163 | off = IMM; | 1163 | off = IMM; |
1164 | load_word: | 1164 | load_word: |
1165 | /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are | 1165 | /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only |
1166 | * only appearing in the programs where ctx == | 1166 | * appearing in the programs where ctx == skb |
1167 | * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] | 1167 | * (see may_access_skb() in the verifier). All programs |
1168 | * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, | 1168 | * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, |
1169 | * internal BPF verifier will check that BPF_R6 == | 1169 | * bpf_convert_filter() saves it in BPF_R6, internal BPF |
1170 | * ctx. | 1170 | * verifier will check that BPF_R6 == ctx. |
1171 | * | 1171 | * |
1172 | * BPF_ABS and BPF_IND are wrappers of function calls, | 1172 | * BPF_ABS and BPF_IND are wrappers of function calls, |
1173 | * so they scratch BPF_R1-BPF_R5 registers, preserve | 1173 | * so they scratch BPF_R1-BPF_R5 registers, preserve |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ea87fb19a94..361a69dfe543 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
@@ -13,11 +13,12 @@ | |||
13 | #include <linux/bpf.h> | 13 | #include <linux/bpf.h> |
14 | #include <linux/jhash.h> | 14 | #include <linux/jhash.h> |
15 | #include <linux/filter.h> | 15 | #include <linux/filter.h> |
16 | #include <linux/rculist_nulls.h> | ||
16 | #include "percpu_freelist.h" | 17 | #include "percpu_freelist.h" |
17 | #include "bpf_lru_list.h" | 18 | #include "bpf_lru_list.h" |
18 | 19 | ||
19 | struct bucket { | 20 | struct bucket { |
20 | struct hlist_head head; | 21 | struct hlist_nulls_head head; |
21 | raw_spinlock_t lock; | 22 | raw_spinlock_t lock; |
22 | }; | 23 | }; |
23 | 24 | ||
@@ -29,28 +30,26 @@ struct bpf_htab { | |||
29 | struct pcpu_freelist freelist; | 30 | struct pcpu_freelist freelist; |
30 | struct bpf_lru lru; | 31 | struct bpf_lru lru; |
31 | }; | 32 | }; |
32 | void __percpu *extra_elems; | 33 | struct htab_elem *__percpu *extra_elems; |
33 | atomic_t count; /* number of elements in this hashtable */ | 34 | atomic_t count; /* number of elements in this hashtable */ |
34 | u32 n_buckets; /* number of hash buckets */ | 35 | u32 n_buckets; /* number of hash buckets */ |
35 | u32 elem_size; /* size of each element in bytes */ | 36 | u32 elem_size; /* size of each element in bytes */ |
36 | }; | 37 | }; |
37 | 38 | ||
38 | enum extra_elem_state { | ||
39 | HTAB_NOT_AN_EXTRA_ELEM = 0, | ||
40 | HTAB_EXTRA_ELEM_FREE, | ||
41 | HTAB_EXTRA_ELEM_USED | ||
42 | }; | ||
43 | |||
44 | /* each htab element is struct htab_elem + key + value */ | 39 | /* each htab element is struct htab_elem + key + value */ |
45 | struct htab_elem { | 40 | struct htab_elem { |
46 | union { | 41 | union { |
47 | struct hlist_node hash_node; | 42 | struct hlist_nulls_node hash_node; |
48 | struct bpf_htab *htab; | 43 | struct { |
49 | struct pcpu_freelist_node fnode; | 44 | void *padding; |
45 | union { | ||
46 | struct bpf_htab *htab; | ||
47 | struct pcpu_freelist_node fnode; | ||
48 | }; | ||
49 | }; | ||
50 | }; | 50 | }; |
51 | union { | 51 | union { |
52 | struct rcu_head rcu; | 52 | struct rcu_head rcu; |
53 | enum extra_elem_state state; | ||
54 | struct bpf_lru_node lru_node; | 53 | struct bpf_lru_node lru_node; |
55 | }; | 54 | }; |
56 | u32 hash; | 55 | u32 hash; |
@@ -71,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab) | |||
71 | htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; | 70 | htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; |
72 | } | 71 | } |
73 | 72 | ||
73 | static bool htab_is_prealloc(const struct bpf_htab *htab) | ||
74 | { | ||
75 | return !(htab->map.map_flags & BPF_F_NO_PREALLOC); | ||
76 | } | ||
77 | |||
74 | static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, | 78 | static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, |
75 | void __percpu *pptr) | 79 | void __percpu *pptr) |
76 | { | 80 | { |
@@ -122,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, | |||
122 | 126 | ||
123 | static int prealloc_init(struct bpf_htab *htab) | 127 | static int prealloc_init(struct bpf_htab *htab) |
124 | { | 128 | { |
129 | u32 num_entries = htab->map.max_entries; | ||
125 | int err = -ENOMEM, i; | 130 | int err = -ENOMEM, i; |
126 | 131 | ||
127 | htab->elems = bpf_map_area_alloc(htab->elem_size * | 132 | if (!htab_is_percpu(htab) && !htab_is_lru(htab)) |
128 | htab->map.max_entries); | 133 | num_entries += num_possible_cpus(); |
134 | |||
135 | htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries); | ||
129 | if (!htab->elems) | 136 | if (!htab->elems) |
130 | return -ENOMEM; | 137 | return -ENOMEM; |
131 | 138 | ||
132 | if (!htab_is_percpu(htab)) | 139 | if (!htab_is_percpu(htab)) |
133 | goto skip_percpu_elems; | 140 | goto skip_percpu_elems; |
134 | 141 | ||
135 | for (i = 0; i < htab->map.max_entries; i++) { | 142 | for (i = 0; i < num_entries; i++) { |
136 | u32 size = round_up(htab->map.value_size, 8); | 143 | u32 size = round_up(htab->map.value_size, 8); |
137 | void __percpu *pptr; | 144 | void __percpu *pptr; |
138 | 145 | ||
@@ -160,10 +167,11 @@ skip_percpu_elems: | |||
160 | if (htab_is_lru(htab)) | 167 | if (htab_is_lru(htab)) |
161 | bpf_lru_populate(&htab->lru, htab->elems, | 168 | bpf_lru_populate(&htab->lru, htab->elems, |
162 | offsetof(struct htab_elem, lru_node), | 169 | offsetof(struct htab_elem, lru_node), |
163 | htab->elem_size, htab->map.max_entries); | 170 | htab->elem_size, num_entries); |
164 | else | 171 | else |
165 | pcpu_freelist_populate(&htab->freelist, htab->elems, | 172 | pcpu_freelist_populate(&htab->freelist, |
166 | htab->elem_size, htab->map.max_entries); | 173 | htab->elems + offsetof(struct htab_elem, fnode), |
174 | htab->elem_size, num_entries); | ||
167 | 175 | ||
168 | return 0; | 176 | return 0; |
169 | 177 | ||
@@ -184,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab) | |||
184 | 192 | ||
185 | static int alloc_extra_elems(struct bpf_htab *htab) | 193 | static int alloc_extra_elems(struct bpf_htab *htab) |
186 | { | 194 | { |
187 | void __percpu *pptr; | 195 | struct htab_elem *__percpu *pptr, *l_new; |
196 | struct pcpu_freelist_node *l; | ||
188 | int cpu; | 197 | int cpu; |
189 | 198 | ||
190 | pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); | 199 | pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, |
200 | GFP_USER | __GFP_NOWARN); | ||
191 | if (!pptr) | 201 | if (!pptr) |
192 | return -ENOMEM; | 202 | return -ENOMEM; |
193 | 203 | ||
194 | for_each_possible_cpu(cpu) { | 204 | for_each_possible_cpu(cpu) { |
195 | ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = | 205 | l = pcpu_freelist_pop(&htab->freelist); |
196 | HTAB_EXTRA_ELEM_FREE; | 206 | /* pop will succeed, since prealloc_init() |
207 | * preallocated extra num_possible_cpus elements | ||
208 | */ | ||
209 | l_new = container_of(l, struct htab_elem, fnode); | ||
210 | *per_cpu_ptr(pptr, cpu) = l_new; | ||
197 | } | 211 | } |
198 | htab->extra_elems = pptr; | 212 | htab->extra_elems = pptr; |
199 | return 0; | 213 | return 0; |
@@ -217,6 +231,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
217 | int err, i; | 231 | int err, i; |
218 | u64 cost; | 232 | u64 cost; |
219 | 233 | ||
234 | BUILD_BUG_ON(offsetof(struct htab_elem, htab) != | ||
235 | offsetof(struct htab_elem, hash_node.pprev)); | ||
236 | BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != | ||
237 | offsetof(struct htab_elem, hash_node.pprev)); | ||
238 | |||
220 | if (lru && !capable(CAP_SYS_ADMIN)) | 239 | if (lru && !capable(CAP_SYS_ADMIN)) |
221 | /* LRU implementation is much complicated than other | 240 | /* LRU implementation is much complicated than other |
222 | * maps. Hence, limit to CAP_SYS_ADMIN for now. | 241 | * maps. Hence, limit to CAP_SYS_ADMIN for now. |
@@ -326,29 +345,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
326 | goto free_htab; | 345 | goto free_htab; |
327 | 346 | ||
328 | for (i = 0; i < htab->n_buckets; i++) { | 347 | for (i = 0; i < htab->n_buckets; i++) { |
329 | INIT_HLIST_HEAD(&htab->buckets[i].head); | 348 | INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); |
330 | raw_spin_lock_init(&htab->buckets[i].lock); | 349 | raw_spin_lock_init(&htab->buckets[i].lock); |
331 | } | 350 | } |
332 | 351 | ||
333 | if (!percpu && !lru) { | ||
334 | /* lru itself can remove the least used element, so | ||
335 | * there is no need for an extra elem during map_update. | ||
336 | */ | ||
337 | err = alloc_extra_elems(htab); | ||
338 | if (err) | ||
339 | goto free_buckets; | ||
340 | } | ||
341 | |||
342 | if (prealloc) { | 352 | if (prealloc) { |
343 | err = prealloc_init(htab); | 353 | err = prealloc_init(htab); |
344 | if (err) | 354 | if (err) |
345 | goto free_extra_elems; | 355 | goto free_buckets; |
356 | |||
357 | if (!percpu && !lru) { | ||
358 | /* lru itself can remove the least used element, so | ||
359 | * there is no need for an extra elem during map_update. | ||
360 | */ | ||
361 | err = alloc_extra_elems(htab); | ||
362 | if (err) | ||
363 | goto free_prealloc; | ||
364 | } | ||
346 | } | 365 | } |
347 | 366 | ||
348 | return &htab->map; | 367 | return &htab->map; |
349 | 368 | ||
350 | free_extra_elems: | 369 | free_prealloc: |
351 | free_percpu(htab->extra_elems); | 370 | prealloc_destroy(htab); |
352 | free_buckets: | 371 | free_buckets: |
353 | bpf_map_area_free(htab->buckets); | 372 | bpf_map_area_free(htab->buckets); |
354 | free_htab: | 373 | free_htab: |
@@ -366,20 +385,44 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) | |||
366 | return &htab->buckets[hash & (htab->n_buckets - 1)]; | 385 | return &htab->buckets[hash & (htab->n_buckets - 1)]; |
367 | } | 386 | } |
368 | 387 | ||
369 | static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) | 388 | static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) |
370 | { | 389 | { |
371 | return &__select_bucket(htab, hash)->head; | 390 | return &__select_bucket(htab, hash)->head; |
372 | } | 391 | } |
373 | 392 | ||
374 | static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, | 393 | /* this lookup function can only be called with bucket lock taken */ |
394 | static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, | ||
375 | void *key, u32 key_size) | 395 | void *key, u32 key_size) |
376 | { | 396 | { |
397 | struct hlist_nulls_node *n; | ||
398 | struct htab_elem *l; | ||
399 | |||
400 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) | ||
401 | if (l->hash == hash && !memcmp(&l->key, key, key_size)) | ||
402 | return l; | ||
403 | |||
404 | return NULL; | ||
405 | } | ||
406 | |||
407 | /* can be called without bucket lock. it will repeat the loop in | ||
408 | * the unlikely event when elements moved from one bucket into another | ||
409 | * while link list is being walked | ||
410 | */ | ||
411 | static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, | ||
412 | u32 hash, void *key, | ||
413 | u32 key_size, u32 n_buckets) | ||
414 | { | ||
415 | struct hlist_nulls_node *n; | ||
377 | struct htab_elem *l; | 416 | struct htab_elem *l; |
378 | 417 | ||
379 | hlist_for_each_entry_rcu(l, head, hash_node) | 418 | again: |
419 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) | ||
380 | if (l->hash == hash && !memcmp(&l->key, key, key_size)) | 420 | if (l->hash == hash && !memcmp(&l->key, key, key_size)) |
381 | return l; | 421 | return l; |
382 | 422 | ||
423 | if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) | ||
424 | goto again; | ||
425 | |||
383 | return NULL; | 426 | return NULL; |
384 | } | 427 | } |
385 | 428 | ||
@@ -387,7 +430,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, | |||
387 | static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) | 430 | static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) |
388 | { | 431 | { |
389 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | 432 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
390 | struct hlist_head *head; | 433 | struct hlist_nulls_head *head; |
391 | struct htab_elem *l; | 434 | struct htab_elem *l; |
392 | u32 hash, key_size; | 435 | u32 hash, key_size; |
393 | 436 | ||
@@ -400,7 +443,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) | |||
400 | 443 | ||
401 | head = select_bucket(htab, hash); | 444 | head = select_bucket(htab, hash); |
402 | 445 | ||
403 | l = lookup_elem_raw(head, hash, key, key_size); | 446 | l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); |
404 | 447 | ||
405 | return l; | 448 | return l; |
406 | } | 449 | } |
@@ -433,8 +476,9 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) | |||
433 | static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) | 476 | static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) |
434 | { | 477 | { |
435 | struct bpf_htab *htab = (struct bpf_htab *)arg; | 478 | struct bpf_htab *htab = (struct bpf_htab *)arg; |
436 | struct htab_elem *l, *tgt_l; | 479 | struct htab_elem *l = NULL, *tgt_l; |
437 | struct hlist_head *head; | 480 | struct hlist_nulls_head *head; |
481 | struct hlist_nulls_node *n; | ||
438 | unsigned long flags; | 482 | unsigned long flags; |
439 | struct bucket *b; | 483 | struct bucket *b; |
440 | 484 | ||
@@ -444,9 +488,9 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) | |||
444 | 488 | ||
445 | raw_spin_lock_irqsave(&b->lock, flags); | 489 | raw_spin_lock_irqsave(&b->lock, flags); |
446 | 490 | ||
447 | hlist_for_each_entry_rcu(l, head, hash_node) | 491 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
448 | if (l == tgt_l) { | 492 | if (l == tgt_l) { |
449 | hlist_del_rcu(&l->hash_node); | 493 | hlist_nulls_del_rcu(&l->hash_node); |
450 | break; | 494 | break; |
451 | } | 495 | } |
452 | 496 | ||
@@ -459,7 +503,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) | |||
459 | static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | 503 | static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) |
460 | { | 504 | { |
461 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | 505 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
462 | struct hlist_head *head; | 506 | struct hlist_nulls_head *head; |
463 | struct htab_elem *l, *next_l; | 507 | struct htab_elem *l, *next_l; |
464 | u32 hash, key_size; | 508 | u32 hash, key_size; |
465 | int i; | 509 | int i; |
@@ -473,7 +517,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |||
473 | head = select_bucket(htab, hash); | 517 | head = select_bucket(htab, hash); |
474 | 518 | ||
475 | /* lookup the key */ | 519 | /* lookup the key */ |
476 | l = lookup_elem_raw(head, hash, key, key_size); | 520 | l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); |
477 | 521 | ||
478 | if (!l) { | 522 | if (!l) { |
479 | i = 0; | 523 | i = 0; |
@@ -481,7 +525,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |||
481 | } | 525 | } |
482 | 526 | ||
483 | /* key was found, get next key in the same bucket */ | 527 | /* key was found, get next key in the same bucket */ |
484 | next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), | 528 | next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), |
485 | struct htab_elem, hash_node); | 529 | struct htab_elem, hash_node); |
486 | 530 | ||
487 | if (next_l) { | 531 | if (next_l) { |
@@ -500,7 +544,7 @@ find_first_elem: | |||
500 | head = select_bucket(htab, i); | 544 | head = select_bucket(htab, i); |
501 | 545 | ||
502 | /* pick first element in the bucket */ | 546 | /* pick first element in the bucket */ |
503 | next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), | 547 | next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), |
504 | struct htab_elem, hash_node); | 548 | struct htab_elem, hash_node); |
505 | if (next_l) { | 549 | if (next_l) { |
506 | /* if it's not empty, just return it */ | 550 | /* if it's not empty, just return it */ |
@@ -538,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head) | |||
538 | 582 | ||
539 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) | 583 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) |
540 | { | 584 | { |
541 | if (l->state == HTAB_EXTRA_ELEM_USED) { | 585 | if (htab_is_prealloc(htab)) { |
542 | l->state = HTAB_EXTRA_ELEM_FREE; | ||
543 | return; | ||
544 | } | ||
545 | |||
546 | if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { | ||
547 | pcpu_freelist_push(&htab->freelist, &l->fnode); | 586 | pcpu_freelist_push(&htab->freelist, &l->fnode); |
548 | } else { | 587 | } else { |
549 | atomic_dec(&htab->count); | 588 | atomic_dec(&htab->count); |
@@ -573,43 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, | |||
573 | static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | 612 | static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, |
574 | void *value, u32 key_size, u32 hash, | 613 | void *value, u32 key_size, u32 hash, |
575 | bool percpu, bool onallcpus, | 614 | bool percpu, bool onallcpus, |
576 | bool old_elem_exists) | 615 | struct htab_elem *old_elem) |
577 | { | 616 | { |
578 | u32 size = htab->map.value_size; | 617 | u32 size = htab->map.value_size; |
579 | bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); | 618 | bool prealloc = htab_is_prealloc(htab); |
580 | struct htab_elem *l_new; | 619 | struct htab_elem *l_new, **pl_new; |
581 | void __percpu *pptr; | 620 | void __percpu *pptr; |
582 | int err = 0; | ||
583 | 621 | ||
584 | if (prealloc) { | 622 | if (prealloc) { |
585 | l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); | 623 | if (old_elem) { |
586 | if (!l_new) | 624 | /* if we're updating the existing element, |
587 | err = -E2BIG; | 625 | * use per-cpu extra elems to avoid freelist_pop/push |
588 | } else { | 626 | */ |
589 | if (atomic_inc_return(&htab->count) > htab->map.max_entries) { | 627 | pl_new = this_cpu_ptr(htab->extra_elems); |
590 | atomic_dec(&htab->count); | 628 | l_new = *pl_new; |
591 | err = -E2BIG; | 629 | *pl_new = old_elem; |
592 | } else { | 630 | } else { |
593 | l_new = kmalloc(htab->elem_size, | 631 | struct pcpu_freelist_node *l; |
594 | GFP_ATOMIC | __GFP_NOWARN); | ||
595 | if (!l_new) | ||
596 | return ERR_PTR(-ENOMEM); | ||
597 | } | ||
598 | } | ||
599 | 632 | ||
600 | if (err) { | 633 | l = pcpu_freelist_pop(&htab->freelist); |
601 | if (!old_elem_exists) | 634 | if (!l) |
602 | return ERR_PTR(err); | 635 | return ERR_PTR(-E2BIG); |
603 | 636 | l_new = container_of(l, struct htab_elem, fnode); | |
604 | /* if we're updating the existing element and the hash table | 637 | } |
605 | * is full, use per-cpu extra elems | ||
606 | */ | ||
607 | l_new = this_cpu_ptr(htab->extra_elems); | ||
608 | if (l_new->state != HTAB_EXTRA_ELEM_FREE) | ||
609 | return ERR_PTR(-E2BIG); | ||
610 | l_new->state = HTAB_EXTRA_ELEM_USED; | ||
611 | } else { | 638 | } else { |
612 | l_new->state = HTAB_NOT_AN_EXTRA_ELEM; | 639 | if (atomic_inc_return(&htab->count) > htab->map.max_entries) |
640 | if (!old_elem) { | ||
641 | /* when map is full and update() is replacing | ||
642 | * old element, it's ok to allocate, since | ||
643 | * old element will be freed immediately. | ||
644 | * Otherwise return an error | ||
645 | */ | ||
646 | atomic_dec(&htab->count); | ||
647 | return ERR_PTR(-E2BIG); | ||
648 | } | ||
649 | l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); | ||
650 | if (!l_new) | ||
651 | return ERR_PTR(-ENOMEM); | ||
613 | } | 652 | } |
614 | 653 | ||
615 | memcpy(l_new->key, key, key_size); | 654 | memcpy(l_new->key, key, key_size); |
@@ -661,7 +700,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
661 | { | 700 | { |
662 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | 701 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
663 | struct htab_elem *l_new = NULL, *l_old; | 702 | struct htab_elem *l_new = NULL, *l_old; |
664 | struct hlist_head *head; | 703 | struct hlist_nulls_head *head; |
665 | unsigned long flags; | 704 | unsigned long flags; |
666 | struct bucket *b; | 705 | struct bucket *b; |
667 | u32 key_size, hash; | 706 | u32 key_size, hash; |
@@ -690,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
690 | goto err; | 729 | goto err; |
691 | 730 | ||
692 | l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, | 731 | l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, |
693 | !!l_old); | 732 | l_old); |
694 | if (IS_ERR(l_new)) { | 733 | if (IS_ERR(l_new)) { |
695 | /* all pre-allocated elements are in use or memory exhausted */ | 734 | /* all pre-allocated elements are in use or memory exhausted */ |
696 | ret = PTR_ERR(l_new); | 735 | ret = PTR_ERR(l_new); |
@@ -700,10 +739,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
700 | /* add new element to the head of the list, so that | 739 | /* add new element to the head of the list, so that |
701 | * concurrent search will find it before old elem | 740 | * concurrent search will find it before old elem |
702 | */ | 741 | */ |
703 | hlist_add_head_rcu(&l_new->hash_node, head); | 742 | hlist_nulls_add_head_rcu(&l_new->hash_node, head); |
704 | if (l_old) { | 743 | if (l_old) { |
705 | hlist_del_rcu(&l_old->hash_node); | 744 | hlist_nulls_del_rcu(&l_old->hash_node); |
706 | free_htab_elem(htab, l_old); | 745 | if (!htab_is_prealloc(htab)) |
746 | free_htab_elem(htab, l_old); | ||
707 | } | 747 | } |
708 | ret = 0; | 748 | ret = 0; |
709 | err: | 749 | err: |
@@ -716,7 +756,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
716 | { | 756 | { |
717 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | 757 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
718 | struct htab_elem *l_new, *l_old = NULL; | 758 | struct htab_elem *l_new, *l_old = NULL; |
719 | struct hlist_head *head; | 759 | struct hlist_nulls_head *head; |
720 | unsigned long flags; | 760 | unsigned long flags; |
721 | struct bucket *b; | 761 | struct bucket *b; |
722 | u32 key_size, hash; | 762 | u32 key_size, hash; |
@@ -757,10 +797,10 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
757 | /* add new element to the head of the list, so that | 797 | /* add new element to the head of the list, so that |
758 | * concurrent search will find it before old elem | 798 | * concurrent search will find it before old elem |
759 | */ | 799 | */ |
760 | hlist_add_head_rcu(&l_new->hash_node, head); | 800 | hlist_nulls_add_head_rcu(&l_new->hash_node, head); |
761 | if (l_old) { | 801 | if (l_old) { |
762 | bpf_lru_node_set_ref(&l_new->lru_node); | 802 | bpf_lru_node_set_ref(&l_new->lru_node); |
763 | hlist_del_rcu(&l_old->hash_node); | 803 | hlist_nulls_del_rcu(&l_old->hash_node); |
764 | } | 804 | } |
765 | ret = 0; | 805 | ret = 0; |
766 | 806 | ||
@@ -781,7 +821,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, | |||
781 | { | 821 | { |
782 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | 822 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
783 | struct htab_elem *l_new = NULL, *l_old; | 823 | struct htab_elem *l_new = NULL, *l_old; |
784 | struct hlist_head *head; | 824 | struct hlist_nulls_head *head; |
785 | unsigned long flags; | 825 | unsigned long flags; |
786 | struct bucket *b; | 826 | struct bucket *b; |
787 | u32 key_size, hash; | 827 | u32 key_size, hash; |
@@ -815,12 +855,12 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, | |||
815 | value, onallcpus); | 855 | value, onallcpus); |
816 | } else { | 856 | } else { |
817 | l_new = alloc_htab_elem(htab, key, value, key_size, | 857 | l_new = alloc_htab_elem(htab, key, value, key_size, |
818 | hash, true, onallcpus, false); | 858 | hash, true, onallcpus, NULL); |
819 | if (IS_ERR(l_new)) { | 859 | if (IS_ERR(l_new)) { |
820 | ret = PTR_ERR(l_new); | 860 | ret = PTR_ERR(l_new); |
821 | goto err; | 861 | goto err; |
822 | } | 862 | } |
823 | hlist_add_head_rcu(&l_new->hash_node, head); | 863 | hlist_nulls_add_head_rcu(&l_new->hash_node, head); |
824 | } | 864 | } |
825 | ret = 0; | 865 | ret = 0; |
826 | err: | 866 | err: |
@@ -834,7 +874,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, | |||
834 | { | 874 | { |
835 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | 875 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
836 | struct htab_elem *l_new = NULL, *l_old; | 876 | struct htab_elem *l_new = NULL, *l_old; |
837 | struct hlist_head *head; | 877 | struct hlist_nulls_head *head; |
838 | unsigned long flags; | 878 | unsigned long flags; |
839 | struct bucket *b; | 879 | struct bucket *b; |
840 | u32 key_size, hash; | 880 | u32 key_size, hash; |
@@ -882,7 +922,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, | |||
882 | } else { | 922 | } else { |
883 | pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), | 923 | pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), |
884 | value, onallcpus); | 924 | value, onallcpus); |
885 | hlist_add_head_rcu(&l_new->hash_node, head); | 925 | hlist_nulls_add_head_rcu(&l_new->hash_node, head); |
886 | l_new = NULL; | 926 | l_new = NULL; |
887 | } | 927 | } |
888 | ret = 0; | 928 | ret = 0; |
@@ -910,7 +950,7 @@ static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, | |||
910 | static int htab_map_delete_elem(struct bpf_map *map, void *key) | 950 | static int htab_map_delete_elem(struct bpf_map *map, void *key) |
911 | { | 951 | { |
912 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | 952 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
913 | struct hlist_head *head; | 953 | struct hlist_nulls_head *head; |
914 | struct bucket *b; | 954 | struct bucket *b; |
915 | struct htab_elem *l; | 955 | struct htab_elem *l; |
916 | unsigned long flags; | 956 | unsigned long flags; |
@@ -930,7 +970,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) | |||
930 | l = lookup_elem_raw(head, hash, key, key_size); | 970 | l = lookup_elem_raw(head, hash, key, key_size); |
931 | 971 | ||
932 | if (l) { | 972 | if (l) { |
933 | hlist_del_rcu(&l->hash_node); | 973 | hlist_nulls_del_rcu(&l->hash_node); |
934 | free_htab_elem(htab, l); | 974 | free_htab_elem(htab, l); |
935 | ret = 0; | 975 | ret = 0; |
936 | } | 976 | } |
@@ -942,7 +982,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) | |||
942 | static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) | 982 | static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) |
943 | { | 983 | { |
944 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | 984 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
945 | struct hlist_head *head; | 985 | struct hlist_nulls_head *head; |
946 | struct bucket *b; | 986 | struct bucket *b; |
947 | struct htab_elem *l; | 987 | struct htab_elem *l; |
948 | unsigned long flags; | 988 | unsigned long flags; |
@@ -962,7 +1002,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) | |||
962 | l = lookup_elem_raw(head, hash, key, key_size); | 1002 | l = lookup_elem_raw(head, hash, key, key_size); |
963 | 1003 | ||
964 | if (l) { | 1004 | if (l) { |
965 | hlist_del_rcu(&l->hash_node); | 1005 | hlist_nulls_del_rcu(&l->hash_node); |
966 | ret = 0; | 1006 | ret = 0; |
967 | } | 1007 | } |
968 | 1008 | ||
@@ -977,14 +1017,13 @@ static void delete_all_elements(struct bpf_htab *htab) | |||
977 | int i; | 1017 | int i; |
978 | 1018 | ||
979 | for (i = 0; i < htab->n_buckets; i++) { | 1019 | for (i = 0; i < htab->n_buckets; i++) { |
980 | struct hlist_head *head = select_bucket(htab, i); | 1020 | struct hlist_nulls_head *head = select_bucket(htab, i); |
981 | struct hlist_node *n; | 1021 | struct hlist_nulls_node *n; |
982 | struct htab_elem *l; | 1022 | struct htab_elem *l; |
983 | 1023 | ||
984 | hlist_for_each_entry_safe(l, n, head, hash_node) { | 1024 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { |
985 | hlist_del_rcu(&l->hash_node); | 1025 | hlist_nulls_del_rcu(&l->hash_node); |
986 | if (l->state != HTAB_EXTRA_ELEM_USED) | 1026 | htab_elem_free(htab, l); |
987 | htab_elem_free(htab, l); | ||
988 | } | 1027 | } |
989 | } | 1028 | } |
990 | } | 1029 | } |
@@ -1004,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map) | |||
1004 | * not have executed. Wait for them. | 1043 | * not have executed. Wait for them. |
1005 | */ | 1044 | */ |
1006 | rcu_barrier(); | 1045 | rcu_barrier(); |
1007 | if (htab->map.map_flags & BPF_F_NO_PREALLOC) | 1046 | if (!htab_is_prealloc(htab)) |
1008 | delete_all_elements(htab); | 1047 | delete_all_elements(htab); |
1009 | else | 1048 | else |
1010 | prealloc_destroy(htab); | 1049 | prealloc_destroy(htab); |
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 8bfe0afaee10..b37bd9ab7f57 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c | |||
@@ -500,9 +500,15 @@ unlock: | |||
500 | raw_spin_unlock(&trie->lock); | 500 | raw_spin_unlock(&trie->lock); |
501 | } | 501 | } |
502 | 502 | ||
503 | static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) | ||
504 | { | ||
505 | return -ENOTSUPP; | ||
506 | } | ||
507 | |||
503 | static const struct bpf_map_ops trie_ops = { | 508 | static const struct bpf_map_ops trie_ops = { |
504 | .map_alloc = trie_alloc, | 509 | .map_alloc = trie_alloc, |
505 | .map_free = trie_free, | 510 | .map_free = trie_free, |
511 | .map_get_next_key = trie_get_next_key, | ||
506 | .map_lookup_elem = trie_lookup_elem, | 512 | .map_lookup_elem = trie_lookup_elem, |
507 | .map_update_elem = trie_update_elem, | 513 | .map_update_elem = trie_update_elem, |
508 | .map_delete_elem = trie_delete_elem, | 514 | .map_delete_elem = trie_delete_elem, |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7af0dcc5d755..821f9e807de5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -617,6 +617,14 @@ static void fixup_bpf_calls(struct bpf_prog *prog) | |||
617 | if (insn->imm == BPF_FUNC_xdp_adjust_head) | 617 | if (insn->imm == BPF_FUNC_xdp_adjust_head) |
618 | prog->xdp_adjust_head = 1; | 618 | prog->xdp_adjust_head = 1; |
619 | if (insn->imm == BPF_FUNC_tail_call) { | 619 | if (insn->imm == BPF_FUNC_tail_call) { |
620 | /* If we tail call into other programs, we | ||
621 | * cannot make any assumptions since they | ||
622 | * can be replaced dynamically during runtime | ||
623 | * in the program array. | ||
624 | */ | ||
625 | prog->cb_access = 1; | ||
626 | prog->xdp_adjust_head = 1; | ||
627 | |||
620 | /* mark bpf_tail_call as different opcode | 628 | /* mark bpf_tail_call as different opcode |
621 | * to avoid conditional branch in | 629 | * to avoid conditional branch in |
622 | * interpeter for every normal call | 630 | * interpeter for every normal call |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 796b68d00119..a834068a400e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -765,38 +765,56 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) | |||
765 | } | 765 | } |
766 | } | 766 | } |
767 | 767 | ||
768 | static int check_ptr_alignment(struct bpf_verifier_env *env, | 768 | static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, |
769 | struct bpf_reg_state *reg, int off, int size) | 769 | int off, int size) |
770 | { | 770 | { |
771 | if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { | ||
772 | if (off % size != 0) { | ||
773 | verbose("misaligned access off %d size %d\n", | ||
774 | off, size); | ||
775 | return -EACCES; | ||
776 | } else { | ||
777 | return 0; | ||
778 | } | ||
779 | } | ||
780 | |||
781 | if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) | ||
782 | /* misaligned access to packet is ok on x86,arm,arm64 */ | ||
783 | return 0; | ||
784 | |||
785 | if (reg->id && size != 1) { | 771 | if (reg->id && size != 1) { |
786 | verbose("Unknown packet alignment. Only byte-sized access allowed\n"); | 772 | verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n"); |
787 | return -EACCES; | 773 | return -EACCES; |
788 | } | 774 | } |
789 | 775 | ||
790 | /* skb->data is NET_IP_ALIGN-ed */ | 776 | /* skb->data is NET_IP_ALIGN-ed */ |
791 | if (reg->type == PTR_TO_PACKET && | 777 | if ((NET_IP_ALIGN + reg->off + off) % size != 0) { |
792 | (NET_IP_ALIGN + reg->off + off) % size != 0) { | ||
793 | verbose("misaligned packet access off %d+%d+%d size %d\n", | 778 | verbose("misaligned packet access off %d+%d+%d size %d\n", |
794 | NET_IP_ALIGN, reg->off, off, size); | 779 | NET_IP_ALIGN, reg->off, off, size); |
795 | return -EACCES; | 780 | return -EACCES; |
796 | } | 781 | } |
782 | |||
797 | return 0; | 783 | return 0; |
798 | } | 784 | } |
799 | 785 | ||
786 | static int check_val_ptr_alignment(const struct bpf_reg_state *reg, | ||
787 | int size) | ||
788 | { | ||
789 | if (size != 1) { | ||
790 | verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); | ||
791 | return -EACCES; | ||
792 | } | ||
793 | |||
794 | return 0; | ||
795 | } | ||
796 | |||
797 | static int check_ptr_alignment(const struct bpf_reg_state *reg, | ||
798 | int off, int size) | ||
799 | { | ||
800 | switch (reg->type) { | ||
801 | case PTR_TO_PACKET: | ||
802 | return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : | ||
803 | check_pkt_ptr_alignment(reg, off, size); | ||
804 | case PTR_TO_MAP_VALUE_ADJ: | ||
805 | return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : | ||
806 | check_val_ptr_alignment(reg, size); | ||
807 | default: | ||
808 | if (off % size != 0) { | ||
809 | verbose("misaligned access off %d size %d\n", | ||
810 | off, size); | ||
811 | return -EACCES; | ||
812 | } | ||
813 | |||
814 | return 0; | ||
815 | } | ||
816 | } | ||
817 | |||
800 | /* check whether memory at (regno + off) is accessible for t = (read | write) | 818 | /* check whether memory at (regno + off) is accessible for t = (read | write) |
801 | * if t==write, value_regno is a register which value is stored into memory | 819 | * if t==write, value_regno is a register which value is stored into memory |
802 | * if t==read, value_regno is a register which will receive the value from memory | 820 | * if t==read, value_regno is a register which will receive the value from memory |
@@ -818,7 +836,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
818 | if (size < 0) | 836 | if (size < 0) |
819 | return size; | 837 | return size; |
820 | 838 | ||
821 | err = check_ptr_alignment(env, reg, off, size); | 839 | err = check_ptr_alignment(reg, off, size); |
822 | if (err) | 840 | if (err) |
823 | return err; | 841 | return err; |
824 | 842 | ||
@@ -1925,6 +1943,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
1925 | * register as unknown. | 1943 | * register as unknown. |
1926 | */ | 1944 | */ |
1927 | if (env->allow_ptr_leaks && | 1945 | if (env->allow_ptr_leaks && |
1946 | BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD && | ||
1928 | (dst_reg->type == PTR_TO_MAP_VALUE || | 1947 | (dst_reg->type == PTR_TO_MAP_VALUE || |
1929 | dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) | 1948 | dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) |
1930 | dst_reg->type = PTR_TO_MAP_VALUE_ADJ; | 1949 | dst_reg->type = PTR_TO_MAP_VALUE_ADJ; |
@@ -1973,14 +1992,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, | |||
1973 | 1992 | ||
1974 | for (i = 0; i < MAX_BPF_REG; i++) | 1993 | for (i = 0; i < MAX_BPF_REG; i++) |
1975 | if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) | 1994 | if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) |
1976 | regs[i].range = dst_reg->off; | 1995 | /* keep the maximum range already checked */ |
1996 | regs[i].range = max(regs[i].range, dst_reg->off); | ||
1977 | 1997 | ||
1978 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { | 1998 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { |
1979 | if (state->stack_slot_type[i] != STACK_SPILL) | 1999 | if (state->stack_slot_type[i] != STACK_SPILL) |
1980 | continue; | 2000 | continue; |
1981 | reg = &state->spilled_regs[i / BPF_REG_SIZE]; | 2001 | reg = &state->spilled_regs[i / BPF_REG_SIZE]; |
1982 | if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) | 2002 | if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) |
1983 | reg->range = dst_reg->off; | 2003 | reg->range = max(reg->range, dst_reg->off); |
1984 | } | 2004 | } |
1985 | } | 2005 | } |
1986 | 2006 | ||