aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/bpf/hashtab.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-03-23 14:29:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-03-23 14:29:49 -0400
commitf341d9f08ae01d90d8d0c135ae2edf4423e724c9 (patch)
treec9e9db67b586df67157a52096d787d67c4440a55 /kernel/bpf/hashtab.c
parent093b995e3b55a0ae0670226ddfcb05bfbf0099ae (diff)
parent68c386590375b2aea5a3154f17882a30170707bf (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Pull networking fixes from David Miller: 1) Several netfilter fixes from Pablo and the crew: - Handle fragmented packets properly in netfilter conntrack, from Florian Westphal. - Fix SCTP ICMP packet handling, from Ying Xue. - Fix big-endian bug in nftables, from Liping Zhang. - Fix alignment of fake conntrack entry, from Steven Rostedt. 2) Fix feature flags setting in fjes driver, from Taku Izumi. 3) Openvswitch ipv6 tunnel source address not set properly, from Or Gerlitz. 4) Fix jumbo MTU handling in amd-xgbe driver, from Thomas Lendacky. 5) sk->sk_frag.page not released properly in some cases, from Eric Dumazet. 6) Fix RTNL deadlocks in nl80211, from Johannes Berg. 7) Fix erroneous RTNL lockdep splat in crypto, from Herbert Xu. 8) Cure improper inflight handling during AF_UNIX GC, from Andrey Ulanov. 9) sch_dsmark doesn't write to packet headers properly, from Eric Dumazet. 10) Fix SCM_TIMESTAMPING_OPT_STATS handling in TCP, from Soheil Hassas Yeganeh. 11) Add some IDs for Motorola qmi_wwan chips, from Tony Lindgren. 12) Fix nametbl deadlock in tipc, from Ying Xue. 13) GRO and LRO packets not counted correctly in mlx5 driver, from Gal Pressman. 14) Fix reset of internal PHYs in bcmgenet, from Doug Berger. 15) Fix hashmap allocation handling, from Alexei Starovoitov. 16) nl_fib_input() needs stronger netlink message length checking, from Eric Dumazet. 17) Fix double-free of sk->sk_filter during sock clone, from Daniel Borkmann. 18) Fix RX checksum offloading in aquantia driver, from Pavel Belous. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (85 commits) net:ethernet:aquantia: Fix for RX checksum offload. amd-xgbe: Fix the ECC-related bit position definitions sfc: cleanup a condition in efx_udp_tunnel_del() Bluetooth: btqcomsmd: fix compile-test dependency inet: frag: release spinlock before calling icmp_send() tcp: initialize icsk_ack.lrcvtime at session start time genetlink: fix counting regression on ctrl_dumpfamily() socket, bpf: fix sk_filter use after free in sk_clone_lock ipv4: provide stronger user input validation in nl_fib_input() bpf: fix hashmap extra_elems logic enic: update enic maintainers net: bcmgenet: remove bcmgenet_internal_phy_setup() ipv6: make sure to initialize sockc.tsflags before first use fjes: Do not load fjes driver if extended socket device is not power on. fjes: Do not load fjes driver if system does not have extended socket device. net/mlx5e: Count LRO packets correctly net/mlx5e: Count GSO packets correctly net/mlx5: Increase number of max QPs in default profile net/mlx5e: Avoid supporting udp tunnel port ndo for VF reps net/mlx5e: Use the proper UAPI values when offloading TC vlan actions ...
Diffstat (limited to 'kernel/bpf/hashtab.c')
-rw-r--r--kernel/bpf/hashtab.c144
1 files changed, 71 insertions, 73 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index afe5bab376c9..361a69dfe543 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -30,18 +30,12 @@ struct bpf_htab {
30 struct pcpu_freelist freelist; 30 struct pcpu_freelist freelist;
31 struct bpf_lru lru; 31 struct bpf_lru lru;
32 }; 32 };
33 void __percpu *extra_elems; 33 struct htab_elem *__percpu *extra_elems;
34 atomic_t count; /* number of elements in this hashtable */ 34 atomic_t count; /* number of elements in this hashtable */
35 u32 n_buckets; /* number of hash buckets */ 35 u32 n_buckets; /* number of hash buckets */
36 u32 elem_size; /* size of each element in bytes */ 36 u32 elem_size; /* size of each element in bytes */
37}; 37};
38 38
39enum extra_elem_state {
40 HTAB_NOT_AN_EXTRA_ELEM = 0,
41 HTAB_EXTRA_ELEM_FREE,
42 HTAB_EXTRA_ELEM_USED
43};
44
45/* each htab element is struct htab_elem + key + value */ 39/* each htab element is struct htab_elem + key + value */
46struct htab_elem { 40struct htab_elem {
47 union { 41 union {
@@ -56,7 +50,6 @@ struct htab_elem {
56 }; 50 };
57 union { 51 union {
58 struct rcu_head rcu; 52 struct rcu_head rcu;
59 enum extra_elem_state state;
60 struct bpf_lru_node lru_node; 53 struct bpf_lru_node lru_node;
61 }; 54 };
62 u32 hash; 55 u32 hash;
@@ -77,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
77 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 70 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
78} 71}
79 72
73static bool htab_is_prealloc(const struct bpf_htab *htab)
74{
75 return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
76}
77
80static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 78static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
81 void __percpu *pptr) 79 void __percpu *pptr)
82{ 80{
@@ -128,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
128 126
129static int prealloc_init(struct bpf_htab *htab) 127static int prealloc_init(struct bpf_htab *htab)
130{ 128{
129 u32 num_entries = htab->map.max_entries;
131 int err = -ENOMEM, i; 130 int err = -ENOMEM, i;
132 131
133 htab->elems = bpf_map_area_alloc(htab->elem_size * 132 if (!htab_is_percpu(htab) && !htab_is_lru(htab))
134 htab->map.max_entries); 133 num_entries += num_possible_cpus();
134
135 htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
135 if (!htab->elems) 136 if (!htab->elems)
136 return -ENOMEM; 137 return -ENOMEM;
137 138
138 if (!htab_is_percpu(htab)) 139 if (!htab_is_percpu(htab))
139 goto skip_percpu_elems; 140 goto skip_percpu_elems;
140 141
141 for (i = 0; i < htab->map.max_entries; i++) { 142 for (i = 0; i < num_entries; i++) {
142 u32 size = round_up(htab->map.value_size, 8); 143 u32 size = round_up(htab->map.value_size, 8);
143 void __percpu *pptr; 144 void __percpu *pptr;
144 145
@@ -166,11 +167,11 @@ skip_percpu_elems:
166 if (htab_is_lru(htab)) 167 if (htab_is_lru(htab))
167 bpf_lru_populate(&htab->lru, htab->elems, 168 bpf_lru_populate(&htab->lru, htab->elems,
168 offsetof(struct htab_elem, lru_node), 169 offsetof(struct htab_elem, lru_node),
169 htab->elem_size, htab->map.max_entries); 170 htab->elem_size, num_entries);
170 else 171 else
171 pcpu_freelist_populate(&htab->freelist, 172 pcpu_freelist_populate(&htab->freelist,
172 htab->elems + offsetof(struct htab_elem, fnode), 173 htab->elems + offsetof(struct htab_elem, fnode),
173 htab->elem_size, htab->map.max_entries); 174 htab->elem_size, num_entries);
174 175
175 return 0; 176 return 0;
176 177
@@ -191,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab)
191 192
192static int alloc_extra_elems(struct bpf_htab *htab) 193static int alloc_extra_elems(struct bpf_htab *htab)
193{ 194{
194 void __percpu *pptr; 195 struct htab_elem *__percpu *pptr, *l_new;
196 struct pcpu_freelist_node *l;
195 int cpu; 197 int cpu;
196 198
197 pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); 199 pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
200 GFP_USER | __GFP_NOWARN);
198 if (!pptr) 201 if (!pptr)
199 return -ENOMEM; 202 return -ENOMEM;
200 203
201 for_each_possible_cpu(cpu) { 204 for_each_possible_cpu(cpu) {
202 ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = 205 l = pcpu_freelist_pop(&htab->freelist);
203 HTAB_EXTRA_ELEM_FREE; 206 /* pop will succeed, since prealloc_init()
207 * preallocated extra num_possible_cpus elements
208 */
209 l_new = container_of(l, struct htab_elem, fnode);
210 *per_cpu_ptr(pptr, cpu) = l_new;
204 } 211 }
205 htab->extra_elems = pptr; 212 htab->extra_elems = pptr;
206 return 0; 213 return 0;
@@ -342,25 +349,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
342 raw_spin_lock_init(&htab->buckets[i].lock); 349 raw_spin_lock_init(&htab->buckets[i].lock);
343 } 350 }
344 351
345 if (!percpu && !lru) {
346 /* lru itself can remove the least used element, so
347 * there is no need for an extra elem during map_update.
348 */
349 err = alloc_extra_elems(htab);
350 if (err)
351 goto free_buckets;
352 }
353
354 if (prealloc) { 352 if (prealloc) {
355 err = prealloc_init(htab); 353 err = prealloc_init(htab);
356 if (err) 354 if (err)
357 goto free_extra_elems; 355 goto free_buckets;
356
357 if (!percpu && !lru) {
358 /* lru itself can remove the least used element, so
359 * there is no need for an extra elem during map_update.
360 */
361 err = alloc_extra_elems(htab);
362 if (err)
363 goto free_prealloc;
364 }
358 } 365 }
359 366
360 return &htab->map; 367 return &htab->map;
361 368
362free_extra_elems: 369free_prealloc:
363 free_percpu(htab->extra_elems); 370 prealloc_destroy(htab);
364free_buckets: 371free_buckets:
365 bpf_map_area_free(htab->buckets); 372 bpf_map_area_free(htab->buckets);
366free_htab: 373free_htab:
@@ -575,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
575 582
576static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 583static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
577{ 584{
578 if (l->state == HTAB_EXTRA_ELEM_USED) { 585 if (htab_is_prealloc(htab)) {
579 l->state = HTAB_EXTRA_ELEM_FREE;
580 return;
581 }
582
583 if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
584 pcpu_freelist_push(&htab->freelist, &l->fnode); 586 pcpu_freelist_push(&htab->freelist, &l->fnode);
585 } else { 587 } else {
586 atomic_dec(&htab->count); 588 atomic_dec(&htab->count);
@@ -610,47 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
610static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 612static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
611 void *value, u32 key_size, u32 hash, 613 void *value, u32 key_size, u32 hash,
612 bool percpu, bool onallcpus, 614 bool percpu, bool onallcpus,
613 bool old_elem_exists) 615 struct htab_elem *old_elem)
614{ 616{
615 u32 size = htab->map.value_size; 617 u32 size = htab->map.value_size;
616 bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); 618 bool prealloc = htab_is_prealloc(htab);
617 struct htab_elem *l_new; 619 struct htab_elem *l_new, **pl_new;
618 void __percpu *pptr; 620 void __percpu *pptr;
619 int err = 0;
620 621
621 if (prealloc) { 622 if (prealloc) {
622 struct pcpu_freelist_node *l; 623 if (old_elem) {
624 /* if we're updating the existing element,
625 * use per-cpu extra elems to avoid freelist_pop/push
626 */
627 pl_new = this_cpu_ptr(htab->extra_elems);
628 l_new = *pl_new;
629 *pl_new = old_elem;
630 } else {
631 struct pcpu_freelist_node *l;
623 632
624 l = pcpu_freelist_pop(&htab->freelist); 633 l = pcpu_freelist_pop(&htab->freelist);
625 if (!l) 634 if (!l)
626 err = -E2BIG; 635 return ERR_PTR(-E2BIG);
627 else
628 l_new = container_of(l, struct htab_elem, fnode); 636 l_new = container_of(l, struct htab_elem, fnode);
629 } else {
630 if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
631 atomic_dec(&htab->count);
632 err = -E2BIG;
633 } else {
634 l_new = kmalloc(htab->elem_size,
635 GFP_ATOMIC | __GFP_NOWARN);
636 if (!l_new)
637 return ERR_PTR(-ENOMEM);
638 } 637 }
639 }
640
641 if (err) {
642 if (!old_elem_exists)
643 return ERR_PTR(err);
644
645 /* if we're updating the existing element and the hash table
646 * is full, use per-cpu extra elems
647 */
648 l_new = this_cpu_ptr(htab->extra_elems);
649 if (l_new->state != HTAB_EXTRA_ELEM_FREE)
650 return ERR_PTR(-E2BIG);
651 l_new->state = HTAB_EXTRA_ELEM_USED;
652 } else { 638 } else {
653 l_new->state = HTAB_NOT_AN_EXTRA_ELEM; 639 if (atomic_inc_return(&htab->count) > htab->map.max_entries)
640 if (!old_elem) {
641 /* when map is full and update() is replacing
642 * old element, it's ok to allocate, since
643 * old element will be freed immediately.
644 * Otherwise return an error
645 */
646 atomic_dec(&htab->count);
647 return ERR_PTR(-E2BIG);
648 }
649 l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
650 if (!l_new)
651 return ERR_PTR(-ENOMEM);
654 } 652 }
655 653
656 memcpy(l_new->key, key, key_size); 654 memcpy(l_new->key, key, key_size);
@@ -731,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
731 goto err; 729 goto err;
732 730
733 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 731 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
734 !!l_old); 732 l_old);
735 if (IS_ERR(l_new)) { 733 if (IS_ERR(l_new)) {
736 /* all pre-allocated elements are in use or memory exhausted */ 734 /* all pre-allocated elements are in use or memory exhausted */
737 ret = PTR_ERR(l_new); 735 ret = PTR_ERR(l_new);
@@ -744,7 +742,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
744 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 742 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
745 if (l_old) { 743 if (l_old) {
746 hlist_nulls_del_rcu(&l_old->hash_node); 744 hlist_nulls_del_rcu(&l_old->hash_node);
747 free_htab_elem(htab, l_old); 745 if (!htab_is_prealloc(htab))
746 free_htab_elem(htab, l_old);
748 } 747 }
749 ret = 0; 748 ret = 0;
750err: 749err:
@@ -856,7 +855,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
856 value, onallcpus); 855 value, onallcpus);
857 } else { 856 } else {
858 l_new = alloc_htab_elem(htab, key, value, key_size, 857 l_new = alloc_htab_elem(htab, key, value, key_size,
859 hash, true, onallcpus, false); 858 hash, true, onallcpus, NULL);
860 if (IS_ERR(l_new)) { 859 if (IS_ERR(l_new)) {
861 ret = PTR_ERR(l_new); 860 ret = PTR_ERR(l_new);
862 goto err; 861 goto err;
@@ -1024,8 +1023,7 @@ static void delete_all_elements(struct bpf_htab *htab)
1024 1023
1025 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1024 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1026 hlist_nulls_del_rcu(&l->hash_node); 1025 hlist_nulls_del_rcu(&l->hash_node);
1027 if (l->state != HTAB_EXTRA_ELEM_USED) 1026 htab_elem_free(htab, l);
1028 htab_elem_free(htab, l);
1029 } 1027 }
1030 } 1028 }
1031} 1029}
@@ -1045,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map)
1045 * not have executed. Wait for them. 1043 * not have executed. Wait for them.
1046 */ 1044 */
1047 rcu_barrier(); 1045 rcu_barrier();
1048 if (htab->map.map_flags & BPF_F_NO_PREALLOC) 1046 if (!htab_is_prealloc(htab))
1049 delete_all_elements(htab); 1047 delete_all_elements(htab);
1050 else 1048 else
1051 prealloc_destroy(htab); 1049 prealloc_destroy(htab);