aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/bpf/devmap.c
diff options
context:
space:
mode:
authorJohn Fastabend <john.fastabend@gmail.com>2017-08-05 01:02:19 -0400
committerDavid S. Miller <davem@davemloft.net>2017-08-07 17:13:04 -0400
commit4cc7b9544b9a904add353406ed1bacbf56f75c52 (patch)
treec7b34d6bc31ad0161a24a076303721988d72adc0 /kernel/bpf/devmap.c
parent98909f9545b7ba18e77029426e703a39925b9642 (diff)
bpf: devmap fix mutex in rcu critical section
Originally we used a mutex to protect concurrent devmap update and delete operations from racing with netdev unregister notifier callbacks. The notifier hook is needed because we increment the netdev ref count when a dev is added to the devmap. This ensures the netdev reference is valid in the datapath. However, we don't want to block unregister events, hence the initial mutex and notifier handler. The concern was in the notifier hook we search the map for dev entries that hold a refcnt on the net device being torn down. But, in order to do this we require two steps, (i) dereference the netdev: dev = rcu_dereference(map[i]) (ii) test ifindex: dev->ifindex == removing_ifindex and then finally we can swap in the NULL dev in the map via an xchg operation, xchg(map[i], NULL) The danger here is a concurrent update could run a different xchg op concurrently leading us to replace the new dev with a NULL dev incorrectly. CPU 1 CPU 2 notifier hook bpf devmap update dev = rcu_dereference(map[i]) dev = rcu_dereference(map[i]) xchg(map[i]), new_dev); rcu_call(dev,...) xchg(map[i], NULL) The above flow would create the incorrect state with the dev reference in the update path being lost. To resolve this the original code used a mutex around the above block. However, updates, deletes, and lookups occur inside rcu critical sections so we can't use a mutex in this context safely. Fortunately, by writing slightly better code we can avoid the mutex altogether. If CPU 1 in the above example uses a cmpxchg and _only_ replaces the dev reference in the map when it is in fact the expected dev the race is removed completely. The two cases being illustrated here, first the race condition, CPU 1 CPU 2 notifier hook bpf devmap update dev = rcu_dereference(map[i]) dev = rcu_dereference(map[i]) xchg(map[i]), new_dev); rcu_call(dev,...) odev = cmpxchg(map[i], dev, NULL) Now we can test the cmpxchg return value, detect odev != dev and abort. Or in the good case, CPU 1 CPU 2 notifier hook bpf devmap update dev = rcu_dereference(map[i]) odev = cmpxchg(map[i], dev, NULL) [...] Now 'odev == dev' and we can do proper cleanup. And viola the original race we tried to solve with a mutex is corrected and the trace noted by Sasha below is resolved due to removal of the mutex. Note: When walking the devmap and removing dev references as needed we depend on the core to fail any calls to dev_get_by_index() using the ifindex of the device being removed. This way we do not race with the user while searching the devmap. Additionally, the mutex was also protecting list add/del/read on the list of maps in-use. This patch converts this to an RCU list and spinlock implementation. This protects the list from concurrent alloc/free operations. The notifier hook walks this list so it uses RCU read semantics. BUG: sleeping function called from invalid context at kernel/locking/mutex.c:747 in_atomic(): 1, irqs_disabled(): 0, pid: 16315, name: syz-executor1 1 lock held by syz-executor1/16315: #0: (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] map_delete_elem kernel/bpf/syscall.c:577 [inline] #0: (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] SYSC_bpf kernel/bpf/syscall.c:1427 [inline] #0: (rcu_read_lock){......}, at: [<ffffffff8c363bc2>] SyS_bpf+0x1d32/0x4ba0 kernel/bpf/syscall.c:1388 Fixes: 2ddf71e23cc2 ("net: add notifier hooks for devmap bpf map") Reported-by: Sasha Levin <alexander.levin@verizon.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/bpf/devmap.c')
-rw-r--r--kernel/bpf/devmap.c48
1 files changed, 25 insertions, 23 deletions
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index d439ee0eadb1..7192fb67d4de 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -40,11 +40,12 @@
40 * contain a reference to the net device and remove them. This is a two step 40 * contain a reference to the net device and remove them. This is a two step
41 * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) 41 * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
42 * check to see if the ifindex is the same as the net_device being removed. 42 * check to see if the ifindex is the same as the net_device being removed.
43 * Unfortunately, the xchg() operations do not protect against this. To avoid 43 * When removing the dev a cmpxchg() is used to ensure the correct dev is
44 * potentially removing incorrect objects the dev_map_list_mutex protects 44 * removed, in the case of a concurrent update or delete operation it is
45 * conflicting netdev unregister and BPF syscall operations. Updates and 45 * possible that the initially referenced dev is no longer in the map. As the
46 * deletes from a BPF program (done in rcu critical section) are blocked 46 * notifier hook walks the map we know that new dev references can not be
47 * because of this mutex. 47 * added by the user because core infrastructure ensures dev_get_by_index()
48 * calls will fail at this point.
48 */ 49 */
49#include <linux/bpf.h> 50#include <linux/bpf.h>
50#include <linux/jhash.h> 51#include <linux/jhash.h>
@@ -68,7 +69,7 @@ struct bpf_dtab {
68 struct list_head list; 69 struct list_head list;
69}; 70};
70 71
71static DEFINE_MUTEX(dev_map_list_mutex); 72static DEFINE_SPINLOCK(dev_map_lock);
72static LIST_HEAD(dev_map_list); 73static LIST_HEAD(dev_map_list);
73 74
74static struct bpf_map *dev_map_alloc(union bpf_attr *attr) 75static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -128,9 +129,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
128 if (!dtab->netdev_map) 129 if (!dtab->netdev_map)
129 goto free_dtab; 130 goto free_dtab;
130 131
131 mutex_lock(&dev_map_list_mutex); 132 spin_lock(&dev_map_lock);
132 list_add_tail(&dtab->list, &dev_map_list); 133 list_add_tail_rcu(&dtab->list, &dev_map_list);
133 mutex_unlock(&dev_map_list_mutex); 134 spin_unlock(&dev_map_lock);
134 return &dtab->map; 135 return &dtab->map;
135 136
136free_dtab: 137free_dtab:
@@ -169,7 +170,6 @@ static void dev_map_free(struct bpf_map *map)
169 * at this point we we can still race with netdev notifier, hence the 170 * at this point we we can still race with netdev notifier, hence the
170 * lock. 171 * lock.
171 */ 172 */
172 mutex_lock(&dev_map_list_mutex);
173 for (i = 0; i < dtab->map.max_entries; i++) { 173 for (i = 0; i < dtab->map.max_entries; i++) {
174 struct bpf_dtab_netdev *dev; 174 struct bpf_dtab_netdev *dev;
175 175
@@ -184,8 +184,9 @@ static void dev_map_free(struct bpf_map *map)
184 /* At this point bpf program is detached and all pending operations 184 /* At this point bpf program is detached and all pending operations
185 * _must_ be complete 185 * _must_ be complete
186 */ 186 */
187 list_del(&dtab->list); 187 spin_lock(&dev_map_lock);
188 mutex_unlock(&dev_map_list_mutex); 188 list_del_rcu(&dtab->list);
189 spin_unlock(&dev_map_lock);
189 free_percpu(dtab->flush_needed); 190 free_percpu(dtab->flush_needed);
190 bpf_map_area_free(dtab->netdev_map); 191 bpf_map_area_free(dtab->netdev_map);
191 kfree(dtab); 192 kfree(dtab);
@@ -322,11 +323,9 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
322 * the driver tear down ensures all soft irqs are complete before 323 * the driver tear down ensures all soft irqs are complete before
323 * removing the net device in the case of dev_put equals zero. 324 * removing the net device in the case of dev_put equals zero.
324 */ 325 */
325 mutex_lock(&dev_map_list_mutex);
326 old_dev = xchg(&dtab->netdev_map[k], NULL); 326 old_dev = xchg(&dtab->netdev_map[k], NULL);
327 if (old_dev) 327 if (old_dev)
328 call_rcu(&old_dev->rcu, __dev_map_entry_free); 328 call_rcu(&old_dev->rcu, __dev_map_entry_free);
329 mutex_unlock(&dev_map_list_mutex);
330 return 0; 329 return 0;
331} 330}
332 331
@@ -369,11 +368,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
369 * Remembering the driver side flush operation will happen before the 368 * Remembering the driver side flush operation will happen before the
370 * net device is removed. 369 * net device is removed.
371 */ 370 */
372 mutex_lock(&dev_map_list_mutex);
373 old_dev = xchg(&dtab->netdev_map[i], dev); 371 old_dev = xchg(&dtab->netdev_map[i], dev);
374 if (old_dev) 372 if (old_dev)
375 call_rcu(&old_dev->rcu, __dev_map_entry_free); 373 call_rcu(&old_dev->rcu, __dev_map_entry_free);
376 mutex_unlock(&dev_map_list_mutex);
377 374
378 return 0; 375 return 0;
379} 376}
@@ -396,22 +393,27 @@ static int dev_map_notification(struct notifier_block *notifier,
396 393
397 switch (event) { 394 switch (event) {
398 case NETDEV_UNREGISTER: 395 case NETDEV_UNREGISTER:
399 mutex_lock(&dev_map_list_mutex); 396 /* This rcu_read_lock/unlock pair is needed because
400 list_for_each_entry(dtab, &dev_map_list, list) { 397 * dev_map_list is an RCU list AND to ensure a delete
398 * operation does not free a netdev_map entry while we
399 * are comparing it against the netdev being unregistered.
400 */
401 rcu_read_lock();
402 list_for_each_entry_rcu(dtab, &dev_map_list, list) {
401 for (i = 0; i < dtab->map.max_entries; i++) { 403 for (i = 0; i < dtab->map.max_entries; i++) {
402 struct bpf_dtab_netdev *dev; 404 struct bpf_dtab_netdev *dev, *odev;
403 405
404 dev = dtab->netdev_map[i]; 406 dev = READ_ONCE(dtab->netdev_map[i]);
405 if (!dev || 407 if (!dev ||
406 dev->dev->ifindex != netdev->ifindex) 408 dev->dev->ifindex != netdev->ifindex)
407 continue; 409 continue;
408 dev = xchg(&dtab->netdev_map[i], NULL); 410 odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
409 if (dev) 411 if (dev == odev)
410 call_rcu(&dev->rcu, 412 call_rcu(&dev->rcu,
411 __dev_map_entry_free); 413 __dev_map_entry_free);
412 } 414 }
413 } 415 }
414 mutex_unlock(&dev_map_list_mutex); 416 rcu_read_unlock();
415 break; 417 break;
416 default: 418 default:
417 break; 419 break;