diff options
author | Arnd Bergmann <arnd@arndb.de> | 2010-02-18 00:45:36 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-02-18 17:08:37 -0500 |
commit | 02df55d28c6001a3cdb7a997a34a0b01f01d015e (patch) | |
tree | ea1bf32f7464294ce90e288bf5440323e9ca6e7f /drivers | |
parent | 37ee3d5b3e979a168536e7e2f15bd1e769cb4122 (diff) |
macvtap: rework object lifetime rules
This reworks the change done by the previous patch
in a more complete way.
The original macvtap code has a number of problems
resulting from the use of RCU for protecting the
access to struct macvtap_queue from open files.
This includes
- need for GFP_ATOMIC allocations for skbs
- potential deadlocks when copy_*_user sleeps
- inability to work with vhost-net
Changing the lifetime of macvtap_queue to always
depend on the open file solves all these. The
RCU reference simply moves one step down to
the reference on the macvlan_dev, which we
only need for nonblocking operations.
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/net/macvtap.c | 183 |
1 files changed, 91 insertions, 92 deletions
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index fe7656bf68c6..705099749766 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c | |||
@@ -60,30 +60,19 @@ static struct cdev macvtap_cdev; | |||
60 | 60 | ||
61 | /* | 61 | /* |
62 | * RCU usage: | 62 | * RCU usage: |
63 | * The macvtap_queue is referenced both from the chardev struct file | 63 | * The macvtap_queue and the macvlan_dev are loosely coupled, the |
64 | * and from the struct macvlan_dev using rcu_read_lock. | 64 | * pointers from one to the other can only be read while rcu_read_lock |
65 | * or macvtap_lock is held. | ||
65 | * | 66 | * |
66 | * We never actually update the contents of a macvtap_queue atomically | 67 | * Both the file and the macvlan_dev hold a reference on the macvtap_queue |
67 | * with RCU but it is used for race-free destruction of a queue when | 68 | * through sock_hold(&q->sk). When the macvlan_dev goes away first, |
68 | * either the file or the macvlan_dev goes away. Pointers back to | 69 | * q->vlan becomes inaccessible. When the files gets closed, |
69 | * the dev and the file are implicitly valid as long as the queue | 70 | * macvtap_get_queue() fails. |
70 | * exists. | ||
71 | * | 71 | * |
72 | * The callbacks from macvlan are always done with rcu_read_lock held | 72 | * There may still be references to the struct sock inside of the |
73 | * already. For calls from file_operations, we use the rcu_read_lock_bh | 73 | * queue from outbound SKBs, but these never reference back to the |
74 | * to get a reference count on the socket and the device. | 74 | * file or the dev. The data structure is freed through __sk_free |
75 | * | 75 | * when both our references and any pending SKBs are gone. |
76 | * When destroying a queue, we remove the pointers from the file and | ||
77 | * from the dev and then synchronize_rcu to make sure no thread is | ||
78 | * still using the queue. There may still be references to the struct | ||
79 | * sock inside of the queue from outbound SKBs, but these never | ||
80 | * reference back to the file or the dev. The data structure is freed | ||
81 | * through __sk_free when both our references and any pending SKBs | ||
82 | * are gone. | ||
83 | * | ||
84 | * macvtap_lock is only used to prevent multiple concurrent open() | ||
85 | * calls to assign a new vlan->tap pointer. It could be moved into | ||
86 | * the macvlan_dev itself but is extremely rarely used. | ||
87 | */ | 76 | */ |
88 | static DEFINE_SPINLOCK(macvtap_lock); | 77 | static DEFINE_SPINLOCK(macvtap_lock); |
89 | 78 | ||
@@ -101,11 +90,12 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file, | |||
101 | goto out; | 90 | goto out; |
102 | 91 | ||
103 | err = 0; | 92 | err = 0; |
104 | q->vlan = vlan; | 93 | rcu_assign_pointer(q->vlan, vlan); |
105 | rcu_assign_pointer(vlan->tap, q); | 94 | rcu_assign_pointer(vlan->tap, q); |
95 | sock_hold(&q->sk); | ||
106 | 96 | ||
107 | q->file = file; | 97 | q->file = file; |
108 | rcu_assign_pointer(file->private_data, q); | 98 | file->private_data = q; |
109 | 99 | ||
110 | out: | 100 | out: |
111 | spin_unlock(&macvtap_lock); | 101 | spin_unlock(&macvtap_lock); |
@@ -113,28 +103,25 @@ out: | |||
113 | } | 103 | } |
114 | 104 | ||
115 | /* | 105 | /* |
116 | * We must destroy each queue exactly once, when either | 106 | * The file owning the queue got closed, give up both |
117 | * the netdev or the file go away. | 107 | * the reference that the files holds as well as the |
108 | * one from the macvlan_dev if that still exists. | ||
118 | * | 109 | * |
119 | * Using the spinlock makes sure that we don't get | 110 | * Using the spinlock makes sure that we don't get |
120 | * to the queue again after destroying it. | 111 | * to the queue again after destroying it. |
121 | * | ||
122 | * synchronize_rcu serializes with the packet flow | ||
123 | * that uses rcu_read_lock. | ||
124 | */ | 112 | */ |
125 | static void macvtap_del_queue(struct macvtap_queue **qp) | 113 | static void macvtap_put_queue(struct macvtap_queue *q) |
126 | { | 114 | { |
127 | struct macvtap_queue *q; | 115 | struct macvlan_dev *vlan; |
128 | 116 | ||
129 | spin_lock(&macvtap_lock); | 117 | spin_lock(&macvtap_lock); |
130 | q = rcu_dereference(*qp); | 118 | vlan = rcu_dereference(q->vlan); |
131 | if (!q) { | 119 | if (vlan) { |
132 | spin_unlock(&macvtap_lock); | 120 | rcu_assign_pointer(vlan->tap, NULL); |
133 | return; | 121 | rcu_assign_pointer(q->vlan, NULL); |
122 | sock_put(&q->sk); | ||
134 | } | 123 | } |
135 | 124 | ||
136 | rcu_assign_pointer(q->vlan->tap, NULL); | ||
137 | rcu_assign_pointer(q->file->private_data, NULL); | ||
138 | spin_unlock(&macvtap_lock); | 125 | spin_unlock(&macvtap_lock); |
139 | 126 | ||
140 | synchronize_rcu(); | 127 | synchronize_rcu(); |
@@ -152,29 +139,29 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, | |||
152 | return rcu_dereference(vlan->tap); | 139 | return rcu_dereference(vlan->tap); |
153 | } | 140 | } |
154 | 141 | ||
142 | /* | ||
143 | * The net_device is going away, give up the reference | ||
144 | * that it holds on the queue (all the queues one day) | ||
145 | * and safely set the pointer from the queues to NULL. | ||
146 | */ | ||
155 | static void macvtap_del_queues(struct net_device *dev) | 147 | static void macvtap_del_queues(struct net_device *dev) |
156 | { | 148 | { |
157 | struct macvlan_dev *vlan = netdev_priv(dev); | 149 | struct macvlan_dev *vlan = netdev_priv(dev); |
158 | macvtap_del_queue(&vlan->tap); | ||
159 | } | ||
160 | |||
161 | static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file) | ||
162 | { | ||
163 | struct macvtap_queue *q; | 150 | struct macvtap_queue *q; |
164 | rcu_read_lock_bh(); | 151 | |
165 | q = rcu_dereference(file->private_data); | 152 | spin_lock(&macvtap_lock); |
166 | if (q) { | 153 | q = rcu_dereference(vlan->tap); |
167 | sock_hold(&q->sk); | 154 | if (!q) { |
168 | dev_hold(q->vlan->dev); | 155 | spin_unlock(&macvtap_lock); |
156 | return; | ||
169 | } | 157 | } |
170 | rcu_read_unlock_bh(); | ||
171 | return q; | ||
172 | } | ||
173 | 158 | ||
174 | static inline void macvtap_file_put_queue(struct macvtap_queue *q) | 159 | rcu_assign_pointer(vlan->tap, NULL); |
175 | { | 160 | rcu_assign_pointer(q->vlan, NULL); |
161 | spin_unlock(&macvtap_lock); | ||
162 | |||
163 | synchronize_rcu(); | ||
176 | sock_put(&q->sk); | 164 | sock_put(&q->sk); |
177 | dev_put(q->vlan->dev); | ||
178 | } | 165 | } |
179 | 166 | ||
180 | /* | 167 | /* |
@@ -284,7 +271,6 @@ static int macvtap_open(struct inode *inode, struct file *file) | |||
284 | q->sock.type = SOCK_RAW; | 271 | q->sock.type = SOCK_RAW; |
285 | q->sock.state = SS_CONNECTED; | 272 | q->sock.state = SS_CONNECTED; |
286 | sock_init_data(&q->sock, &q->sk); | 273 | sock_init_data(&q->sock, &q->sk); |
287 | q->sk.sk_allocation = GFP_ATOMIC; /* for now */ | ||
288 | q->sk.sk_write_space = macvtap_sock_write_space; | 274 | q->sk.sk_write_space = macvtap_sock_write_space; |
289 | 275 | ||
290 | err = macvtap_set_queue(dev, file, q); | 276 | err = macvtap_set_queue(dev, file, q); |
@@ -300,13 +286,14 @@ out: | |||
300 | 286 | ||
301 | static int macvtap_release(struct inode *inode, struct file *file) | 287 | static int macvtap_release(struct inode *inode, struct file *file) |
302 | { | 288 | { |
303 | macvtap_del_queue((struct macvtap_queue **)&file->private_data); | 289 | struct macvtap_queue *q = file->private_data; |
290 | macvtap_put_queue(q); | ||
304 | return 0; | 291 | return 0; |
305 | } | 292 | } |
306 | 293 | ||
307 | static unsigned int macvtap_poll(struct file *file, poll_table * wait) | 294 | static unsigned int macvtap_poll(struct file *file, poll_table * wait) |
308 | { | 295 | { |
309 | struct macvtap_queue *q = macvtap_file_get_queue(file); | 296 | struct macvtap_queue *q = file->private_data; |
310 | unsigned int mask = POLLERR; | 297 | unsigned int mask = POLLERR; |
311 | 298 | ||
312 | if (!q) | 299 | if (!q) |
@@ -323,7 +310,6 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait) | |||
323 | sock_writeable(&q->sk))) | 310 | sock_writeable(&q->sk))) |
324 | mask |= POLLOUT | POLLWRNORM; | 311 | mask |= POLLOUT | POLLWRNORM; |
325 | 312 | ||
326 | macvtap_file_put_queue(q); | ||
327 | out: | 313 | out: |
328 | return mask; | 314 | return mask; |
329 | } | 315 | } |
@@ -334,6 +320,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, | |||
334 | int noblock) | 320 | int noblock) |
335 | { | 321 | { |
336 | struct sk_buff *skb; | 322 | struct sk_buff *skb; |
323 | struct macvlan_dev *vlan; | ||
337 | size_t len = count; | 324 | size_t len = count; |
338 | int err; | 325 | int err; |
339 | 326 | ||
@@ -341,26 +328,37 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, | |||
341 | return -EINVAL; | 328 | return -EINVAL; |
342 | 329 | ||
343 | skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err); | 330 | skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err); |
344 | 331 | if (!skb) | |
345 | if (!skb) { | 332 | goto err; |
346 | macvlan_count_rx(q->vlan, 0, false, false); | ||
347 | return err; | ||
348 | } | ||
349 | 333 | ||
350 | skb_reserve(skb, NET_IP_ALIGN); | 334 | skb_reserve(skb, NET_IP_ALIGN); |
351 | skb_put(skb, count); | 335 | skb_put(skb, count); |
352 | 336 | ||
353 | if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) { | 337 | err = skb_copy_datagram_from_iovec(skb, 0, iv, 0, len); |
354 | macvlan_count_rx(q->vlan, 0, false, false); | 338 | if (err) |
355 | kfree_skb(skb); | 339 | goto err; |
356 | return -EFAULT; | ||
357 | } | ||
358 | 340 | ||
359 | skb_set_network_header(skb, ETH_HLEN); | 341 | skb_set_network_header(skb, ETH_HLEN); |
360 | 342 | rcu_read_lock_bh(); | |
361 | macvlan_start_xmit(skb, q->vlan->dev); | 343 | vlan = rcu_dereference(q->vlan); |
344 | if (vlan) | ||
345 | macvlan_start_xmit(skb, vlan->dev); | ||
346 | else | ||
347 | kfree_skb(skb); | ||
348 | rcu_read_unlock_bh(); | ||
362 | 349 | ||
363 | return count; | 350 | return count; |
351 | |||
352 | err: | ||
353 | rcu_read_lock_bh(); | ||
354 | vlan = rcu_dereference(q->vlan); | ||
355 | if (vlan) | ||
356 | macvlan_count_rx(q->vlan, 0, false, false); | ||
357 | rcu_read_unlock_bh(); | ||
358 | |||
359 | kfree_skb(skb); | ||
360 | |||
361 | return err; | ||
364 | } | 362 | } |
365 | 363 | ||
366 | static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, | 364 | static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, |
@@ -368,15 +366,10 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, | |||
368 | { | 366 | { |
369 | struct file *file = iocb->ki_filp; | 367 | struct file *file = iocb->ki_filp; |
370 | ssize_t result = -ENOLINK; | 368 | ssize_t result = -ENOLINK; |
371 | struct macvtap_queue *q = macvtap_file_get_queue(file); | 369 | struct macvtap_queue *q = file->private_data; |
372 | |||
373 | if (!q) | ||
374 | goto out; | ||
375 | 370 | ||
376 | result = macvtap_get_user(q, iv, iov_length(iv, count), | 371 | result = macvtap_get_user(q, iv, iov_length(iv, count), |
377 | file->f_flags & O_NONBLOCK); | 372 | file->f_flags & O_NONBLOCK); |
378 | macvtap_file_put_queue(q); | ||
379 | out: | ||
380 | return result; | 373 | return result; |
381 | } | 374 | } |
382 | 375 | ||
@@ -385,14 +378,17 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q, | |||
385 | const struct sk_buff *skb, | 378 | const struct sk_buff *skb, |
386 | const struct iovec *iv, int len) | 379 | const struct iovec *iv, int len) |
387 | { | 380 | { |
388 | struct macvlan_dev *vlan = q->vlan; | 381 | struct macvlan_dev *vlan; |
389 | int ret; | 382 | int ret; |
390 | 383 | ||
391 | len = min_t(int, skb->len, len); | 384 | len = min_t(int, skb->len, len); |
392 | 385 | ||
393 | ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len); | 386 | ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len); |
394 | 387 | ||
388 | rcu_read_lock_bh(); | ||
389 | vlan = rcu_dereference(q->vlan); | ||
395 | macvlan_count_rx(vlan, len, ret == 0, 0); | 390 | macvlan_count_rx(vlan, len, ret == 0, 0); |
391 | rcu_read_unlock_bh(); | ||
396 | 392 | ||
397 | return ret ? ret : len; | 393 | return ret ? ret : len; |
398 | } | 394 | } |
@@ -401,14 +397,16 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, | |||
401 | unsigned long count, loff_t pos) | 397 | unsigned long count, loff_t pos) |
402 | { | 398 | { |
403 | struct file *file = iocb->ki_filp; | 399 | struct file *file = iocb->ki_filp; |
404 | struct macvtap_queue *q = macvtap_file_get_queue(file); | 400 | struct macvtap_queue *q = file->private_data; |
405 | 401 | ||
406 | DECLARE_WAITQUEUE(wait, current); | 402 | DECLARE_WAITQUEUE(wait, current); |
407 | struct sk_buff *skb; | 403 | struct sk_buff *skb; |
408 | ssize_t len, ret = 0; | 404 | ssize_t len, ret = 0; |
409 | 405 | ||
410 | if (!q) | 406 | if (!q) { |
411 | return -ENOLINK; | 407 | ret = -ENOLINK; |
408 | goto out; | ||
409 | } | ||
412 | 410 | ||
413 | len = iov_length(iv, count); | 411 | len = iov_length(iv, count); |
414 | if (len < 0) { | 412 | if (len < 0) { |
@@ -444,7 +442,6 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, | |||
444 | remove_wait_queue(q->sk.sk_sleep, &wait); | 442 | remove_wait_queue(q->sk.sk_sleep, &wait); |
445 | 443 | ||
446 | out: | 444 | out: |
447 | macvtap_file_put_queue(q); | ||
448 | return ret; | 445 | return ret; |
449 | } | 446 | } |
450 | 447 | ||
@@ -454,12 +451,13 @@ out: | |||
454 | static long macvtap_ioctl(struct file *file, unsigned int cmd, | 451 | static long macvtap_ioctl(struct file *file, unsigned int cmd, |
455 | unsigned long arg) | 452 | unsigned long arg) |
456 | { | 453 | { |
457 | struct macvtap_queue *q; | 454 | struct macvtap_queue *q = file->private_data; |
455 | struct macvlan_dev *vlan; | ||
458 | void __user *argp = (void __user *)arg; | 456 | void __user *argp = (void __user *)arg; |
459 | struct ifreq __user *ifr = argp; | 457 | struct ifreq __user *ifr = argp; |
460 | unsigned int __user *up = argp; | 458 | unsigned int __user *up = argp; |
461 | unsigned int u; | 459 | unsigned int u; |
462 | char devname[IFNAMSIZ]; | 460 | int ret; |
463 | 461 | ||
464 | switch (cmd) { | 462 | switch (cmd) { |
465 | case TUNSETIFF: | 463 | case TUNSETIFF: |
@@ -471,16 +469,21 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, | |||
471 | return 0; | 469 | return 0; |
472 | 470 | ||
473 | case TUNGETIFF: | 471 | case TUNGETIFF: |
474 | q = macvtap_file_get_queue(file); | 472 | rcu_read_lock_bh(); |
475 | if (!q) | 473 | vlan = rcu_dereference(q->vlan); |
474 | if (vlan) | ||
475 | dev_hold(vlan->dev); | ||
476 | rcu_read_unlock_bh(); | ||
477 | |||
478 | if (!vlan) | ||
476 | return -ENOLINK; | 479 | return -ENOLINK; |
477 | memcpy(devname, q->vlan->dev->name, sizeof(devname)); | ||
478 | macvtap_file_put_queue(q); | ||
479 | 480 | ||
481 | ret = 0; | ||
480 | if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || | 482 | if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || |
481 | put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags)) | 483 | put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags)) |
482 | return -EFAULT; | 484 | ret = -EFAULT; |
483 | return 0; | 485 | dev_put(vlan->dev); |
486 | return ret; | ||
484 | 487 | ||
485 | case TUNGETFEATURES: | 488 | case TUNGETFEATURES: |
486 | if (put_user((IFF_TAP | IFF_NO_PI), up)) | 489 | if (put_user((IFF_TAP | IFF_NO_PI), up)) |
@@ -491,11 +494,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, | |||
491 | if (get_user(u, up)) | 494 | if (get_user(u, up)) |
492 | return -EFAULT; | 495 | return -EFAULT; |
493 | 496 | ||
494 | q = macvtap_file_get_queue(file); | ||
495 | if (!q) | ||
496 | return -ENOLINK; | ||
497 | q->sk.sk_sndbuf = u; | 497 | q->sk.sk_sndbuf = u; |
498 | macvtap_file_put_queue(q); | ||
499 | return 0; | 498 | return 0; |
500 | 499 | ||
501 | case TUNSETOFFLOAD: | 500 | case TUNSETOFFLOAD: |