diff options
author | Herbert Xu <herbert@gondor.apana.org.au> | 2006-01-06 03:09:47 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-01-06 11:33:20 -0500 |
commit | 4b2f0260c74324abca76ccaa42d426af163125e7 (patch) | |
tree | 881f76200dc3489b11497528feb72d6eae93bddb | |
parent | bd6a59b22fd3bd044bb14978b885bcd042a10e8e (diff) |
[PATCH] nbd: fix TX/RX race condition
Janos Haar of First NetCenter Bt. reported numerous crashes involving the
NBD driver. With his help, this was tracked down to bogus bio vectors
which in turn was the result of a race condition between the
receive/transmit routines in the NBD driver.
The bug manifests itself like this:
CPU0 CPU1
do_nbd_request
add req to queuelist
nbd_send_request
send req head
for each bio
kmap
send
nbd_read_stat
nbd_find_request
nbd_end_request
kunmap
When CPU1 finishes nbd_end_request, the request and all its associated
bio's are freed. So when CPU0 calls kunmap whose argument is derived from
the last bio, it may crash.
Under normal circumstances, the race occurs only on the last bio. However,
if an error is encountered on the remote NBD server (such as an incorrect
magic number in the request), or if there were a bug in the server, it is
possible for the nbd_end_request to occur any time after the request's
addition to the queuelist.
The following patch fixes this problem by making sure that requests are not
added to the queuelist until after they have been completed transmission.
In order for the receiving side to be ready for responses involving
requests still being transmitted, the patch introduces the concept of the
active request.
When a response matches the current active request, its processing is
delayed until after the tranmission has come to a stop.
This has been tested by Janos and it has been successful in curing this
race condition.
From: Herbert Xu <herbert@gondor.apana.org.au>
Here is an updated patch which removes the active_req wait in
nbd_clear_queue and the associated memory barrier.
I've also clarified this in the comment.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: <djani22@dynamicweb.hu>
Cc: Paul Clements <Paul.Clements@SteelEye.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | drivers/block/nbd.c | 122 | ||||
-rw-r--r-- | include/linux/nbd.h | 8 |
2 files changed, 68 insertions, 62 deletions
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 9e268ddedfbd..d5c8ee7d9815 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c | |||
@@ -54,11 +54,15 @@ | |||
54 | #include <linux/errno.h> | 54 | #include <linux/errno.h> |
55 | #include <linux/file.h> | 55 | #include <linux/file.h> |
56 | #include <linux/ioctl.h> | 56 | #include <linux/ioctl.h> |
57 | #include <linux/compiler.h> | ||
58 | #include <linux/err.h> | ||
59 | #include <linux/kernel.h> | ||
57 | #include <net/sock.h> | 60 | #include <net/sock.h> |
58 | 61 | ||
59 | #include <linux/devfs_fs_kernel.h> | 62 | #include <linux/devfs_fs_kernel.h> |
60 | 63 | ||
61 | #include <asm/uaccess.h> | 64 | #include <asm/uaccess.h> |
65 | #include <asm/system.h> | ||
62 | #include <asm/types.h> | 66 | #include <asm/types.h> |
63 | 67 | ||
64 | #include <linux/nbd.h> | 68 | #include <linux/nbd.h> |
@@ -230,14 +234,6 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) | |||
230 | request.len = htonl(size); | 234 | request.len = htonl(size); |
231 | memcpy(request.handle, &req, sizeof(req)); | 235 | memcpy(request.handle, &req, sizeof(req)); |
232 | 236 | ||
233 | down(&lo->tx_lock); | ||
234 | |||
235 | if (!sock || !lo->sock) { | ||
236 | printk(KERN_ERR "%s: Attempted send on closed socket\n", | ||
237 | lo->disk->disk_name); | ||
238 | goto error_out; | ||
239 | } | ||
240 | |||
241 | dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n", | 237 | dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n", |
242 | lo->disk->disk_name, req, | 238 | lo->disk->disk_name, req, |
243 | nbdcmd_to_ascii(nbd_cmd(req)), | 239 | nbdcmd_to_ascii(nbd_cmd(req)), |
@@ -276,11 +272,9 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) | |||
276 | } | 272 | } |
277 | } | 273 | } |
278 | } | 274 | } |
279 | up(&lo->tx_lock); | ||
280 | return 0; | 275 | return 0; |
281 | 276 | ||
282 | error_out: | 277 | error_out: |
283 | up(&lo->tx_lock); | ||
284 | return 1; | 278 | return 1; |
285 | } | 279 | } |
286 | 280 | ||
@@ -289,9 +283,14 @@ static struct request *nbd_find_request(struct nbd_device *lo, char *handle) | |||
289 | struct request *req; | 283 | struct request *req; |
290 | struct list_head *tmp; | 284 | struct list_head *tmp; |
291 | struct request *xreq; | 285 | struct request *xreq; |
286 | int err; | ||
292 | 287 | ||
293 | memcpy(&xreq, handle, sizeof(xreq)); | 288 | memcpy(&xreq, handle, sizeof(xreq)); |
294 | 289 | ||
290 | err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq); | ||
291 | if (unlikely(err)) | ||
292 | goto out; | ||
293 | |||
295 | spin_lock(&lo->queue_lock); | 294 | spin_lock(&lo->queue_lock); |
296 | list_for_each(tmp, &lo->queue_head) { | 295 | list_for_each(tmp, &lo->queue_head) { |
297 | req = list_entry(tmp, struct request, queuelist); | 296 | req = list_entry(tmp, struct request, queuelist); |
@@ -302,7 +301,11 @@ static struct request *nbd_find_request(struct nbd_device *lo, char *handle) | |||
302 | return req; | 301 | return req; |
303 | } | 302 | } |
304 | spin_unlock(&lo->queue_lock); | 303 | spin_unlock(&lo->queue_lock); |
305 | return NULL; | 304 | |
305 | err = -ENOENT; | ||
306 | |||
307 | out: | ||
308 | return ERR_PTR(err); | ||
306 | } | 309 | } |
307 | 310 | ||
308 | static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec) | 311 | static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec) |
@@ -331,7 +334,11 @@ static struct request *nbd_read_stat(struct nbd_device *lo) | |||
331 | goto harderror; | 334 | goto harderror; |
332 | } | 335 | } |
333 | req = nbd_find_request(lo, reply.handle); | 336 | req = nbd_find_request(lo, reply.handle); |
334 | if (req == NULL) { | 337 | if (unlikely(IS_ERR(req))) { |
338 | result = PTR_ERR(req); | ||
339 | if (result != -ENOENT) | ||
340 | goto harderror; | ||
341 | |||
335 | printk(KERN_ERR "%s: Unexpected reply (%p)\n", | 342 | printk(KERN_ERR "%s: Unexpected reply (%p)\n", |
336 | lo->disk->disk_name, reply.handle); | 343 | lo->disk->disk_name, reply.handle); |
337 | result = -EBADR; | 344 | result = -EBADR; |
@@ -395,19 +402,24 @@ static void nbd_clear_que(struct nbd_device *lo) | |||
395 | 402 | ||
396 | BUG_ON(lo->magic != LO_MAGIC); | 403 | BUG_ON(lo->magic != LO_MAGIC); |
397 | 404 | ||
398 | do { | 405 | /* |
399 | req = NULL; | 406 | * Because we have set lo->sock to NULL under the tx_lock, all |
400 | spin_lock(&lo->queue_lock); | 407 | * modifications to the list must have completed by now. For |
401 | if (!list_empty(&lo->queue_head)) { | 408 | * the same reason, the active_req must be NULL. |
402 | req = list_entry(lo->queue_head.next, struct request, queuelist); | 409 | * |
403 | list_del_init(&req->queuelist); | 410 | * As a consequence, we don't need to take the spin lock while |
404 | } | 411 | * purging the list here. |
405 | spin_unlock(&lo->queue_lock); | 412 | */ |
406 | if (req) { | 413 | BUG_ON(lo->sock); |
407 | req->errors++; | 414 | BUG_ON(lo->active_req); |
408 | nbd_end_request(req); | 415 | |
409 | } | 416 | while (!list_empty(&lo->queue_head)) { |
410 | } while (req); | 417 | req = list_entry(lo->queue_head.next, struct request, |
418 | queuelist); | ||
419 | list_del_init(&req->queuelist); | ||
420 | req->errors++; | ||
421 | nbd_end_request(req); | ||
422 | } | ||
411 | } | 423 | } |
412 | 424 | ||
413 | /* | 425 | /* |
@@ -435,11 +447,6 @@ static void do_nbd_request(request_queue_t * q) | |||
435 | 447 | ||
436 | BUG_ON(lo->magic != LO_MAGIC); | 448 | BUG_ON(lo->magic != LO_MAGIC); |
437 | 449 | ||
438 | if (!lo->file) { | ||
439 | printk(KERN_ERR "%s: Request when not-ready\n", | ||
440 | lo->disk->disk_name); | ||
441 | goto error_out; | ||
442 | } | ||
443 | nbd_cmd(req) = NBD_CMD_READ; | 450 | nbd_cmd(req) = NBD_CMD_READ; |
444 | if (rq_data_dir(req) == WRITE) { | 451 | if (rq_data_dir(req) == WRITE) { |
445 | nbd_cmd(req) = NBD_CMD_WRITE; | 452 | nbd_cmd(req) = NBD_CMD_WRITE; |
@@ -453,32 +460,34 @@ static void do_nbd_request(request_queue_t * q) | |||
453 | req->errors = 0; | 460 | req->errors = 0; |
454 | spin_unlock_irq(q->queue_lock); | 461 | spin_unlock_irq(q->queue_lock); |
455 | 462 | ||
456 | spin_lock(&lo->queue_lock); | 463 | down(&lo->tx_lock); |
457 | 464 | if (unlikely(!lo->sock)) { | |
458 | if (!lo->file) { | 465 | up(&lo->tx_lock); |
459 | spin_unlock(&lo->queue_lock); | 466 | printk(KERN_ERR "%s: Attempted send on closed socket\n", |
460 | printk(KERN_ERR "%s: failed between accept and semaphore, file lost\n", | 467 | lo->disk->disk_name); |
461 | lo->disk->disk_name); | ||
462 | req->errors++; | 468 | req->errors++; |
463 | nbd_end_request(req); | 469 | nbd_end_request(req); |
464 | spin_lock_irq(q->queue_lock); | 470 | spin_lock_irq(q->queue_lock); |
465 | continue; | 471 | continue; |
466 | } | 472 | } |
467 | 473 | ||
468 | list_add(&req->queuelist, &lo->queue_head); | 474 | lo->active_req = req; |
469 | spin_unlock(&lo->queue_lock); | ||
470 | 475 | ||
471 | if (nbd_send_req(lo, req) != 0) { | 476 | if (nbd_send_req(lo, req) != 0) { |
472 | printk(KERN_ERR "%s: Request send failed\n", | 477 | printk(KERN_ERR "%s: Request send failed\n", |
473 | lo->disk->disk_name); | 478 | lo->disk->disk_name); |
474 | if (nbd_find_request(lo, (char *)&req) != NULL) { | 479 | req->errors++; |
475 | /* we still own req */ | 480 | nbd_end_request(req); |
476 | req->errors++; | 481 | } else { |
477 | nbd_end_request(req); | 482 | spin_lock(&lo->queue_lock); |
478 | } else /* we're racing with nbd_clear_que */ | 483 | list_add(&req->queuelist, &lo->queue_head); |
479 | printk(KERN_DEBUG "nbd: can't find req\n"); | 484 | spin_unlock(&lo->queue_lock); |
480 | } | 485 | } |
481 | 486 | ||
487 | lo->active_req = NULL; | ||
488 | up(&lo->tx_lock); | ||
489 | wake_up_all(&lo->active_wq); | ||
490 | |||
482 | spin_lock_irq(q->queue_lock); | 491 | spin_lock_irq(q->queue_lock); |
483 | continue; | 492 | continue; |
484 | 493 | ||
@@ -529,17 +538,10 @@ static int nbd_ioctl(struct inode *inode, struct file *file, | |||
529 | down(&lo->tx_lock); | 538 | down(&lo->tx_lock); |
530 | lo->sock = NULL; | 539 | lo->sock = NULL; |
531 | up(&lo->tx_lock); | 540 | up(&lo->tx_lock); |
532 | spin_lock(&lo->queue_lock); | ||
533 | file = lo->file; | 541 | file = lo->file; |
534 | lo->file = NULL; | 542 | lo->file = NULL; |
535 | spin_unlock(&lo->queue_lock); | ||
536 | nbd_clear_que(lo); | 543 | nbd_clear_que(lo); |
537 | spin_lock(&lo->queue_lock); | 544 | BUG_ON(!list_empty(&lo->queue_head)); |
538 | if (!list_empty(&lo->queue_head)) { | ||
539 | printk(KERN_ERR "nbd: disconnect: some requests are in progress -> please try again.\n"); | ||
540 | error = -EBUSY; | ||
541 | } | ||
542 | spin_unlock(&lo->queue_lock); | ||
543 | if (file) | 545 | if (file) |
544 | fput(file); | 546 | fput(file); |
545 | return error; | 547 | return error; |
@@ -598,24 +600,19 @@ static int nbd_ioctl(struct inode *inode, struct file *file, | |||
598 | lo->sock = NULL; | 600 | lo->sock = NULL; |
599 | } | 601 | } |
600 | up(&lo->tx_lock); | 602 | up(&lo->tx_lock); |
601 | spin_lock(&lo->queue_lock); | ||
602 | file = lo->file; | 603 | file = lo->file; |
603 | lo->file = NULL; | 604 | lo->file = NULL; |
604 | spin_unlock(&lo->queue_lock); | ||
605 | nbd_clear_que(lo); | 605 | nbd_clear_que(lo); |
606 | printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name); | 606 | printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name); |
607 | if (file) | 607 | if (file) |
608 | fput(file); | 608 | fput(file); |
609 | return lo->harderror; | 609 | return lo->harderror; |
610 | case NBD_CLEAR_QUE: | 610 | case NBD_CLEAR_QUE: |
611 | down(&lo->tx_lock); | 611 | /* |
612 | if (lo->sock) { | 612 | * This is for compatibility only. The queue is always cleared |
613 | up(&lo->tx_lock); | 613 | * by NBD_DO_IT or NBD_CLEAR_SOCK. |
614 | return 0; /* probably should be error, but that would | 614 | */ |
615 | * break "nbd-client -d", so just return 0 */ | 615 | BUG_ON(!lo->sock && !list_empty(&lo->queue_head)); |
616 | } | ||
617 | up(&lo->tx_lock); | ||
618 | nbd_clear_que(lo); | ||
619 | return 0; | 616 | return 0; |
620 | case NBD_PRINT_DEBUG: | 617 | case NBD_PRINT_DEBUG: |
621 | printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n", | 618 | printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n", |
@@ -688,6 +685,7 @@ static int __init nbd_init(void) | |||
688 | spin_lock_init(&nbd_dev[i].queue_lock); | 685 | spin_lock_init(&nbd_dev[i].queue_lock); |
689 | INIT_LIST_HEAD(&nbd_dev[i].queue_head); | 686 | INIT_LIST_HEAD(&nbd_dev[i].queue_head); |
690 | init_MUTEX(&nbd_dev[i].tx_lock); | 687 | init_MUTEX(&nbd_dev[i].tx_lock); |
688 | init_waitqueue_head(&nbd_dev[i].active_wq); | ||
691 | nbd_dev[i].blksize = 1024; | 689 | nbd_dev[i].blksize = 1024; |
692 | nbd_dev[i].bytesize = 0x7ffffc00ULL << 10; /* 2TB */ | 690 | nbd_dev[i].bytesize = 0x7ffffc00ULL << 10; /* 2TB */ |
693 | disk->major = NBD_MAJOR; | 691 | disk->major = NBD_MAJOR; |
diff --git a/include/linux/nbd.h b/include/linux/nbd.h index 090e210e98f0..f95d51fae733 100644 --- a/include/linux/nbd.h +++ b/include/linux/nbd.h | |||
@@ -37,18 +37,26 @@ enum { | |||
37 | /* userspace doesn't need the nbd_device structure */ | 37 | /* userspace doesn't need the nbd_device structure */ |
38 | #ifdef __KERNEL__ | 38 | #ifdef __KERNEL__ |
39 | 39 | ||
40 | #include <linux/wait.h> | ||
41 | |||
40 | /* values for flags field */ | 42 | /* values for flags field */ |
41 | #define NBD_READ_ONLY 0x0001 | 43 | #define NBD_READ_ONLY 0x0001 |
42 | #define NBD_WRITE_NOCHK 0x0002 | 44 | #define NBD_WRITE_NOCHK 0x0002 |
43 | 45 | ||
46 | struct request; | ||
47 | |||
44 | struct nbd_device { | 48 | struct nbd_device { |
45 | int flags; | 49 | int flags; |
46 | int harderror; /* Code of hard error */ | 50 | int harderror; /* Code of hard error */ |
47 | struct socket * sock; | 51 | struct socket * sock; |
48 | struct file * file; /* If == NULL, device is not ready, yet */ | 52 | struct file * file; /* If == NULL, device is not ready, yet */ |
49 | int magic; | 53 | int magic; |
54 | |||
50 | spinlock_t queue_lock; | 55 | spinlock_t queue_lock; |
51 | struct list_head queue_head;/* Requests are added here... */ | 56 | struct list_head queue_head;/* Requests are added here... */ |
57 | struct request *active_req; | ||
58 | wait_queue_head_t active_wq; | ||
59 | |||
52 | struct semaphore tx_lock; | 60 | struct semaphore tx_lock; |
53 | struct gendisk *disk; | 61 | struct gendisk *disk; |
54 | int blksize; | 62 | int blksize; |