aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHerbert Xu <herbert@gondor.apana.org.au>2006-01-06 03:09:47 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-06 11:33:20 -0500
commit4b2f0260c74324abca76ccaa42d426af163125e7 (patch)
tree881f76200dc3489b11497528feb72d6eae93bddb
parentbd6a59b22fd3bd044bb14978b885bcd042a10e8e (diff)
[PATCH] nbd: fix TX/RX race condition
Janos Haar of First NetCenter Bt. reported numerous crashes involving the NBD driver. With his help, this was tracked down to bogus bio vectors which in turn was the result of a race condition between the receive/transmit routines in the NBD driver. The bug manifests itself like this: CPU0 CPU1 do_nbd_request add req to queuelist nbd_send_request send req head for each bio kmap send nbd_read_stat nbd_find_request nbd_end_request kunmap When CPU1 finishes nbd_end_request, the request and all its associated bio's are freed. So when CPU0 calls kunmap whose argument is derived from the last bio, it may crash. Under normal circumstances, the race occurs only on the last bio. However, if an error is encountered on the remote NBD server (such as an incorrect magic number in the request), or if there were a bug in the server, it is possible for the nbd_end_request to occur any time after the request's addition to the queuelist. The following patch fixes this problem by making sure that requests are not added to the queuelist until after they have been completed transmission. In order for the receiving side to be ready for responses involving requests still being transmitted, the patch introduces the concept of the active request. When a response matches the current active request, its processing is delayed until after the tranmission has come to a stop. This has been tested by Janos and it has been successful in curing this race condition. From: Herbert Xu <herbert@gondor.apana.org.au> Here is an updated patch which removes the active_req wait in nbd_clear_queue and the associated memory barrier. I've also clarified this in the comment. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cc: <djani22@dynamicweb.hu> Cc: Paul Clements <Paul.Clements@SteelEye.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/block/nbd.c122
-rw-r--r--include/linux/nbd.h8
2 files changed, 68 insertions, 62 deletions
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9e268ddedfbd..d5c8ee7d9815 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -54,11 +54,15 @@
54#include <linux/errno.h> 54#include <linux/errno.h>
55#include <linux/file.h> 55#include <linux/file.h>
56#include <linux/ioctl.h> 56#include <linux/ioctl.h>
57#include <linux/compiler.h>
58#include <linux/err.h>
59#include <linux/kernel.h>
57#include <net/sock.h> 60#include <net/sock.h>
58 61
59#include <linux/devfs_fs_kernel.h> 62#include <linux/devfs_fs_kernel.h>
60 63
61#include <asm/uaccess.h> 64#include <asm/uaccess.h>
65#include <asm/system.h>
62#include <asm/types.h> 66#include <asm/types.h>
63 67
64#include <linux/nbd.h> 68#include <linux/nbd.h>
@@ -230,14 +234,6 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
230 request.len = htonl(size); 234 request.len = htonl(size);
231 memcpy(request.handle, &req, sizeof(req)); 235 memcpy(request.handle, &req, sizeof(req));
232 236
233 down(&lo->tx_lock);
234
235 if (!sock || !lo->sock) {
236 printk(KERN_ERR "%s: Attempted send on closed socket\n",
237 lo->disk->disk_name);
238 goto error_out;
239 }
240
241 dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n", 237 dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n",
242 lo->disk->disk_name, req, 238 lo->disk->disk_name, req,
243 nbdcmd_to_ascii(nbd_cmd(req)), 239 nbdcmd_to_ascii(nbd_cmd(req)),
@@ -276,11 +272,9 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
276 } 272 }
277 } 273 }
278 } 274 }
279 up(&lo->tx_lock);
280 return 0; 275 return 0;
281 276
282error_out: 277error_out:
283 up(&lo->tx_lock);
284 return 1; 278 return 1;
285} 279}
286 280
@@ -289,9 +283,14 @@ static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
289 struct request *req; 283 struct request *req;
290 struct list_head *tmp; 284 struct list_head *tmp;
291 struct request *xreq; 285 struct request *xreq;
286 int err;
292 287
293 memcpy(&xreq, handle, sizeof(xreq)); 288 memcpy(&xreq, handle, sizeof(xreq));
294 289
290 err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq);
291 if (unlikely(err))
292 goto out;
293
295 spin_lock(&lo->queue_lock); 294 spin_lock(&lo->queue_lock);
296 list_for_each(tmp, &lo->queue_head) { 295 list_for_each(tmp, &lo->queue_head) {
297 req = list_entry(tmp, struct request, queuelist); 296 req = list_entry(tmp, struct request, queuelist);
@@ -302,7 +301,11 @@ static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
302 return req; 301 return req;
303 } 302 }
304 spin_unlock(&lo->queue_lock); 303 spin_unlock(&lo->queue_lock);
305 return NULL; 304
305 err = -ENOENT;
306
307out:
308 return ERR_PTR(err);
306} 309}
307 310
308static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec) 311static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec)
@@ -331,7 +334,11 @@ static struct request *nbd_read_stat(struct nbd_device *lo)
331 goto harderror; 334 goto harderror;
332 } 335 }
333 req = nbd_find_request(lo, reply.handle); 336 req = nbd_find_request(lo, reply.handle);
334 if (req == NULL) { 337 if (unlikely(IS_ERR(req))) {
338 result = PTR_ERR(req);
339 if (result != -ENOENT)
340 goto harderror;
341
335 printk(KERN_ERR "%s: Unexpected reply (%p)\n", 342 printk(KERN_ERR "%s: Unexpected reply (%p)\n",
336 lo->disk->disk_name, reply.handle); 343 lo->disk->disk_name, reply.handle);
337 result = -EBADR; 344 result = -EBADR;
@@ -395,19 +402,24 @@ static void nbd_clear_que(struct nbd_device *lo)
395 402
396 BUG_ON(lo->magic != LO_MAGIC); 403 BUG_ON(lo->magic != LO_MAGIC);
397 404
398 do { 405 /*
399 req = NULL; 406 * Because we have set lo->sock to NULL under the tx_lock, all
400 spin_lock(&lo->queue_lock); 407 * modifications to the list must have completed by now. For
401 if (!list_empty(&lo->queue_head)) { 408 * the same reason, the active_req must be NULL.
402 req = list_entry(lo->queue_head.next, struct request, queuelist); 409 *
403 list_del_init(&req->queuelist); 410 * As a consequence, we don't need to take the spin lock while
404 } 411 * purging the list here.
405 spin_unlock(&lo->queue_lock); 412 */
406 if (req) { 413 BUG_ON(lo->sock);
407 req->errors++; 414 BUG_ON(lo->active_req);
408 nbd_end_request(req); 415
409 } 416 while (!list_empty(&lo->queue_head)) {
410 } while (req); 417 req = list_entry(lo->queue_head.next, struct request,
418 queuelist);
419 list_del_init(&req->queuelist);
420 req->errors++;
421 nbd_end_request(req);
422 }
411} 423}
412 424
413/* 425/*
@@ -435,11 +447,6 @@ static void do_nbd_request(request_queue_t * q)
435 447
436 BUG_ON(lo->magic != LO_MAGIC); 448 BUG_ON(lo->magic != LO_MAGIC);
437 449
438 if (!lo->file) {
439 printk(KERN_ERR "%s: Request when not-ready\n",
440 lo->disk->disk_name);
441 goto error_out;
442 }
443 nbd_cmd(req) = NBD_CMD_READ; 450 nbd_cmd(req) = NBD_CMD_READ;
444 if (rq_data_dir(req) == WRITE) { 451 if (rq_data_dir(req) == WRITE) {
445 nbd_cmd(req) = NBD_CMD_WRITE; 452 nbd_cmd(req) = NBD_CMD_WRITE;
@@ -453,32 +460,34 @@ static void do_nbd_request(request_queue_t * q)
453 req->errors = 0; 460 req->errors = 0;
454 spin_unlock_irq(q->queue_lock); 461 spin_unlock_irq(q->queue_lock);
455 462
456 spin_lock(&lo->queue_lock); 463 down(&lo->tx_lock);
457 464 if (unlikely(!lo->sock)) {
458 if (!lo->file) { 465 up(&lo->tx_lock);
459 spin_unlock(&lo->queue_lock); 466 printk(KERN_ERR "%s: Attempted send on closed socket\n",
460 printk(KERN_ERR "%s: failed between accept and semaphore, file lost\n", 467 lo->disk->disk_name);
461 lo->disk->disk_name);
462 req->errors++; 468 req->errors++;
463 nbd_end_request(req); 469 nbd_end_request(req);
464 spin_lock_irq(q->queue_lock); 470 spin_lock_irq(q->queue_lock);
465 continue; 471 continue;
466 } 472 }
467 473
468 list_add(&req->queuelist, &lo->queue_head); 474 lo->active_req = req;
469 spin_unlock(&lo->queue_lock);
470 475
471 if (nbd_send_req(lo, req) != 0) { 476 if (nbd_send_req(lo, req) != 0) {
472 printk(KERN_ERR "%s: Request send failed\n", 477 printk(KERN_ERR "%s: Request send failed\n",
473 lo->disk->disk_name); 478 lo->disk->disk_name);
474 if (nbd_find_request(lo, (char *)&req) != NULL) { 479 req->errors++;
475 /* we still own req */ 480 nbd_end_request(req);
476 req->errors++; 481 } else {
477 nbd_end_request(req); 482 spin_lock(&lo->queue_lock);
478 } else /* we're racing with nbd_clear_que */ 483 list_add(&req->queuelist, &lo->queue_head);
479 printk(KERN_DEBUG "nbd: can't find req\n"); 484 spin_unlock(&lo->queue_lock);
480 } 485 }
481 486
487 lo->active_req = NULL;
488 up(&lo->tx_lock);
489 wake_up_all(&lo->active_wq);
490
482 spin_lock_irq(q->queue_lock); 491 spin_lock_irq(q->queue_lock);
483 continue; 492 continue;
484 493
@@ -529,17 +538,10 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
529 down(&lo->tx_lock); 538 down(&lo->tx_lock);
530 lo->sock = NULL; 539 lo->sock = NULL;
531 up(&lo->tx_lock); 540 up(&lo->tx_lock);
532 spin_lock(&lo->queue_lock);
533 file = lo->file; 541 file = lo->file;
534 lo->file = NULL; 542 lo->file = NULL;
535 spin_unlock(&lo->queue_lock);
536 nbd_clear_que(lo); 543 nbd_clear_que(lo);
537 spin_lock(&lo->queue_lock); 544 BUG_ON(!list_empty(&lo->queue_head));
538 if (!list_empty(&lo->queue_head)) {
539 printk(KERN_ERR "nbd: disconnect: some requests are in progress -> please try again.\n");
540 error = -EBUSY;
541 }
542 spin_unlock(&lo->queue_lock);
543 if (file) 545 if (file)
544 fput(file); 546 fput(file);
545 return error; 547 return error;
@@ -598,24 +600,19 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
598 lo->sock = NULL; 600 lo->sock = NULL;
599 } 601 }
600 up(&lo->tx_lock); 602 up(&lo->tx_lock);
601 spin_lock(&lo->queue_lock);
602 file = lo->file; 603 file = lo->file;
603 lo->file = NULL; 604 lo->file = NULL;
604 spin_unlock(&lo->queue_lock);
605 nbd_clear_que(lo); 605 nbd_clear_que(lo);
606 printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name); 606 printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name);
607 if (file) 607 if (file)
608 fput(file); 608 fput(file);
609 return lo->harderror; 609 return lo->harderror;
610 case NBD_CLEAR_QUE: 610 case NBD_CLEAR_QUE:
611 down(&lo->tx_lock); 611 /*
612 if (lo->sock) { 612 * This is for compatibility only. The queue is always cleared
613 up(&lo->tx_lock); 613 * by NBD_DO_IT or NBD_CLEAR_SOCK.
614 return 0; /* probably should be error, but that would 614 */
615 * break "nbd-client -d", so just return 0 */ 615 BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
616 }
617 up(&lo->tx_lock);
618 nbd_clear_que(lo);
619 return 0; 616 return 0;
620 case NBD_PRINT_DEBUG: 617 case NBD_PRINT_DEBUG:
621 printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n", 618 printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n",
@@ -688,6 +685,7 @@ static int __init nbd_init(void)
688 spin_lock_init(&nbd_dev[i].queue_lock); 685 spin_lock_init(&nbd_dev[i].queue_lock);
689 INIT_LIST_HEAD(&nbd_dev[i].queue_head); 686 INIT_LIST_HEAD(&nbd_dev[i].queue_head);
690 init_MUTEX(&nbd_dev[i].tx_lock); 687 init_MUTEX(&nbd_dev[i].tx_lock);
688 init_waitqueue_head(&nbd_dev[i].active_wq);
691 nbd_dev[i].blksize = 1024; 689 nbd_dev[i].blksize = 1024;
692 nbd_dev[i].bytesize = 0x7ffffc00ULL << 10; /* 2TB */ 690 nbd_dev[i].bytesize = 0x7ffffc00ULL << 10; /* 2TB */
693 disk->major = NBD_MAJOR; 691 disk->major = NBD_MAJOR;
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index 090e210e98f0..f95d51fae733 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -37,18 +37,26 @@ enum {
37/* userspace doesn't need the nbd_device structure */ 37/* userspace doesn't need the nbd_device structure */
38#ifdef __KERNEL__ 38#ifdef __KERNEL__
39 39
40#include <linux/wait.h>
41
40/* values for flags field */ 42/* values for flags field */
41#define NBD_READ_ONLY 0x0001 43#define NBD_READ_ONLY 0x0001
42#define NBD_WRITE_NOCHK 0x0002 44#define NBD_WRITE_NOCHK 0x0002
43 45
46struct request;
47
44struct nbd_device { 48struct nbd_device {
45 int flags; 49 int flags;
46 int harderror; /* Code of hard error */ 50 int harderror; /* Code of hard error */
47 struct socket * sock; 51 struct socket * sock;
48 struct file * file; /* If == NULL, device is not ready, yet */ 52 struct file * file; /* If == NULL, device is not ready, yet */
49 int magic; 53 int magic;
54
50 spinlock_t queue_lock; 55 spinlock_t queue_lock;
51 struct list_head queue_head;/* Requests are added here... */ 56 struct list_head queue_head;/* Requests are added here... */
57 struct request *active_req;
58 wait_queue_head_t active_wq;
59
52 struct semaphore tx_lock; 60 struct semaphore tx_lock;
53 struct gendisk *disk; 61 struct gendisk *disk;
54 int blksize; 62 int blksize;