aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTrond Myklebust <Trond.Myklebust@netapp.com>2007-06-14 18:00:42 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2007-07-10 23:40:31 -0400
commitc1384c9c4c184543375b52a0997d06cd98145164 (patch)
tree26bb2ee42dc2fddfe1817d3066844669912adef2
parent6e5b70e9d1e712d8dad5514e0ab5240ac4b5fb57 (diff)
SUNRPC: fix hang due to eventd deadlock...
Brian Behlendorf writes: The root cause of the NFS hang we were observing appears to be a rare deadlock between the kernel provided usermodehelper API and the linux NFS client. The deadlock can arise because both of these services use the generic linux work queues. The usermodehelper API run the specified user application in the context of the work queue. And NFS submits both cleanup and reconnect work to the generic work queue for handling. Normally this is fine but a deadlock can result in the following situation. - NFS client is in a disconnected state - [events/0] runs a usermodehelper app with an NFS dependent operation, this triggers an NFS reconnect. - NFS reconnect happens to be submitted to [events/0] work queue. - Deadlock, the [events/0] work queue will never process the reconnect because it is blocked on the previous NFS dependent operation which will not complete.` The solution is simply to run reconnect requests on rpciod. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
-rw-r--r--net/sunrpc/xprt.c4
-rw-r--r--net/sunrpc/xprtsock.c17
2 files changed, 9 insertions, 12 deletions
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 5b05b73e4c1d..518acb74a5bb 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -127,7 +127,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
127 clear_bit(XPRT_LOCKED, &xprt->state); 127 clear_bit(XPRT_LOCKED, &xprt->state);
128 smp_mb__after_clear_bit(); 128 smp_mb__after_clear_bit();
129 } else 129 } else
130 schedule_work(&xprt->task_cleanup); 130 queue_work(rpciod_workqueue, &xprt->task_cleanup);
131} 131}
132 132
133/* 133/*
@@ -515,7 +515,7 @@ xprt_init_autodisconnect(unsigned long data)
515 if (xprt_connecting(xprt)) 515 if (xprt_connecting(xprt))
516 xprt_release_write(xprt, NULL); 516 xprt_release_write(xprt, NULL);
517 else 517 else
518 schedule_work(&xprt->task_cleanup); 518 queue_work(rpciod_workqueue, &xprt->task_cleanup);
519 return; 519 return;
520out_abort: 520out_abort:
521 spin_unlock(&xprt->transport_lock); 521 spin_unlock(&xprt->transport_lock);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index cc33c5880abb..ee6ad3baf680 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -653,8 +653,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
653 653
654 dprintk("RPC: xs_destroy xprt %p\n", xprt); 654 dprintk("RPC: xs_destroy xprt %p\n", xprt);
655 655
656 cancel_delayed_work(&transport->connect_worker); 656 cancel_rearming_delayed_work(&transport->connect_worker);
657 flush_scheduled_work();
658 657
659 xprt_disconnect(xprt); 658 xprt_disconnect(xprt);
660 xs_close(xprt); 659 xs_close(xprt);
@@ -1001,7 +1000,7 @@ static void xs_tcp_state_change(struct sock *sk)
1001 /* Try to schedule an autoclose RPC calls */ 1000 /* Try to schedule an autoclose RPC calls */
1002 set_bit(XPRT_CLOSE_WAIT, &xprt->state); 1001 set_bit(XPRT_CLOSE_WAIT, &xprt->state);
1003 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) 1002 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
1004 schedule_work(&xprt->task_cleanup); 1003 queue_work(rpciod_workqueue, &xprt->task_cleanup);
1005 default: 1004 default:
1006 xprt_disconnect(xprt); 1005 xprt_disconnect(xprt);
1007 } 1006 }
@@ -1410,18 +1409,16 @@ static void xs_connect(struct rpc_task *task)
1410 dprintk("RPC: xs_connect delayed xprt %p for %lu " 1409 dprintk("RPC: xs_connect delayed xprt %p for %lu "
1411 "seconds\n", 1410 "seconds\n",
1412 xprt, xprt->reestablish_timeout / HZ); 1411 xprt, xprt->reestablish_timeout / HZ);
1413 schedule_delayed_work(&transport->connect_worker, 1412 queue_delayed_work(rpciod_workqueue,
1414 xprt->reestablish_timeout); 1413 &transport->connect_worker,
1414 xprt->reestablish_timeout);
1415 xprt->reestablish_timeout <<= 1; 1415 xprt->reestablish_timeout <<= 1;
1416 if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO) 1416 if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
1417 xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; 1417 xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
1418 } else { 1418 } else {
1419 dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); 1419 dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
1420 schedule_delayed_work(&transport->connect_worker, 0); 1420 queue_delayed_work(rpciod_workqueue,
1421 1421 &transport->connect_worker, 0);
1422 /* flush_scheduled_work can sleep... */
1423 if (!RPC_IS_ASYNC(task))
1424 flush_scheduled_work();
1425 } 1422 }
1426} 1423}
1427 1424