aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGreg Banks <gnb@sgi.com>2009-01-13 05:26:35 -0500
committerJ. Bruce Fields <bfields@citi.umich.edu>2009-03-18 17:38:41 -0400
commit59a252ff8c0f2fa32c896f69d56ae33e641ce7ad (patch)
treef79089d44737e5f050cc4869b42829650096747f
parent8bbfa9f3889b643fc7de82c0c761ef17097f8faf (diff)
knfsd: avoid overloading the CPU scheduler with enormous load averages
Avoid overloading the CPU scheduler with enormous load averages when handling high call-rate NFS loads. When the knfsd bottom half is made aware of an incoming call by the socket layer, it tries to choose an nfsd thread and wake it up. As long as there are idle threads, one will be woken up. If there are lot of nfsd threads (a sensible configuration when the server is disk-bound or is running an HSM), there will be many more nfsd threads than CPUs to run them. Under a high call-rate low service-time workload, the result is that almost every nfsd is runnable, but only a handful are actually able to run. This situation causes two significant problems: 1. The CPU scheduler takes over 10% of each CPU, which is robbing the nfsd threads of valuable CPU time. 2. At a high enough load, the nfsd threads starve userspace threads of CPU time, to the point where daemons like portmap and rpc.mountd do not schedule for tens of seconds at a time. Clients attempting to mount an NFS filesystem timeout at the very first step (opening a TCP connection to portmap) because portmap cannot wake up from select() and call accept() in time. Disclaimer: these effects were observed on a SLES9 kernel, modern kernels' schedulers may behave more gracefully. The solution is simple: keep in each svc_pool a counter of the number of threads which have been woken but have not yet run, and do not wake any more if that count reaches an arbitrary small threshold. Testing was on a 4 CPU 4 NIC Altix using 4 IRIX clients, each with 16 synthetic client threads simulating an rsync (i.e. recursive directory listing) workload reading from an i386 RH9 install image (161480 regular files in 10841 directories) on the server. That tree is small enough to fill in the server's RAM so no disk traffic was involved. This setup gives a sustained call rate in excess of 60000 calls/sec before being CPU-bound on the server. The server was running 128 nfsds. Profiling showed schedule() taking 6.7% of every CPU, and __wake_up() taking 5.2%. This patch drops those contributions to 3.0% and 2.2%. Load average was over 120 before the patch, and 20.9 after. This patch is a forward-ported version of knfsd-avoid-nfsd-overload which has been shipping in the SGI "Enhanced NFS" product since 2006. It has been posted before: http://article.gmane.org/gmane.linux.nfs/10374 Signed-off-by: Greg Banks <gnb@sgi.com> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
-rw-r--r--include/linux/sunrpc/svc.h2
-rw-r--r--net/sunrpc/svc_xprt.c25
2 files changed, 20 insertions, 7 deletions
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3435d24bfe55..39ec186a492d 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -41,6 +41,7 @@ struct svc_pool {
41 struct list_head sp_sockets; /* pending sockets */ 41 struct list_head sp_sockets; /* pending sockets */
42 unsigned int sp_nrthreads; /* # of threads in pool */ 42 unsigned int sp_nrthreads; /* # of threads in pool */
43 struct list_head sp_all_threads; /* all server threads */ 43 struct list_head sp_all_threads; /* all server threads */
44 int sp_nwaking; /* number of threads woken but not yet active */
44} ____cacheline_aligned_in_smp; 45} ____cacheline_aligned_in_smp;
45 46
46/* 47/*
@@ -264,6 +265,7 @@ struct svc_rqst {
264 * cache pages */ 265 * cache pages */
265 wait_queue_head_t rq_wait; /* synchronization */ 266 wait_queue_head_t rq_wait; /* synchronization */
266 struct task_struct *rq_task; /* service thread */ 267 struct task_struct *rq_task; /* service thread */
268 int rq_waking; /* 1 if thread is being woken */
267}; 269};
268 270
269/* 271/*
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e588df5d6b34..0551b6b6cf8c 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -14,6 +14,8 @@
14 14
15#define RPCDBG_FACILITY RPCDBG_SVCXPRT 15#define RPCDBG_FACILITY RPCDBG_SVCXPRT
16 16
17#define SVC_MAX_WAKING 5
18
17static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); 19static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
18static int svc_deferred_recv(struct svc_rqst *rqstp); 20static int svc_deferred_recv(struct svc_rqst *rqstp);
19static struct cache_deferred_req *svc_defer(struct cache_req *req); 21static struct cache_deferred_req *svc_defer(struct cache_req *req);
@@ -298,6 +300,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
298 struct svc_pool *pool; 300 struct svc_pool *pool;
299 struct svc_rqst *rqstp; 301 struct svc_rqst *rqstp;
300 int cpu; 302 int cpu;
303 int thread_avail;
301 304
302 if (!(xprt->xpt_flags & 305 if (!(xprt->xpt_flags &
303 ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED)))) 306 ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
@@ -309,12 +312,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
309 312
310 spin_lock_bh(&pool->sp_lock); 313 spin_lock_bh(&pool->sp_lock);
311 314
312 if (!list_empty(&pool->sp_threads) &&
313 !list_empty(&pool->sp_sockets))
314 printk(KERN_ERR
315 "svc_xprt_enqueue: "
316 "threads and transports both waiting??\n");
317
318 if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { 315 if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
319 /* Don't enqueue dead transports */ 316 /* Don't enqueue dead transports */
320 dprintk("svc: transport %p is dead, not enqueued\n", xprt); 317 dprintk("svc: transport %p is dead, not enqueued\n", xprt);
@@ -353,7 +350,14 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
353 } 350 }
354 351
355 process: 352 process:
356 if (!list_empty(&pool->sp_threads)) { 353 /* Work out whether threads are available */
354 thread_avail = !list_empty(&pool->sp_threads); /* threads are asleep */
355 if (pool->sp_nwaking >= SVC_MAX_WAKING) {
356 /* too many threads are runnable and trying to wake up */
357 thread_avail = 0;
358 }
359
360 if (thread_avail) {
357 rqstp = list_entry(pool->sp_threads.next, 361 rqstp = list_entry(pool->sp_threads.next,
358 struct svc_rqst, 362 struct svc_rqst,
359 rq_list); 363 rq_list);
@@ -368,6 +372,8 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
368 svc_xprt_get(xprt); 372 svc_xprt_get(xprt);
369 rqstp->rq_reserved = serv->sv_max_mesg; 373 rqstp->rq_reserved = serv->sv_max_mesg;
370 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); 374 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
375 rqstp->rq_waking = 1;
376 pool->sp_nwaking++;
371 BUG_ON(xprt->xpt_pool != pool); 377 BUG_ON(xprt->xpt_pool != pool);
372 wake_up(&rqstp->rq_wait); 378 wake_up(&rqstp->rq_wait);
373 } else { 379 } else {
@@ -633,6 +639,11 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
633 return -EINTR; 639 return -EINTR;
634 640
635 spin_lock_bh(&pool->sp_lock); 641 spin_lock_bh(&pool->sp_lock);
642 if (rqstp->rq_waking) {
643 rqstp->rq_waking = 0;
644 pool->sp_nwaking--;
645 BUG_ON(pool->sp_nwaking < 0);
646 }
636 xprt = svc_xprt_dequeue(pool); 647 xprt = svc_xprt_dequeue(pool);
637 if (xprt) { 648 if (xprt) {
638 rqstp->rq_xprt = xprt; 649 rqstp->rq_xprt = xprt;