aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGreg Banks <gnb@melbourne.sgi.com>2006-10-02 05:18:01 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-10-02 10:57:20 -0400
commitbfd241600a3b0db4fe43c859f1460d0a958d924a (patch)
tree7f04604adee7249e686d1db0cac93f1fee8bc5b6
parenteec09661dc82e90a31051d045a94026a91aceb82 (diff)
[PATCH] knfsd: make rpc threads pools numa aware
Actually implement multiple pools. On NUMA machines, allocate a svc_pool per NUMA node; on SMP a svc_pool per CPU; otherwise a single global pool. Enqueue sockets on the svc_pool corresponding to the CPU on which the socket bh is run (i.e. the NIC interrupt CPU). Threads have their cpu mask set to limit them to the CPUs in the svc_pool that owns them. This is the patch that allows an Altix to scale NFS traffic linearly beyond 4 CPUs and 4 NICs. Incorporates changes and feedback from Neil Brown, Trond Myklebust, and Christoph Hellwig. Signed-off-by: Greg Banks <gnb@melbourne.sgi.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/sunrpc/svc.h1
-rw-r--r--net/sunrpc/svc.c255
-rw-r--r--net/sunrpc/svcsock.c7
3 files changed, 261 insertions, 2 deletions
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index f2eeb833e7d8..4ebcdf91f3b3 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -386,5 +386,6 @@ int svc_process(struct svc_rqst *);
386int svc_register(struct svc_serv *, int, unsigned short); 386int svc_register(struct svc_serv *, int, unsigned short);
387void svc_wake_up(struct svc_serv *); 387void svc_wake_up(struct svc_serv *);
388void svc_reserve(struct svc_rqst *rqstp, int space); 388void svc_reserve(struct svc_rqst *rqstp, int space);
389struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu);
389 390
390#endif /* SUNRPC_SVC_H */ 391#endif /* SUNRPC_SVC_H */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 8c75eec4fd6a..a99e67b164c1 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -4,6 +4,10 @@
4 * High-level RPC service routines 4 * High-level RPC service routines
5 * 5 *
6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
7 *
8 * Multiple threads pools and NUMAisation
9 * Copyright (c) 2006 Silicon Graphics, Inc.
10 * by Greg Banks <gnb@melbourne.sgi.com>
7 */ 11 */
8 12
9#include <linux/linkage.h> 13#include <linux/linkage.h>
@@ -25,6 +29,242 @@
25#define RPC_PARANOIA 1 29#define RPC_PARANOIA 1
26 30
27/* 31/*
32 * Mode for mapping cpus to pools.
33 */
34enum {
35 SVC_POOL_NONE = -1, /* uninitialised, choose one of the others */
36 SVC_POOL_GLOBAL, /* no mapping, just a single global pool
37 * (legacy & UP mode) */
38 SVC_POOL_PERCPU, /* one pool per cpu */
39 SVC_POOL_PERNODE /* one pool per numa node */
40};
41
42/*
43 * Structure for mapping cpus to pools and vice versa.
44 * Setup once during sunrpc initialisation.
45 */
46static struct svc_pool_map {
47 int mode; /* Note: int not enum to avoid
48 * warnings about "enumeration value
49 * not handled in switch" */
50 unsigned int npools;
51 unsigned int *pool_to; /* maps pool id to cpu or node */
52 unsigned int *to_pool; /* maps cpu or node to pool id */
53} svc_pool_map = {
54 .mode = SVC_POOL_NONE
55};
56
57
58/*
59 * Detect best pool mapping mode heuristically,
60 * according to the machine's topology.
61 */
62static int
63svc_pool_map_choose_mode(void)
64{
65 unsigned int node;
66
67 if (num_online_nodes() > 1) {
68 /*
69 * Actually have multiple NUMA nodes,
70 * so split pools on NUMA node boundaries
71 */
72 return SVC_POOL_PERNODE;
73 }
74
75 node = any_online_node(node_online_map);
76 if (nr_cpus_node(node) > 2) {
77 /*
78 * Non-trivial SMP, or CONFIG_NUMA on
79 * non-NUMA hardware, e.g. with a generic
80 * x86_64 kernel on Xeons. In this case we
81 * want to divide the pools on cpu boundaries.
82 */
83 return SVC_POOL_PERCPU;
84 }
85
86 /* default: one global pool */
87 return SVC_POOL_GLOBAL;
88}
89
90/*
91 * Allocate the to_pool[] and pool_to[] arrays.
92 * Returns 0 on success or an errno.
93 */
94static int
95svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
96{
97 m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
98 if (!m->to_pool)
99 goto fail;
100 m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
101 if (!m->pool_to)
102 goto fail_free;
103
104 return 0;
105
106fail_free:
107 kfree(m->to_pool);
108fail:
109 return -ENOMEM;
110}
111
112/*
113 * Initialise the pool map for SVC_POOL_PERCPU mode.
114 * Returns number of pools or <0 on error.
115 */
116static int
117svc_pool_map_init_percpu(struct svc_pool_map *m)
118{
119 unsigned int maxpools = highest_possible_processor_id()+1;
120 unsigned int pidx = 0;
121 unsigned int cpu;
122 int err;
123
124 err = svc_pool_map_alloc_arrays(m, maxpools);
125 if (err)
126 return err;
127
128 for_each_online_cpu(cpu) {
129 BUG_ON(pidx > maxpools);
130 m->to_pool[cpu] = pidx;
131 m->pool_to[pidx] = cpu;
132 pidx++;
133 }
134 /* cpus brought online later all get mapped to pool0, sorry */
135
136 return pidx;
137};
138
139
140/*
141 * Initialise the pool map for SVC_POOL_PERNODE mode.
142 * Returns number of pools or <0 on error.
143 */
144static int
145svc_pool_map_init_pernode(struct svc_pool_map *m)
146{
147 unsigned int maxpools = highest_possible_node_id()+1;
148 unsigned int pidx = 0;
149 unsigned int node;
150 int err;
151
152 err = svc_pool_map_alloc_arrays(m, maxpools);
153 if (err)
154 return err;
155
156 for_each_node_with_cpus(node) {
157 /* some architectures (e.g. SN2) have cpuless nodes */
158 BUG_ON(pidx > maxpools);
159 m->to_pool[node] = pidx;
160 m->pool_to[pidx] = node;
161 pidx++;
162 }
163 /* nodes brought online later all get mapped to pool0, sorry */
164
165 return pidx;
166}
167
168
169/*
170 * Build the global map of cpus to pools and vice versa.
171 */
172static unsigned int
173svc_pool_map_init(void)
174{
175 struct svc_pool_map *m = &svc_pool_map;
176 int npools = -1;
177
178 if (m->mode != SVC_POOL_NONE)
179 return m->npools;
180
181 m->mode = svc_pool_map_choose_mode();
182
183 switch (m->mode) {
184 case SVC_POOL_PERCPU:
185 npools = svc_pool_map_init_percpu(m);
186 break;
187 case SVC_POOL_PERNODE:
188 npools = svc_pool_map_init_pernode(m);
189 break;
190 }
191
192 if (npools < 0) {
193 /* default, or memory allocation failure */
194 npools = 1;
195 m->mode = SVC_POOL_GLOBAL;
196 }
197 m->npools = npools;
198
199 return m->npools;
200}
201
202/*
203 * Set the current thread's cpus_allowed mask so that it
204 * will only run on cpus in the given pool.
205 *
206 * Returns 1 and fills in oldmask iff a cpumask was applied.
207 */
208static inline int
209svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
210{
211 struct svc_pool_map *m = &svc_pool_map;
212 unsigned int node; /* or cpu */
213
214 /*
215 * The caller checks for sv_nrpools > 1, which
216 * implies that we've been initialized and the
217 * map mode is not NONE.
218 */
219 BUG_ON(m->mode == SVC_POOL_NONE);
220
221 switch (m->mode)
222 {
223 default:
224 return 0;
225 case SVC_POOL_PERCPU:
226 node = m->pool_to[pidx];
227 *oldmask = current->cpus_allowed;
228 set_cpus_allowed(current, cpumask_of_cpu(node));
229 return 1;
230 case SVC_POOL_PERNODE:
231 node = m->pool_to[pidx];
232 *oldmask = current->cpus_allowed;
233 set_cpus_allowed(current, node_to_cpumask(node));
234 return 1;
235 }
236}
237
238/*
239 * Use the mapping mode to choose a pool for a given CPU.
240 * Used when enqueueing an incoming RPC. Always returns
241 * a non-NULL pool pointer.
242 */
243struct svc_pool *
244svc_pool_for_cpu(struct svc_serv *serv, int cpu)
245{
246 struct svc_pool_map *m = &svc_pool_map;
247 unsigned int pidx = 0;
248
249 /*
250 * SVC_POOL_NONE happens in a pure client when
251 * lockd is brought up, so silently treat it the
252 * same as SVC_POOL_GLOBAL.
253 */
254
255 switch (m->mode) {
256 case SVC_POOL_PERCPU:
257 pidx = m->to_pool[cpu];
258 break;
259 case SVC_POOL_PERNODE:
260 pidx = m->to_pool[cpu_to_node(cpu)];
261 break;
262 }
263 return &serv->sv_pools[pidx % serv->sv_nrpools];
264}
265
266
267/*
28 * Create an RPC service 268 * Create an RPC service
29 */ 269 */
30static struct svc_serv * 270static struct svc_serv *
@@ -105,8 +345,9 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
105 svc_thread_fn func, int sig, struct module *mod) 345 svc_thread_fn func, int sig, struct module *mod)
106{ 346{
107 struct svc_serv *serv; 347 struct svc_serv *serv;
348 unsigned int npools = svc_pool_map_init();
108 349
109 serv = __svc_create(prog, bufsize, /*npools*/1, shutdown); 350 serv = __svc_create(prog, bufsize, npools, shutdown);
110 351
111 if (serv != NULL) { 352 if (serv != NULL) {
112 serv->sv_function = func; 353 serv->sv_function = func;
@@ -209,6 +450,8 @@ svc_release_buffer(struct svc_rqst *rqstp)
209 450
210/* 451/*
211 * Create a thread in the given pool. Caller must hold BKL. 452 * Create a thread in the given pool. Caller must hold BKL.
453 * On a NUMA or SMP machine, with a multi-pool serv, the thread
454 * will be restricted to run on the cpus belonging to the pool.
212 */ 455 */
213static int 456static int
214__svc_create_thread(svc_thread_fn func, struct svc_serv *serv, 457__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
@@ -216,6 +459,8 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
216{ 459{
217 struct svc_rqst *rqstp; 460 struct svc_rqst *rqstp;
218 int error = -ENOMEM; 461 int error = -ENOMEM;
462 int have_oldmask = 0;
463 cpumask_t oldmask;
219 464
220 rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); 465 rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
221 if (!rqstp) 466 if (!rqstp)
@@ -235,7 +480,15 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
235 spin_unlock_bh(&pool->sp_lock); 480 spin_unlock_bh(&pool->sp_lock);
236 rqstp->rq_server = serv; 481 rqstp->rq_server = serv;
237 rqstp->rq_pool = pool; 482 rqstp->rq_pool = pool;
483
484 if (serv->sv_nrpools > 1)
485 have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
486
238 error = kernel_thread((int (*)(void *)) func, rqstp, 0); 487 error = kernel_thread((int (*)(void *)) func, rqstp, 0);
488
489 if (have_oldmask)
490 set_cpus_allowed(current, oldmask);
491
239 if (error < 0) 492 if (error < 0)
240 goto out_thread; 493 goto out_thread;
241 svc_sock_update_bufs(serv); 494 svc_sock_update_bufs(serv);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b78659adeff3..cba85d195222 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -151,8 +151,9 @@ static void
151svc_sock_enqueue(struct svc_sock *svsk) 151svc_sock_enqueue(struct svc_sock *svsk)
152{ 152{
153 struct svc_serv *serv = svsk->sk_server; 153 struct svc_serv *serv = svsk->sk_server;
154 struct svc_pool *pool = &serv->sv_pools[0]; 154 struct svc_pool *pool;
155 struct svc_rqst *rqstp; 155 struct svc_rqst *rqstp;
156 int cpu;
156 157
157 if (!(svsk->sk_flags & 158 if (!(svsk->sk_flags &
158 ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) 159 ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
@@ -160,6 +161,10 @@ svc_sock_enqueue(struct svc_sock *svsk)
160 if (test_bit(SK_DEAD, &svsk->sk_flags)) 161 if (test_bit(SK_DEAD, &svsk->sk_flags))
161 return; 162 return;
162 163
164 cpu = get_cpu();
165 pool = svc_pool_for_cpu(svsk->sk_server, cpu);
166 put_cpu();
167
163 spin_lock_bh(&pool->sp_lock); 168 spin_lock_bh(&pool->sp_lock);
164 169
165 if (!list_empty(&pool->sp_threads) && 170 if (!list_empty(&pool->sp_threads) &&