2 files changed, 260 insertions, 2 deletions
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 8c75eec4fd6a..a99e67b164c1 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -4,6 +4,10 @@
 * High-level RPC service routines
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ *
+ * Multiple threads pools and NUMAisation
+ * Copyright (c) 2006 Silicon Graphics, Inc.
+ * by Greg Banks <gnb@melbourne.sgi.com>
 */
 #include <linux/linkage.h>
@@ -25,6 +29,242 @@
 #define RPC_PARANOIA 1
 /*
+ * Mode for mapping cpus to pools.
+ */
+enum {
+        SVC_POOL_NONE = -1,     /* uninitialised, choose one of the others */
+        SVC_POOL_GLOBAL,        /* no mapping, just a single global pool
+                                 * (legacy & UP mode) */
+        SVC_POOL_PERCPU,        /* one pool per cpu */
+        SVC_POOL_PERNODE        /* one pool per numa node */
+};
+/*
+ * Structure for mapping cpus to pools and vice versa.
+ * Setup once during sunrpc initialisation.
+ */
+static struct svc_pool_map {
+        int mode;                       /* Note: int not enum to avoid
+                                         * warnings about "enumeration value
+                                         * not handled in switch" */
+        unsigned int npools;
+        unsigned int *pool_to;          /* maps pool id to cpu or node */
+        unsigned int *to_pool;          /* maps cpu or node to pool id */
+} svc_pool_map = {
+        .mode = SVC_POOL_NONE
+};
+/*
+ * Detect best pool mapping mode heuristically,
+ * according to the machine's topology.
+ */
+static int
+svc_pool_map_choose_mode(void)
+{
+        unsigned int node;
+        if (num_online_nodes() > 1) {
+                /*
+                 * Actually have multiple NUMA nodes,
+                 * so split pools on NUMA node boundaries
+                 */
+                return SVC_POOL_PERNODE;
+        }
+        node = any_online_node(node_online_map);
+        if (nr_cpus_node(node) > 2) {
+                /*
+                 * Non-trivial SMP, or CONFIG_NUMA on
+                 * non-NUMA hardware, e.g. with a generic
+                 * x86_64 kernel on Xeons.  In this case we
+                 * want to divide the pools on cpu boundaries.
+                 */
+                return SVC_POOL_PERCPU;
+        }
+        /* default: one global pool */
+        return SVC_POOL_GLOBAL;
+}
+/*
+ * Allocate the to_pool[] and pool_to[] arrays.
+ * Returns 0 on success or an errno.
+ */
+static int
+svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
+{
+        m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
+        if (!m->to_pool)
+                goto fail;
+        m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
+        if (!m->pool_to)
+                goto fail_free;
+        return 0;
+fail_free:
+        kfree(m->to_pool);
+fail:
+        return -ENOMEM;
+}
+/*
+ * Initialise the pool map for SVC_POOL_PERCPU mode.
+ * Returns number of pools or <0 on error.
+ */
+static int
+svc_pool_map_init_percpu(struct svc_pool_map *m)
+{
+        unsigned int maxpools = highest_possible_processor_id()+1;
+        unsigned int pidx = 0;
+        unsigned int cpu;
+        int err;
+        err = svc_pool_map_alloc_arrays(m, maxpools);
+        if (err)
+                return err;
+        for_each_online_cpu(cpu) {
+                BUG_ON(pidx > maxpools);
+                m->to_pool[cpu] = pidx;
+                m->pool_to[pidx] = cpu;
+                pidx++;
+        }
+        /* cpus brought online later all get mapped to pool0, sorry */
+        return pidx;
+};
+/*
+ * Initialise the pool map for SVC_POOL_PERNODE mode.
+ * Returns number of pools or <0 on error.
+ */
+static int
+svc_pool_map_init_pernode(struct svc_pool_map *m)
+{
+        unsigned int maxpools = highest_possible_node_id()+1;
+        unsigned int pidx = 0;
+        unsigned int node;
+        int err;
+        err = svc_pool_map_alloc_arrays(m, maxpools);
+        if (err)
+                return err;
+        for_each_node_with_cpus(node) {
+                /* some architectures (e.g. SN2) have cpuless nodes */
+                BUG_ON(pidx > maxpools);
+                m->to_pool[node] = pidx;
+                m->pool_to[pidx] = node;
+                pidx++;
+        }
+        /* nodes brought online later all get mapped to pool0, sorry */
+        return pidx;
+}
+/*
+ * Build the global map of cpus to pools and vice versa.
+ */
+static unsigned int
+svc_pool_map_init(void)
+{
+        struct svc_pool_map *m = &svc_pool_map;
+        int npools = -1;
+        if (m->mode != SVC_POOL_NONE)
+                return m->npools;
+        m->mode = svc_pool_map_choose_mode();
+        switch (m->mode) {
+        case SVC_POOL_PERCPU:
+                npools = svc_pool_map_init_percpu(m);
+                break;
+        case SVC_POOL_PERNODE:
+                npools = svc_pool_map_init_pernode(m);
+                break;
+        }
+        if (npools < 0) {
+                /* default, or memory allocation failure */
+                npools = 1;
+                m->mode = SVC_POOL_GLOBAL;
+        }
+        m->npools = npools;
+        return m->npools;
+}
+/*
+ * Set the current thread's cpus_allowed mask so that it
+ * will only run on cpus in the given pool.
+ *
+ * Returns 1 and fills in oldmask iff a cpumask was applied.
+ */
+static inline int
+svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
+{
+        struct svc_pool_map *m = &svc_pool_map;
+        unsigned int node; /* or cpu */
+        /*
+         * The caller checks for sv_nrpools > 1, which
+         * implies that we've been initialized and the
+         * map mode is not NONE.
+         */
+        BUG_ON(m->mode == SVC_POOL_NONE);
+        switch (m->mode)
+        {
+        default:
+                return 0;
+        case SVC_POOL_PERCPU:
+                node = m->pool_to[pidx];
+                *oldmask = current->cpus_allowed;
+                set_cpus_allowed(current, cpumask_of_cpu(node));
+                return 1;
+        case SVC_POOL_PERNODE:
+                node = m->pool_to[pidx];
+                *oldmask = current->cpus_allowed;
+                set_cpus_allowed(current, node_to_cpumask(node));
+                return 1;
+        }
+}
+/*
+ * Use the mapping mode to choose a pool for a given CPU.
+ * Used when enqueueing an incoming RPC.  Always returns
+ * a non-NULL pool pointer.
+ */
+struct svc_pool *
+svc_pool_for_cpu(struct svc_serv *serv, int cpu)
+{
+        struct svc_pool_map *m = &svc_pool_map;
+        unsigned int pidx = 0;
+        /*
+         * SVC_POOL_NONE happens in a pure client when
+         * lockd is brought up, so silently treat it the
+         * same as SVC_POOL_GLOBAL.
+         */
+        switch (m->mode) {
+        case SVC_POOL_PERCPU:
+                pidx = m->to_pool[cpu];
+                break;
+        case SVC_POOL_PERNODE:
+                pidx = m->to_pool[cpu_to_node(cpu)];
+                break;
+        }
+        return &serv->sv_pools[pidx % serv->sv_nrpools];
+}
+/*
 * Create an RPC service
 */
 static struct svc_serv *
@@ -105,8 +345,9 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
                  svc_thread_fn func, int sig, struct module *mod)
 {
        struct svc_serv *serv;
+        unsigned int npools = svc_pool_map_init();
-        serv = __svc_create(prog, bufsize, /*npools*/1, shutdown);
+        serv = __svc_create(prog, bufsize, npools, shutdown);
        if (serv != NULL) {
                serv->sv_function = func;
@@ -209,6 +450,8 @@ svc_release_buffer(struct svc_rqst *rqstp)
 /*
 * Create a thread in the given pool.  Caller must hold BKL.
+ * On a NUMA or SMP machine, with a multi-pool serv, the thread
+ * will be restricted to run on the cpus belonging to the pool.
 */
 static int
 __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
@@ -216,6 +459,8 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
 {
        struct svc_rqst *rqstp;
        int             error = -ENOMEM;
+        int             have_oldmask = 0;
+        cpumask_t       oldmask;
        rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
        if (!rqstp)
@@ -235,7 +480,15 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
        spin_unlock_bh(&pool->sp_lock);
        rqstp->rq_server = serv;
        rqstp->rq_pool = pool;
+        if (serv->sv_nrpools > 1)
+                have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
        error = kernel_thread((int (*)(void *)) func, rqstp, 0);
+        if (have_oldmask)
+                set_cpus_allowed(current, oldmask);
        if (error < 0)
                goto out_thread;
        svc_sock_update_bufs(serv);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b78659adeff3..cba85d195222 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -151,8 +151,9 @@ static void
 svc_sock_enqueue(struct svc_sock *svsk)
 {
        struct svc_serv *serv = svsk->sk_server;
-        struct svc_pool *pool = &serv->sv_pools[0];
+        struct svc_pool *pool;
        struct svc_rqst *rqstp;
+        int cpu;
        if (!(svsk->sk_flags &
              ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
@@ -160,6 +161,10 @@ svc_sock_enqueue(struct svc_sock *svsk)
        if (test_bit(SK_DEAD, &svsk->sk_flags))
                return;
+        cpu = get_cpu();
+        pool = svc_pool_for_cpu(svsk->sk_server, cpu);
+        put_cpu();
        spin_lock_bh(&pool->sp_lock);
        if (!list_empty(&pool->sp_threads) &&

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 8c75eec4fd6a..a99e67b164c1 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c
@@ -4,6 +4,10 @@
4	* High-level RPC service routines	4	* High-level RPC service routines
5	*	5	*
6	* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>	6	* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
		7	*
		8	* Multiple threads pools and NUMAisation
		9	* Copyright (c) 2006 Silicon Graphics, Inc.
		10	* by Greg Banks <gnb@melbourne.sgi.com>
7	*/	11	*/
8		12
9	#include <linux/linkage.h>	13	#include <linux/linkage.h>
@@ -25,6 +29,242 @@
25	#define RPC_PARANOIA 1	29	#define RPC_PARANOIA 1
26		30
27	/*	31	/*
		32	* Mode for mapping cpus to pools.
		33	*/
		34	enum {
		35	SVC_POOL_NONE = -1, /* uninitialised, choose one of the others */
		36	SVC_POOL_GLOBAL, /* no mapping, just a single global pool
		37	* (legacy & UP mode) */
		38	SVC_POOL_PERCPU, /* one pool per cpu */
		39	SVC_POOL_PERNODE /* one pool per numa node */
		40	};
		41
		42	/*
		43	* Structure for mapping cpus to pools and vice versa.
		44	* Setup once during sunrpc initialisation.
		45	*/
		46	static struct svc_pool_map {
		47	int mode; /* Note: int not enum to avoid
		48	* warnings about "enumeration value
		49	* not handled in switch" */
		50	unsigned int npools;
		51	unsigned int pool_to; / maps pool id to cpu or node */
		52	unsigned int to_pool; / maps cpu or node to pool id */
		53	} svc_pool_map = {
		54	.mode = SVC_POOL_NONE
		55	};
		56
		57
		58	/*
		59	* Detect best pool mapping mode heuristically,
		60	* according to the machine's topology.
		61	*/
		62	static int
		63	svc_pool_map_choose_mode(void)
		64	{
		65	unsigned int node;
		66
		67	if (num_online_nodes() > 1) {
		68	/*
		69	* Actually have multiple NUMA nodes,
		70	* so split pools on NUMA node boundaries
		71	*/
		72	return SVC_POOL_PERNODE;
		73	}
		74
		75	node = any_online_node(node_online_map);
		76	if (nr_cpus_node(node) > 2) {
		77	/*
		78	* Non-trivial SMP, or CONFIG_NUMA on
		79	* non-NUMA hardware, e.g. with a generic
		80	* x86_64 kernel on Xeons. In this case we
		81	* want to divide the pools on cpu boundaries.
		82	*/
		83	return SVC_POOL_PERCPU;
		84	}
		85
		86	/* default: one global pool */
		87	return SVC_POOL_GLOBAL;
		88	}
		89
		90	/*
		91	* Allocate the to_pool[] and pool_to[] arrays.
		92	* Returns 0 on success or an errno.
		93	*/
		94	static int
		95	svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
		96	{
		97	m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
		98	if (!m->to_pool)
		99	goto fail;
		100	m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
		101	if (!m->pool_to)
		102	goto fail_free;
		103
		104	return 0;
		105
		106	fail_free:
		107	kfree(m->to_pool);
		108	fail:
		109	return -ENOMEM;
		110	}
		111
		112	/*
		113	* Initialise the pool map for SVC_POOL_PERCPU mode.
		114	* Returns number of pools or <0 on error.
		115	*/
		116	static int
		117	svc_pool_map_init_percpu(struct svc_pool_map *m)
		118	{
		119	unsigned int maxpools = highest_possible_processor_id()+1;
		120	unsigned int pidx = 0;
		121	unsigned int cpu;
		122	int err;
		123
		124	err = svc_pool_map_alloc_arrays(m, maxpools);
		125	if (err)
		126	return err;
		127
		128	for_each_online_cpu(cpu) {
		129	BUG_ON(pidx > maxpools);
		130	m->to_pool[cpu] = pidx;
		131	m->pool_to[pidx] = cpu;
		132	pidx++;
		133	}
		134	/* cpus brought online later all get mapped to pool0, sorry */
		135
		136	return pidx;
		137	};
		138
		139
		140	/*
		141	* Initialise the pool map for SVC_POOL_PERNODE mode.
		142	* Returns number of pools or <0 on error.
		143	*/
		144	static int
		145	svc_pool_map_init_pernode(struct svc_pool_map *m)
		146	{
		147	unsigned int maxpools = highest_possible_node_id()+1;
		148	unsigned int pidx = 0;
		149	unsigned int node;
		150	int err;
		151
		152	err = svc_pool_map_alloc_arrays(m, maxpools);
		153	if (err)
		154	return err;
		155
		156	for_each_node_with_cpus(node) {
		157	/* some architectures (e.g. SN2) have cpuless nodes */
		158	BUG_ON(pidx > maxpools);
		159	m->to_pool[node] = pidx;
		160	m->pool_to[pidx] = node;
		161	pidx++;
		162	}
		163	/* nodes brought online later all get mapped to pool0, sorry */
		164
		165	return pidx;
		166	}
		167
		168
		169	/*
		170	* Build the global map of cpus to pools and vice versa.
		171	*/
		172	static unsigned int
		173	svc_pool_map_init(void)
		174	{
		175	struct svc_pool_map *m = &svc_pool_map;
		176	int npools = -1;
		177
		178	if (m->mode != SVC_POOL_NONE)
		179	return m->npools;
		180
		181	m->mode = svc_pool_map_choose_mode();
		182
		183	switch (m->mode) {
		184	case SVC_POOL_PERCPU:
		185	npools = svc_pool_map_init_percpu(m);
		186	break;
		187	case SVC_POOL_PERNODE:
		188	npools = svc_pool_map_init_pernode(m);
		189	break;
		190	}
		191
		192	if (npools < 0) {
		193	/* default, or memory allocation failure */
		194	npools = 1;
		195	m->mode = SVC_POOL_GLOBAL;
		196	}
		197	m->npools = npools;
		198
		199	return m->npools;
		200	}
		201
		202	/*
		203	* Set the current thread's cpus_allowed mask so that it
		204	* will only run on cpus in the given pool.
		205	*
		206	* Returns 1 and fills in oldmask iff a cpumask was applied.
		207	*/
		208	static inline int
		209	svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
		210	{
		211	struct svc_pool_map *m = &svc_pool_map;
		212	unsigned int node; /* or cpu */
		213
		214	/*
		215	* The caller checks for sv_nrpools > 1, which
		216	* implies that we've been initialized and the
		217	* map mode is not NONE.
		218	*/
		219	BUG_ON(m->mode == SVC_POOL_NONE);
		220
		221	switch (m->mode)
		222	{
		223	default:
		224	return 0;
		225	case SVC_POOL_PERCPU:
		226	node = m->pool_to[pidx];
		227	*oldmask = current->cpus_allowed;
		228	set_cpus_allowed(current, cpumask_of_cpu(node));
		229	return 1;
		230	case SVC_POOL_PERNODE:
		231	node = m->pool_to[pidx];
		232	*oldmask = current->cpus_allowed;
		233	set_cpus_allowed(current, node_to_cpumask(node));
		234	return 1;
		235	}
		236	}
		237
		238	/*
		239	* Use the mapping mode to choose a pool for a given CPU.
		240	* Used when enqueueing an incoming RPC. Always returns
		241	* a non-NULL pool pointer.
		242	*/
		243	struct svc_pool *
		244	svc_pool_for_cpu(struct svc_serv *serv, int cpu)
		245	{
		246	struct svc_pool_map *m = &svc_pool_map;
		247	unsigned int pidx = 0;
		248
		249	/*
		250	* SVC_POOL_NONE happens in a pure client when
		251	* lockd is brought up, so silently treat it the
		252	* same as SVC_POOL_GLOBAL.
		253	*/
		254
		255	switch (m->mode) {
		256	case SVC_POOL_PERCPU:
		257	pidx = m->to_pool[cpu];
		258	break;
		259	case SVC_POOL_PERNODE:
		260	pidx = m->to_pool[cpu_to_node(cpu)];
		261	break;
		262	}
		263	return &serv->sv_pools[pidx % serv->sv_nrpools];
		264	}
		265
		266
		267	/*
28	* Create an RPC service	268	* Create an RPC service
29	*/	269	*/
30	static struct svc_serv *	270	static struct svc_serv *
@@ -105,8 +345,9 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
105	svc_thread_fn func, int sig, struct module *mod)	345	svc_thread_fn func, int sig, struct module *mod)
106	{	346	{
107	struct svc_serv *serv;	347	struct svc_serv *serv;
		348	unsigned int npools = svc_pool_map_init();
108		349
109	serv = __svc_create(prog, bufsize, /npools/1, shutdown);	350	serv = __svc_create(prog, bufsize, npools, shutdown);
110		351
111	if (serv != NULL) {	352	if (serv != NULL) {
112	serv->sv_function = func;	353	serv->sv_function = func;
@@ -209,6 +450,8 @@ svc_release_buffer(struct svc_rqst *rqstp)
209		450
210	/*	451	/*
211	* Create a thread in the given pool. Caller must hold BKL.	452	* Create a thread in the given pool. Caller must hold BKL.
		453	* On a NUMA or SMP machine, with a multi-pool serv, the thread
		454	* will be restricted to run on the cpus belonging to the pool.
212	*/	455	*/
213	static int	456	static int
214	__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,	457	__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
@@ -216,6 +459,8 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
216	{	459	{
217	struct svc_rqst *rqstp;	460	struct svc_rqst *rqstp;
218	int error = -ENOMEM;	461	int error = -ENOMEM;
		462	int have_oldmask = 0;
		463	cpumask_t oldmask;
219		464
220	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);	465	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
221	if (!rqstp)	466	if (!rqstp)
@@ -235,7 +480,15 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
235	spin_unlock_bh(&pool->sp_lock);	480	spin_unlock_bh(&pool->sp_lock);
236	rqstp->rq_server = serv;	481	rqstp->rq_server = serv;
237	rqstp->rq_pool = pool;	482	rqstp->rq_pool = pool;
		483
		484	if (serv->sv_nrpools > 1)
		485	have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
		486
238	error = kernel_thread((int ()(void )) func, rqstp, 0);	487	error = kernel_thread((int ()(void )) func, rqstp, 0);
		488
		489	if (have_oldmask)
		490	set_cpus_allowed(current, oldmask);
		491
239	if (error < 0)	492	if (error < 0)
240	goto out_thread;	493	goto out_thread;
241	svc_sock_update_bufs(serv);	494	svc_sock_update_bufs(serv);


diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b78659adeff3..cba85d195222 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c
@@ -151,8 +151,9 @@ static void
151	svc_sock_enqueue(struct svc_sock *svsk)	151	svc_sock_enqueue(struct svc_sock *svsk)
152	{	152	{
153	struct svc_serv *serv = svsk->sk_server;	153	struct svc_serv *serv = svsk->sk_server;
154	struct svc_pool *pool = &serv->sv_pools[0];	154	struct svc_pool *pool;
155	struct svc_rqst *rqstp;	155	struct svc_rqst *rqstp;
		156	int cpu;
156		157
157	if (!(svsk->sk_flags &	158	if (!(svsk->sk_flags &
158	( (1<<SK_CONN)\|(1<<SK_DATA)\|(1<<SK_CLOSE)\|(1<<SK_DEFERRED)) ))	159	( (1<<SK_CONN)\|(1<<SK_DATA)\|(1<<SK_CLOSE)\|(1<<SK_DEFERRED)) ))
@@ -160,6 +161,10 @@ svc_sock_enqueue(struct svc_sock *svsk)
160	if (test_bit(SK_DEAD, &svsk->sk_flags))	161	if (test_bit(SK_DEAD, &svsk->sk_flags))
161	return;	162	return;
162		163
		164	cpu = get_cpu();
		165	pool = svc_pool_for_cpu(svsk->sk_server, cpu);
		166	put_cpu();
		167
163	spin_lock_bh(&pool->sp_lock);	168	spin_lock_bh(&pool->sp_lock);
164		169
165	if (!list_empty(&pool->sp_threads) &&	170	if (!list_empty(&pool->sp_threads) &&