[PATCH] knfsd: make rpc threads pools numa aware

Actually implement multiple pools. On NUMA machines, allocate a svc_pool per NUMA node; on SMP a svc_pool per CPU; otherwise a single global pool. Enqueue sockets on the svc_pool corresponding to the CPU on which the socket bh is run (i.e. the NIC interrupt CPU). Threads have their cpu mask set to limit them to the CPUs in the svc_pool that owns them. This is the patch that allows an Altix to scale NFS traffic linearly beyond 4 CPUs and 4 NICs. Incorporates changes and feedback from Neil Brown, Trond Myklebust, and Christoph Hellwig. Signed-off-by: Greg Banks <gnb@melbourne.sgi.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Greg Banks <gnb@melbourne.sgi.com> 2006-10-02 05:18:01 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-10-02 10:57:20 -0400
commit: bfd241600a3b0db4fe43c859f1460d0a958d924a (patch)
tree: 7f04604adee7249e686d1db0cac93f1fee8bc5b6 /net/sunrpc/svc.c
parent: eec09661dc82e90a31051d045a94026a91aceb82 (diff)
1 files changed, 254 insertions, 1 deletions
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 8c75eec4fd6a..a99e67b164c1 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -4,6 +4,10 @@
 * High-level RPC service routines
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ *
+ * Multiple threads pools and NUMAisation
+ * Copyright (c) 2006 Silicon Graphics, Inc.
+ * by Greg Banks <gnb@melbourne.sgi.com>
 */
 #include <linux/linkage.h>
@@ -25,6 +29,242 @@
 #define RPC_PARANOIA 1
 /*
+ * Mode for mapping cpus to pools.
+ */
+enum {
+        SVC_POOL_NONE = -1,     /* uninitialised, choose one of the others */
+        SVC_POOL_GLOBAL,        /* no mapping, just a single global pool
+                                 * (legacy & UP mode) */
+        SVC_POOL_PERCPU,        /* one pool per cpu */
+        SVC_POOL_PERNODE        /* one pool per numa node */
+};
+/*
+ * Structure for mapping cpus to pools and vice versa.
+ * Setup once during sunrpc initialisation.
+ */
+static struct svc_pool_map {
+        int mode;                       /* Note: int not enum to avoid
+                                         * warnings about "enumeration value
+                                         * not handled in switch" */
+        unsigned int npools;
+        unsigned int *pool_to;          /* maps pool id to cpu or node */
+        unsigned int *to_pool;          /* maps cpu or node to pool id */
+} svc_pool_map = {
+        .mode = SVC_POOL_NONE
+};
+/*
+ * Detect best pool mapping mode heuristically,
+ * according to the machine's topology.
+ */
+static int
+svc_pool_map_choose_mode(void)
+{
+        unsigned int node;
+        if (num_online_nodes() > 1) {
+                /*
+                 * Actually have multiple NUMA nodes,
+                 * so split pools on NUMA node boundaries
+                 */
+                return SVC_POOL_PERNODE;
+        }
+        node = any_online_node(node_online_map);
+        if (nr_cpus_node(node) > 2) {
+                /*
+                 * Non-trivial SMP, or CONFIG_NUMA on
+                 * non-NUMA hardware, e.g. with a generic
+                 * x86_64 kernel on Xeons.  In this case we
+                 * want to divide the pools on cpu boundaries.
+                 */
+                return SVC_POOL_PERCPU;
+        }
+        /* default: one global pool */
+        return SVC_POOL_GLOBAL;
+}
+/*
+ * Allocate the to_pool[] and pool_to[] arrays.
+ * Returns 0 on success or an errno.
+ */
+static int
+svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
+{
+        m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
+        if (!m->to_pool)
+                goto fail;
+        m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
+        if (!m->pool_to)
+                goto fail_free;
+        return 0;
+fail_free:
+        kfree(m->to_pool);
+fail:
+        return -ENOMEM;
+}
+/*
+ * Initialise the pool map for SVC_POOL_PERCPU mode.
+ * Returns number of pools or <0 on error.
+ */
+static int
+svc_pool_map_init_percpu(struct svc_pool_map *m)
+{
+        unsigned int maxpools = highest_possible_processor_id()+1;
+        unsigned int pidx = 0;
+        unsigned int cpu;
+        int err;
+        err = svc_pool_map_alloc_arrays(m, maxpools);
+        if (err)
+                return err;
+        for_each_online_cpu(cpu) {
+                BUG_ON(pidx > maxpools);
+                m->to_pool[cpu] = pidx;
+                m->pool_to[pidx] = cpu;
+                pidx++;
+        }
+        /* cpus brought online later all get mapped to pool0, sorry */
+        return pidx;
+};
+/*
+ * Initialise the pool map for SVC_POOL_PERNODE mode.
+ * Returns number of pools or <0 on error.
+ */
+static int
+svc_pool_map_init_pernode(struct svc_pool_map *m)
+{
+        unsigned int maxpools = highest_possible_node_id()+1;
+        unsigned int pidx = 0;
+        unsigned int node;
+        int err;
+        err = svc_pool_map_alloc_arrays(m, maxpools);
+        if (err)
+                return err;
+        for_each_node_with_cpus(node) {
+                /* some architectures (e.g. SN2) have cpuless nodes */
+                BUG_ON(pidx > maxpools);
+                m->to_pool[node] = pidx;
+                m->pool_to[pidx] = node;
+                pidx++;
+        }
+        /* nodes brought online later all get mapped to pool0, sorry */
+        return pidx;
+}
+/*
+ * Build the global map of cpus to pools and vice versa.
+ */
+static unsigned int
+svc_pool_map_init(void)
+{
+        struct svc_pool_map *m = &svc_pool_map;
+        int npools = -1;
+        if (m->mode != SVC_POOL_NONE)
+                return m->npools;
+        m->mode = svc_pool_map_choose_mode();
+        switch (m->mode) {
+        case SVC_POOL_PERCPU:
+                npools = svc_pool_map_init_percpu(m);
+                break;
+        case SVC_POOL_PERNODE:
+                npools = svc_pool_map_init_pernode(m);
+                break;
+        }
+        if (npools < 0) {
+                /* default, or memory allocation failure */
+                npools = 1;
+                m->mode = SVC_POOL_GLOBAL;
+        }
+        m->npools = npools;
+        return m->npools;
+}
+/*
+ * Set the current thread's cpus_allowed mask so that it
+ * will only run on cpus in the given pool.
+ *
+ * Returns 1 and fills in oldmask iff a cpumask was applied.
+ */
+static inline int
+svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
+{
+        struct svc_pool_map *m = &svc_pool_map;
+        unsigned int node; /* or cpu */
+        /*
+         * The caller checks for sv_nrpools > 1, which
+         * implies that we've been initialized and the
+         * map mode is not NONE.
+         */
+        BUG_ON(m->mode == SVC_POOL_NONE);
+        switch (m->mode)
+        {
+        default:
+                return 0;
+        case SVC_POOL_PERCPU:
+                node = m->pool_to[pidx];
+                *oldmask = current->cpus_allowed;
+                set_cpus_allowed(current, cpumask_of_cpu(node));
+                return 1;
+        case SVC_POOL_PERNODE:
+                node = m->pool_to[pidx];
+                *oldmask = current->cpus_allowed;
+                set_cpus_allowed(current, node_to_cpumask(node));
+                return 1;
+        }
+}
+/*
+ * Use the mapping mode to choose a pool for a given CPU.
+ * Used when enqueueing an incoming RPC.  Always returns
+ * a non-NULL pool pointer.
+ */
+struct svc_pool *
+svc_pool_for_cpu(struct svc_serv *serv, int cpu)
+{
+        struct svc_pool_map *m = &svc_pool_map;
+        unsigned int pidx = 0;
+        /*
+         * SVC_POOL_NONE happens in a pure client when
+         * lockd is brought up, so silently treat it the
+         * same as SVC_POOL_GLOBAL.
+         */
+        switch (m->mode) {
+        case SVC_POOL_PERCPU:
+                pidx = m->to_pool[cpu];
+                break;
+        case SVC_POOL_PERNODE:
+                pidx = m->to_pool[cpu_to_node(cpu)];
+                break;
+        }
+        return &serv->sv_pools[pidx % serv->sv_nrpools];
+}
+/*
 * Create an RPC service
 */
 static struct svc_serv *
@@ -105,8 +345,9 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
                  svc_thread_fn func, int sig, struct module *mod)
 {
        struct svc_serv *serv;
+        unsigned int npools = svc_pool_map_init();
-        serv = __svc_create(prog, bufsize, /*npools*/1, shutdown);
+        serv = __svc_create(prog, bufsize, npools, shutdown);
        if (serv != NULL) {
                serv->sv_function = func;
@@ -209,6 +450,8 @@ svc_release_buffer(struct svc_rqst *rqstp)
 /*
 * Create a thread in the given pool.  Caller must hold BKL.
+ * On a NUMA or SMP machine, with a multi-pool serv, the thread
+ * will be restricted to run on the cpus belonging to the pool.
 */
 static int
 __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
@@ -216,6 +459,8 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
 {
        struct svc_rqst *rqstp;
        int             error = -ENOMEM;
+        int             have_oldmask = 0;
+        cpumask_t       oldmask;
        rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
        if (!rqstp)
@@ -235,7 +480,15 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
        spin_unlock_bh(&pool->sp_lock);
        rqstp->rq_server = serv;
        rqstp->rq_pool = pool;
+        if (serv->sv_nrpools > 1)
+                have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
        error = kernel_thread((int (*)(void *)) func, rqstp, 0);
+        if (have_oldmask)
+                set_cpus_allowed(current, oldmask);
        if (error < 0)
                goto out_thread;
        svc_sock_update_bufs(serv);
author	Greg Banks <gnb@melbourne.sgi.com>	2006-10-02 05:18:01 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-10-02 10:57:20 -0400
commit	bfd241600a3b0db4fe43c859f1460d0a958d924a (patch)
tree	7f04604adee7249e686d1db0cac93f1fee8bc5b6 /net/sunrpc/svc.c
parent	eec09661dc82e90a31051d045a94026a91aceb82 (diff)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 8c75eec4fd6a..a99e67b164c1 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c
@@ -4,6 +4,10 @@
4	* High-level RPC service routines	4	* High-level RPC service routines
5	*	5	*
6	* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>	6	* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
		7	*
		8	* Multiple threads pools and NUMAisation
		9	* Copyright (c) 2006 Silicon Graphics, Inc.
		10	* by Greg Banks <gnb@melbourne.sgi.com>
7	*/	11	*/
8		12
9	#include <linux/linkage.h>	13	#include <linux/linkage.h>
@@ -25,6 +29,242 @@
25	#define RPC_PARANOIA 1	29	#define RPC_PARANOIA 1
26		30
27	/*	31	/*
		32	* Mode for mapping cpus to pools.
		33	*/
		34	enum {
		35	SVC_POOL_NONE = -1, /* uninitialised, choose one of the others */
		36	SVC_POOL_GLOBAL, /* no mapping, just a single global pool
		37	* (legacy & UP mode) */
		38	SVC_POOL_PERCPU, /* one pool per cpu */
		39	SVC_POOL_PERNODE /* one pool per numa node */
		40	};
		41
		42	/*
		43	* Structure for mapping cpus to pools and vice versa.
		44	* Setup once during sunrpc initialisation.
		45	*/
		46	static struct svc_pool_map {
		47	int mode; /* Note: int not enum to avoid
		48	* warnings about "enumeration value
		49	* not handled in switch" */
		50	unsigned int npools;
		51	unsigned int pool_to; / maps pool id to cpu or node */
		52	unsigned int to_pool; / maps cpu or node to pool id */
		53	} svc_pool_map = {
		54	.mode = SVC_POOL_NONE
		55	};
		56
		57
		58	/*
		59	* Detect best pool mapping mode heuristically,
		60	* according to the machine's topology.
		61	*/
		62	static int
		63	svc_pool_map_choose_mode(void)
		64	{
		65	unsigned int node;
		66
		67	if (num_online_nodes() > 1) {
		68	/*
		69	* Actually have multiple NUMA nodes,
		70	* so split pools on NUMA node boundaries
		71	*/
		72	return SVC_POOL_PERNODE;
		73	}
		74
		75	node = any_online_node(node_online_map);
		76	if (nr_cpus_node(node) > 2) {
		77	/*
		78	* Non-trivial SMP, or CONFIG_NUMA on
		79	* non-NUMA hardware, e.g. with a generic
		80	* x86_64 kernel on Xeons. In this case we
		81	* want to divide the pools on cpu boundaries.
		82	*/
		83	return SVC_POOL_PERCPU;
		84	}
		85
		86	/* default: one global pool */
		87	return SVC_POOL_GLOBAL;
		88	}
		89
		90	/*
		91	* Allocate the to_pool[] and pool_to[] arrays.
		92	* Returns 0 on success or an errno.
		93	*/
		94	static int
		95	svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
		96	{
		97	m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
		98	if (!m->to_pool)
		99	goto fail;
		100	m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
		101	if (!m->pool_to)
		102	goto fail_free;
		103
		104	return 0;
		105
		106	fail_free:
		107	kfree(m->to_pool);
		108	fail:
		109	return -ENOMEM;
		110	}
		111
		112	/*
		113	* Initialise the pool map for SVC_POOL_PERCPU mode.
		114	* Returns number of pools or <0 on error.
		115	*/
		116	static int
		117	svc_pool_map_init_percpu(struct svc_pool_map *m)
		118	{
		119	unsigned int maxpools = highest_possible_processor_id()+1;
		120	unsigned int pidx = 0;
		121	unsigned int cpu;
		122	int err;
		123
		124	err = svc_pool_map_alloc_arrays(m, maxpools);
		125	if (err)
		126	return err;
		127
		128	for_each_online_cpu(cpu) {
		129	BUG_ON(pidx > maxpools);
		130	m->to_pool[cpu] = pidx;
		131	m->pool_to[pidx] = cpu;
		132	pidx++;
		133	}
		134	/* cpus brought online later all get mapped to pool0, sorry */
		135
		136	return pidx;
		137	};
		138
		139
		140	/*
		141	* Initialise the pool map for SVC_POOL_PERNODE mode.
		142	* Returns number of pools or <0 on error.
		143	*/
		144	static int
		145	svc_pool_map_init_pernode(struct svc_pool_map *m)
		146	{
		147	unsigned int maxpools = highest_possible_node_id()+1;
		148	unsigned int pidx = 0;
		149	unsigned int node;
		150	int err;
		151
		152	err = svc_pool_map_alloc_arrays(m, maxpools);
		153	if (err)
		154	return err;
		155
		156	for_each_node_with_cpus(node) {
		157	/* some architectures (e.g. SN2) have cpuless nodes */
		158	BUG_ON(pidx > maxpools);
		159	m->to_pool[node] = pidx;
		160	m->pool_to[pidx] = node;
		161	pidx++;
		162	}
		163	/* nodes brought online later all get mapped to pool0, sorry */
		164
		165	return pidx;
		166	}
		167
		168
		169	/*
		170	* Build the global map of cpus to pools and vice versa.
		171	*/
		172	static unsigned int
		173	svc_pool_map_init(void)
		174	{
		175	struct svc_pool_map *m = &svc_pool_map;
		176	int npools = -1;
		177
		178	if (m->mode != SVC_POOL_NONE)
		179	return m->npools;
		180
		181	m->mode = svc_pool_map_choose_mode();
		182
		183	switch (m->mode) {
		184	case SVC_POOL_PERCPU:
		185	npools = svc_pool_map_init_percpu(m);
		186	break;
		187	case SVC_POOL_PERNODE:
		188	npools = svc_pool_map_init_pernode(m);
		189	break;
		190	}
		191
		192	if (npools < 0) {
		193	/* default, or memory allocation failure */
		194	npools = 1;
		195	m->mode = SVC_POOL_GLOBAL;
		196	}
		197	m->npools = npools;
		198
		199	return m->npools;
		200	}
		201
		202	/*
		203	* Set the current thread's cpus_allowed mask so that it
		204	* will only run on cpus in the given pool.
		205	*
		206	* Returns 1 and fills in oldmask iff a cpumask was applied.
		207	*/
		208	static inline int
		209	svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
		210	{
		211	struct svc_pool_map *m = &svc_pool_map;
		212	unsigned int node; /* or cpu */
		213
		214	/*
		215	* The caller checks for sv_nrpools > 1, which
		216	* implies that we've been initialized and the
		217	* map mode is not NONE.
		218	*/
		219	BUG_ON(m->mode == SVC_POOL_NONE);
		220
		221	switch (m->mode)
		222	{
		223	default:
		224	return 0;
		225	case SVC_POOL_PERCPU:
		226	node = m->pool_to[pidx];
		227	*oldmask = current->cpus_allowed;
		228	set_cpus_allowed(current, cpumask_of_cpu(node));
		229	return 1;
		230	case SVC_POOL_PERNODE:
		231	node = m->pool_to[pidx];
		232	*oldmask = current->cpus_allowed;
		233	set_cpus_allowed(current, node_to_cpumask(node));
		234	return 1;
		235	}
		236	}
		237
		238	/*
		239	* Use the mapping mode to choose a pool for a given CPU.
		240	* Used when enqueueing an incoming RPC. Always returns
		241	* a non-NULL pool pointer.
		242	*/
		243	struct svc_pool *
		244	svc_pool_for_cpu(struct svc_serv *serv, int cpu)
		245	{
		246	struct svc_pool_map *m = &svc_pool_map;
		247	unsigned int pidx = 0;
		248
		249	/*
		250	* SVC_POOL_NONE happens in a pure client when
		251	* lockd is brought up, so silently treat it the
		252	* same as SVC_POOL_GLOBAL.
		253	*/
		254
		255	switch (m->mode) {
		256	case SVC_POOL_PERCPU:
		257	pidx = m->to_pool[cpu];
		258	break;
		259	case SVC_POOL_PERNODE:
		260	pidx = m->to_pool[cpu_to_node(cpu)];
		261	break;
		262	}
		263	return &serv->sv_pools[pidx % serv->sv_nrpools];
		264	}
		265
		266
		267	/*
28	* Create an RPC service	268	* Create an RPC service
29	*/	269	*/
30	static struct svc_serv *	270	static struct svc_serv *
@@ -105,8 +345,9 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
105	svc_thread_fn func, int sig, struct module *mod)	345	svc_thread_fn func, int sig, struct module *mod)
106	{	346	{
107	struct svc_serv *serv;	347	struct svc_serv *serv;
		348	unsigned int npools = svc_pool_map_init();
108		349
109	serv = __svc_create(prog, bufsize, /npools/1, shutdown);	350	serv = __svc_create(prog, bufsize, npools, shutdown);
110		351
111	if (serv != NULL) {	352	if (serv != NULL) {
112	serv->sv_function = func;	353	serv->sv_function = func;
@@ -209,6 +450,8 @@ svc_release_buffer(struct svc_rqst *rqstp)
209		450
210	/*	451	/*
211	* Create a thread in the given pool. Caller must hold BKL.	452	* Create a thread in the given pool. Caller must hold BKL.
		453	* On a NUMA or SMP machine, with a multi-pool serv, the thread
		454	* will be restricted to run on the cpus belonging to the pool.
212	*/	455	*/
213	static int	456	static int
214	__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,	457	__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
@@ -216,6 +459,8 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
216	{	459	{
217	struct svc_rqst *rqstp;	460	struct svc_rqst *rqstp;
218	int error = -ENOMEM;	461	int error = -ENOMEM;
		462	int have_oldmask = 0;
		463	cpumask_t oldmask;
219		464
220	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);	465	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
221	if (!rqstp)	466	if (!rqstp)
@@ -235,7 +480,15 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
235	spin_unlock_bh(&pool->sp_lock);	480	spin_unlock_bh(&pool->sp_lock);
236	rqstp->rq_server = serv;	481	rqstp->rq_server = serv;
237	rqstp->rq_pool = pool;	482	rqstp->rq_pool = pool;
		483
		484	if (serv->sv_nrpools > 1)
		485	have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
		486
238	error = kernel_thread((int ()(void )) func, rqstp, 0);	487	error = kernel_thread((int ()(void )) func, rqstp, 0);
		488
		489	if (have_oldmask)
		490	set_cpus_allowed(current, oldmask);
		491
239	if (error < 0)	492	if (error < 0)
240	goto out_thread;	493	goto out_thread;
241	svc_sock_update_bufs(serv);	494	svc_sock_update_bufs(serv);