[PATCH] slab: Node rotor for freeing alien caches and remote per cpu pages.

The cache reaper currently tries to free all alien caches and all remote per cpu pages in each pass of cache_reap. For a machines with large number of nodes (such as Altix) this may lead to sporadic delays of around ~10ms. Interrupts are disabled while reclaiming creating unacceptable delays. This patch changes that behavior by adding a per cpu reap_node variable. Instead of attempting to free all caches, we free only one alien cache and the per cpu pages from one remote node. That reduces the time spend in cache_reap. However, doing so will lengthen the time it takes to completely drain all remote per cpu pagesets and all alien caches. The time needed will grow with the number of nodes in the system. All caches are drained when they overflow their respective capacity. So the drawback here is only that a bit of memory may be wasted for awhile longer. Details: 1. Rename drain_remote_pages to drain_node_pages to allow the specification of the node to drain of pcp pages. 2. Add additional functions init_reap_node, next_reap_node for NUMA that manage a per cpu reap_node counter. 3. Add a reap_alien function that reaps only from the current reap_node. For us this seems to be a critical issue. Holdoffs of an average of ~7ms cause some HPC benchmarks to slow down significantly. F.e. NAS parallel slows down dramatically. NAS parallel has a 12-16 seconds runtime w/o rotor compared to 5.8 secs with the rotor patches. It gets down to 5.05 secs with the additional interrupt holdoff reductions. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@engr.sgi.com> 2006-03-09 20:33:54 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-03-09 22:47:38 -0500
commit: 8fce4d8e3b9e3cf47cc8afeb6077e22ab795d989 (patch)
tree: 4930be5756f7a3893717d38f443f6261f11a1f60 /mm
parent: 7b61fcda8a640bb87be23f9f09c1f24357b5c6e1 (diff)
2 files changed, 70 insertions, 12 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 791690d7d3fa..234bd4895d14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 }
 #ifdef CONFIG_NUMA
-/* Called from the slab reaper to drain remote pagesets */
+/*
-void drain_remote_pages(void)
+ * Called from the slab reaper to drain pagesets on a particular node that
+ * belong to the currently executing processor.
+ */
+void drain_node_pages(int nodeid)
 {
-        struct zone *zone;
+        int i, z;
-        int i;
        unsigned long flags;
        local_irq_save(flags);
-        for_each_zone(zone) {
+        for (z = 0; z < MAX_NR_ZONES; z++) {
+                struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
                struct per_cpu_pageset *pset;
-                /* Do not drain local pagesets */
-                if (zone->zone_pgdat->node_id == numa_node_id())
-                        continue;
                pset = zone_pcp(zone, smp_processor_id());
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
diff --git a/mm/slab.c b/mm/slab.c
index 61800b88e241..d0bd7f07ab04 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
        dump_stack();
 }
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, reap_node);
+static void init_reap_node(int cpu)
+{
+        int node;
+        node = next_node(cpu_to_node(cpu), node_online_map);
+        if (node == MAX_NUMNODES)
+                node = 0;
+        __get_cpu_var(reap_node) = node;
+}
+static void next_reap_node(void)
+{
+        int node = __get_cpu_var(reap_node);
+        /*
+         * Also drain per cpu pages on remote zones
+         */
+        if (node != numa_node_id())
+                drain_node_pages(node);
+        node = next_node(node, node_online_map);
+        if (unlikely(node >= MAX_NUMNODES))
+                node = first_node(node_online_map);
+        __get_cpu_var(reap_node) = node;
+}
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
 /*
 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 * via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
         * at that time.
         */
        if (keventd_up() && reap_work->func == NULL) {
+                init_reap_node(cpu);
                INIT_WORK(reap_work, cache_reap, NULL);
                schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
        }
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
        }
 }
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+{
+        int node = __get_cpu_var(reap_node);
+        if (l3->alien) {
+                struct array_cache *ac = l3->alien[node];
+                if (ac && ac->avail) {
+                        spin_lock_irq(&ac->lock);
+                        __drain_alien_cache(cachep, ac, node);
+                        spin_unlock_irq(&ac->lock);
+                }
+        }
+}
 static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
        int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
 #else
 #define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -3497,8 +3557,7 @@ static void cache_reap(void *unused)
                check_irq_on();
                l3 = searchp->nodelists[numa_node_id()];
-                if (l3->alien)
+                reap_alien(searchp, l3);
-                        drain_alien_cache(searchp, l3->alien);
                spin_lock_irq(&l3->list_lock);
                drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3548,7 +3607,7 @@ static void cache_reap(void *unused)
        }
        check_irq_on();
        mutex_unlock(&cache_chain_mutex);
-        drain_remote_pages();
+        next_reap_node();
        /* Setup the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
author	Christoph Lameter <clameter@engr.sgi.com>	2006-03-09 20:33:54 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-03-09 22:47:38 -0500
commit	8fce4d8e3b9e3cf47cc8afeb6077e22ab795d989 (patch)
tree	4930be5756f7a3893717d38f443f6261f11a1f60 /mm
parent	7b61fcda8a640bb87be23f9f09c1f24357b5c6e1 (diff)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 791690d7d3fa..234bd4895d14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
590	}	590	}
591		591
592	#ifdef CONFIG_NUMA	592	#ifdef CONFIG_NUMA
593	/* Called from the slab reaper to drain remote pagesets */	593	/*
594	void drain_remote_pages(void)	594	* Called from the slab reaper to drain pagesets on a particular node that
		595	* belong to the currently executing processor.
		596	*/
		597	void drain_node_pages(int nodeid)
595	{	598	{
596	struct zone *zone;	599	int i, z;
597	int i;
598	unsigned long flags;	600	unsigned long flags;
599		601
600	local_irq_save(flags);	602	local_irq_save(flags);
601	for_each_zone(zone) {	603	for (z = 0; z < MAX_NR_ZONES; z++) {
		604	struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
602	struct per_cpu_pageset *pset;	605	struct per_cpu_pageset *pset;
603		606
604	/* Do not drain local pagesets */
605	if (zone->zone_pgdat->node_id == numa_node_id())
606	continue;
607
608	pset = zone_pcp(zone, smp_processor_id());	607	pset = zone_pcp(zone, smp_processor_id());
609	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {	608	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
610	struct per_cpu_pages *pcp;	609	struct per_cpu_pages *pcp;


diff --git a/mm/slab.c b/mm/slab.c index 61800b88e241..d0bd7f07ab04 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char function, struct kmem_cache cachep, char *
789	dump_stack();	789	dump_stack();
790	}	790	}
791		791
		792	#ifdef CONFIG_NUMA
		793	/*
		794	* Special reaping functions for NUMA systems called from cache_reap().
		795	* These take care of doing round robin flushing of alien caches (containing
		796	* objects freed on different nodes from which they were allocated) and the
		797	* flushing of remote pcps by calling drain_node_pages.
		798	*/
		799	static DEFINE_PER_CPU(unsigned long, reap_node);
		800
		801	static void init_reap_node(int cpu)
		802	{
		803	int node;
		804
		805	node = next_node(cpu_to_node(cpu), node_online_map);
		806	if (node == MAX_NUMNODES)
		807	node = 0;
		808
		809	__get_cpu_var(reap_node) = node;
		810	}
		811
		812	static void next_reap_node(void)
		813	{
		814	int node = __get_cpu_var(reap_node);
		815
		816	/*
		817	* Also drain per cpu pages on remote zones
		818	*/
		819	if (node != numa_node_id())
		820	drain_node_pages(node);
		821
		822	node = next_node(node, node_online_map);
		823	if (unlikely(node >= MAX_NUMNODES))
		824	node = first_node(node_online_map);
		825	__get_cpu_var(reap_node) = node;
		826	}
		827
		828	#else
		829	#define init_reap_node(cpu) do { } while (0)
		830	#define next_reap_node(void) do { } while (0)
		831	#endif
		832
792	/*	833	/*
793	* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz	834	* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
794	* via the workqueue/eventd.	835	* via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
806	* at that time.	847	* at that time.
807	*/	848	*/
808	if (keventd_up() && reap_work->func == NULL) {	849	if (keventd_up() && reap_work->func == NULL) {
		850	init_reap_node(cpu);
809	INIT_WORK(reap_work, cache_reap, NULL);	851	INIT_WORK(reap_work, cache_reap, NULL);
810	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);	852	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
811	}	853	}
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
884	}	926	}
885	}	927	}
886		928
		929	/*
		930	* Called from cache_reap() to regularly drain alien caches round robin.
		931	*/
		932	static void reap_alien(struct kmem_cache cachep, struct kmem_list3 l3)
		933	{
		934	int node = __get_cpu_var(reap_node);
		935
		936	if (l3->alien) {
		937	struct array_cache *ac = l3->alien[node];
		938	if (ac && ac->avail) {
		939	spin_lock_irq(&ac->lock);
		940	__drain_alien_cache(cachep, ac, node);
		941	spin_unlock_irq(&ac->lock);
		942	}
		943	}
		944	}
		945
887	static void drain_alien_cache(struct kmem_cache cachep, struct array_cache *alien)	946	static void drain_alien_cache(struct kmem_cache cachep, struct array_cache *alien)
888	{	947	{
889	int i = 0;	948	int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache cachep, struct array_cache *al
902	#else	961	#else
903		962
904	#define drain_alien_cache(cachep, alien) do { } while (0)	963	#define drain_alien_cache(cachep, alien) do { } while (0)
		964	#define reap_alien(cachep, l3) do { } while (0)
905		965
906	static inline struct array_cache **alloc_alien_cache(int node, int limit)	966	static inline struct array_cache **alloc_alien_cache(int node, int limit)
907	{	967	{
@@ -3497,8 +3557,7 @@ static void cache_reap(void *unused)
3497	check_irq_on();	3557	check_irq_on();
3498		3558
3499	l3 = searchp->nodelists[numa_node_id()];	3559	l3 = searchp->nodelists[numa_node_id()];
3500	if (l3->alien)	3560	reap_alien(searchp, l3);
3501	drain_alien_cache(searchp, l3->alien);
3502	spin_lock_irq(&l3->list_lock);	3561	spin_lock_irq(&l3->list_lock);
3503		3562
3504	drain_array_locked(searchp, cpu_cache_get(searchp), 0,	3563	drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3548,7 +3607,7 @@ static void cache_reap(void *unused)
3548	}	3607	}
3549	check_irq_on();	3608	check_irq_on();
3550	mutex_unlock(&cache_chain_mutex);	3609	mutex_unlock(&cache_chain_mutex);
3551	drain_remote_pages();	3610	next_reap_node();
3552	/* Setup the next iteration */	3611	/* Setup the next iteration */
3553	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);	3612	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3554	}	3613	}