aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@engr.sgi.com>2006-03-09 20:33:54 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-09 22:47:38 -0500
commit8fce4d8e3b9e3cf47cc8afeb6077e22ab795d989 (patch)
tree4930be5756f7a3893717d38f443f6261f11a1f60
parent7b61fcda8a640bb87be23f9f09c1f24357b5c6e1 (diff)
[PATCH] slab: Node rotor for freeing alien caches and remote per cpu pages.
The cache reaper currently tries to free all alien caches and all remote per cpu pages in each pass of cache_reap. For a machines with large number of nodes (such as Altix) this may lead to sporadic delays of around ~10ms. Interrupts are disabled while reclaiming creating unacceptable delays. This patch changes that behavior by adding a per cpu reap_node variable. Instead of attempting to free all caches, we free only one alien cache and the per cpu pages from one remote node. That reduces the time spend in cache_reap. However, doing so will lengthen the time it takes to completely drain all remote per cpu pagesets and all alien caches. The time needed will grow with the number of nodes in the system. All caches are drained when they overflow their respective capacity. So the drawback here is only that a bit of memory may be wasted for awhile longer. Details: 1. Rename drain_remote_pages to drain_node_pages to allow the specification of the node to drain of pcp pages. 2. Add additional functions init_reap_node, next_reap_node for NUMA that manage a per cpu reap_node counter. 3. Add a reap_alien function that reaps only from the current reap_node. For us this seems to be a critical issue. Holdoffs of an average of ~7ms cause some HPC benchmarks to slow down significantly. F.e. NAS parallel slows down dramatically. NAS parallel has a 12-16 seconds runtime w/o rotor compared to 5.8 secs with the rotor patches. It gets down to 5.05 secs with the additional interrupt holdoff reductions. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/gfp.h4
-rw-r--r--mm/page_alloc.c17
-rw-r--r--mm/slab.c65
3 files changed, 72 insertions, 14 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 20f9148e38d9..7851e6b520cf 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -157,9 +157,9 @@ extern void FASTCALL(free_cold_page(struct page *page));
157 157
158void page_alloc_init(void); 158void page_alloc_init(void);
159#ifdef CONFIG_NUMA 159#ifdef CONFIG_NUMA
160void drain_remote_pages(void); 160void drain_node_pages(int node);
161#else 161#else
162static inline void drain_remote_pages(void) { }; 162static inline void drain_node_pages(int node) { };
163#endif 163#endif
164 164
165#endif /* __LINUX_GFP_H */ 165#endif /* __LINUX_GFP_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 791690d7d3fa..234bd4895d14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
590} 590}
591 591
592#ifdef CONFIG_NUMA 592#ifdef CONFIG_NUMA
593/* Called from the slab reaper to drain remote pagesets */ 593/*
594void drain_remote_pages(void) 594 * Called from the slab reaper to drain pagesets on a particular node that
595 * belong to the currently executing processor.
596 */
597void drain_node_pages(int nodeid)
595{ 598{
596 struct zone *zone; 599 int i, z;
597 int i;
598 unsigned long flags; 600 unsigned long flags;
599 601
600 local_irq_save(flags); 602 local_irq_save(flags);
601 for_each_zone(zone) { 603 for (z = 0; z < MAX_NR_ZONES; z++) {
604 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
602 struct per_cpu_pageset *pset; 605 struct per_cpu_pageset *pset;
603 606
604 /* Do not drain local pagesets */
605 if (zone->zone_pgdat->node_id == numa_node_id())
606 continue;
607
608 pset = zone_pcp(zone, smp_processor_id()); 607 pset = zone_pcp(zone, smp_processor_id());
609 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 608 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
610 struct per_cpu_pages *pcp; 609 struct per_cpu_pages *pcp;
diff --git a/mm/slab.c b/mm/slab.c
index 61800b88e241..d0bd7f07ab04 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
789 dump_stack(); 789 dump_stack();
790} 790}
791 791
792#ifdef CONFIG_NUMA
793/*
794 * Special reaping functions for NUMA systems called from cache_reap().
795 * These take care of doing round robin flushing of alien caches (containing
796 * objects freed on different nodes from which they were allocated) and the
797 * flushing of remote pcps by calling drain_node_pages.
798 */
799static DEFINE_PER_CPU(unsigned long, reap_node);
800
801static void init_reap_node(int cpu)
802{
803 int node;
804
805 node = next_node(cpu_to_node(cpu), node_online_map);
806 if (node == MAX_NUMNODES)
807 node = 0;
808
809 __get_cpu_var(reap_node) = node;
810}
811
812static void next_reap_node(void)
813{
814 int node = __get_cpu_var(reap_node);
815
816 /*
817 * Also drain per cpu pages on remote zones
818 */
819 if (node != numa_node_id())
820 drain_node_pages(node);
821
822 node = next_node(node, node_online_map);
823 if (unlikely(node >= MAX_NUMNODES))
824 node = first_node(node_online_map);
825 __get_cpu_var(reap_node) = node;
826}
827
828#else
829#define init_reap_node(cpu) do { } while (0)
830#define next_reap_node(void) do { } while (0)
831#endif
832
792/* 833/*
793 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 834 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
794 * via the workqueue/eventd. 835 * via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
806 * at that time. 847 * at that time.
807 */ 848 */
808 if (keventd_up() && reap_work->func == NULL) { 849 if (keventd_up() && reap_work->func == NULL) {
850 init_reap_node(cpu);
809 INIT_WORK(reap_work, cache_reap, NULL); 851 INIT_WORK(reap_work, cache_reap, NULL);
810 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 852 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
811 } 853 }
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
884 } 926 }
885} 927}
886 928
929/*
930 * Called from cache_reap() to regularly drain alien caches round robin.
931 */
932static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
933{
934 int node = __get_cpu_var(reap_node);
935
936 if (l3->alien) {
937 struct array_cache *ac = l3->alien[node];
938 if (ac && ac->avail) {
939 spin_lock_irq(&ac->lock);
940 __drain_alien_cache(cachep, ac, node);
941 spin_unlock_irq(&ac->lock);
942 }
943 }
944}
945
887static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) 946static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
888{ 947{
889 int i = 0; 948 int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
902#else 961#else
903 962
904#define drain_alien_cache(cachep, alien) do { } while (0) 963#define drain_alien_cache(cachep, alien) do { } while (0)
964#define reap_alien(cachep, l3) do { } while (0)
905 965
906static inline struct array_cache **alloc_alien_cache(int node, int limit) 966static inline struct array_cache **alloc_alien_cache(int node, int limit)
907{ 967{
@@ -3497,8 +3557,7 @@ static void cache_reap(void *unused)
3497 check_irq_on(); 3557 check_irq_on();
3498 3558
3499 l3 = searchp->nodelists[numa_node_id()]; 3559 l3 = searchp->nodelists[numa_node_id()];
3500 if (l3->alien) 3560 reap_alien(searchp, l3);
3501 drain_alien_cache(searchp, l3->alien);
3502 spin_lock_irq(&l3->list_lock); 3561 spin_lock_irq(&l3->list_lock);
3503 3562
3504 drain_array_locked(searchp, cpu_cache_get(searchp), 0, 3563 drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3548,7 +3607,7 @@ static void cache_reap(void *unused)
3548 } 3607 }
3549 check_irq_on(); 3608 check_irq_on();
3550 mutex_unlock(&cache_chain_mutex); 3609 mutex_unlock(&cache_chain_mutex);
3551 drain_remote_pages(); 3610 next_reap_node();
3552 /* Setup the next iteration */ 3611 /* Setup the next iteration */
3553 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3612 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3554} 3613}