aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-09-01 00:27:35 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-09-01 14:39:08 -0400
commitdf9ecaba3f152d1ea79f2a5e0b87505e03f47590 (patch)
treeb25f855923ef437a0513559425d6c875dbd3e617
parenta302eb4e4602d6444ae75a0e516fb2f2c62d6642 (diff)
[PATCH] ZVC: Scale thresholds depending on the size of the system
The ZVC counter update threshold is currently set to a fixed value of 32. This patch sets up the threshold depending on the number of processors and the sizes of the zones in the system. With the current threshold of 32, I was able to observe slight contention when more than 130-140 processors concurrently updated the counters. The contention vanished when I either increased the threshold to 64 or used Andrew's idea of overstepping the interval (see ZVC overstep patch). However, we saw contention again at 220-230 processors. So we need higher values for larger systems. But the current default is already a bit of an overkill for smaller systems. Some systems have tiny zones where precision matters. For example i386 and x86_64 have 16M DMA zones and either 900M ZONE_NORMAL or ZONE_DMA32. These are even present on SMP and NUMA systems. The patch here sets up a threshold based on the number of processors in the system and the size of the zone that these counters are used for. The threshold should grow logarithmically, so we use fls() as an easy approximation. Results of tests on a system with 1024 processors (4TB RAM) The following output is from a test allocating 1GB of memory concurrently on each processor (Forking the process. So contention on mmap_sem and the pte locks is not a factor): X MIN TYPE: CPUS WALL WALL SYS USER TOTCPU fork 1 0.552 0.552 0.540 0.012 0.552 fork 4 0.552 0.548 2.164 0.036 2.200 fork 16 0.564 0.548 8.812 0.164 8.976 fork 128 0.580 0.572 72.204 1.208 73.412 fork 256 1.300 0.660 310.400 2.160 312.560 fork 512 3.512 0.696 1526.836 4.816 1531.652 fork 1020 20.024 0.700 17243.176 6.688 17249.863 So a threshold of 32 is fine up to 128 processors. At 256 processors contention becomes a factor. Overstepping the counter (earlier patch) improves the numbers a bit: fork 4 0.552 0.548 2.164 0.040 2.204 fork 16 0.552 0.548 8.640 0.148 8.788 fork 128 0.556 0.548 69.676 0.956 70.632 fork 256 0.876 0.636 212.468 2.108 214.576 fork 512 2.276 0.672 997.324 4.260 1001.584 fork 1020 13.564 0.680 11586.436 6.088 11592.523 Still contention at 512 and 1020. Contention at 1020 is down by a third. 256 still has a slight bit of contention. After this patch the counter threshold will be set to 125 which reduces contention significantly: fork 128 0.560 0.548 69.776 0.932 70.708 fork 256 0.636 0.556 143.460 2.036 145.496 fork 512 0.640 0.548 284.244 4.236 288.480 fork 1020 1.500 0.588 1326.152 8.892 1335.044 [akpm@osdl.org: !SMP build fix] Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--mm/vmstat.c139
2 files changed, 120 insertions, 20 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 656b588a9f96..f45163c528e8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -77,6 +77,7 @@ struct per_cpu_pages {
77struct per_cpu_pageset { 77struct per_cpu_pageset {
78 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ 78 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
79#ifdef CONFIG_SMP 79#ifdef CONFIG_SMP
80 s8 stat_threshold;
80 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; 81 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
81#endif 82#endif
82} ____cacheline_aligned_in_smp; 83} ____cacheline_aligned_in_smp;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3799a0f7543a..c1b5f4106b38 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
12#include <linux/config.h> 12#include <linux/config.h>
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/cpu.h>
15 16
16void __get_zone_counts(unsigned long *active, unsigned long *inactive, 17void __get_zone_counts(unsigned long *active, unsigned long *inactive,
17 unsigned long *free, struct pglist_data *pgdat) 18 unsigned long *free, struct pglist_data *pgdat)
@@ -114,17 +115,72 @@ EXPORT_SYMBOL(vm_stat);
114 115
115#ifdef CONFIG_SMP 116#ifdef CONFIG_SMP
116 117
117#define STAT_THRESHOLD 32 118static int calculate_threshold(struct zone *zone)
119{
120 int threshold;
121 int mem; /* memory in 128 MB units */
122
123 /*
124 * The threshold scales with the number of processors and the amount
125 * of memory per zone. More memory means that we can defer updates for
126 * longer, more processors could lead to more contention.
127 * fls() is used to have a cheap way of logarithmic scaling.
128 *
129 * Some sample thresholds:
130 *
131 * Threshold Processors (fls) Zonesize fls(mem+1)
132 * ------------------------------------------------------------------
133 * 8 1 1 0.9-1 GB 4
134 * 16 2 2 0.9-1 GB 4
135 * 20 2 2 1-2 GB 5
136 * 24 2 2 2-4 GB 6
137 * 28 2 2 4-8 GB 7
138 * 32 2 2 8-16 GB 8
139 * 4 2 2 <128M 1
140 * 30 4 3 2-4 GB 5
141 * 48 4 3 8-16 GB 8
142 * 32 8 4 1-2 GB 4
143 * 32 8 4 0.9-1GB 4
144 * 10 16 5 <128M 1
145 * 40 16 5 900M 4
146 * 70 64 7 2-4 GB 5
147 * 84 64 7 4-8 GB 6
148 * 108 512 9 4-8 GB 6
149 * 125 1024 10 8-16 GB 8
150 * 125 1024 10 16-32 GB 9
151 */
152
153 mem = zone->present_pages >> (27 - PAGE_SHIFT);
154
155 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
156
157 /*
158 * Maximum threshold is 125
159 */
160 threshold = min(125, threshold);
161
162 return threshold;
163}
118 164
119/* 165/*
120 * Determine pointer to currently valid differential byte given a zone and 166 * Refresh the thresholds for each zone.
121 * the item number.
122 *
123 * Preemption must be off
124 */ 167 */
125static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) 168static void refresh_zone_stat_thresholds(void)
126{ 169{
127 return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; 170 struct zone *zone;
171 int cpu;
172 int threshold;
173
174 for_each_zone(zone) {
175
176 if (!zone->present_pages)
177 continue;
178
179 threshold = calculate_threshold(zone);
180
181 for_each_online_cpu(cpu)
182 zone_pcp(zone, cpu)->stat_threshold = threshold;
183 }
128} 184}
129 185
130/* 186/*
@@ -133,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
133void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 189void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
134 int delta) 190 int delta)
135{ 191{
136 s8 *p; 192 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
193 s8 *p = pcp->vm_stat_diff + item;
137 long x; 194 long x;
138 195
139 p = diff_pointer(zone, item);
140 x = delta + *p; 196 x = delta + *p;
141 197
142 if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { 198 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
143 zone_page_state_add(x, zone, item); 199 zone_page_state_add(x, zone, item);
144 x = 0; 200 x = 0;
145 } 201 }
146
147 *p = x; 202 *p = x;
148} 203}
149EXPORT_SYMBOL(__mod_zone_page_state); 204EXPORT_SYMBOL(__mod_zone_page_state);
@@ -172,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state);
172 * No overflow check is necessary and therefore the differential can be 227 * No overflow check is necessary and therefore the differential can be
173 * incremented or decremented in place which may allow the compilers to 228 * incremented or decremented in place which may allow the compilers to
174 * generate better code. 229 * generate better code.
175 *
176 * The increment or decrement is known and therefore one boundary check can 230 * The increment or decrement is known and therefore one boundary check can
177 * be omitted. 231 * be omitted.
178 * 232 *
233 * NOTE: These functions are very performance sensitive. Change only
234 * with care.
235 *
179 * Some processors have inc/dec instructions that are atomic vs an interrupt. 236 * Some processors have inc/dec instructions that are atomic vs an interrupt.
180 * However, the code must first determine the differential location in a zone 237 * However, the code must first determine the differential location in a zone
181 * based on the processor number and then inc/dec the counter. There is no 238 * based on the processor number and then inc/dec the counter. There is no
@@ -185,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state);
185 */ 242 */
186static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 243static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
187{ 244{
188 s8 *p = diff_pointer(zone, item); 245 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
246 s8 *p = pcp->vm_stat_diff + item;
189 247
190 (*p)++; 248 (*p)++;
191 249
192 if (unlikely(*p > STAT_THRESHOLD)) { 250 if (unlikely(*p > pcp->stat_threshold)) {
193 zone_page_state_add(*p + STAT_THRESHOLD / 2, zone, item); 251 int overstep = pcp->stat_threshold / 2;
194 *p = -STAT_THRESHOLD / 2; 252
253 zone_page_state_add(*p + overstep, zone, item);
254 *p = -overstep;
195 } 255 }
196} 256}
197 257
@@ -204,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state);
204void __dec_zone_page_state(struct page *page, enum zone_stat_item item) 264void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
205{ 265{
206 struct zone *zone = page_zone(page); 266 struct zone *zone = page_zone(page);
207 s8 *p = diff_pointer(zone, item); 267 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
268 s8 *p = pcp->vm_stat_diff + item;
208 269
209 (*p)--; 270 (*p)--;
210 271
211 if (unlikely(*p < -STAT_THRESHOLD)) { 272 if (unlikely(*p < - pcp->stat_threshold)) {
212 zone_page_state_add(*p - STAT_THRESHOLD / 2, zone, item); 273 int overstep = pcp->stat_threshold / 2;
213 *p = STAT_THRESHOLD /2; 274
275 zone_page_state_add(*p - overstep, zone, item);
276 *p = overstep;
214 } 277 }
215} 278}
216EXPORT_SYMBOL(__dec_zone_page_state); 279EXPORT_SYMBOL(__dec_zone_page_state);
@@ -515,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
515 pageset->pcp[j].high, 578 pageset->pcp[j].high,
516 pageset->pcp[j].batch); 579 pageset->pcp[j].batch);
517 } 580 }
581#ifdef CONFIG_SMP
582 seq_printf(m, "\n vm stats threshold: %d",
583 pageset->stat_threshold);
584#endif
518 } 585 }
519 seq_printf(m, 586 seq_printf(m,
520 "\n all_unreclaimable: %u" 587 "\n all_unreclaimable: %u"
@@ -603,3 +670,35 @@ struct seq_operations vmstat_op = {
603 670
604#endif /* CONFIG_PROC_FS */ 671#endif /* CONFIG_PROC_FS */
605 672
673#ifdef CONFIG_SMP
674/*
675 * Use the cpu notifier to insure that the thresholds are recalculated
676 * when necessary.
677 */
678static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
679 unsigned long action,
680 void *hcpu)
681{
682 switch (action) {
683 case CPU_UP_PREPARE:
684 case CPU_UP_CANCELED:
685 case CPU_DEAD:
686 refresh_zone_stat_thresholds();
687 break;
688 default:
689 break;
690 }
691 return NOTIFY_OK;
692}
693
694static struct notifier_block __cpuinitdata vmstat_notifier =
695 { &vmstat_cpuup_callback, NULL, 0 };
696
697int __init setup_vmstat(void)
698{
699 refresh_zone_stat_thresholds();
700 register_cpu_notifier(&vmstat_notifier);
701 return 0;
702}
703module_init(setup_vmstat)
704#endif