diff options
| -rw-r--r-- | include/linux/mmzone.h | 1 | ||||
| -rw-r--r-- | mm/vmstat.c | 139 |
2 files changed, 120 insertions, 20 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 656b588a9f96..f45163c528e8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
| @@ -77,6 +77,7 @@ struct per_cpu_pages { | |||
| 77 | struct per_cpu_pageset { | 77 | struct per_cpu_pageset { |
| 78 | struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ | 78 | struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ |
| 79 | #ifdef CONFIG_SMP | 79 | #ifdef CONFIG_SMP |
| 80 | s8 stat_threshold; | ||
| 80 | s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; | 81 | s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; |
| 81 | #endif | 82 | #endif |
| 82 | } ____cacheline_aligned_in_smp; | 83 | } ____cacheline_aligned_in_smp; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 3799a0f7543a..c1b5f4106b38 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #include <linux/config.h> | 12 | #include <linux/config.h> |
| 13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/cpu.h> | ||
| 15 | 16 | ||
| 16 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | 17 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, |
| 17 | unsigned long *free, struct pglist_data *pgdat) | 18 | unsigned long *free, struct pglist_data *pgdat) |
| @@ -114,17 +115,72 @@ EXPORT_SYMBOL(vm_stat); | |||
| 114 | 115 | ||
| 115 | #ifdef CONFIG_SMP | 116 | #ifdef CONFIG_SMP |
| 116 | 117 | ||
| 117 | #define STAT_THRESHOLD 32 | 118 | static int calculate_threshold(struct zone *zone) |
| 119 | { | ||
| 120 | int threshold; | ||
| 121 | int mem; /* memory in 128 MB units */ | ||
| 122 | |||
| 123 | /* | ||
| 124 | * The threshold scales with the number of processors and the amount | ||
| 125 | * of memory per zone. More memory means that we can defer updates for | ||
| 126 | * longer, more processors could lead to more contention. | ||
| 127 | * fls() is used to have a cheap way of logarithmic scaling. | ||
| 128 | * | ||
| 129 | * Some sample thresholds: | ||
| 130 | * | ||
| 131 | * Threshold Processors (fls) Zonesize fls(mem+1) | ||
| 132 | * ------------------------------------------------------------------ | ||
| 133 | * 8 1 1 0.9-1 GB 4 | ||
| 134 | * 16 2 2 0.9-1 GB 4 | ||
| 135 | * 20 2 2 1-2 GB 5 | ||
| 136 | * 24 2 2 2-4 GB 6 | ||
| 137 | * 28 2 2 4-8 GB 7 | ||
| 138 | * 32 2 2 8-16 GB 8 | ||
| 139 | * 4 2 2 <128M 1 | ||
| 140 | * 30 4 3 2-4 GB 5 | ||
| 141 | * 48 4 3 8-16 GB 8 | ||
| 142 | * 32 8 4 1-2 GB 4 | ||
| 143 | * 32 8 4 0.9-1GB 4 | ||
| 144 | * 10 16 5 <128M 1 | ||
| 145 | * 40 16 5 900M 4 | ||
| 146 | * 70 64 7 2-4 GB 5 | ||
| 147 | * 84 64 7 4-8 GB 6 | ||
| 148 | * 108 512 9 4-8 GB 6 | ||
| 149 | * 125 1024 10 8-16 GB 8 | ||
| 150 | * 125 1024 10 16-32 GB 9 | ||
| 151 | */ | ||
| 152 | |||
| 153 | mem = zone->present_pages >> (27 - PAGE_SHIFT); | ||
| 154 | |||
| 155 | threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); | ||
| 156 | |||
| 157 | /* | ||
| 158 | * Maximum threshold is 125 | ||
| 159 | */ | ||
| 160 | threshold = min(125, threshold); | ||
| 161 | |||
| 162 | return threshold; | ||
| 163 | } | ||
| 118 | 164 | ||
| 119 | /* | 165 | /* |
| 120 | * Determine pointer to currently valid differential byte given a zone and | 166 | * Refresh the thresholds for each zone. |
| 121 | * the item number. | ||
| 122 | * | ||
| 123 | * Preemption must be off | ||
| 124 | */ | 167 | */ |
| 125 | static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) | 168 | static void refresh_zone_stat_thresholds(void) |
| 126 | { | 169 | { |
| 127 | return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; | 170 | struct zone *zone; |
| 171 | int cpu; | ||
| 172 | int threshold; | ||
| 173 | |||
| 174 | for_each_zone(zone) { | ||
| 175 | |||
| 176 | if (!zone->present_pages) | ||
| 177 | continue; | ||
| 178 | |||
| 179 | threshold = calculate_threshold(zone); | ||
| 180 | |||
| 181 | for_each_online_cpu(cpu) | ||
| 182 | zone_pcp(zone, cpu)->stat_threshold = threshold; | ||
| 183 | } | ||
| 128 | } | 184 | } |
| 129 | 185 | ||
| 130 | /* | 186 | /* |
| @@ -133,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) | |||
| 133 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 189 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
| 134 | int delta) | 190 | int delta) |
| 135 | { | 191 | { |
| 136 | s8 *p; | 192 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); |
| 193 | s8 *p = pcp->vm_stat_diff + item; | ||
| 137 | long x; | 194 | long x; |
| 138 | 195 | ||
| 139 | p = diff_pointer(zone, item); | ||
| 140 | x = delta + *p; | 196 | x = delta + *p; |
| 141 | 197 | ||
| 142 | if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { | 198 | if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { |
| 143 | zone_page_state_add(x, zone, item); | 199 | zone_page_state_add(x, zone, item); |
| 144 | x = 0; | 200 | x = 0; |
| 145 | } | 201 | } |
| 146 | |||
| 147 | *p = x; | 202 | *p = x; |
| 148 | } | 203 | } |
| 149 | EXPORT_SYMBOL(__mod_zone_page_state); | 204 | EXPORT_SYMBOL(__mod_zone_page_state); |
| @@ -172,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state); | |||
| 172 | * No overflow check is necessary and therefore the differential can be | 227 | * No overflow check is necessary and therefore the differential can be |
| 173 | * incremented or decremented in place which may allow the compilers to | 228 | * incremented or decremented in place which may allow the compilers to |
| 174 | * generate better code. | 229 | * generate better code. |
| 175 | * | ||
| 176 | * The increment or decrement is known and therefore one boundary check can | 230 | * The increment or decrement is known and therefore one boundary check can |
| 177 | * be omitted. | 231 | * be omitted. |
| 178 | * | 232 | * |
| 233 | * NOTE: These functions are very performance sensitive. Change only | ||
| 234 | * with care. | ||
| 235 | * | ||
| 179 | * Some processors have inc/dec instructions that are atomic vs an interrupt. | 236 | * Some processors have inc/dec instructions that are atomic vs an interrupt. |
| 180 | * However, the code must first determine the differential location in a zone | 237 | * However, the code must first determine the differential location in a zone |
| 181 | * based on the processor number and then inc/dec the counter. There is no | 238 | * based on the processor number and then inc/dec the counter. There is no |
| @@ -185,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state); | |||
| 185 | */ | 242 | */ |
| 186 | static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | 243 | static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
| 187 | { | 244 | { |
| 188 | s8 *p = diff_pointer(zone, item); | 245 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); |
| 246 | s8 *p = pcp->vm_stat_diff + item; | ||
| 189 | 247 | ||
| 190 | (*p)++; | 248 | (*p)++; |
| 191 | 249 | ||
| 192 | if (unlikely(*p > STAT_THRESHOLD)) { | 250 | if (unlikely(*p > pcp->stat_threshold)) { |
| 193 | zone_page_state_add(*p + STAT_THRESHOLD / 2, zone, item); | 251 | int overstep = pcp->stat_threshold / 2; |
| 194 | *p = -STAT_THRESHOLD / 2; | 252 | |
| 253 | zone_page_state_add(*p + overstep, zone, item); | ||
| 254 | *p = -overstep; | ||
| 195 | } | 255 | } |
| 196 | } | 256 | } |
| 197 | 257 | ||
| @@ -204,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state); | |||
| 204 | void __dec_zone_page_state(struct page *page, enum zone_stat_item item) | 264 | void __dec_zone_page_state(struct page *page, enum zone_stat_item item) |
| 205 | { | 265 | { |
| 206 | struct zone *zone = page_zone(page); | 266 | struct zone *zone = page_zone(page); |
| 207 | s8 *p = diff_pointer(zone, item); | 267 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); |
| 268 | s8 *p = pcp->vm_stat_diff + item; | ||
| 208 | 269 | ||
| 209 | (*p)--; | 270 | (*p)--; |
| 210 | 271 | ||
| 211 | if (unlikely(*p < -STAT_THRESHOLD)) { | 272 | if (unlikely(*p < - pcp->stat_threshold)) { |
| 212 | zone_page_state_add(*p - STAT_THRESHOLD / 2, zone, item); | 273 | int overstep = pcp->stat_threshold / 2; |
| 213 | *p = STAT_THRESHOLD /2; | 274 | |
| 275 | zone_page_state_add(*p - overstep, zone, item); | ||
| 276 | *p = overstep; | ||
| 214 | } | 277 | } |
| 215 | } | 278 | } |
| 216 | EXPORT_SYMBOL(__dec_zone_page_state); | 279 | EXPORT_SYMBOL(__dec_zone_page_state); |
| @@ -515,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
| 515 | pageset->pcp[j].high, | 578 | pageset->pcp[j].high, |
| 516 | pageset->pcp[j].batch); | 579 | pageset->pcp[j].batch); |
| 517 | } | 580 | } |
| 581 | #ifdef CONFIG_SMP | ||
| 582 | seq_printf(m, "\n vm stats threshold: %d", | ||
| 583 | pageset->stat_threshold); | ||
| 584 | #endif | ||
| 518 | } | 585 | } |
| 519 | seq_printf(m, | 586 | seq_printf(m, |
| 520 | "\n all_unreclaimable: %u" | 587 | "\n all_unreclaimable: %u" |
| @@ -603,3 +670,35 @@ struct seq_operations vmstat_op = { | |||
| 603 | 670 | ||
| 604 | #endif /* CONFIG_PROC_FS */ | 671 | #endif /* CONFIG_PROC_FS */ |
| 605 | 672 | ||
| 673 | #ifdef CONFIG_SMP | ||
| 674 | /* | ||
| 675 | * Use the cpu notifier to insure that the thresholds are recalculated | ||
| 676 | * when necessary. | ||
| 677 | */ | ||
| 678 | static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | ||
| 679 | unsigned long action, | ||
| 680 | void *hcpu) | ||
| 681 | { | ||
| 682 | switch (action) { | ||
| 683 | case CPU_UP_PREPARE: | ||
| 684 | case CPU_UP_CANCELED: | ||
| 685 | case CPU_DEAD: | ||
| 686 | refresh_zone_stat_thresholds(); | ||
| 687 | break; | ||
| 688 | default: | ||
| 689 | break; | ||
| 690 | } | ||
| 691 | return NOTIFY_OK; | ||
| 692 | } | ||
| 693 | |||
| 694 | static struct notifier_block __cpuinitdata vmstat_notifier = | ||
| 695 | { &vmstat_cpuup_callback, NULL, 0 }; | ||
| 696 | |||
| 697 | int __init setup_vmstat(void) | ||
| 698 | { | ||
| 699 | refresh_zone_stat_thresholds(); | ||
| 700 | register_cpu_notifier(&vmstat_notifier); | ||
| 701 | return 0; | ||
| 702 | } | ||
| 703 | module_init(setup_vmstat) | ||
| 704 | #endif | ||
