aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDave Jones <davej@redhat.com>2006-09-05 17:20:21 -0400
committerDave Jones <davej@redhat.com>2006-09-05 17:20:21 -0400
commit115b384cf87249d76adb0b21aca11ee22128927d (patch)
treef39a2a54863e9d82d1196906f92c82ab5991c6af /mm
parent8eb7925f93af75e66a240d148efdec212f95bcb7 (diff)
parentc336923b668fdcf0312efbec3b44895d713f4d81 (diff)
Merge ../linus
Diffstat (limited to 'mm')
-rw-r--r--mm/fadvise.c3
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/memory_hotplug.c44
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/mempool.c9
-rw-r--r--mm/slab.c4
-rw-r--r--mm/swap.c20
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/vmstat.c151
9 files changed, 196 insertions, 50 deletions
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 60a5d55e51d9..168c78a121bb 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -73,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
73 file->f_ra.ra_pages = bdi->ra_pages * 2; 73 file->f_ra.ra_pages = bdi->ra_pages * 2;
74 break; 74 break;
75 case POSIX_FADV_WILLNEED: 75 case POSIX_FADV_WILLNEED:
76 case POSIX_FADV_NOREUSE:
77 if (!mapping->a_ops->readpage) { 76 if (!mapping->a_ops->readpage) {
78 ret = -EINVAL; 77 ret = -EINVAL;
79 break; 78 break;
@@ -94,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
94 if (ret > 0) 93 if (ret > 0)
95 ret = 0; 94 ret = 0;
96 break; 95 break;
96 case POSIX_FADV_NOREUSE:
97 break;
97 case POSIX_FADV_DONTNEED: 98 case POSIX_FADV_DONTNEED:
98 if (!bdi_write_congested(mapping->backing_dev_info)) 99 if (!bdi_write_congested(mapping->backing_dev_info))
99 filemap_flush(mapping); 100 filemap_flush(mapping);
diff --git a/mm/filemap.c b/mm/filemap.c
index d087fc3d3281..b9a60c43b61a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -849,8 +849,6 @@ static void shrink_readahead_size_eio(struct file *filp,
849 return; 849 return;
850 850
851 ra->ra_pages /= 4; 851 ra->ra_pages /= 4;
852 printk(KERN_WARNING "Reducing readahead size to %luK\n",
853 ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
854} 852}
855 853
856/** 854/**
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 01c9fb97c619..c37319542b70 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -52,6 +52,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
52 int nr_pages = PAGES_PER_SECTION; 52 int nr_pages = PAGES_PER_SECTION;
53 int ret; 53 int ret;
54 54
55 if (pfn_valid(phys_start_pfn))
56 return -EEXIST;
57
55 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 58 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
56 59
57 if (ret < 0) 60 if (ret < 0)
@@ -76,15 +79,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
76{ 79{
77 unsigned long i; 80 unsigned long i;
78 int err = 0; 81 int err = 0;
82 int start_sec, end_sec;
83 /* during initialize mem_map, align hot-added range to section */
84 start_sec = pfn_to_section_nr(phys_start_pfn);
85 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
79 86
80 for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { 87 for (i = start_sec; i <= end_sec; i++) {
81 err = __add_section(zone, phys_start_pfn + i); 88 err = __add_section(zone, i << PFN_SECTION_SHIFT);
82 89
83 /* We want to keep adding the rest of the 90 /*
84 * sections if the first ones already exist 91 * EEXIST is finally dealed with by ioresource collision
92 * check. see add_memory() => register_memory_resource()
93 * Warning will be printed if there is collision.
85 */ 94 */
86 if (err && (err != -EEXIST)) 95 if (err && (err != -EEXIST))
87 break; 96 break;
97 err = 0;
88 } 98 }
89 99
90 return err; 100 return err;
@@ -156,7 +166,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
156 res.flags = IORESOURCE_MEM; /* we just need system ram */ 166 res.flags = IORESOURCE_MEM; /* we just need system ram */
157 section_end = res.end; 167 section_end = res.end;
158 168
159 while (find_next_system_ram(&res) >= 0) { 169 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
160 start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); 170 start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
161 nr_pages = (unsigned long) 171 nr_pages = (unsigned long)
162 ((res.end + 1 - res.start) >> PAGE_SHIFT); 172 ((res.end + 1 - res.start) >> PAGE_SHIFT);
@@ -213,10 +223,9 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
213} 223}
214 224
215/* add this memory to iomem resource */ 225/* add this memory to iomem resource */
216static void register_memory_resource(u64 start, u64 size) 226static struct resource *register_memory_resource(u64 start, u64 size)
217{ 227{
218 struct resource *res; 228 struct resource *res;
219
220 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 229 res = kzalloc(sizeof(struct resource), GFP_KERNEL);
221 BUG_ON(!res); 230 BUG_ON(!res);
222 231
@@ -228,7 +237,18 @@ static void register_memory_resource(u64 start, u64 size)
228 printk("System RAM resource %llx - %llx cannot be added\n", 237 printk("System RAM resource %llx - %llx cannot be added\n",
229 (unsigned long long)res->start, (unsigned long long)res->end); 238 (unsigned long long)res->start, (unsigned long long)res->end);
230 kfree(res); 239 kfree(res);
240 res = NULL;
231 } 241 }
242 return res;
243}
244
245static void release_memory_resource(struct resource *res)
246{
247 if (!res)
248 return;
249 release_resource(res);
250 kfree(res);
251 return;
232} 252}
233 253
234 254
@@ -237,8 +257,13 @@ int add_memory(int nid, u64 start, u64 size)
237{ 257{
238 pg_data_t *pgdat = NULL; 258 pg_data_t *pgdat = NULL;
239 int new_pgdat = 0; 259 int new_pgdat = 0;
260 struct resource *res;
240 int ret; 261 int ret;
241 262
263 res = register_memory_resource(start, size);
264 if (!res)
265 return -EEXIST;
266
242 if (!node_online(nid)) { 267 if (!node_online(nid)) {
243 pgdat = hotadd_new_pgdat(nid, start); 268 pgdat = hotadd_new_pgdat(nid, start);
244 if (!pgdat) 269 if (!pgdat)
@@ -268,14 +293,13 @@ int add_memory(int nid, u64 start, u64 size)
268 BUG_ON(ret); 293 BUG_ON(ret);
269 } 294 }
270 295
271 /* register this memory as resource */
272 register_memory_resource(start, size);
273
274 return ret; 296 return ret;
275error: 297error:
276 /* rollback pgdat allocation and others */ 298 /* rollback pgdat allocation and others */
277 if (new_pgdat) 299 if (new_pgdat)
278 rollback_node_hotadd(nid, pgdat); 300 rollback_node_hotadd(nid, pgdat);
301 if (res)
302 release_memory_resource(res);
279 303
280 return ret; 304 return ret;
281} 305}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e07e27e846a2..a9963ceddd65 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1176,7 +1176,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1176 if (vma) { 1176 if (vma) {
1177 unsigned long off; 1177 unsigned long off;
1178 1178
1179 off = vma->vm_pgoff; 1179 /*
1180 * for small pages, there is no difference between
1181 * shift and PAGE_SHIFT, so the bit-shift is safe.
1182 * for huge pages, since vm_pgoff is in units of small
1183 * pages, we need to shift off the always 0 bits to get
1184 * a useful offset.
1185 */
1186 BUG_ON(shift < PAGE_SHIFT);
1187 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1180 off += (addr - vma->vm_start) >> shift; 1188 off += (addr - vma->vm_start) >> shift;
1181 return offset_il_node(pol, vma, off); 1189 return offset_il_node(pol, vma, off);
1182 } else 1190 } else
diff --git a/mm/mempool.c b/mm/mempool.c
index fe6e05289cc5..ccd8cb8cd41f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -238,8 +238,13 @@ repeat_alloc:
238 init_wait(&wait); 238 init_wait(&wait);
239 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 239 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
240 smp_mb(); 240 smp_mb();
241 if (!pool->curr_nr) 241 if (!pool->curr_nr) {
242 io_schedule(); 242 /*
243 * FIXME: this should be io_schedule(). The timeout is there
244 * as a workaround for some DM problems in 2.6.18.
245 */
246 io_schedule_timeout(5*HZ);
247 }
243 finish_wait(&pool->wait, &wait); 248 finish_wait(&pool->wait, &wait);
244 249
245 goto repeat_alloc; 250 goto repeat_alloc;
diff --git a/mm/slab.c b/mm/slab.c
index 0f20843beffd..21ba06035700 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1106,7 +1106,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1106 1106
1107#endif 1107#endif
1108 1108
1109static int __devinit cpuup_callback(struct notifier_block *nfb, 1109static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1110 unsigned long action, void *hcpu) 1110 unsigned long action, void *hcpu)
1111{ 1111{
1112 long cpu = (long)hcpu; 1112 long cpu = (long)hcpu;
@@ -3224,7 +3224,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3224EXPORT_SYMBOL(kmem_cache_alloc); 3224EXPORT_SYMBOL(kmem_cache_alloc);
3225 3225
3226/** 3226/**
3227 * kmem_cache_alloc - Allocate an object. The memory is set to zero. 3227 * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
3228 * @cache: The cache to allocate from. 3228 * @cache: The cache to allocate from.
3229 * @flags: See kmalloc(). 3229 * @flags: See kmalloc().
3230 * 3230 *
diff --git a/mm/swap.c b/mm/swap.c
index 8fd095c4ae51..687686a61f7c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -54,6 +54,26 @@ void put_page(struct page *page)
54} 54}
55EXPORT_SYMBOL(put_page); 55EXPORT_SYMBOL(put_page);
56 56
57/**
58 * put_pages_list(): release a list of pages
59 *
60 * Release a list of pages which are strung together on page.lru. Currently
61 * used by read_cache_pages() and related error recovery code.
62 *
63 * @pages: list of pages threaded on page->lru
64 */
65void put_pages_list(struct list_head *pages)
66{
67 while (!list_empty(pages)) {
68 struct page *victim;
69
70 victim = list_entry(pages->prev, struct page, lru);
71 list_del(&victim->lru);
72 page_cache_release(victim);
73 }
74}
75EXPORT_SYMBOL(put_pages_list);
76
57/* 77/*
58 * Writeback is about to end against a page which has been marked for immediate 78 * Writeback is about to end against a page which has been marked for immediate
59 * reclaim. If it still appears to be reclaimable, move it to the tail of the 79 * reclaim. If it still appears to be reclaimable, move it to the tail of the
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e70d6c6d6fee..f1f5ec783781 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -442,11 +442,12 @@ int swap_type_of(dev_t device)
442 442
443 if (!(swap_info[i].flags & SWP_WRITEOK)) 443 if (!(swap_info[i].flags & SWP_WRITEOK))
444 continue; 444 continue;
445
445 if (!device) { 446 if (!device) {
446 spin_unlock(&swap_lock); 447 spin_unlock(&swap_lock);
447 return i; 448 return i;
448 } 449 }
449 inode = swap_info->swap_file->f_dentry->d_inode; 450 inode = swap_info[i].swap_file->f_dentry->d_inode;
450 if (S_ISBLK(inode->i_mode) && 451 if (S_ISBLK(inode->i_mode) &&
451 device == MKDEV(imajor(inode), iminor(inode))) { 452 device == MKDEV(imajor(inode), iminor(inode))) {
452 spin_unlock(&swap_lock); 453 spin_unlock(&swap_lock);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index dfdf24133901..c1b5f4106b38 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
12#include <linux/config.h> 12#include <linux/config.h>
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/cpu.h>
15 16
16void __get_zone_counts(unsigned long *active, unsigned long *inactive, 17void __get_zone_counts(unsigned long *active, unsigned long *inactive,
17 unsigned long *free, struct pglist_data *pgdat) 18 unsigned long *free, struct pglist_data *pgdat)
@@ -114,17 +115,72 @@ EXPORT_SYMBOL(vm_stat);
114 115
115#ifdef CONFIG_SMP 116#ifdef CONFIG_SMP
116 117
117#define STAT_THRESHOLD 32 118static int calculate_threshold(struct zone *zone)
119{
120 int threshold;
121 int mem; /* memory in 128 MB units */
122
123 /*
124 * The threshold scales with the number of processors and the amount
125 * of memory per zone. More memory means that we can defer updates for
126 * longer, more processors could lead to more contention.
127 * fls() is used to have a cheap way of logarithmic scaling.
128 *
129 * Some sample thresholds:
130 *
131 * Threshold Processors (fls) Zonesize fls(mem+1)
132 * ------------------------------------------------------------------
133 * 8 1 1 0.9-1 GB 4
134 * 16 2 2 0.9-1 GB 4
135 * 20 2 2 1-2 GB 5
136 * 24 2 2 2-4 GB 6
137 * 28 2 2 4-8 GB 7
138 * 32 2 2 8-16 GB 8
139 * 4 2 2 <128M 1
140 * 30 4 3 2-4 GB 5
141 * 48 4 3 8-16 GB 8
142 * 32 8 4 1-2 GB 4
143 * 32 8 4 0.9-1GB 4
144 * 10 16 5 <128M 1
145 * 40 16 5 900M 4
146 * 70 64 7 2-4 GB 5
147 * 84 64 7 4-8 GB 6
148 * 108 512 9 4-8 GB 6
149 * 125 1024 10 8-16 GB 8
150 * 125 1024 10 16-32 GB 9
151 */
152
153 mem = zone->present_pages >> (27 - PAGE_SHIFT);
154
155 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
156
157 /*
158 * Maximum threshold is 125
159 */
160 threshold = min(125, threshold);
161
162 return threshold;
163}
118 164
119/* 165/*
120 * Determine pointer to currently valid differential byte given a zone and 166 * Refresh the thresholds for each zone.
121 * the item number.
122 *
123 * Preemption must be off
124 */ 167 */
125static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) 168static void refresh_zone_stat_thresholds(void)
126{ 169{
127 return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; 170 struct zone *zone;
171 int cpu;
172 int threshold;
173
174 for_each_zone(zone) {
175
176 if (!zone->present_pages)
177 continue;
178
179 threshold = calculate_threshold(zone);
180
181 for_each_online_cpu(cpu)
182 zone_pcp(zone, cpu)->stat_threshold = threshold;
183 }
128} 184}
129 185
130/* 186/*
@@ -133,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
133void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 189void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
134 int delta) 190 int delta)
135{ 191{
136 s8 *p; 192 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
193 s8 *p = pcp->vm_stat_diff + item;
137 long x; 194 long x;
138 195
139 p = diff_pointer(zone, item);
140 x = delta + *p; 196 x = delta + *p;
141 197
142 if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { 198 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
143 zone_page_state_add(x, zone, item); 199 zone_page_state_add(x, zone, item);
144 x = 0; 200 x = 0;
145 } 201 }
146
147 *p = x; 202 *p = x;
148} 203}
149EXPORT_SYMBOL(__mod_zone_page_state); 204EXPORT_SYMBOL(__mod_zone_page_state);
@@ -172,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state);
172 * No overflow check is necessary and therefore the differential can be 227 * No overflow check is necessary and therefore the differential can be
173 * incremented or decremented in place which may allow the compilers to 228 * incremented or decremented in place which may allow the compilers to
174 * generate better code. 229 * generate better code.
175 *
176 * The increment or decrement is known and therefore one boundary check can 230 * The increment or decrement is known and therefore one boundary check can
177 * be omitted. 231 * be omitted.
178 * 232 *
233 * NOTE: These functions are very performance sensitive. Change only
234 * with care.
235 *
179 * Some processors have inc/dec instructions that are atomic vs an interrupt. 236 * Some processors have inc/dec instructions that are atomic vs an interrupt.
180 * However, the code must first determine the differential location in a zone 237 * However, the code must first determine the differential location in a zone
181 * based on the processor number and then inc/dec the counter. There is no 238 * based on the processor number and then inc/dec the counter. There is no
@@ -185,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state);
185 */ 242 */
186static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 243static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
187{ 244{
188 s8 *p = diff_pointer(zone, item); 245 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
246 s8 *p = pcp->vm_stat_diff + item;
189 247
190 (*p)++; 248 (*p)++;
191 249
192 if (unlikely(*p > STAT_THRESHOLD)) { 250 if (unlikely(*p > pcp->stat_threshold)) {
193 zone_page_state_add(*p, zone, item); 251 int overstep = pcp->stat_threshold / 2;
194 *p = 0; 252
253 zone_page_state_add(*p + overstep, zone, item);
254 *p = -overstep;
195 } 255 }
196} 256}
197 257
@@ -204,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state);
204void __dec_zone_page_state(struct page *page, enum zone_stat_item item) 264void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
205{ 265{
206 struct zone *zone = page_zone(page); 266 struct zone *zone = page_zone(page);
207 s8 *p = diff_pointer(zone, item); 267 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
268 s8 *p = pcp->vm_stat_diff + item;
208 269
209 (*p)--; 270 (*p)--;
210 271
211 if (unlikely(*p < -STAT_THRESHOLD)) { 272 if (unlikely(*p < - pcp->stat_threshold)) {
212 zone_page_state_add(*p, zone, item); 273 int overstep = pcp->stat_threshold / 2;
213 *p = 0; 274
275 zone_page_state_add(*p - overstep, zone, item);
276 *p = overstep;
214 } 277 }
215} 278}
216EXPORT_SYMBOL(__dec_zone_page_state); 279EXPORT_SYMBOL(__dec_zone_page_state);
@@ -239,19 +302,9 @@ EXPORT_SYMBOL(inc_zone_page_state);
239void dec_zone_page_state(struct page *page, enum zone_stat_item item) 302void dec_zone_page_state(struct page *page, enum zone_stat_item item)
240{ 303{
241 unsigned long flags; 304 unsigned long flags;
242 struct zone *zone;
243 s8 *p;
244 305
245 zone = page_zone(page);
246 local_irq_save(flags); 306 local_irq_save(flags);
247 p = diff_pointer(zone, item); 307 __dec_zone_page_state(page, item);
248
249 (*p)--;
250
251 if (unlikely(*p < -STAT_THRESHOLD)) {
252 zone_page_state_add(*p, zone, item);
253 *p = 0;
254 }
255 local_irq_restore(flags); 308 local_irq_restore(flags);
256} 309}
257EXPORT_SYMBOL(dec_zone_page_state); 310EXPORT_SYMBOL(dec_zone_page_state);
@@ -525,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
525 pageset->pcp[j].high, 578 pageset->pcp[j].high,
526 pageset->pcp[j].batch); 579 pageset->pcp[j].batch);
527 } 580 }
581#ifdef CONFIG_SMP
582 seq_printf(m, "\n vm stats threshold: %d",
583 pageset->stat_threshold);
584#endif
528 } 585 }
529 seq_printf(m, 586 seq_printf(m,
530 "\n all_unreclaimable: %u" 587 "\n all_unreclaimable: %u"
@@ -613,3 +670,35 @@ struct seq_operations vmstat_op = {
613 670
614#endif /* CONFIG_PROC_FS */ 671#endif /* CONFIG_PROC_FS */
615 672
673#ifdef CONFIG_SMP
674/*
675 * Use the cpu notifier to insure that the thresholds are recalculated
676 * when necessary.
677 */
678static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
679 unsigned long action,
680 void *hcpu)
681{
682 switch (action) {
683 case CPU_UP_PREPARE:
684 case CPU_UP_CANCELED:
685 case CPU_DEAD:
686 refresh_zone_stat_thresholds();
687 break;
688 default:
689 break;
690 }
691 return NOTIFY_OK;
692}
693
694static struct notifier_block __cpuinitdata vmstat_notifier =
695 { &vmstat_cpuup_callback, NULL, 0 };
696
697int __init setup_vmstat(void)
698{
699 refresh_zone_stat_thresholds();
700 register_cpu_notifier(&vmstat_notifier);
701 return 0;
702}
703module_init(setup_vmstat)
704#endif