aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/allocpercpu.c129
-rw-r--r--mm/bootmem.c202
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/fremap.c4
-rw-r--r--mm/highmem.c13
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memory.c77
-rw-r--r--mm/mempolicy.c19
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mprotect.c51
-rw-r--r--mm/msync.c196
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c97
-rw-r--r--mm/page-writeback.c29
-rw-r--r--mm/page_alloc.c233
-rw-r--r--mm/page_io.c48
-rw-r--r--mm/rmap.c65
-rw-r--r--mm/shmem.c1
-rw-r--r--mm/slab.c310
-rw-r--r--mm/slob.c49
-rw-r--r--mm/swap.c49
-rw-r--r--mm/vmalloc.c8
-rw-r--r--mm/vmscan.c110
-rw-r--r--mm/vmstat.c49
27 files changed, 1049 insertions, 747 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 9dd824c11eeb..60c56c0b5e10 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,4 +23,4 @@ obj-$(CONFIG_SLAB) += slab.o
23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
24obj-$(CONFIG_FS_XIP) += filemap_xip.o 24obj-$(CONFIG_FS_XIP) += filemap_xip.o
25obj-$(CONFIG_MIGRATION) += migrate.o 25obj-$(CONFIG_MIGRATION) += migrate.o
26 26obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
new file mode 100644
index 000000000000..eaa9abeea536
--- /dev/null
+++ b/mm/allocpercpu.c
@@ -0,0 +1,129 @@
1/*
2 * linux/mm/allocpercpu.c
3 *
4 * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
5 */
6#include <linux/mm.h>
7#include <linux/module.h>
8
9/**
10 * percpu_depopulate - depopulate per-cpu data for given cpu
11 * @__pdata: per-cpu data to depopulate
12 * @cpu: depopulate per-cpu data for this cpu
13 *
14 * Depopulating per-cpu data for a cpu going offline would be a typical
15 * use case. You need to register a cpu hotplug handler for that purpose.
16 */
17void percpu_depopulate(void *__pdata, int cpu)
18{
19 struct percpu_data *pdata = __percpu_disguise(__pdata);
20 if (pdata->ptrs[cpu]) {
21 kfree(pdata->ptrs[cpu]);
22 pdata->ptrs[cpu] = NULL;
23 }
24}
25EXPORT_SYMBOL_GPL(percpu_depopulate);
26
27/**
28 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
29 * @__pdata: per-cpu data to depopulate
30 * @mask: depopulate per-cpu data for cpu's selected through mask bits
31 */
32void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
33{
34 int cpu;
35 for_each_cpu_mask(cpu, *mask)
36 percpu_depopulate(__pdata, cpu);
37}
38EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
39
40/**
41 * percpu_populate - populate per-cpu data for given cpu
42 * @__pdata: per-cpu data to populate further
43 * @size: size of per-cpu object
44 * @gfp: may sleep or not etc.
45 * @cpu: populate per-data for this cpu
46 *
47 * Populating per-cpu data for a cpu coming online would be a typical
48 * use case. You need to register a cpu hotplug handler for that purpose.
49 * Per-cpu object is populated with zeroed buffer.
50 */
51void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
52{
53 struct percpu_data *pdata = __percpu_disguise(__pdata);
54 int node = cpu_to_node(cpu);
55
56 BUG_ON(pdata->ptrs[cpu]);
57 if (node_online(node)) {
58 /* FIXME: kzalloc_node(size, gfp, node) */
59 pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
60 if (pdata->ptrs[cpu])
61 memset(pdata->ptrs[cpu], 0, size);
62 } else
63 pdata->ptrs[cpu] = kzalloc(size, gfp);
64 return pdata->ptrs[cpu];
65}
66EXPORT_SYMBOL_GPL(percpu_populate);
67
68/**
69 * percpu_populate_mask - populate per-cpu data for more cpu's
70 * @__pdata: per-cpu data to populate further
71 * @size: size of per-cpu object
72 * @gfp: may sleep or not etc.
73 * @mask: populate per-cpu data for cpu's selected through mask bits
74 *
75 * Per-cpu objects are populated with zeroed buffers.
76 */
77int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
78 cpumask_t *mask)
79{
80 cpumask_t populated = CPU_MASK_NONE;
81 int cpu;
82
83 for_each_cpu_mask(cpu, *mask)
84 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
85 __percpu_depopulate_mask(__pdata, &populated);
86 return -ENOMEM;
87 } else
88 cpu_set(cpu, populated);
89 return 0;
90}
91EXPORT_SYMBOL_GPL(__percpu_populate_mask);
92
93/**
94 * percpu_alloc_mask - initial setup of per-cpu data
95 * @size: size of per-cpu object
96 * @gfp: may sleep or not etc.
97 * @mask: populate per-data for cpu's selected through mask bits
98 *
99 * Populating per-cpu data for all online cpu's would be a typical use case,
100 * which is simplified by the percpu_alloc() wrapper.
101 * Per-cpu objects are populated with zeroed buffers.
102 */
103void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
104{
105 void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
106 void *__pdata = __percpu_disguise(pdata);
107
108 if (unlikely(!pdata))
109 return NULL;
110 if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
111 return __pdata;
112 kfree(pdata);
113 return NULL;
114}
115EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
116
117/**
118 * percpu_free - final cleanup of per-cpu data
119 * @__pdata: object to clean up
120 *
121 * We simply clean up any per-cpu object left. No need for the client to
122 * track and specify through a bis mask which per-cpu objects are to free.
123 */
124void percpu_free(void *__pdata)
125{
126 __percpu_depopulate_mask(__pdata, &cpu_possible_map);
127 kfree(__percpu_disguise(__pdata));
128}
129EXPORT_SYMBOL_GPL(percpu_free);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 50353e0dac12..d53112fcb404 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -8,17 +8,15 @@
8 * free memory collector. It's used to deal with reserved 8 * free memory collector. It's used to deal with reserved
9 * system memory and memory holes as well. 9 * system memory and memory holes as well.
10 */ 10 */
11
12#include <linux/mm.h>
13#include <linux/kernel_stat.h>
14#include <linux/swap.h>
15#include <linux/interrupt.h>
16#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h>
17#include <linux/bootmem.h> 13#include <linux/bootmem.h>
18#include <linux/mmzone.h>
19#include <linux/module.h> 14#include <linux/module.h>
20#include <asm/dma.h> 15
16#include <asm/bug.h>
21#include <asm/io.h> 17#include <asm/io.h>
18#include <asm/processor.h>
19
22#include "internal.h" 20#include "internal.h"
23 21
24/* 22/*
@@ -41,7 +39,7 @@ unsigned long saved_max_pfn;
41#endif 39#endif
42 40
43/* return the number of _pages_ that will be allocated for the boot bitmap */ 41/* return the number of _pages_ that will be allocated for the boot bitmap */
44unsigned long __init bootmem_bootmap_pages (unsigned long pages) 42unsigned long __init bootmem_bootmap_pages(unsigned long pages)
45{ 43{
46 unsigned long mapsize; 44 unsigned long mapsize;
47 45
@@ -51,12 +49,14 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
51 49
52 return mapsize; 50 return mapsize;
53} 51}
52
54/* 53/*
55 * link bdata in order 54 * link bdata in order
56 */ 55 */
57static void link_bootmem(bootmem_data_t *bdata) 56static void __init link_bootmem(bootmem_data_t *bdata)
58{ 57{
59 bootmem_data_t *ent; 58 bootmem_data_t *ent;
59
60 if (list_empty(&bdata_list)) { 60 if (list_empty(&bdata_list)) {
61 list_add(&bdata->list, &bdata_list); 61 list_add(&bdata->list, &bdata_list);
62 return; 62 return;
@@ -69,22 +69,32 @@ static void link_bootmem(bootmem_data_t *bdata)
69 } 69 }
70 } 70 }
71 list_add_tail(&bdata->list, &bdata_list); 71 list_add_tail(&bdata->list, &bdata_list);
72 return;
73} 72}
74 73
74/*
75 * Given an initialised bdata, it returns the size of the boot bitmap
76 */
77static unsigned long __init get_mapsize(bootmem_data_t *bdata)
78{
79 unsigned long mapsize;
80 unsigned long start = PFN_DOWN(bdata->node_boot_start);
81 unsigned long end = bdata->node_low_pfn;
82
83 mapsize = ((end - start) + 7) / 8;
84 return ALIGN(mapsize, sizeof(long));
85}
75 86
76/* 87/*
77 * Called once to set up the allocator itself. 88 * Called once to set up the allocator itself.
78 */ 89 */
79static unsigned long __init init_bootmem_core (pg_data_t *pgdat, 90static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
80 unsigned long mapstart, unsigned long start, unsigned long end) 91 unsigned long mapstart, unsigned long start, unsigned long end)
81{ 92{
82 bootmem_data_t *bdata = pgdat->bdata; 93 bootmem_data_t *bdata = pgdat->bdata;
83 unsigned long mapsize = ((end - start)+7)/8; 94 unsigned long mapsize;
84 95
85 mapsize = ALIGN(mapsize, sizeof(long)); 96 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
86 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); 97 bdata->node_boot_start = PFN_PHYS(start);
87 bdata->node_boot_start = (start << PAGE_SHIFT);
88 bdata->node_low_pfn = end; 98 bdata->node_low_pfn = end;
89 link_bootmem(bdata); 99 link_bootmem(bdata);
90 100
@@ -92,6 +102,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
92 * Initially all pages are reserved - setup_arch() has to 102 * Initially all pages are reserved - setup_arch() has to
93 * register free RAM areas explicitly. 103 * register free RAM areas explicitly.
94 */ 104 */
105 mapsize = get_mapsize(bdata);
95 memset(bdata->node_bootmem_map, 0xff, mapsize); 106 memset(bdata->node_bootmem_map, 0xff, mapsize);
96 107
97 return mapsize; 108 return mapsize;
@@ -102,22 +113,22 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
102 * might be used for boot-time allocations - or it might get added 113 * might be used for boot-time allocations - or it might get added
103 * to the free page pool later on. 114 * to the free page pool later on.
104 */ 115 */
105static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) 116static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
117 unsigned long size)
106{ 118{
119 unsigned long sidx, eidx;
107 unsigned long i; 120 unsigned long i;
121
108 /* 122 /*
109 * round up, partially reserved pages are considered 123 * round up, partially reserved pages are considered
110 * fully reserved. 124 * fully reserved.
111 */ 125 */
112 unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
113 unsigned long eidx = (addr + size - bdata->node_boot_start +
114 PAGE_SIZE-1)/PAGE_SIZE;
115 unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
116
117 BUG_ON(!size); 126 BUG_ON(!size);
118 BUG_ON(sidx >= eidx); 127 BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
119 BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn); 128 BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
120 BUG_ON(end > bdata->node_low_pfn); 129
130 sidx = PFN_DOWN(addr - bdata->node_boot_start);
131 eidx = PFN_UP(addr + size - bdata->node_boot_start);
121 132
122 for (i = sidx; i < eidx; i++) 133 for (i = sidx; i < eidx; i++)
123 if (test_and_set_bit(i, bdata->node_bootmem_map)) { 134 if (test_and_set_bit(i, bdata->node_bootmem_map)) {
@@ -127,20 +138,18 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add
127 } 138 }
128} 139}
129 140
130static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) 141static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
142 unsigned long size)
131{ 143{
144 unsigned long sidx, eidx;
132 unsigned long i; 145 unsigned long i;
133 unsigned long start; 146
134 /* 147 /*
135 * round down end of usable mem, partially free pages are 148 * round down end of usable mem, partially free pages are
136 * considered reserved. 149 * considered reserved.
137 */ 150 */
138 unsigned long sidx;
139 unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
140 unsigned long end = (addr + size)/PAGE_SIZE;
141
142 BUG_ON(!size); 151 BUG_ON(!size);
143 BUG_ON(end > bdata->node_low_pfn); 152 BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn);
144 153
145 if (addr < bdata->last_success) 154 if (addr < bdata->last_success)
146 bdata->last_success = addr; 155 bdata->last_success = addr;
@@ -148,8 +157,8 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
148 /* 157 /*
149 * Round up the beginning of the address. 158 * Round up the beginning of the address.
150 */ 159 */
151 start = (addr + PAGE_SIZE-1) / PAGE_SIZE; 160 sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
152 sidx = start - (bdata->node_boot_start/PAGE_SIZE); 161 eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
153 162
154 for (i = sidx; i < eidx; i++) { 163 for (i = sidx; i < eidx; i++) {
155 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) 164 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
@@ -175,10 +184,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
175 unsigned long align, unsigned long goal, unsigned long limit) 184 unsigned long align, unsigned long goal, unsigned long limit)
176{ 185{
177 unsigned long offset, remaining_size, areasize, preferred; 186 unsigned long offset, remaining_size, areasize, preferred;
178 unsigned long i, start = 0, incr, eidx, end_pfn = bdata->node_low_pfn; 187 unsigned long i, start = 0, incr, eidx, end_pfn;
179 void *ret; 188 void *ret;
180 189
181 if(!size) { 190 if (!size) {
182 printk("__alloc_bootmem_core(): zero-sized request\n"); 191 printk("__alloc_bootmem_core(): zero-sized request\n");
183 BUG(); 192 BUG();
184 } 193 }
@@ -187,23 +196,22 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
187 if (limit && bdata->node_boot_start >= limit) 196 if (limit && bdata->node_boot_start >= limit)
188 return NULL; 197 return NULL;
189 198
190 limit >>=PAGE_SHIFT; 199 end_pfn = bdata->node_low_pfn;
200 limit = PFN_DOWN(limit);
191 if (limit && end_pfn > limit) 201 if (limit && end_pfn > limit)
192 end_pfn = limit; 202 end_pfn = limit;
193 203
194 eidx = end_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 204 eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
195 offset = 0; 205 offset = 0;
196 if (align && 206 if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
197 (bdata->node_boot_start & (align - 1UL)) != 0) 207 offset = align - (bdata->node_boot_start & (align - 1UL));
198 offset = (align - (bdata->node_boot_start & (align - 1UL))); 208 offset = PFN_DOWN(offset);
199 offset >>= PAGE_SHIFT;
200 209
201 /* 210 /*
202 * We try to allocate bootmem pages above 'goal' 211 * We try to allocate bootmem pages above 'goal'
203 * first, then we try to allocate lower pages. 212 * first, then we try to allocate lower pages.
204 */ 213 */
205 if (goal && (goal >= bdata->node_boot_start) && 214 if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
206 ((goal >> PAGE_SHIFT) < end_pfn)) {
207 preferred = goal - bdata->node_boot_start; 215 preferred = goal - bdata->node_boot_start;
208 216
209 if (bdata->last_success >= preferred) 217 if (bdata->last_success >= preferred)
@@ -212,9 +220,8 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
212 } else 220 } else
213 preferred = 0; 221 preferred = 0;
214 222
215 preferred = ALIGN(preferred, align) >> PAGE_SHIFT; 223 preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
216 preferred += offset; 224 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
217 areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
218 incr = align >> PAGE_SHIFT ? : 1; 225 incr = align >> PAGE_SHIFT ? : 1;
219 226
220restart_scan: 227restart_scan:
@@ -229,7 +236,7 @@ restart_scan:
229 for (j = i + 1; j < i + areasize; ++j) { 236 for (j = i + 1; j < i + areasize; ++j) {
230 if (j >= eidx) 237 if (j >= eidx)
231 goto fail_block; 238 goto fail_block;
232 if (test_bit (j, bdata->node_bootmem_map)) 239 if (test_bit(j, bdata->node_bootmem_map))
233 goto fail_block; 240 goto fail_block;
234 } 241 }
235 start = i; 242 start = i;
@@ -245,7 +252,7 @@ restart_scan:
245 return NULL; 252 return NULL;
246 253
247found: 254found:
248 bdata->last_success = start << PAGE_SHIFT; 255 bdata->last_success = PFN_PHYS(start);
249 BUG_ON(start >= eidx); 256 BUG_ON(start >= eidx);
250 257
251 /* 258 /*
@@ -257,19 +264,21 @@ found:
257 bdata->last_offset && bdata->last_pos+1 == start) { 264 bdata->last_offset && bdata->last_pos+1 == start) {
258 offset = ALIGN(bdata->last_offset, align); 265 offset = ALIGN(bdata->last_offset, align);
259 BUG_ON(offset > PAGE_SIZE); 266 BUG_ON(offset > PAGE_SIZE);
260 remaining_size = PAGE_SIZE-offset; 267 remaining_size = PAGE_SIZE - offset;
261 if (size < remaining_size) { 268 if (size < remaining_size) {
262 areasize = 0; 269 areasize = 0;
263 /* last_pos unchanged */ 270 /* last_pos unchanged */
264 bdata->last_offset = offset+size; 271 bdata->last_offset = offset + size;
265 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + 272 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
266 bdata->node_boot_start); 273 offset +
274 bdata->node_boot_start);
267 } else { 275 } else {
268 remaining_size = size - remaining_size; 276 remaining_size = size - remaining_size;
269 areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; 277 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
270 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + 278 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
271 bdata->node_boot_start); 279 offset +
272 bdata->last_pos = start+areasize-1; 280 bdata->node_boot_start);
281 bdata->last_pos = start + areasize - 1;
273 bdata->last_offset = remaining_size; 282 bdata->last_offset = remaining_size;
274 } 283 }
275 bdata->last_offset &= ~PAGE_MASK; 284 bdata->last_offset &= ~PAGE_MASK;
@@ -282,7 +291,7 @@ found:
282 /* 291 /*
283 * Reserve the area now: 292 * Reserve the area now:
284 */ 293 */
285 for (i = start; i < start+areasize; i++) 294 for (i = start; i < start + areasize; i++)
286 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) 295 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
287 BUG(); 296 BUG();
288 memset(ret, 0, size); 297 memset(ret, 0, size);
@@ -303,8 +312,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
303 312
304 count = 0; 313 count = 0;
305 /* first extant page of the node */ 314 /* first extant page of the node */
306 pfn = bdata->node_boot_start >> PAGE_SHIFT; 315 pfn = PFN_DOWN(bdata->node_boot_start);
307 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 316 idx = bdata->node_low_pfn - pfn;
308 map = bdata->node_bootmem_map; 317 map = bdata->node_bootmem_map;
309 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ 318 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
310 if (bdata->node_boot_start == 0 || 319 if (bdata->node_boot_start == 0 ||
@@ -333,7 +342,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
333 } 342 }
334 } 343 }
335 } else { 344 } else {
336 i+=BITS_PER_LONG; 345 i += BITS_PER_LONG;
337 } 346 }
338 pfn += BITS_PER_LONG; 347 pfn += BITS_PER_LONG;
339 } 348 }
@@ -345,9 +354,10 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
345 */ 354 */
346 page = virt_to_page(bdata->node_bootmem_map); 355 page = virt_to_page(bdata->node_bootmem_map);
347 count = 0; 356 count = 0;
348 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { 357 idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
349 count++; 358 for (i = 0; i < idx; i++, page++) {
350 __free_pages_bootmem(page, 0); 359 __free_pages_bootmem(page, 0);
360 count++;
351 } 361 }
352 total += count; 362 total += count;
353 bdata->node_bootmem_map = NULL; 363 bdata->node_bootmem_map = NULL;
@@ -355,64 +365,72 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
355 return total; 365 return total;
356} 366}
357 367
358unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) 368unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
369 unsigned long startpfn, unsigned long endpfn)
359{ 370{
360 return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); 371 return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
361} 372}
362 373
363void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) 374void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
375 unsigned long size)
364{ 376{
365 reserve_bootmem_core(pgdat->bdata, physaddr, size); 377 reserve_bootmem_core(pgdat->bdata, physaddr, size);
366} 378}
367 379
368void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) 380void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
381 unsigned long size)
369{ 382{
370 free_bootmem_core(pgdat->bdata, physaddr, size); 383 free_bootmem_core(pgdat->bdata, physaddr, size);
371} 384}
372 385
373unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) 386unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
374{ 387{
375 return(free_all_bootmem_core(pgdat)); 388 return free_all_bootmem_core(pgdat);
376} 389}
377 390
378unsigned long __init init_bootmem (unsigned long start, unsigned long pages) 391unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
379{ 392{
380 max_low_pfn = pages; 393 max_low_pfn = pages;
381 min_low_pfn = start; 394 min_low_pfn = start;
382 return(init_bootmem_core(NODE_DATA(0), start, 0, pages)); 395 return init_bootmem_core(NODE_DATA(0), start, 0, pages);
383} 396}
384 397
385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 398#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
386void __init reserve_bootmem (unsigned long addr, unsigned long size) 399void __init reserve_bootmem(unsigned long addr, unsigned long size)
387{ 400{
388 reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); 401 reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
389} 402}
390#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 403#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
391 404
392void __init free_bootmem (unsigned long addr, unsigned long size) 405void __init free_bootmem(unsigned long addr, unsigned long size)
393{ 406{
394 free_bootmem_core(NODE_DATA(0)->bdata, addr, size); 407 free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
395} 408}
396 409
397unsigned long __init free_all_bootmem (void) 410unsigned long __init free_all_bootmem(void)
398{ 411{
399 return(free_all_bootmem_core(NODE_DATA(0))); 412 return free_all_bootmem_core(NODE_DATA(0));
400} 413}
401 414
402void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) 415void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
416 unsigned long goal)
403{ 417{
404 bootmem_data_t *bdata; 418 bootmem_data_t *bdata;
405 void *ptr; 419 void *ptr;
406 420
407 list_for_each_entry(bdata, &bdata_list, list) 421 list_for_each_entry(bdata, &bdata_list, list) {
408 if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) 422 ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
409 return(ptr); 423 if (ptr)
424 return ptr;
425 }
410 return NULL; 426 return NULL;
411} 427}
412 428
413void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) 429void * __init __alloc_bootmem(unsigned long size, unsigned long align,
430 unsigned long goal)
414{ 431{
415 void *mem = __alloc_bootmem_nopanic(size,align,goal); 432 void *mem = __alloc_bootmem_nopanic(size,align,goal);
433
416 if (mem) 434 if (mem)
417 return mem; 435 return mem;
418 /* 436 /*
@@ -424,29 +442,34 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned
424} 442}
425 443
426 444
427void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, 445void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
428 unsigned long goal) 446 unsigned long align, unsigned long goal)
429{ 447{
430 void *ptr; 448 void *ptr;
431 449
432 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 450 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
433 if (ptr) 451 if (ptr)
434 return (ptr); 452 return ptr;
435 453
436 return __alloc_bootmem(size, align, goal); 454 return __alloc_bootmem(size, align, goal);
437} 455}
438 456
439#define LOW32LIMIT 0xffffffff 457#ifndef ARCH_LOW_ADDRESS_LIMIT
458#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
459#endif
440 460
441void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) 461void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
462 unsigned long goal)
442{ 463{
443 bootmem_data_t *bdata; 464 bootmem_data_t *bdata;
444 void *ptr; 465 void *ptr;
445 466
446 list_for_each_entry(bdata, &bdata_list, list) 467 list_for_each_entry(bdata, &bdata_list, list) {
447 if ((ptr = __alloc_bootmem_core(bdata, size, 468 ptr = __alloc_bootmem_core(bdata, size, align, goal,
448 align, goal, LOW32LIMIT))) 469 ARCH_LOW_ADDRESS_LIMIT);
449 return(ptr); 470 if (ptr)
471 return ptr;
472 }
450 473
451 /* 474 /*
452 * Whoops, we cannot satisfy the allocation request. 475 * Whoops, we cannot satisfy the allocation request.
@@ -459,5 +482,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsig
459void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 482void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
460 unsigned long align, unsigned long goal) 483 unsigned long align, unsigned long goal)
461{ 484{
462 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); 485 return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
486 ARCH_LOW_ADDRESS_LIMIT);
463} 487}
diff --git a/mm/filemap.c b/mm/filemap.c
index b9a60c43b61a..afcdc72b5e90 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -488,6 +488,12 @@ struct page *page_cache_alloc_cold(struct address_space *x)
488EXPORT_SYMBOL(page_cache_alloc_cold); 488EXPORT_SYMBOL(page_cache_alloc_cold);
489#endif 489#endif
490 490
491static int __sleep_on_page_lock(void *word)
492{
493 io_schedule();
494 return 0;
495}
496
491/* 497/*
492 * In order to wait for pages to become available there must be 498 * In order to wait for pages to become available there must be
493 * waitqueues associated with pages. By using a hash table of 499 * waitqueues associated with pages. By using a hash table of
@@ -577,13 +583,24 @@ void fastcall __lock_page(struct page *page)
577} 583}
578EXPORT_SYMBOL(__lock_page); 584EXPORT_SYMBOL(__lock_page);
579 585
586/*
587 * Variant of lock_page that does not require the caller to hold a reference
588 * on the page's mapping.
589 */
590void fastcall __lock_page_nosync(struct page *page)
591{
592 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
593 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
594 TASK_UNINTERRUPTIBLE);
595}
596
580/** 597/**
581 * find_get_page - find and get a page reference 598 * find_get_page - find and get a page reference
582 * @mapping: the address_space to search 599 * @mapping: the address_space to search
583 * @offset: the page index 600 * @offset: the page index
584 * 601 *
585 * A rather lightweight function, finding and getting a reference to a 602 * Is there a pagecache struct page at the given (mapping, offset) tuple?
586 * hashed page atomically. 603 * If yes, increment its refcount and return it; if no, return NULL.
587 */ 604 */
588struct page * find_get_page(struct address_space *mapping, unsigned long offset) 605struct page * find_get_page(struct address_space *mapping, unsigned long offset)
589{ 606{
@@ -970,7 +987,7 @@ page_not_up_to_date:
970 /* Get exclusive access to the page ... */ 987 /* Get exclusive access to the page ... */
971 lock_page(page); 988 lock_page(page);
972 989
973 /* Did it get unhashed before we got the lock? */ 990 /* Did it get truncated before we got the lock? */
974 if (!page->mapping) { 991 if (!page->mapping) {
975 unlock_page(page); 992 unlock_page(page);
976 page_cache_release(page); 993 page_cache_release(page);
@@ -1610,7 +1627,7 @@ no_cached_page:
1610page_not_uptodate: 1627page_not_uptodate:
1611 lock_page(page); 1628 lock_page(page);
1612 1629
1613 /* Did it get unhashed while we waited for it? */ 1630 /* Did it get truncated while we waited for it? */
1614 if (!page->mapping) { 1631 if (!page->mapping) {
1615 unlock_page(page); 1632 unlock_page(page);
1616 goto err; 1633 goto err;
diff --git a/mm/fremap.c b/mm/fremap.c
index 21b7d0cbc98c..aa30618ec6b2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -79,9 +79,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
79 inc_mm_counter(mm, file_rss); 79 inc_mm_counter(mm, file_rss);
80 80
81 flush_icache_page(vma, page); 81 flush_icache_page(vma, page);
82 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 82 pte_val = mk_pte(page, prot);
83 set_pte_at(mm, addr, pte, pte_val);
83 page_add_file_rmap(page); 84 page_add_file_rmap(page);
84 pte_val = *pte;
85 update_mmu_cache(vma, addr, pte_val); 85 update_mmu_cache(vma, addr, pte_val);
86 lazy_mmu_prot_update(pte_val); 86 lazy_mmu_prot_update(pte_val);
87 err = 0; 87 err = 0;
diff --git a/mm/highmem.c b/mm/highmem.c
index 9b2a5403c447..ee5519b176ee 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -46,6 +46,19 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
46 */ 46 */
47#ifdef CONFIG_HIGHMEM 47#ifdef CONFIG_HIGHMEM
48 48
49unsigned long totalhigh_pages __read_mostly;
50
51unsigned int nr_free_highpages (void)
52{
53 pg_data_t *pgdat;
54 unsigned int pages = 0;
55
56 for_each_online_pgdat(pgdat)
57 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
58
59 return pages;
60}
61
49static int pkmap_count[LAST_PKMAP]; 62static int pkmap_count[LAST_PKMAP];
50static unsigned int last_pkmap_nr; 63static unsigned int last_pkmap_nr;
51static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); 64static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index df499973255f..7c7d03dbf73d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -72,7 +72,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
72 struct zone **z; 72 struct zone **z;
73 73
74 for (z = zonelist->zones; *z; z++) { 74 for (z = zonelist->zones; *z; z++) {
75 nid = (*z)->zone_pgdat->node_id; 75 nid = zone_to_nid(*z);
76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && 76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
77 !list_empty(&hugepage_freelists[nid])) 77 !list_empty(&hugepage_freelists[nid]))
78 break; 78 break;
@@ -177,7 +177,7 @@ static void update_and_free_page(struct page *page)
177{ 177{
178 int i; 178 int i;
179 nr_huge_pages--; 179 nr_huge_pages--;
180 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; 180 nr_huge_pages_node[page_to_nid(page)]--;
181 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 181 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
182 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 182 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
183 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 183 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -191,7 +191,8 @@ static void update_and_free_page(struct page *page)
191#ifdef CONFIG_HIGHMEM 191#ifdef CONFIG_HIGHMEM
192static void try_to_free_low(unsigned long count) 192static void try_to_free_low(unsigned long count)
193{ 193{
194 int i, nid; 194 int i;
195
195 for (i = 0; i < MAX_NUMNODES; ++i) { 196 for (i = 0; i < MAX_NUMNODES; ++i) {
196 struct page *page, *next; 197 struct page *page, *next;
197 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 198 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
@@ -199,9 +200,8 @@ static void try_to_free_low(unsigned long count)
199 continue; 200 continue;
200 list_del(&page->lru); 201 list_del(&page->lru);
201 update_and_free_page(page); 202 update_and_free_page(page);
202 nid = page_zone(page)->zone_pgdat->node_id;
203 free_huge_pages--; 203 free_huge_pages--;
204 free_huge_pages_node[nid]--; 204 free_huge_pages_node[page_to_nid(page)]--;
205 if (count >= nr_huge_pages) 205 if (count >= nr_huge_pages)
206 return; 206 return;
207 } 207 }
diff --git a/mm/internal.h b/mm/internal.h
index d20e3cc4aef0..d527b80b292f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,8 +24,8 @@ static inline void set_page_count(struct page *page, int v)
24 */ 24 */
25static inline void set_page_refcounted(struct page *page) 25static inline void set_page_refcounted(struct page *page)
26{ 26{
27 BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); 27 VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
28 BUG_ON(atomic_read(&page->_count)); 28 VM_BUG_ON(atomic_read(&page->_count));
29 set_page_count(page, 1); 29 set_page_count(page, 1);
30} 30}
31 31
diff --git a/mm/memory.c b/mm/memory.c
index 109e9866237e..92a3ebd8d795 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/delayacct.h> 50#include <linux/delayacct.h>
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h>
52 53
53#include <asm/pgalloc.h> 54#include <asm/pgalloc.h>
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
@@ -1226,7 +1227,12 @@ out:
1226 return retval; 1227 return retval;
1227} 1228}
1228 1229
1229/* 1230/**
1231 * vm_insert_page - insert single page into user vma
1232 * @vma: user vma to map to
1233 * @addr: target user address of this page
1234 * @page: source kernel page
1235 *
1230 * This allows drivers to insert individual pages they've allocated 1236 * This allows drivers to insert individual pages they've allocated
1231 * into a user vma. 1237 * into a user vma.
1232 * 1238 *
@@ -1318,7 +1324,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1318 return 0; 1324 return 0;
1319} 1325}
1320 1326
1321/* Note: this is only safe if the mm semaphore is held when called. */ 1327/**
1328 * remap_pfn_range - remap kernel memory to userspace
1329 * @vma: user vma to map to
1330 * @addr: target user address to start at
1331 * @pfn: physical address of kernel memory
1332 * @size: size of map area
1333 * @prot: page protection flags for this mapping
1334 *
1335 * Note: this is only safe if the mm semaphore is held when called.
1336 */
1322int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 1337int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1323 unsigned long pfn, unsigned long size, pgprot_t prot) 1338 unsigned long pfn, unsigned long size, pgprot_t prot)
1324{ 1339{
@@ -1458,14 +1473,29 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1458{ 1473{
1459 struct page *old_page, *new_page; 1474 struct page *old_page, *new_page;
1460 pte_t entry; 1475 pte_t entry;
1461 int reuse, ret = VM_FAULT_MINOR; 1476 int reuse = 0, ret = VM_FAULT_MINOR;
1477 struct page *dirty_page = NULL;
1462 1478
1463 old_page = vm_normal_page(vma, address, orig_pte); 1479 old_page = vm_normal_page(vma, address, orig_pte);
1464 if (!old_page) 1480 if (!old_page)
1465 goto gotten; 1481 goto gotten;
1466 1482
1467 if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == 1483 /*
1468 (VM_SHARED|VM_WRITE))) { 1484 * Take out anonymous pages first, anonymous shared vmas are
1485 * not dirty accountable.
1486 */
1487 if (PageAnon(old_page)) {
1488 if (!TestSetPageLocked(old_page)) {
1489 reuse = can_share_swap_page(old_page);
1490 unlock_page(old_page);
1491 }
1492 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1493 (VM_WRITE|VM_SHARED))) {
1494 /*
1495 * Only catch write-faults on shared writable pages,
1496 * read-only shared pages can get COWed by
1497 * get_user_pages(.write=1, .force=1).
1498 */
1469 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 1499 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1470 /* 1500 /*
1471 * Notify the address space that the page is about to 1501 * Notify the address space that the page is about to
@@ -1494,13 +1524,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1494 if (!pte_same(*page_table, orig_pte)) 1524 if (!pte_same(*page_table, orig_pte))
1495 goto unlock; 1525 goto unlock;
1496 } 1526 }
1497 1527 dirty_page = old_page;
1528 get_page(dirty_page);
1498 reuse = 1; 1529 reuse = 1;
1499 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1500 reuse = can_share_swap_page(old_page);
1501 unlock_page(old_page);
1502 } else {
1503 reuse = 0;
1504 } 1530 }
1505 1531
1506 if (reuse) { 1532 if (reuse) {
@@ -1566,6 +1592,10 @@ gotten:
1566 page_cache_release(old_page); 1592 page_cache_release(old_page);
1567unlock: 1593unlock:
1568 pte_unmap_unlock(page_table, ptl); 1594 pte_unmap_unlock(page_table, ptl);
1595 if (dirty_page) {
1596 set_page_dirty_balance(dirty_page);
1597 put_page(dirty_page);
1598 }
1569 return ret; 1599 return ret;
1570oom: 1600oom:
1571 if (old_page) 1601 if (old_page)
@@ -1785,9 +1815,10 @@ void unmap_mapping_range(struct address_space *mapping,
1785} 1815}
1786EXPORT_SYMBOL(unmap_mapping_range); 1816EXPORT_SYMBOL(unmap_mapping_range);
1787 1817
1788/* 1818/**
1789 * Handle all mappings that got truncated by a "truncate()" 1819 * vmtruncate - unmap mappings "freed" by truncate() syscall
1790 * system call. 1820 * @inode: inode of the file used
1821 * @offset: file offset to start truncating
1791 * 1822 *
1792 * NOTE! We have to be ready to update the memory sharing 1823 * NOTE! We have to be ready to update the memory sharing
1793 * between the file and the memory map for a potential last 1824 * between the file and the memory map for a potential last
@@ -1856,11 +1887,16 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1856} 1887}
1857EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ 1888EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
1858 1889
1859/* 1890/**
1891 * swapin_readahead - swap in pages in hope we need them soon
1892 * @entry: swap entry of this memory
1893 * @addr: address to start
1894 * @vma: user vma this addresses belong to
1895 *
1860 * Primitive swap readahead code. We simply read an aligned block of 1896 * Primitive swap readahead code. We simply read an aligned block of
1861 * (1 << page_cluster) entries in the swap area. This method is chosen 1897 * (1 << page_cluster) entries in the swap area. This method is chosen
1862 * because it doesn't cost us any seek time. We also make sure to queue 1898 * because it doesn't cost us any seek time. We also make sure to queue
1863 * the 'original' request together with the readahead ones... 1899 * the 'original' request together with the readahead ones...
1864 * 1900 *
1865 * This has been extended to use the NUMA policies from the mm triggering 1901 * This has been extended to use the NUMA policies from the mm triggering
1866 * the readahead. 1902 * the readahead.
@@ -2098,6 +2134,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2098 unsigned int sequence = 0; 2134 unsigned int sequence = 0;
2099 int ret = VM_FAULT_MINOR; 2135 int ret = VM_FAULT_MINOR;
2100 int anon = 0; 2136 int anon = 0;
2137 struct page *dirty_page = NULL;
2101 2138
2102 pte_unmap(page_table); 2139 pte_unmap(page_table);
2103 BUG_ON(vma->vm_flags & VM_PFNMAP); 2140 BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2192,6 +2229,10 @@ retry:
2192 } else { 2229 } else {
2193 inc_mm_counter(mm, file_rss); 2230 inc_mm_counter(mm, file_rss);
2194 page_add_file_rmap(new_page); 2231 page_add_file_rmap(new_page);
2232 if (write_access) {
2233 dirty_page = new_page;
2234 get_page(dirty_page);
2235 }
2195 } 2236 }
2196 } else { 2237 } else {
2197 /* One of our sibling threads was faster, back out. */ 2238 /* One of our sibling threads was faster, back out. */
@@ -2204,6 +2245,10 @@ retry:
2204 lazy_mmu_prot_update(entry); 2245 lazy_mmu_prot_update(entry);
2205unlock: 2246unlock:
2206 pte_unmap_unlock(page_table, ptl); 2247 pte_unmap_unlock(page_table, ptl);
2248 if (dirty_page) {
2249 set_page_dirty_balance(dirty_page);
2250 put_page(dirty_page);
2251 }
2207 return ret; 2252 return ret;
2208oom: 2253oom:
2209 page_cache_release(new_page); 2254 page_cache_release(new_page);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a9963ceddd65..38f89650bc84 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -105,7 +105,7 @@ static struct kmem_cache *sn_cache;
105 105
106/* Highest zone. An specific allocation for a zone below that is not 106/* Highest zone. An specific allocation for a zone below that is not
107 policied. */ 107 policied. */
108int policy_zone = ZONE_DMA; 108enum zone_type policy_zone = ZONE_DMA;
109 109
110struct mempolicy default_policy = { 110struct mempolicy default_policy = {
111 .refcnt = ATOMIC_INIT(1), /* never free it */ 111 .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -137,7 +137,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
137static struct zonelist *bind_zonelist(nodemask_t *nodes) 137static struct zonelist *bind_zonelist(nodemask_t *nodes)
138{ 138{
139 struct zonelist *zl; 139 struct zonelist *zl;
140 int num, max, nd, k; 140 int num, max, nd;
141 enum zone_type k;
141 142
142 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
143 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 144 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
@@ -148,12 +149,16 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
148 lower zones etc. Avoid empty zones because the memory allocator 149 lower zones etc. Avoid empty zones because the memory allocator
149 doesn't like them. If you implement node hot removal you 150 doesn't like them. If you implement node hot removal you
150 have to fix that. */ 151 have to fix that. */
151 for (k = policy_zone; k >= 0; k--) { 152 k = policy_zone;
153 while (1) {
152 for_each_node_mask(nd, *nodes) { 154 for_each_node_mask(nd, *nodes) {
153 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 155 struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 if (z->present_pages > 0) 156 if (z->present_pages > 0)
155 zl->zones[num++] = z; 157 zl->zones[num++] = z;
156 } 158 }
159 if (k == 0)
160 break;
161 k--;
157 } 162 }
158 zl->zones[num] = NULL; 163 zl->zones[num] = NULL;
159 return zl; 164 return zl;
@@ -482,7 +487,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
482 switch (p->policy) { 487 switch (p->policy) {
483 case MPOL_BIND: 488 case MPOL_BIND:
484 for (i = 0; p->v.zonelist->zones[i]; i++) 489 for (i = 0; p->v.zonelist->zones[i]; i++)
485 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, 490 node_set(zone_to_nid(p->v.zonelist->zones[i]),
486 *nodes); 491 *nodes);
487 break; 492 break;
488 case MPOL_DEFAULT: 493 case MPOL_DEFAULT:
@@ -1140,7 +1145,7 @@ unsigned slab_node(struct mempolicy *policy)
1140 * Follow bind policy behavior and start allocation at the 1145 * Follow bind policy behavior and start allocation at the
1141 * first node. 1146 * first node.
1142 */ 1147 */
1143 return policy->v.zonelist->zones[0]->zone_pgdat->node_id; 1148 return zone_to_nid(policy->v.zonelist->zones[0]);
1144 1149
1145 case MPOL_PREFERRED: 1150 case MPOL_PREFERRED:
1146 if (policy->v.preferred_node >= 0) 1151 if (policy->v.preferred_node >= 0)
@@ -1285,7 +1290,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1285 1290
1286 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1291 if ((gfp & __GFP_WAIT) && !in_interrupt())
1287 cpuset_update_task_memory_state(); 1292 cpuset_update_task_memory_state();
1288 if (!pol || in_interrupt()) 1293 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1289 pol = &default_policy; 1294 pol = &default_policy;
1290 if (pol->policy == MPOL_INTERLEAVE) 1295 if (pol->policy == MPOL_INTERLEAVE)
1291 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1296 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
@@ -1644,7 +1649,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1644 1649
1645 nodes_clear(nodes); 1650 nodes_clear(nodes);
1646 for (z = pol->v.zonelist->zones; *z; z++) 1651 for (z = pol->v.zonelist->zones; *z; z++)
1647 node_set((*z)->zone_pgdat->node_id, nodes); 1652 node_set(zone_to_nid(*z), nodes);
1648 nodes_remap(tmp, nodes, *mpolmask, *newmask); 1653 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1649 nodes = tmp; 1654 nodes = tmp;
1650 1655
diff --git a/mm/migrate.c b/mm/migrate.c
index 3f1e0c2c942c..20a8c2687b1e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -741,7 +741,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
741 741
742 *result = &pm->status; 742 *result = &pm->status;
743 743
744 return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); 744 return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0);
745} 745}
746 746
747/* 747/*
diff --git a/mm/mmap.c b/mm/mmap.c
index d799d896d74a..eea8eefd51a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -116,7 +116,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
116 * which are reclaimable, under pressure. The dentry 116 * which are reclaimable, under pressure. The dentry
117 * cache and most inode caches should fall into this 117 * cache and most inode caches should fall into this
118 */ 118 */
119 free += atomic_read(&slab_reclaim_pages); 119 free += global_page_state(NR_SLAB_RECLAIMABLE);
120 120
121 /* 121 /*
122 * Leave the last 3% for root 122 * Leave the last 3% for root
@@ -1105,12 +1105,6 @@ munmap_back:
1105 goto free_vma; 1105 goto free_vma;
1106 } 1106 }
1107 1107
1108 /* Don't make the VMA automatically writable if it's shared, but the
1109 * backer wishes to know when pages are first written to */
1110 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1111 vma->vm_page_prot =
1112 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1113
1114 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform 1108 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1115 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) 1109 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1116 * that memory reservation must be checked; but that reservation 1110 * that memory reservation must be checked; but that reservation
@@ -1128,6 +1122,10 @@ munmap_back:
1128 pgoff = vma->vm_pgoff; 1122 pgoff = vma->vm_pgoff;
1129 vm_flags = vma->vm_flags; 1123 vm_flags = vma->vm_flags;
1130 1124
1125 if (vma_wants_writenotify(vma))
1126 vma->vm_page_prot =
1127 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1128
1131 if (!file || !vma_merge(mm, prev, addr, vma->vm_end, 1129 if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
1132 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { 1130 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
1133 file = vma->vm_file; 1131 file = vma->vm_file;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 638edabaff71..955f9d0e38aa 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -27,7 +27,8 @@
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28 28
29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
30 unsigned long addr, unsigned long end, pgprot_t newprot) 30 unsigned long addr, unsigned long end, pgprot_t newprot,
31 int dirty_accountable)
31{ 32{
32 pte_t *pte, oldpte; 33 pte_t *pte, oldpte;
33 spinlock_t *ptl; 34 spinlock_t *ptl;
@@ -42,7 +43,14 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
42 * bits by wiping the pte and then setting the new pte 43 * bits by wiping the pte and then setting the new pte
43 * into place. 44 * into place.
44 */ 45 */
45 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); 46 ptent = ptep_get_and_clear(mm, addr, pte);
47 ptent = pte_modify(ptent, newprot);
48 /*
49 * Avoid taking write faults for pages we know to be
50 * dirty.
51 */
52 if (dirty_accountable && pte_dirty(ptent))
53 ptent = pte_mkwrite(ptent);
46 set_pte_at(mm, addr, pte, ptent); 54 set_pte_at(mm, addr, pte, ptent);
47 lazy_mmu_prot_update(ptent); 55 lazy_mmu_prot_update(ptent);
48#ifdef CONFIG_MIGRATION 56#ifdef CONFIG_MIGRATION
@@ -66,7 +74,8 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
66} 74}
67 75
68static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, 76static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
69 unsigned long addr, unsigned long end, pgprot_t newprot) 77 unsigned long addr, unsigned long end, pgprot_t newprot,
78 int dirty_accountable)
70{ 79{
71 pmd_t *pmd; 80 pmd_t *pmd;
72 unsigned long next; 81 unsigned long next;
@@ -76,12 +85,13 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
76 next = pmd_addr_end(addr, end); 85 next = pmd_addr_end(addr, end);
77 if (pmd_none_or_clear_bad(pmd)) 86 if (pmd_none_or_clear_bad(pmd))
78 continue; 87 continue;
79 change_pte_range(mm, pmd, addr, next, newprot); 88 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
80 } while (pmd++, addr = next, addr != end); 89 } while (pmd++, addr = next, addr != end);
81} 90}
82 91
83static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, 92static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
84 unsigned long addr, unsigned long end, pgprot_t newprot) 93 unsigned long addr, unsigned long end, pgprot_t newprot,
94 int dirty_accountable)
85{ 95{
86 pud_t *pud; 96 pud_t *pud;
87 unsigned long next; 97 unsigned long next;
@@ -91,12 +101,13 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
91 next = pud_addr_end(addr, end); 101 next = pud_addr_end(addr, end);
92 if (pud_none_or_clear_bad(pud)) 102 if (pud_none_or_clear_bad(pud))
93 continue; 103 continue;
94 change_pmd_range(mm, pud, addr, next, newprot); 104 change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
95 } while (pud++, addr = next, addr != end); 105 } while (pud++, addr = next, addr != end);
96} 106}
97 107
98static void change_protection(struct vm_area_struct *vma, 108static void change_protection(struct vm_area_struct *vma,
99 unsigned long addr, unsigned long end, pgprot_t newprot) 109 unsigned long addr, unsigned long end, pgprot_t newprot,
110 int dirty_accountable)
100{ 111{
101 struct mm_struct *mm = vma->vm_mm; 112 struct mm_struct *mm = vma->vm_mm;
102 pgd_t *pgd; 113 pgd_t *pgd;
@@ -110,7 +121,7 @@ static void change_protection(struct vm_area_struct *vma,
110 next = pgd_addr_end(addr, end); 121 next = pgd_addr_end(addr, end);
111 if (pgd_none_or_clear_bad(pgd)) 122 if (pgd_none_or_clear_bad(pgd))
112 continue; 123 continue;
113 change_pud_range(mm, pgd, addr, next, newprot); 124 change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
114 } while (pgd++, addr = next, addr != end); 125 } while (pgd++, addr = next, addr != end);
115 flush_tlb_range(vma, start, end); 126 flush_tlb_range(vma, start, end);
116} 127}
@@ -123,10 +134,9 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
123 unsigned long oldflags = vma->vm_flags; 134 unsigned long oldflags = vma->vm_flags;
124 long nrpages = (end - start) >> PAGE_SHIFT; 135 long nrpages = (end - start) >> PAGE_SHIFT;
125 unsigned long charged = 0; 136 unsigned long charged = 0;
126 unsigned int mask;
127 pgprot_t newprot;
128 pgoff_t pgoff; 137 pgoff_t pgoff;
129 int error; 138 int error;
139 int dirty_accountable = 0;
130 140
131 if (newflags == oldflags) { 141 if (newflags == oldflags) {
132 *pprev = vma; 142 *pprev = vma;
@@ -176,24 +186,23 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
176 } 186 }
177 187
178success: 188success:
179 /* Don't make the VMA automatically writable if it's shared, but the
180 * backer wishes to know when pages are first written to */
181 mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
182 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
183 mask &= ~VM_SHARED;
184
185 newprot = protection_map[newflags & mask];
186
187 /* 189 /*
188 * vm_flags and vm_page_prot are protected by the mmap_sem 190 * vm_flags and vm_page_prot are protected by the mmap_sem
189 * held in write mode. 191 * held in write mode.
190 */ 192 */
191 vma->vm_flags = newflags; 193 vma->vm_flags = newflags;
192 vma->vm_page_prot = newprot; 194 vma->vm_page_prot = protection_map[newflags &
195 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
196 if (vma_wants_writenotify(vma)) {
197 vma->vm_page_prot = protection_map[newflags &
198 (VM_READ|VM_WRITE|VM_EXEC)];
199 dirty_accountable = 1;
200 }
201
193 if (is_vm_hugetlb_page(vma)) 202 if (is_vm_hugetlb_page(vma))
194 hugetlb_change_protection(vma, start, end, newprot); 203 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
195 else 204 else
196 change_protection(vma, start, end, newprot); 205 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
197 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 206 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
198 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 207 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
199 return 0; 208 return 0;
diff --git a/mm/msync.c b/mm/msync.c
index d083544df21b..358d73cf7b78 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -7,149 +7,33 @@
7/* 7/*
8 * The msync() system call. 8 * The msync() system call.
9 */ 9 */
10#include <linux/slab.h>
11#include <linux/pagemap.h>
12#include <linux/fs.h> 10#include <linux/fs.h>
13#include <linux/mm.h> 11#include <linux/mm.h>
14#include <linux/mman.h> 12#include <linux/mman.h>
15#include <linux/hugetlb.h>
16#include <linux/writeback.h>
17#include <linux/file.h> 13#include <linux/file.h>
18#include <linux/syscalls.h> 14#include <linux/syscalls.h>
19 15
20#include <asm/pgtable.h>
21#include <asm/tlbflush.h>
22
23static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
24 unsigned long addr, unsigned long end)
25{
26 pte_t *pte;
27 spinlock_t *ptl;
28 int progress = 0;
29 unsigned long ret = 0;
30
31again:
32 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
33 do {
34 struct page *page;
35
36 if (progress >= 64) {
37 progress = 0;
38 if (need_resched() || need_lockbreak(ptl))
39 break;
40 }
41 progress++;
42 if (!pte_present(*pte))
43 continue;
44 if (!pte_maybe_dirty(*pte))
45 continue;
46 page = vm_normal_page(vma, addr, *pte);
47 if (!page)
48 continue;
49 if (ptep_clear_flush_dirty(vma, addr, pte) ||
50 page_test_and_clear_dirty(page))
51 ret += set_page_dirty(page);
52 progress += 3;
53 } while (pte++, addr += PAGE_SIZE, addr != end);
54 pte_unmap_unlock(pte - 1, ptl);
55 cond_resched();
56 if (addr != end)
57 goto again;
58 return ret;
59}
60
61static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
62 pud_t *pud, unsigned long addr, unsigned long end)
63{
64 pmd_t *pmd;
65 unsigned long next;
66 unsigned long ret = 0;
67
68 pmd = pmd_offset(pud, addr);
69 do {
70 next = pmd_addr_end(addr, end);
71 if (pmd_none_or_clear_bad(pmd))
72 continue;
73 ret += msync_pte_range(vma, pmd, addr, next);
74 } while (pmd++, addr = next, addr != end);
75 return ret;
76}
77
78static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
79 pgd_t *pgd, unsigned long addr, unsigned long end)
80{
81 pud_t *pud;
82 unsigned long next;
83 unsigned long ret = 0;
84
85 pud = pud_offset(pgd, addr);
86 do {
87 next = pud_addr_end(addr, end);
88 if (pud_none_or_clear_bad(pud))
89 continue;
90 ret += msync_pmd_range(vma, pud, addr, next);
91 } while (pud++, addr = next, addr != end);
92 return ret;
93}
94
95static unsigned long msync_page_range(struct vm_area_struct *vma,
96 unsigned long addr, unsigned long end)
97{
98 pgd_t *pgd;
99 unsigned long next;
100 unsigned long ret = 0;
101
102 /* For hugepages we can't go walking the page table normally,
103 * but that's ok, hugetlbfs is memory based, so we don't need
104 * to do anything more on an msync().
105 */
106 if (vma->vm_flags & VM_HUGETLB)
107 return 0;
108
109 BUG_ON(addr >= end);
110 pgd = pgd_offset(vma->vm_mm, addr);
111 flush_cache_range(vma, addr, end);
112 do {
113 next = pgd_addr_end(addr, end);
114 if (pgd_none_or_clear_bad(pgd))
115 continue;
116 ret += msync_pud_range(vma, pgd, addr, next);
117 } while (pgd++, addr = next, addr != end);
118 return ret;
119}
120
121/* 16/*
122 * MS_SYNC syncs the entire file - including mappings. 17 * MS_SYNC syncs the entire file - including mappings.
123 * 18 *
124 * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just 19 * MS_ASYNC does not start I/O (it used to, up to 2.5.67).
125 * marks the relevant pages dirty. The application may now run fsync() to 20 * Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
21 * Now it doesn't do anything, since dirty pages are properly tracked.
22 *
23 * The application may now run fsync() to
126 * write out the dirty pages and wait on the writeout and check the result. 24 * write out the dirty pages and wait on the writeout and check the result.
127 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start 25 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
128 * async writeout immediately. 26 * async writeout immediately.
129 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to 27 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
130 * applications. 28 * applications.
131 */ 29 */
132static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
133 unsigned long end, int flags,
134 unsigned long *nr_pages_dirtied)
135{
136 struct file *file = vma->vm_file;
137
138 if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
139 return -EBUSY;
140
141 if (file && (vma->vm_flags & VM_SHARED))
142 *nr_pages_dirtied = msync_page_range(vma, addr, end);
143 return 0;
144}
145
146asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 30asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
147{ 31{
148 unsigned long end; 32 unsigned long end;
33 struct mm_struct *mm = current->mm;
149 struct vm_area_struct *vma; 34 struct vm_area_struct *vma;
150 int unmapped_error = 0; 35 int unmapped_error = 0;
151 int error = -EINVAL; 36 int error = -EINVAL;
152 int done = 0;
153 37
154 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) 38 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
155 goto out; 39 goto out;
@@ -169,64 +53,50 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
169 * If the interval [start,end) covers some unmapped address ranges, 53 * If the interval [start,end) covers some unmapped address ranges,
170 * just ignore them, but return -ENOMEM at the end. 54 * just ignore them, but return -ENOMEM at the end.
171 */ 55 */
172 down_read(&current->mm->mmap_sem); 56 down_read(&mm->mmap_sem);
173 vma = find_vma(current->mm, start); 57 vma = find_vma(mm, start);
174 if (!vma) { 58 for (;;) {
175 error = -ENOMEM;
176 goto out_unlock;
177 }
178 do {
179 unsigned long nr_pages_dirtied = 0;
180 struct file *file; 59 struct file *file;
181 60
61 /* Still start < end. */
62 error = -ENOMEM;
63 if (!vma)
64 goto out_unlock;
182 /* Here start < vma->vm_end. */ 65 /* Here start < vma->vm_end. */
183 if (start < vma->vm_start) { 66 if (start < vma->vm_start) {
184 unmapped_error = -ENOMEM;
185 start = vma->vm_start; 67 start = vma->vm_start;
68 if (start >= end)
69 goto out_unlock;
70 unmapped_error = -ENOMEM;
186 } 71 }
187 /* Here vma->vm_start <= start < vma->vm_end. */ 72 /* Here vma->vm_start <= start < vma->vm_end. */
188 if (end <= vma->vm_end) { 73 if ((flags & MS_INVALIDATE) &&
189 if (start < end) { 74 (vma->vm_flags & VM_LOCKED)) {
190 error = msync_interval(vma, start, end, flags, 75 error = -EBUSY;
191 &nr_pages_dirtied); 76 goto out_unlock;
192 if (error)
193 goto out_unlock;
194 }
195 error = unmapped_error;
196 done = 1;
197 } else {
198 /* Here vma->vm_start <= start < vma->vm_end < end. */
199 error = msync_interval(vma, start, vma->vm_end, flags,
200 &nr_pages_dirtied);
201 if (error)
202 goto out_unlock;
203 } 77 }
204 file = vma->vm_file; 78 file = vma->vm_file;
205 start = vma->vm_end; 79 start = vma->vm_end;
206 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { 80 if ((flags & MS_SYNC) && file &&
207 get_file(file);
208 up_read(&current->mm->mmap_sem);
209 balance_dirty_pages_ratelimited_nr(file->f_mapping,
210 nr_pages_dirtied);
211 fput(file);
212 down_read(&current->mm->mmap_sem);
213 vma = find_vma(current->mm, start);
214 } else if ((flags & MS_SYNC) && file &&
215 (vma->vm_flags & VM_SHARED)) { 81 (vma->vm_flags & VM_SHARED)) {
216 get_file(file); 82 get_file(file);
217 up_read(&current->mm->mmap_sem); 83 up_read(&mm->mmap_sem);
218 error = do_fsync(file, 0); 84 error = do_fsync(file, 0);
219 fput(file); 85 fput(file);
220 down_read(&current->mm->mmap_sem); 86 if (error || start >= end)
221 if (error) 87 goto out;
222 goto out_unlock; 88 down_read(&mm->mmap_sem);
223 vma = find_vma(current->mm, start); 89 vma = find_vma(mm, start);
224 } else { 90 } else {
91 if (start >= end) {
92 error = 0;
93 goto out_unlock;
94 }
225 vma = vma->vm_next; 95 vma = vma->vm_next;
226 } 96 }
227 } while (vma && !done); 97 }
228out_unlock: 98out_unlock:
229 up_read(&current->mm->mmap_sem); 99 up_read(&mm->mmap_sem);
230out: 100out:
231 return error; 101 return error ? : unmapped_error;
232} 102}
diff --git a/mm/nommu.c b/mm/nommu.c
index c576df71e3bb..d99dea31e443 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1133,7 +1133,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
1133 * which are reclaimable, under pressure. The dentry 1133 * which are reclaimable, under pressure. The dentry
1134 * cache and most inode caches should fall into this 1134 * cache and most inode caches should fall into this
1135 */ 1135 */
1136 free += atomic_read(&slab_reclaim_pages); 1136 free += global_page_state(NR_SLAB_RECLAIMABLE);
1137 1137
1138 /* 1138 /*
1139 * Leave the last 3% for root 1139 * Leave the last 3% for root
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b9af136e5cfa..bada3d03119f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -21,6 +21,8 @@
21#include <linux/timex.h> 21#include <linux/timex.h>
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/module.h>
25#include <linux/notifier.h>
24 26
25int sysctl_panic_on_oom; 27int sysctl_panic_on_oom;
26/* #define DEBUG */ 28/* #define DEBUG */
@@ -58,6 +60,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 } 60 }
59 61
60 /* 62 /*
63 * swapoff can easily use up all memory, so kill those first.
64 */
65 if (p->flags & PF_SWAPOFF)
66 return ULONG_MAX;
67
68 /*
61 * The memory size of the process is the basis for the badness. 69 * The memory size of the process is the basis for the badness.
62 */ 70 */
63 points = mm->total_vm; 71 points = mm->total_vm;
@@ -127,6 +135,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
127 points /= 4; 135 points /= 4;
128 136
129 /* 137 /*
138 * If p's nodes don't overlap ours, it may still help to kill p
139 * because p may have allocated or otherwise mapped memory on
140 * this node before. However it will be less likely.
141 */
142 if (!cpuset_excl_nodes_overlap(p))
143 points /= 8;
144
145 /*
130 * Adjust the score by oomkilladj. 146 * Adjust the score by oomkilladj.
131 */ 147 */
132 if (p->oomkilladj) { 148 if (p->oomkilladj) {
@@ -161,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
161 177
162 for (z = zonelist->zones; *z; z++) 178 for (z = zonelist->zones; *z; z++)
163 if (cpuset_zone_allowed(*z, gfp_mask)) 179 if (cpuset_zone_allowed(*z, gfp_mask))
164 node_clear((*z)->zone_pgdat->node_id, 180 node_clear(zone_to_nid(*z), nodes);
165 nodes);
166 else 181 else
167 return CONSTRAINT_CPUSET; 182 return CONSTRAINT_CPUSET;
168 183
@@ -191,25 +206,38 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
191 unsigned long points; 206 unsigned long points;
192 int releasing; 207 int releasing;
193 208
209 /* skip kernel threads */
210 if (!p->mm)
211 continue;
194 /* skip the init task with pid == 1 */ 212 /* skip the init task with pid == 1 */
195 if (p->pid == 1) 213 if (p->pid == 1)
196 continue; 214 continue;
197 if (p->oomkilladj == OOM_DISABLE)
198 continue;
199 /* If p's nodes don't overlap ours, it won't help to kill p. */
200 if (!cpuset_excl_nodes_overlap(p))
201 continue;
202 215
203 /* 216 /*
204 * This is in the process of releasing memory so wait for it 217 * This is in the process of releasing memory so wait for it
205 * to finish before killing some other task by mistake. 218 * to finish before killing some other task by mistake.
219 *
220 * However, if p is the current task, we allow the 'kill' to
221 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
222 * which will allow it to gain access to memory reserves in
223 * the process of exiting and releasing its resources.
224 * Otherwise we could get an OOM deadlock.
206 */ 225 */
207 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || 226 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
208 p->flags & PF_EXITING; 227 p->flags & PF_EXITING;
209 if (releasing && !(p->flags & PF_DEAD)) 228 if (releasing) {
229 /* PF_DEAD tasks have already released their mm */
230 if (p->flags & PF_DEAD)
231 continue;
232 if (p->flags & PF_EXITING && p == current) {
233 chosen = p;
234 *ppoints = ULONG_MAX;
235 break;
236 }
210 return ERR_PTR(-1UL); 237 return ERR_PTR(-1UL);
211 if (p->flags & PF_SWAPOFF) 238 }
212 return p; 239 if (p->oomkilladj == OOM_DISABLE)
240 continue;
213 241
214 points = badness(p, uptime.tv_sec); 242 points = badness(p, uptime.tv_sec);
215 if (points > *ppoints || !chosen) { 243 if (points > *ppoints || !chosen) {
@@ -221,9 +249,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
221} 249}
222 250
223/** 251/**
224 * We must be careful though to never send SIGKILL a process with 252 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
225 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that 253 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
226 * we select a process with CAP_SYS_RAW_IO set). 254 * set.
227 */ 255 */
228static void __oom_kill_task(struct task_struct *p, const char *message) 256static void __oom_kill_task(struct task_struct *p, const char *message)
229{ 257{
@@ -241,8 +269,11 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
241 return; 269 return;
242 } 270 }
243 task_unlock(p); 271 task_unlock(p);
244 printk(KERN_ERR "%s: Killed process %d (%s).\n", 272
273 if (message) {
274 printk(KERN_ERR "%s: Killed process %d (%s).\n",
245 message, p->pid, p->comm); 275 message, p->pid, p->comm);
276 }
246 277
247 /* 278 /*
248 * We give our sacrificial lamb high priority and access to 279 * We give our sacrificial lamb high priority and access to
@@ -293,8 +324,17 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
293 struct task_struct *c; 324 struct task_struct *c;
294 struct list_head *tsk; 325 struct list_head *tsk;
295 326
296 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " 327 /*
297 "children.\n", p->pid, p->comm, points); 328 * If the task is already exiting, don't alarm the sysadmin or kill
329 * its children or threads, just set TIF_MEMDIE so it can die quickly
330 */
331 if (p->flags & PF_EXITING) {
332 __oom_kill_task(p, NULL);
333 return 0;
334 }
335
336 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
337 " and children.\n", p->pid, p->comm, points);
298 /* Try to kill a child first */ 338 /* Try to kill a child first */
299 list_for_each(tsk, &p->children) { 339 list_for_each(tsk, &p->children) {
300 c = list_entry(tsk, struct task_struct, sibling); 340 c = list_entry(tsk, struct task_struct, sibling);
@@ -306,6 +346,20 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
306 return oom_kill_task(p, message); 346 return oom_kill_task(p, message);
307} 347}
308 348
349static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
350
351int register_oom_notifier(struct notifier_block *nb)
352{
353 return blocking_notifier_chain_register(&oom_notify_list, nb);
354}
355EXPORT_SYMBOL_GPL(register_oom_notifier);
356
357int unregister_oom_notifier(struct notifier_block *nb)
358{
359 return blocking_notifier_chain_unregister(&oom_notify_list, nb);
360}
361EXPORT_SYMBOL_GPL(unregister_oom_notifier);
362
309/** 363/**
310 * out_of_memory - kill the "best" process when we run out of memory 364 * out_of_memory - kill the "best" process when we run out of memory
311 * 365 *
@@ -318,10 +372,17 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
318{ 372{
319 struct task_struct *p; 373 struct task_struct *p;
320 unsigned long points = 0; 374 unsigned long points = 0;
375 unsigned long freed = 0;
376
377 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
378 if (freed > 0)
379 /* Got some memory back in the last second. */
380 return;
321 381
322 if (printk_ratelimit()) { 382 if (printk_ratelimit()) {
323 printk("oom-killer: gfp_mask=0x%x, order=%d\n", 383 printk(KERN_WARNING "%s invoked oom-killer: "
324 gfp_mask, order); 384 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
385 current->comm, gfp_mask, order, current->oomkilladj);
325 dump_stack(); 386 dump_stack();
326 show_mem(); 387 show_mem();
327 } 388 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 77a0bc4e261a..555752907dc3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -23,6 +23,7 @@
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/mpage.h> 25#include <linux/mpage.h>
26#include <linux/rmap.h>
26#include <linux/percpu.h> 27#include <linux/percpu.h>
27#include <linux/notifier.h> 28#include <linux/notifier.h>
28#include <linux/smp.h> 29#include <linux/smp.h>
@@ -243,6 +244,16 @@ static void balance_dirty_pages(struct address_space *mapping)
243 pdflush_operation(background_writeout, 0); 244 pdflush_operation(background_writeout, 0);
244} 245}
245 246
247void set_page_dirty_balance(struct page *page)
248{
249 if (set_page_dirty(page)) {
250 struct address_space *mapping = page_mapping(page);
251
252 if (mapping)
253 balance_dirty_pages_ratelimited(mapping);
254 }
255}
256
246/** 257/**
247 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 258 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
248 * @mapping: address_space which was dirtied 259 * @mapping: address_space which was dirtied
@@ -550,7 +561,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
550 return 0; 561 return 0;
551 wbc->for_writepages = 1; 562 wbc->for_writepages = 1;
552 if (mapping->a_ops->writepages) 563 if (mapping->a_ops->writepages)
553 ret = mapping->a_ops->writepages(mapping, wbc); 564 ret = mapping->a_ops->writepages(mapping, wbc);
554 else 565 else
555 ret = generic_writepages(mapping, wbc); 566 ret = generic_writepages(mapping, wbc);
556 wbc->for_writepages = 0; 567 wbc->for_writepages = 0;
@@ -690,7 +701,7 @@ int set_page_dirty_lock(struct page *page)
690{ 701{
691 int ret; 702 int ret;
692 703
693 lock_page(page); 704 lock_page_nosync(page);
694 ret = set_page_dirty(page); 705 ret = set_page_dirty(page);
695 unlock_page(page); 706 unlock_page(page);
696 return ret; 707 return ret;
@@ -712,9 +723,15 @@ int test_clear_page_dirty(struct page *page)
712 radix_tree_tag_clear(&mapping->page_tree, 723 radix_tree_tag_clear(&mapping->page_tree,
713 page_index(page), 724 page_index(page),
714 PAGECACHE_TAG_DIRTY); 725 PAGECACHE_TAG_DIRTY);
715 if (mapping_cap_account_dirty(mapping))
716 __dec_zone_page_state(page, NR_FILE_DIRTY);
717 write_unlock_irqrestore(&mapping->tree_lock, flags); 726 write_unlock_irqrestore(&mapping->tree_lock, flags);
727 /*
728 * We can continue to use `mapping' here because the
729 * page is locked, which pins the address_space
730 */
731 if (mapping_cap_account_dirty(mapping)) {
732 page_mkclean(page);
733 dec_zone_page_state(page, NR_FILE_DIRTY);
734 }
718 return 1; 735 return 1;
719 } 736 }
720 write_unlock_irqrestore(&mapping->tree_lock, flags); 737 write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -744,8 +761,10 @@ int clear_page_dirty_for_io(struct page *page)
744 761
745 if (mapping) { 762 if (mapping) {
746 if (TestClearPageDirty(page)) { 763 if (TestClearPageDirty(page)) {
747 if (mapping_cap_account_dirty(mapping)) 764 if (mapping_cap_account_dirty(mapping)) {
765 page_mkclean(page);
748 dec_zone_page_state(page, NR_FILE_DIRTY); 766 dec_zone_page_state(page, NR_FILE_DIRTY);
767 }
749 return 1; 768 return 1;
750 } 769 }
751 return 0; 770 return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3b5358a0561f..9810f0a60db7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(node_online_map);
51nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 51nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
52EXPORT_SYMBOL(node_possible_map); 52EXPORT_SYMBOL(node_possible_map);
53unsigned long totalram_pages __read_mostly; 53unsigned long totalram_pages __read_mostly;
54unsigned long totalhigh_pages __read_mostly;
55unsigned long totalreserve_pages __read_mostly; 54unsigned long totalreserve_pages __read_mostly;
56long nr_swap_pages; 55long nr_swap_pages;
57int percpu_pagelist_fraction; 56int percpu_pagelist_fraction;
@@ -69,7 +68,15 @@ static void __free_pages_ok(struct page *page, unsigned int order);
69 * TBD: should special case ZONE_DMA32 machines here - in those we normally 68 * TBD: should special case ZONE_DMA32 machines here - in those we normally
70 * don't need any ZONE_NORMAL reservation 69 * don't need any ZONE_NORMAL reservation
71 */ 70 */
72int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; 71int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
72 256,
73#ifdef CONFIG_ZONE_DMA32
74 256,
75#endif
76#ifdef CONFIG_HIGHMEM
77 32
78#endif
79};
73 80
74EXPORT_SYMBOL(totalram_pages); 81EXPORT_SYMBOL(totalram_pages);
75 82
@@ -80,7 +87,17 @@ EXPORT_SYMBOL(totalram_pages);
80struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 87struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
81EXPORT_SYMBOL(zone_table); 88EXPORT_SYMBOL(zone_table);
82 89
83static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 90static char *zone_names[MAX_NR_ZONES] = {
91 "DMA",
92#ifdef CONFIG_ZONE_DMA32
93 "DMA32",
94#endif
95 "Normal",
96#ifdef CONFIG_HIGHMEM
97 "HighMem"
98#endif
99};
100
84int min_free_kbytes = 1024; 101int min_free_kbytes = 1024;
85 102
86unsigned long __meminitdata nr_kernel_pages; 103unsigned long __meminitdata nr_kernel_pages;
@@ -127,7 +144,6 @@ static int bad_range(struct zone *zone, struct page *page)
127 144
128 return 0; 145 return 0;
129} 146}
130
131#else 147#else
132static inline int bad_range(struct zone *zone, struct page *page) 148static inline int bad_range(struct zone *zone, struct page *page)
133{ 149{
@@ -218,12 +234,12 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
218{ 234{
219 int i; 235 int i;
220 236
221 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 237 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
222 /* 238 /*
223 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 239 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
224 * and __GFP_HIGHMEM from hard or soft interrupt context. 240 * and __GFP_HIGHMEM from hard or soft interrupt context.
225 */ 241 */
226 BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 242 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
227 for (i = 0; i < (1 << order); i++) 243 for (i = 0; i < (1 << order); i++)
228 clear_highpage(page + i); 244 clear_highpage(page + i);
229} 245}
@@ -347,8 +363,8 @@ static inline void __free_one_page(struct page *page,
347 363
348 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 364 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
349 365
350 BUG_ON(page_idx & (order_size - 1)); 366 VM_BUG_ON(page_idx & (order_size - 1));
351 BUG_ON(bad_range(zone, page)); 367 VM_BUG_ON(bad_range(zone, page));
352 368
353 zone->free_pages += order_size; 369 zone->free_pages += order_size;
354 while (order < MAX_ORDER-1) { 370 while (order < MAX_ORDER-1) {
@@ -421,7 +437,7 @@ static void free_pages_bulk(struct zone *zone, int count,
421 while (count--) { 437 while (count--) {
422 struct page *page; 438 struct page *page;
423 439
424 BUG_ON(list_empty(list)); 440 VM_BUG_ON(list_empty(list));
425 page = list_entry(list->prev, struct page, lru); 441 page = list_entry(list->prev, struct page, lru);
426 /* have to delete it as __free_one_page list manipulates */ 442 /* have to delete it as __free_one_page list manipulates */
427 list_del(&page->lru); 443 list_del(&page->lru);
@@ -432,9 +448,11 @@ static void free_pages_bulk(struct zone *zone, int count,
432 448
433static void free_one_page(struct zone *zone, struct page *page, int order) 449static void free_one_page(struct zone *zone, struct page *page, int order)
434{ 450{
435 LIST_HEAD(list); 451 spin_lock(&zone->lock);
436 list_add(&page->lru, &list); 452 zone->all_unreclaimable = 0;
437 free_pages_bulk(zone, 1, &list, order); 453 zone->pages_scanned = 0;
454 __free_one_page(page, zone ,order);
455 spin_unlock(&zone->lock);
438} 456}
439 457
440static void __free_pages_ok(struct page *page, unsigned int order) 458static void __free_pages_ok(struct page *page, unsigned int order)
@@ -512,7 +530,7 @@ static inline void expand(struct zone *zone, struct page *page,
512 area--; 530 area--;
513 high--; 531 high--;
514 size >>= 1; 532 size >>= 1;
515 BUG_ON(bad_range(zone, &page[size])); 533 VM_BUG_ON(bad_range(zone, &page[size]));
516 list_add(&page[size].lru, &area->free_list); 534 list_add(&page[size].lru, &area->free_list);
517 area->nr_free++; 535 area->nr_free++;
518 set_page_order(&page[size], high); 536 set_page_order(&page[size], high);
@@ -615,19 +633,23 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
615#ifdef CONFIG_NUMA 633#ifdef CONFIG_NUMA
616/* 634/*
617 * Called from the slab reaper to drain pagesets on a particular node that 635 * Called from the slab reaper to drain pagesets on a particular node that
618 * belong to the currently executing processor. 636 * belongs to the currently executing processor.
619 * Note that this function must be called with the thread pinned to 637 * Note that this function must be called with the thread pinned to
620 * a single processor. 638 * a single processor.
621 */ 639 */
622void drain_node_pages(int nodeid) 640void drain_node_pages(int nodeid)
623{ 641{
624 int i, z; 642 int i;
643 enum zone_type z;
625 unsigned long flags; 644 unsigned long flags;
626 645
627 for (z = 0; z < MAX_NR_ZONES; z++) { 646 for (z = 0; z < MAX_NR_ZONES; z++) {
628 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 647 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
629 struct per_cpu_pageset *pset; 648 struct per_cpu_pageset *pset;
630 649
650 if (!populated_zone(zone))
651 continue;
652
631 pset = zone_pcp(zone, smp_processor_id()); 653 pset = zone_pcp(zone, smp_processor_id());
632 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 654 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
633 struct per_cpu_pages *pcp; 655 struct per_cpu_pages *pcp;
@@ -672,7 +694,8 @@ static void __drain_pages(unsigned int cpu)
672 694
673void mark_free_pages(struct zone *zone) 695void mark_free_pages(struct zone *zone)
674{ 696{
675 unsigned long zone_pfn, flags; 697 unsigned long pfn, max_zone_pfn;
698 unsigned long flags;
676 int order; 699 int order;
677 struct list_head *curr; 700 struct list_head *curr;
678 701
@@ -680,18 +703,25 @@ void mark_free_pages(struct zone *zone)
680 return; 703 return;
681 704
682 spin_lock_irqsave(&zone->lock, flags); 705 spin_lock_irqsave(&zone->lock, flags);
683 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 706
684 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); 707 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
708 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
709 if (pfn_valid(pfn)) {
710 struct page *page = pfn_to_page(pfn);
711
712 if (!PageNosave(page))
713 ClearPageNosaveFree(page);
714 }
685 715
686 for (order = MAX_ORDER - 1; order >= 0; --order) 716 for (order = MAX_ORDER - 1; order >= 0; --order)
687 list_for_each(curr, &zone->free_area[order].free_list) { 717 list_for_each(curr, &zone->free_area[order].free_list) {
688 unsigned long start_pfn, i; 718 unsigned long i;
689 719
690 start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); 720 pfn = page_to_pfn(list_entry(curr, struct page, lru));
721 for (i = 0; i < (1UL << order); i++)
722 SetPageNosaveFree(pfn_to_page(pfn + i));
723 }
691 724
692 for (i=0; i < (1<<order); i++)
693 SetPageNosaveFree(pfn_to_page(start_pfn+i));
694 }
695 spin_unlock_irqrestore(&zone->lock, flags); 725 spin_unlock_irqrestore(&zone->lock, flags);
696} 726}
697 727
@@ -761,8 +791,8 @@ void split_page(struct page *page, unsigned int order)
761{ 791{
762 int i; 792 int i;
763 793
764 BUG_ON(PageCompound(page)); 794 VM_BUG_ON(PageCompound(page));
765 BUG_ON(!page_count(page)); 795 VM_BUG_ON(!page_count(page));
766 for (i = 1; i < (1 << order); i++) 796 for (i = 1; i < (1 << order); i++)
767 set_page_refcounted(page + i); 797 set_page_refcounted(page + i);
768} 798}
@@ -809,7 +839,7 @@ again:
809 local_irq_restore(flags); 839 local_irq_restore(flags);
810 put_cpu(); 840 put_cpu();
811 841
812 BUG_ON(bad_range(zone, page)); 842 VM_BUG_ON(bad_range(zone, page));
813 if (prep_new_page(page, order, gfp_flags)) 843 if (prep_new_page(page, order, gfp_flags))
814 goto again; 844 goto again;
815 return page; 845 return page;
@@ -870,32 +900,37 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
870 struct zone **z = zonelist->zones; 900 struct zone **z = zonelist->zones;
871 struct page *page = NULL; 901 struct page *page = NULL;
872 int classzone_idx = zone_idx(*z); 902 int classzone_idx = zone_idx(*z);
903 struct zone *zone;
873 904
874 /* 905 /*
875 * Go through the zonelist once, looking for a zone with enough free. 906 * Go through the zonelist once, looking for a zone with enough free.
876 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 907 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
877 */ 908 */
878 do { 909 do {
910 zone = *z;
911 if (unlikely((gfp_mask & __GFP_THISNODE) &&
912 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
913 break;
879 if ((alloc_flags & ALLOC_CPUSET) && 914 if ((alloc_flags & ALLOC_CPUSET) &&
880 !cpuset_zone_allowed(*z, gfp_mask)) 915 !cpuset_zone_allowed(zone, gfp_mask))
881 continue; 916 continue;
882 917
883 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 918 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
884 unsigned long mark; 919 unsigned long mark;
885 if (alloc_flags & ALLOC_WMARK_MIN) 920 if (alloc_flags & ALLOC_WMARK_MIN)
886 mark = (*z)->pages_min; 921 mark = zone->pages_min;
887 else if (alloc_flags & ALLOC_WMARK_LOW) 922 else if (alloc_flags & ALLOC_WMARK_LOW)
888 mark = (*z)->pages_low; 923 mark = zone->pages_low;
889 else 924 else
890 mark = (*z)->pages_high; 925 mark = zone->pages_high;
891 if (!zone_watermark_ok(*z, order, mark, 926 if (!zone_watermark_ok(zone , order, mark,
892 classzone_idx, alloc_flags)) 927 classzone_idx, alloc_flags))
893 if (!zone_reclaim_mode || 928 if (!zone_reclaim_mode ||
894 !zone_reclaim(*z, gfp_mask, order)) 929 !zone_reclaim(zone, gfp_mask, order))
895 continue; 930 continue;
896 } 931 }
897 932
898 page = buffered_rmqueue(zonelist, *z, order, gfp_mask); 933 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
899 if (page) { 934 if (page) {
900 break; 935 break;
901 } 936 }
@@ -1083,7 +1118,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1083 * get_zeroed_page() returns a 32-bit address, which cannot represent 1118 * get_zeroed_page() returns a 32-bit address, which cannot represent
1084 * a highmem page 1119 * a highmem page
1085 */ 1120 */
1086 BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1121 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1087 1122
1088 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1123 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1089 if (page) 1124 if (page)
@@ -1116,7 +1151,7 @@ EXPORT_SYMBOL(__free_pages);
1116fastcall void free_pages(unsigned long addr, unsigned int order) 1151fastcall void free_pages(unsigned long addr, unsigned int order)
1117{ 1152{
1118 if (addr != 0) { 1153 if (addr != 0) {
1119 BUG_ON(!virt_addr_valid((void *)addr)); 1154 VM_BUG_ON(!virt_addr_valid((void *)addr));
1120 __free_pages(virt_to_page((void *)addr), order); 1155 __free_pages(virt_to_page((void *)addr), order);
1121 } 1156 }
1122} 1157}
@@ -1142,7 +1177,8 @@ EXPORT_SYMBOL(nr_free_pages);
1142#ifdef CONFIG_NUMA 1177#ifdef CONFIG_NUMA
1143unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 1178unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1144{ 1179{
1145 unsigned int i, sum = 0; 1180 unsigned int sum = 0;
1181 enum zone_type i;
1146 1182
1147 for (i = 0; i < MAX_NR_ZONES; i++) 1183 for (i = 0; i < MAX_NR_ZONES; i++)
1148 sum += pgdat->node_zones[i].free_pages; 1184 sum += pgdat->node_zones[i].free_pages;
@@ -1186,24 +1222,10 @@ unsigned int nr_free_pagecache_pages(void)
1186{ 1222{
1187 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1223 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1188} 1224}
1189
1190#ifdef CONFIG_HIGHMEM
1191unsigned int nr_free_highpages (void)
1192{
1193 pg_data_t *pgdat;
1194 unsigned int pages = 0;
1195
1196 for_each_online_pgdat(pgdat)
1197 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1198
1199 return pages;
1200}
1201#endif
1202
1203#ifdef CONFIG_NUMA 1225#ifdef CONFIG_NUMA
1204static void show_node(struct zone *zone) 1226static void show_node(struct zone *zone)
1205{ 1227{
1206 printk("Node %d ", zone->zone_pgdat->node_id); 1228 printk("Node %ld ", zone_to_nid(zone));
1207} 1229}
1208#else 1230#else
1209#define show_node(zone) do { } while (0) 1231#define show_node(zone) do { } while (0)
@@ -1215,13 +1237,8 @@ void si_meminfo(struct sysinfo *val)
1215 val->sharedram = 0; 1237 val->sharedram = 0;
1216 val->freeram = nr_free_pages(); 1238 val->freeram = nr_free_pages();
1217 val->bufferram = nr_blockdev_pages(); 1239 val->bufferram = nr_blockdev_pages();
1218#ifdef CONFIG_HIGHMEM
1219 val->totalhigh = totalhigh_pages; 1240 val->totalhigh = totalhigh_pages;
1220 val->freehigh = nr_free_highpages(); 1241 val->freehigh = nr_free_highpages();
1221#else
1222 val->totalhigh = 0;
1223 val->freehigh = 0;
1224#endif
1225 val->mem_unit = PAGE_SIZE; 1242 val->mem_unit = PAGE_SIZE;
1226} 1243}
1227 1244
@@ -1234,8 +1251,13 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1234 1251
1235 val->totalram = pgdat->node_present_pages; 1252 val->totalram = pgdat->node_present_pages;
1236 val->freeram = nr_free_pages_pgdat(pgdat); 1253 val->freeram = nr_free_pages_pgdat(pgdat);
1254#ifdef CONFIG_HIGHMEM
1237 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1255 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1238 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1256 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1257#else
1258 val->totalhigh = 0;
1259 val->freehigh = 0;
1260#endif
1239 val->mem_unit = PAGE_SIZE; 1261 val->mem_unit = PAGE_SIZE;
1240} 1262}
1241#endif 1263#endif
@@ -1282,10 +1304,6 @@ void show_free_areas(void)
1282 1304
1283 get_zone_counts(&active, &inactive, &free); 1305 get_zone_counts(&active, &inactive, &free);
1284 1306
1285 printk("Free pages: %11ukB (%ukB HighMem)\n",
1286 K(nr_free_pages()),
1287 K(nr_free_highpages()));
1288
1289 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1307 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1290 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1308 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1291 active, 1309 active,
@@ -1294,7 +1312,8 @@ void show_free_areas(void)
1294 global_page_state(NR_WRITEBACK), 1312 global_page_state(NR_WRITEBACK),
1295 global_page_state(NR_UNSTABLE_NFS), 1313 global_page_state(NR_UNSTABLE_NFS),
1296 nr_free_pages(), 1314 nr_free_pages(),
1297 global_page_state(NR_SLAB), 1315 global_page_state(NR_SLAB_RECLAIMABLE) +
1316 global_page_state(NR_SLAB_UNRECLAIMABLE),
1298 global_page_state(NR_FILE_MAPPED), 1317 global_page_state(NR_FILE_MAPPED),
1299 global_page_state(NR_PAGETABLE)); 1318 global_page_state(NR_PAGETABLE));
1300 1319
@@ -1360,39 +1379,25 @@ void show_free_areas(void)
1360 * Add all populated zones of a node to the zonelist. 1379 * Add all populated zones of a node to the zonelist.
1361 */ 1380 */
1362static int __meminit build_zonelists_node(pg_data_t *pgdat, 1381static int __meminit build_zonelists_node(pg_data_t *pgdat,
1363 struct zonelist *zonelist, int nr_zones, int zone_type) 1382 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
1364{ 1383{
1365 struct zone *zone; 1384 struct zone *zone;
1366 1385
1367 BUG_ON(zone_type > ZONE_HIGHMEM); 1386 BUG_ON(zone_type >= MAX_NR_ZONES);
1387 zone_type++;
1368 1388
1369 do { 1389 do {
1390 zone_type--;
1370 zone = pgdat->node_zones + zone_type; 1391 zone = pgdat->node_zones + zone_type;
1371 if (populated_zone(zone)) { 1392 if (populated_zone(zone)) {
1372#ifndef CONFIG_HIGHMEM
1373 BUG_ON(zone_type > ZONE_NORMAL);
1374#endif
1375 zonelist->zones[nr_zones++] = zone; 1393 zonelist->zones[nr_zones++] = zone;
1376 check_highest_zone(zone_type); 1394 check_highest_zone(zone_type);
1377 } 1395 }
1378 zone_type--;
1379 1396
1380 } while (zone_type >= 0); 1397 } while (zone_type);
1381 return nr_zones; 1398 return nr_zones;
1382} 1399}
1383 1400
1384static inline int highest_zone(int zone_bits)
1385{
1386 int res = ZONE_NORMAL;
1387 if (zone_bits & (__force int)__GFP_HIGHMEM)
1388 res = ZONE_HIGHMEM;
1389 if (zone_bits & (__force int)__GFP_DMA32)
1390 res = ZONE_DMA32;
1391 if (zone_bits & (__force int)__GFP_DMA)
1392 res = ZONE_DMA;
1393 return res;
1394}
1395
1396#ifdef CONFIG_NUMA 1401#ifdef CONFIG_NUMA
1397#define MAX_NODE_LOAD (num_online_nodes()) 1402#define MAX_NODE_LOAD (num_online_nodes())
1398static int __meminitdata node_load[MAX_NUMNODES]; 1403static int __meminitdata node_load[MAX_NUMNODES];
@@ -1458,13 +1463,14 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1458 1463
1459static void __meminit build_zonelists(pg_data_t *pgdat) 1464static void __meminit build_zonelists(pg_data_t *pgdat)
1460{ 1465{
1461 int i, j, k, node, local_node; 1466 int j, node, local_node;
1467 enum zone_type i;
1462 int prev_node, load; 1468 int prev_node, load;
1463 struct zonelist *zonelist; 1469 struct zonelist *zonelist;
1464 nodemask_t used_mask; 1470 nodemask_t used_mask;
1465 1471
1466 /* initialize zonelists */ 1472 /* initialize zonelists */
1467 for (i = 0; i < GFP_ZONETYPES; i++) { 1473 for (i = 0; i < MAX_NR_ZONES; i++) {
1468 zonelist = pgdat->node_zonelists + i; 1474 zonelist = pgdat->node_zonelists + i;
1469 zonelist->zones[0] = NULL; 1475 zonelist->zones[0] = NULL;
1470 } 1476 }
@@ -1494,13 +1500,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1494 node_load[node] += load; 1500 node_load[node] += load;
1495 prev_node = node; 1501 prev_node = node;
1496 load--; 1502 load--;
1497 for (i = 0; i < GFP_ZONETYPES; i++) { 1503 for (i = 0; i < MAX_NR_ZONES; i++) {
1498 zonelist = pgdat->node_zonelists + i; 1504 zonelist = pgdat->node_zonelists + i;
1499 for (j = 0; zonelist->zones[j] != NULL; j++); 1505 for (j = 0; zonelist->zones[j] != NULL; j++);
1500 1506
1501 k = highest_zone(i); 1507 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1502
1503 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1504 zonelist->zones[j] = NULL; 1508 zonelist->zones[j] = NULL;
1505 } 1509 }
1506 } 1510 }
@@ -1510,17 +1514,16 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1510 1514
1511static void __meminit build_zonelists(pg_data_t *pgdat) 1515static void __meminit build_zonelists(pg_data_t *pgdat)
1512{ 1516{
1513 int i, j, k, node, local_node; 1517 int node, local_node;
1518 enum zone_type i,j;
1514 1519
1515 local_node = pgdat->node_id; 1520 local_node = pgdat->node_id;
1516 for (i = 0; i < GFP_ZONETYPES; i++) { 1521 for (i = 0; i < MAX_NR_ZONES; i++) {
1517 struct zonelist *zonelist; 1522 struct zonelist *zonelist;
1518 1523
1519 zonelist = pgdat->node_zonelists + i; 1524 zonelist = pgdat->node_zonelists + i;
1520 1525
1521 j = 0; 1526 j = build_zonelists_node(pgdat, zonelist, 0, i);
1522 k = highest_zone(i);
1523 j = build_zonelists_node(pgdat, zonelist, j, k);
1524 /* 1527 /*
1525 * Now we build the zonelist so that it contains the zones 1528 * Now we build the zonelist so that it contains the zones
1526 * of all the other nodes. 1529 * of all the other nodes.
@@ -1532,12 +1535,12 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1532 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1535 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1533 if (!node_online(node)) 1536 if (!node_online(node))
1534 continue; 1537 continue;
1535 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1538 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1536 } 1539 }
1537 for (node = 0; node < local_node; node++) { 1540 for (node = 0; node < local_node; node++) {
1538 if (!node_online(node)) 1541 if (!node_online(node))
1539 continue; 1542 continue;
1540 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1543 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1541 } 1544 }
1542 1545
1543 zonelist->zones[j] = NULL; 1546 zonelist->zones[j] = NULL;
@@ -1643,7 +1646,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1643 unsigned long *zones_size, unsigned long *zholes_size) 1646 unsigned long *zones_size, unsigned long *zholes_size)
1644{ 1647{
1645 unsigned long realtotalpages, totalpages = 0; 1648 unsigned long realtotalpages, totalpages = 0;
1646 int i; 1649 enum zone_type i;
1647 1650
1648 for (i = 0; i < MAX_NR_ZONES; i++) 1651 for (i = 0; i < MAX_NR_ZONES; i++)
1649 totalpages += zones_size[i]; 1652 totalpages += zones_size[i];
@@ -1698,8 +1701,8 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1698} 1701}
1699 1702
1700#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) 1703#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1701void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 1704void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
1702 unsigned long size) 1705 unsigned long pfn, unsigned long size)
1703{ 1706{
1704 unsigned long snum = pfn_to_section_nr(pfn); 1707 unsigned long snum = pfn_to_section_nr(pfn);
1705 unsigned long end = pfn_to_section_nr(pfn + size); 1708 unsigned long end = pfn_to_section_nr(pfn + size);
@@ -1845,8 +1848,10 @@ static inline void free_zone_pagesets(int cpu)
1845 for_each_zone(zone) { 1848 for_each_zone(zone) {
1846 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 1849 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1847 1850
1851 /* Free per_cpu_pageset if it is slab allocated */
1852 if (pset != &boot_pageset[cpu])
1853 kfree(pset);
1848 zone_pcp(zone, cpu) = NULL; 1854 zone_pcp(zone, cpu) = NULL;
1849 kfree(pset);
1850 } 1855 }
1851} 1856}
1852 1857
@@ -1981,7 +1986,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
1981static void __meminit free_area_init_core(struct pglist_data *pgdat, 1986static void __meminit free_area_init_core(struct pglist_data *pgdat,
1982 unsigned long *zones_size, unsigned long *zholes_size) 1987 unsigned long *zones_size, unsigned long *zholes_size)
1983{ 1988{
1984 unsigned long j; 1989 enum zone_type j;
1985 int nid = pgdat->node_id; 1990 int nid = pgdat->node_id;
1986 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1991 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1987 int ret; 1992 int ret;
@@ -1999,15 +2004,16 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
1999 if (zholes_size) 2004 if (zholes_size)
2000 realsize -= zholes_size[j]; 2005 realsize -= zholes_size[j];
2001 2006
2002 if (j < ZONE_HIGHMEM) 2007 if (!is_highmem_idx(j))
2003 nr_kernel_pages += realsize; 2008 nr_kernel_pages += realsize;
2004 nr_all_pages += realsize; 2009 nr_all_pages += realsize;
2005 2010
2006 zone->spanned_pages = size; 2011 zone->spanned_pages = size;
2007 zone->present_pages = realsize; 2012 zone->present_pages = realsize;
2008#ifdef CONFIG_NUMA 2013#ifdef CONFIG_NUMA
2009 zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) 2014 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2010 / 100; 2015 / 100;
2016 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
2011#endif 2017#endif
2012 zone->name = zone_names[j]; 2018 zone->name = zone_names[j];
2013 spin_lock_init(&zone->lock); 2019 spin_lock_init(&zone->lock);
@@ -2129,7 +2135,7 @@ static void calculate_totalreserve_pages(void)
2129{ 2135{
2130 struct pglist_data *pgdat; 2136 struct pglist_data *pgdat;
2131 unsigned long reserve_pages = 0; 2137 unsigned long reserve_pages = 0;
2132 int i, j; 2138 enum zone_type i, j;
2133 2139
2134 for_each_online_pgdat(pgdat) { 2140 for_each_online_pgdat(pgdat) {
2135 for (i = 0; i < MAX_NR_ZONES; i++) { 2141 for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -2162,7 +2168,7 @@ static void calculate_totalreserve_pages(void)
2162static void setup_per_zone_lowmem_reserve(void) 2168static void setup_per_zone_lowmem_reserve(void)
2163{ 2169{
2164 struct pglist_data *pgdat; 2170 struct pglist_data *pgdat;
2165 int j, idx; 2171 enum zone_type j, idx;
2166 2172
2167 for_each_online_pgdat(pgdat) { 2173 for_each_online_pgdat(pgdat) {
2168 for (j = 0; j < MAX_NR_ZONES; j++) { 2174 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -2171,9 +2177,12 @@ static void setup_per_zone_lowmem_reserve(void)
2171 2177
2172 zone->lowmem_reserve[j] = 0; 2178 zone->lowmem_reserve[j] = 0;
2173 2179
2174 for (idx = j-1; idx >= 0; idx--) { 2180 idx = j;
2181 while (idx) {
2175 struct zone *lower_zone; 2182 struct zone *lower_zone;
2176 2183
2184 idx--;
2185
2177 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2186 if (sysctl_lowmem_reserve_ratio[idx] < 1)
2178 sysctl_lowmem_reserve_ratio[idx] = 1; 2187 sysctl_lowmem_reserve_ratio[idx] = 1;
2179 2188
@@ -2314,10 +2323,26 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
2314 return rc; 2323 return rc;
2315 2324
2316 for_each_zone(zone) 2325 for_each_zone(zone)
2317 zone->min_unmapped_ratio = (zone->present_pages * 2326 zone->min_unmapped_pages = (zone->present_pages *
2318 sysctl_min_unmapped_ratio) / 100; 2327 sysctl_min_unmapped_ratio) / 100;
2319 return 0; 2328 return 0;
2320} 2329}
2330
2331int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
2332 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2333{
2334 struct zone *zone;
2335 int rc;
2336
2337 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2338 if (rc)
2339 return rc;
2340
2341 for_each_zone(zone)
2342 zone->min_slab_pages = (zone->present_pages *
2343 sysctl_min_slab_ratio) / 100;
2344 return 0;
2345}
2321#endif 2346#endif
2322 2347
2323/* 2348/*
diff --git a/mm/page_io.c b/mm/page_io.c
index 88029948d00a..d4840ecbf8f9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -52,14 +52,29 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
52 if (bio->bi_size) 52 if (bio->bi_size)
53 return 1; 53 return 1;
54 54
55 if (!uptodate) 55 if (!uptodate) {
56 SetPageError(page); 56 SetPageError(page);
57 /*
58 * We failed to write the page out to swap-space.
59 * Re-dirty the page in order to avoid it being reclaimed.
60 * Also print a dire warning that things will go BAD (tm)
61 * very quickly.
62 *
63 * Also clear PG_reclaim to avoid rotate_reclaimable_page()
64 */
65 set_page_dirty(page);
66 printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
67 imajor(bio->bi_bdev->bd_inode),
68 iminor(bio->bi_bdev->bd_inode),
69 (unsigned long long)bio->bi_sector);
70 ClearPageReclaim(page);
71 }
57 end_page_writeback(page); 72 end_page_writeback(page);
58 bio_put(bio); 73 bio_put(bio);
59 return 0; 74 return 0;
60} 75}
61 76
62static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) 77int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
63{ 78{
64 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 79 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
65 struct page *page = bio->bi_io_vec[0].bv_page; 80 struct page *page = bio->bi_io_vec[0].bv_page;
@@ -70,6 +85,10 @@ static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
70 if (!uptodate) { 85 if (!uptodate) {
71 SetPageError(page); 86 SetPageError(page);
72 ClearPageUptodate(page); 87 ClearPageUptodate(page);
88 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
89 imajor(bio->bi_bdev->bd_inode),
90 iminor(bio->bi_bdev->bd_inode),
91 (unsigned long long)bio->bi_sector);
73 } else { 92 } else {
74 SetPageUptodate(page); 93 SetPageUptodate(page);
75 } 94 }
@@ -137,10 +156,12 @@ out:
137 * We use end_swap_bio_read() even for writes, because it happens to do what 156 * We use end_swap_bio_read() even for writes, because it happens to do what
138 * we want. 157 * we want.
139 */ 158 */
140int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) 159int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
160 struct bio **bio_chain)
141{ 161{
142 struct bio *bio; 162 struct bio *bio;
143 int ret = 0; 163 int ret = 0;
164 int bio_rw;
144 165
145 lock_page(page); 166 lock_page(page);
146 167
@@ -151,11 +172,22 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
151 goto out; 172 goto out;
152 } 173 }
153 174
154 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 175 bio_rw = rw;
155 wait_on_page_locked(page); 176 if (!bio_chain)
156 177 bio_rw |= (1 << BIO_RW_SYNC);
157 if (!PageUptodate(page) || PageError(page)) 178 if (bio_chain)
158 ret = -EIO; 179 bio_get(bio);
180 submit_bio(bio_rw, bio);
181 if (bio_chain == NULL) {
182 wait_on_page_locked(page);
183
184 if (!PageUptodate(page) || PageError(page))
185 ret = -EIO;
186 }
187 if (bio_chain) {
188 bio->bi_private = *bio_chain;
189 *bio_chain = bio;
190 }
159out: 191out:
160 return ret; 192 return ret;
161} 193}
diff --git a/mm/rmap.c b/mm/rmap.c
index 40158b59729e..e2155d791d99 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -434,6 +434,71 @@ int page_referenced(struct page *page, int is_locked)
434 return referenced; 434 return referenced;
435} 435}
436 436
437static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
438{
439 struct mm_struct *mm = vma->vm_mm;
440 unsigned long address;
441 pte_t *pte, entry;
442 spinlock_t *ptl;
443 int ret = 0;
444
445 address = vma_address(page, vma);
446 if (address == -EFAULT)
447 goto out;
448
449 pte = page_check_address(page, mm, address, &ptl);
450 if (!pte)
451 goto out;
452
453 if (!pte_dirty(*pte) && !pte_write(*pte))
454 goto unlock;
455
456 entry = ptep_get_and_clear(mm, address, pte);
457 entry = pte_mkclean(entry);
458 entry = pte_wrprotect(entry);
459 ptep_establish(vma, address, pte, entry);
460 lazy_mmu_prot_update(entry);
461 ret = 1;
462
463unlock:
464 pte_unmap_unlock(pte, ptl);
465out:
466 return ret;
467}
468
469static int page_mkclean_file(struct address_space *mapping, struct page *page)
470{
471 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
472 struct vm_area_struct *vma;
473 struct prio_tree_iter iter;
474 int ret = 0;
475
476 BUG_ON(PageAnon(page));
477
478 spin_lock(&mapping->i_mmap_lock);
479 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
480 if (vma->vm_flags & VM_SHARED)
481 ret += page_mkclean_one(page, vma);
482 }
483 spin_unlock(&mapping->i_mmap_lock);
484 return ret;
485}
486
487int page_mkclean(struct page *page)
488{
489 int ret = 0;
490
491 BUG_ON(!PageLocked(page));
492
493 if (page_mapped(page)) {
494 struct address_space *mapping = page_mapping(page);
495 if (mapping)
496 ret = page_mkclean_file(mapping, page);
497 }
498
499 return ret;
500}
501
437/** 502/**
438 * page_set_anon_rmap - setup new anonymous rmap 503 * page_set_anon_rmap - setup new anonymous rmap
439 * @page: the page to add the mapping to 504 * @page: the page to add the mapping to
diff --git a/mm/shmem.c b/mm/shmem.c
index db21c51531ca..8631be45b40d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
45#include <linux/namei.h> 45#include <linux/namei.h>
46#include <linux/ctype.h> 46#include <linux/ctype.h>
47#include <linux/migrate.h> 47#include <linux/migrate.h>
48#include <linux/highmem.h>
48 49
49#include <asm/uaccess.h> 50#include <asm/uaccess.h>
50#include <asm/div64.h> 51#include <asm/div64.h>
diff --git a/mm/slab.c b/mm/slab.c
index 21ba06035700..7a48eb1a60c8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -313,7 +313,7 @@ static int drain_freelist(struct kmem_cache *cache,
313 struct kmem_list3 *l3, int tofree); 313 struct kmem_list3 *l3, int tofree);
314static void free_block(struct kmem_cache *cachep, void **objpp, int len, 314static void free_block(struct kmem_cache *cachep, void **objpp, int len,
315 int node); 315 int node);
316static void enable_cpucache(struct kmem_cache *cachep); 316static int enable_cpucache(struct kmem_cache *cachep);
317static void cache_reap(void *unused); 317static void cache_reap(void *unused);
318 318
319/* 319/*
@@ -674,6 +674,8 @@ static struct kmem_cache cache_cache = {
674#endif 674#endif
675}; 675};
676 676
677#define BAD_ALIEN_MAGIC 0x01020304ul
678
677#ifdef CONFIG_LOCKDEP 679#ifdef CONFIG_LOCKDEP
678 680
679/* 681/*
@@ -682,42 +684,58 @@ static struct kmem_cache cache_cache = {
682 * The locking for this is tricky in that it nests within the locks 684 * The locking for this is tricky in that it nests within the locks
683 * of all other slabs in a few places; to deal with this special 685 * of all other slabs in a few places; to deal with this special
684 * locking we put on-slab caches into a separate lock-class. 686 * locking we put on-slab caches into a separate lock-class.
687 *
688 * We set lock class for alien array caches which are up during init.
689 * The lock annotation will be lost if all cpus of a node goes down and
690 * then comes back up during hotplug
685 */ 691 */
686static struct lock_class_key on_slab_key; 692static struct lock_class_key on_slab_l3_key;
693static struct lock_class_key on_slab_alc_key;
694
695static inline void init_lock_keys(void)
687 696
688static inline void init_lock_keys(struct cache_sizes *s)
689{ 697{
690 int q; 698 int q;
691 699 struct cache_sizes *s = malloc_sizes;
692 for (q = 0; q < MAX_NUMNODES; q++) { 700
693 if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep)) 701 while (s->cs_size != ULONG_MAX) {
694 continue; 702 for_each_node(q) {
695 lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock, 703 struct array_cache **alc;
696 &on_slab_key); 704 int r;
705 struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
706 if (!l3 || OFF_SLAB(s->cs_cachep))
707 continue;
708 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
709 alc = l3->alien;
710 /*
711 * FIXME: This check for BAD_ALIEN_MAGIC
712 * should go away when common slab code is taught to
713 * work even without alien caches.
714 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
715 * for alloc_alien_cache,
716 */
717 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
718 continue;
719 for_each_node(r) {
720 if (alc[r])
721 lockdep_set_class(&alc[r]->lock,
722 &on_slab_alc_key);
723 }
724 }
725 s++;
697 } 726 }
698} 727}
699
700#else 728#else
701static inline void init_lock_keys(struct cache_sizes *s) 729static inline void init_lock_keys(void)
702{ 730{
703} 731}
704#endif 732#endif
705 733
706
707
708/* Guard access to the cache-chain. */ 734/* Guard access to the cache-chain. */
709static DEFINE_MUTEX(cache_chain_mutex); 735static DEFINE_MUTEX(cache_chain_mutex);
710static struct list_head cache_chain; 736static struct list_head cache_chain;
711 737
712/* 738/*
713 * vm_enough_memory() looks at this to determine how many slab-allocated pages
714 * are possibly freeable under pressure
715 *
716 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
717 */
718atomic_t slab_reclaim_pages;
719
720/*
721 * chicken and egg problem: delay the per-cpu array allocation 739 * chicken and egg problem: delay the per-cpu array allocation
722 * until the general caches are up. 740 * until the general caches are up.
723 */ 741 */
@@ -768,11 +786,10 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
768 return csizep->cs_cachep; 786 return csizep->cs_cachep;
769} 787}
770 788
771struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 789static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
772{ 790{
773 return __find_general_cachep(size, gfpflags); 791 return __find_general_cachep(size, gfpflags);
774} 792}
775EXPORT_SYMBOL(kmem_find_general_cachep);
776 793
777static size_t slab_mgmt_size(size_t nr_objs, size_t align) 794static size_t slab_mgmt_size(size_t nr_objs, size_t align)
778{ 795{
@@ -1092,7 +1109,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1092 1109
1093static inline struct array_cache **alloc_alien_cache(int node, int limit) 1110static inline struct array_cache **alloc_alien_cache(int node, int limit)
1094{ 1111{
1095 return (struct array_cache **) 0x01020304ul; 1112 return (struct array_cache **)BAD_ALIEN_MAGIC;
1096} 1113}
1097 1114
1098static inline void free_alien_cache(struct array_cache **ac_ptr) 1115static inline void free_alien_cache(struct array_cache **ac_ptr)
@@ -1422,7 +1439,6 @@ void __init kmem_cache_init(void)
1422 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1439 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1423 NULL, NULL); 1440 NULL, NULL);
1424 } 1441 }
1425 init_lock_keys(sizes);
1426 1442
1427 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1443 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1428 sizes->cs_size, 1444 sizes->cs_size,
@@ -1491,10 +1507,15 @@ void __init kmem_cache_init(void)
1491 struct kmem_cache *cachep; 1507 struct kmem_cache *cachep;
1492 mutex_lock(&cache_chain_mutex); 1508 mutex_lock(&cache_chain_mutex);
1493 list_for_each_entry(cachep, &cache_chain, next) 1509 list_for_each_entry(cachep, &cache_chain, next)
1494 enable_cpucache(cachep); 1510 if (enable_cpucache(cachep))
1511 BUG();
1495 mutex_unlock(&cache_chain_mutex); 1512 mutex_unlock(&cache_chain_mutex);
1496 } 1513 }
1497 1514
1515 /* Annotate slab for lockdep -- annotate the malloc caches */
1516 init_lock_keys();
1517
1518
1498 /* Done! */ 1519 /* Done! */
1499 g_cpucache_up = FULL; 1520 g_cpucache_up = FULL;
1500 1521
@@ -1551,8 +1572,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1551 1572
1552 nr_pages = (1 << cachep->gfporder); 1573 nr_pages = (1 << cachep->gfporder);
1553 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1574 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1554 atomic_add(nr_pages, &slab_reclaim_pages); 1575 add_zone_page_state(page_zone(page),
1555 add_zone_page_state(page_zone(page), NR_SLAB, nr_pages); 1576 NR_SLAB_RECLAIMABLE, nr_pages);
1577 else
1578 add_zone_page_state(page_zone(page),
1579 NR_SLAB_UNRECLAIMABLE, nr_pages);
1556 for (i = 0; i < nr_pages; i++) 1580 for (i = 0; i < nr_pages; i++)
1557 __SetPageSlab(page + i); 1581 __SetPageSlab(page + i);
1558 return page_address(page); 1582 return page_address(page);
@@ -1567,7 +1591,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1567 struct page *page = virt_to_page(addr); 1591 struct page *page = virt_to_page(addr);
1568 const unsigned long nr_freed = i; 1592 const unsigned long nr_freed = i;
1569 1593
1570 sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed); 1594 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1595 sub_zone_page_state(page_zone(page),
1596 NR_SLAB_RECLAIMABLE, nr_freed);
1597 else
1598 sub_zone_page_state(page_zone(page),
1599 NR_SLAB_UNRECLAIMABLE, nr_freed);
1571 while (i--) { 1600 while (i--) {
1572 BUG_ON(!PageSlab(page)); 1601 BUG_ON(!PageSlab(page));
1573 __ClearPageSlab(page); 1602 __ClearPageSlab(page);
@@ -1576,8 +1605,6 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1576 if (current->reclaim_state) 1605 if (current->reclaim_state)
1577 current->reclaim_state->reclaimed_slab += nr_freed; 1606 current->reclaim_state->reclaimed_slab += nr_freed;
1578 free_pages((unsigned long)addr, cachep->gfporder); 1607 free_pages((unsigned long)addr, cachep->gfporder);
1579 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1580 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1581} 1608}
1582 1609
1583static void kmem_rcu_free(struct rcu_head *head) 1610static void kmem_rcu_free(struct rcu_head *head)
@@ -1834,6 +1861,27 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
1834 } 1861 }
1835} 1862}
1836 1863
1864static void __kmem_cache_destroy(struct kmem_cache *cachep)
1865{
1866 int i;
1867 struct kmem_list3 *l3;
1868
1869 for_each_online_cpu(i)
1870 kfree(cachep->array[i]);
1871
1872 /* NUMA: free the list3 structures */
1873 for_each_online_node(i) {
1874 l3 = cachep->nodelists[i];
1875 if (l3) {
1876 kfree(l3->shared);
1877 free_alien_cache(l3->alien);
1878 kfree(l3);
1879 }
1880 }
1881 kmem_cache_free(&cache_cache, cachep);
1882}
1883
1884
1837/** 1885/**
1838 * calculate_slab_order - calculate size (page order) of slabs 1886 * calculate_slab_order - calculate size (page order) of slabs
1839 * @cachep: pointer to the cache that is being created 1887 * @cachep: pointer to the cache that is being created
@@ -1904,12 +1952,11 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
1904 return left_over; 1952 return left_over;
1905} 1953}
1906 1954
1907static void setup_cpu_cache(struct kmem_cache *cachep) 1955static int setup_cpu_cache(struct kmem_cache *cachep)
1908{ 1956{
1909 if (g_cpucache_up == FULL) { 1957 if (g_cpucache_up == FULL)
1910 enable_cpucache(cachep); 1958 return enable_cpucache(cachep);
1911 return; 1959
1912 }
1913 if (g_cpucache_up == NONE) { 1960 if (g_cpucache_up == NONE) {
1914 /* 1961 /*
1915 * Note: the first kmem_cache_create must create the cache 1962 * Note: the first kmem_cache_create must create the cache
@@ -1956,6 +2003,7 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
1956 cpu_cache_get(cachep)->touched = 0; 2003 cpu_cache_get(cachep)->touched = 0;
1957 cachep->batchcount = 1; 2004 cachep->batchcount = 1;
1958 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2005 cachep->limit = BOOT_CPUCACHE_ENTRIES;
2006 return 0;
1959} 2007}
1960 2008
1961/** 2009/**
@@ -2097,6 +2145,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2097 } else { 2145 } else {
2098 ralign = BYTES_PER_WORD; 2146 ralign = BYTES_PER_WORD;
2099 } 2147 }
2148
2149 /*
2150 * Redzoning and user store require word alignment. Note this will be
2151 * overridden by architecture or caller mandated alignment if either
2152 * is greater than BYTES_PER_WORD.
2153 */
2154 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2155 ralign = BYTES_PER_WORD;
2156
2100 /* 2) arch mandated alignment: disables debug if necessary */ 2157 /* 2) arch mandated alignment: disables debug if necessary */
2101 if (ralign < ARCH_SLAB_MINALIGN) { 2158 if (ralign < ARCH_SLAB_MINALIGN) {
2102 ralign = ARCH_SLAB_MINALIGN; 2159 ralign = ARCH_SLAB_MINALIGN;
@@ -2110,8 +2167,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2110 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2167 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2111 } 2168 }
2112 /* 2169 /*
2113 * 4) Store it. Note that the debug code below can reduce 2170 * 4) Store it.
2114 * the alignment to BYTES_PER_WORD.
2115 */ 2171 */
2116 align = ralign; 2172 align = ralign;
2117 2173
@@ -2123,20 +2179,19 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2123#if DEBUG 2179#if DEBUG
2124 cachep->obj_size = size; 2180 cachep->obj_size = size;
2125 2181
2182 /*
2183 * Both debugging options require word-alignment which is calculated
2184 * into align above.
2185 */
2126 if (flags & SLAB_RED_ZONE) { 2186 if (flags & SLAB_RED_ZONE) {
2127 /* redzoning only works with word aligned caches */
2128 align = BYTES_PER_WORD;
2129
2130 /* add space for red zone words */ 2187 /* add space for red zone words */
2131 cachep->obj_offset += BYTES_PER_WORD; 2188 cachep->obj_offset += BYTES_PER_WORD;
2132 size += 2 * BYTES_PER_WORD; 2189 size += 2 * BYTES_PER_WORD;
2133 } 2190 }
2134 if (flags & SLAB_STORE_USER) { 2191 if (flags & SLAB_STORE_USER) {
2135 /* user store requires word alignment and 2192 /* user store requires one word storage behind the end of
2136 * one word storage behind the end of the real 2193 * the real object.
2137 * object.
2138 */ 2194 */
2139 align = BYTES_PER_WORD;
2140 size += BYTES_PER_WORD; 2195 size += BYTES_PER_WORD;
2141 } 2196 }
2142#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2197#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
@@ -2200,14 +2255,26 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2200 cachep->gfpflags |= GFP_DMA; 2255 cachep->gfpflags |= GFP_DMA;
2201 cachep->buffer_size = size; 2256 cachep->buffer_size = size;
2202 2257
2203 if (flags & CFLGS_OFF_SLAB) 2258 if (flags & CFLGS_OFF_SLAB) {
2204 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2259 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2260 /*
2261 * This is a possibility for one of the malloc_sizes caches.
2262 * But since we go off slab only for object size greater than
2263 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2264 * this should not happen at all.
2265 * But leave a BUG_ON for some lucky dude.
2266 */
2267 BUG_ON(!cachep->slabp_cache);
2268 }
2205 cachep->ctor = ctor; 2269 cachep->ctor = ctor;
2206 cachep->dtor = dtor; 2270 cachep->dtor = dtor;
2207 cachep->name = name; 2271 cachep->name = name;
2208 2272
2209 2273 if (setup_cpu_cache(cachep)) {
2210 setup_cpu_cache(cachep); 2274 __kmem_cache_destroy(cachep);
2275 cachep = NULL;
2276 goto oops;
2277 }
2211 2278
2212 /* cache setup completed, link it into the list */ 2279 /* cache setup completed, link it into the list */
2213 list_add(&cachep->next, &cache_chain); 2280 list_add(&cachep->next, &cache_chain);
@@ -2389,9 +2456,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2389 */ 2456 */
2390int kmem_cache_destroy(struct kmem_cache *cachep) 2457int kmem_cache_destroy(struct kmem_cache *cachep)
2391{ 2458{
2392 int i;
2393 struct kmem_list3 *l3;
2394
2395 BUG_ON(!cachep || in_interrupt()); 2459 BUG_ON(!cachep || in_interrupt());
2396 2460
2397 /* Don't let CPUs to come and go */ 2461 /* Don't let CPUs to come and go */
@@ -2417,25 +2481,23 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
2417 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2481 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2418 synchronize_rcu(); 2482 synchronize_rcu();
2419 2483
2420 for_each_online_cpu(i) 2484 __kmem_cache_destroy(cachep);
2421 kfree(cachep->array[i]);
2422
2423 /* NUMA: free the list3 structures */
2424 for_each_online_node(i) {
2425 l3 = cachep->nodelists[i];
2426 if (l3) {
2427 kfree(l3->shared);
2428 free_alien_cache(l3->alien);
2429 kfree(l3);
2430 }
2431 }
2432 kmem_cache_free(&cache_cache, cachep);
2433 unlock_cpu_hotplug(); 2485 unlock_cpu_hotplug();
2434 return 0; 2486 return 0;
2435} 2487}
2436EXPORT_SYMBOL(kmem_cache_destroy); 2488EXPORT_SYMBOL(kmem_cache_destroy);
2437 2489
2438/* Get the memory for a slab management obj. */ 2490/*
2491 * Get the memory for a slab management obj.
2492 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2493 * always come from malloc_sizes caches. The slab descriptor cannot
2494 * come from the same cache which is getting created because,
2495 * when we are searching for an appropriate cache for these
2496 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2497 * If we are creating a malloc_sizes cache here it would not be visible to
2498 * kmem_find_general_cachep till the initialization is complete.
2499 * Hence we cannot have slabp_cache same as the original cache.
2500 */
2439static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, 2501static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2440 int colour_off, gfp_t local_flags, 2502 int colour_off, gfp_t local_flags,
2441 int nodeid) 2503 int nodeid)
@@ -3119,6 +3181,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3119 if (slabp->inuse == 0) { 3181 if (slabp->inuse == 0) {
3120 if (l3->free_objects > l3->free_limit) { 3182 if (l3->free_objects > l3->free_limit) {
3121 l3->free_objects -= cachep->num; 3183 l3->free_objects -= cachep->num;
3184 /* No need to drop any previously held
3185 * lock here, even if we have a off-slab slab
3186 * descriptor it is guaranteed to come from
3187 * a different cache, refer to comments before
3188 * alloc_slabmgmt.
3189 */
3122 slab_destroy(cachep, slabp); 3190 slab_destroy(cachep, slabp);
3123 } else { 3191 } else {
3124 list_add(&slabp->list, &l3->slabs_free); 3192 list_add(&slabp->list, &l3->slabs_free);
@@ -3317,7 +3385,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3317} 3385}
3318EXPORT_SYMBOL(kmem_cache_alloc_node); 3386EXPORT_SYMBOL(kmem_cache_alloc_node);
3319 3387
3320void *kmalloc_node(size_t size, gfp_t flags, int node) 3388void *__kmalloc_node(size_t size, gfp_t flags, int node)
3321{ 3389{
3322 struct kmem_cache *cachep; 3390 struct kmem_cache *cachep;
3323 3391
@@ -3326,7 +3394,7 @@ void *kmalloc_node(size_t size, gfp_t flags, int node)
3326 return NULL; 3394 return NULL;
3327 return kmem_cache_alloc_node(cachep, flags, node); 3395 return kmem_cache_alloc_node(cachep, flags, node);
3328} 3396}
3329EXPORT_SYMBOL(kmalloc_node); 3397EXPORT_SYMBOL(__kmalloc_node);
3330#endif 3398#endif
3331 3399
3332/** 3400/**
@@ -3370,55 +3438,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3370EXPORT_SYMBOL(__kmalloc_track_caller); 3438EXPORT_SYMBOL(__kmalloc_track_caller);
3371#endif 3439#endif
3372 3440
3373#ifdef CONFIG_SMP
3374/**
3375 * __alloc_percpu - allocate one copy of the object for every present
3376 * cpu in the system, zeroing them.
3377 * Objects should be dereferenced using the per_cpu_ptr macro only.
3378 *
3379 * @size: how many bytes of memory are required.
3380 */
3381void *__alloc_percpu(size_t size)
3382{
3383 int i;
3384 struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
3385
3386 if (!pdata)
3387 return NULL;
3388
3389 /*
3390 * Cannot use for_each_online_cpu since a cpu may come online
3391 * and we have no way of figuring out how to fix the array
3392 * that we have allocated then....
3393 */
3394 for_each_possible_cpu(i) {
3395 int node = cpu_to_node(i);
3396
3397 if (node_online(node))
3398 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
3399 else
3400 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
3401
3402 if (!pdata->ptrs[i])
3403 goto unwind_oom;
3404 memset(pdata->ptrs[i], 0, size);
3405 }
3406
3407 /* Catch derefs w/o wrappers */
3408 return (void *)(~(unsigned long)pdata);
3409
3410unwind_oom:
3411 while (--i >= 0) {
3412 if (!cpu_possible(i))
3413 continue;
3414 kfree(pdata->ptrs[i]);
3415 }
3416 kfree(pdata);
3417 return NULL;
3418}
3419EXPORT_SYMBOL(__alloc_percpu);
3420#endif
3421
3422/** 3441/**
3423 * kmem_cache_free - Deallocate an object 3442 * kmem_cache_free - Deallocate an object
3424 * @cachep: The cache the allocation was from. 3443 * @cachep: The cache the allocation was from.
@@ -3464,29 +3483,6 @@ void kfree(const void *objp)
3464} 3483}
3465EXPORT_SYMBOL(kfree); 3484EXPORT_SYMBOL(kfree);
3466 3485
3467#ifdef CONFIG_SMP
3468/**
3469 * free_percpu - free previously allocated percpu memory
3470 * @objp: pointer returned by alloc_percpu.
3471 *
3472 * Don't free memory not originally allocated by alloc_percpu()
3473 * The complemented objp is to check for that.
3474 */
3475void free_percpu(const void *objp)
3476{
3477 int i;
3478 struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3479
3480 /*
3481 * We allocate for all cpus so we cannot use for online cpu here.
3482 */
3483 for_each_possible_cpu(i)
3484 kfree(p->ptrs[i]);
3485 kfree(p);
3486}
3487EXPORT_SYMBOL(free_percpu);
3488#endif
3489
3490unsigned int kmem_cache_size(struct kmem_cache *cachep) 3486unsigned int kmem_cache_size(struct kmem_cache *cachep)
3491{ 3487{
3492 return obj_size(cachep); 3488 return obj_size(cachep);
@@ -3603,22 +3599,26 @@ static void do_ccupdate_local(void *info)
3603static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3599static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3604 int batchcount, int shared) 3600 int batchcount, int shared)
3605{ 3601{
3606 struct ccupdate_struct new; 3602 struct ccupdate_struct *new;
3607 int i, err; 3603 int i;
3604
3605 new = kzalloc(sizeof(*new), GFP_KERNEL);
3606 if (!new)
3607 return -ENOMEM;
3608 3608
3609 memset(&new.new, 0, sizeof(new.new));
3610 for_each_online_cpu(i) { 3609 for_each_online_cpu(i) {
3611 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, 3610 new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3612 batchcount); 3611 batchcount);
3613 if (!new.new[i]) { 3612 if (!new->new[i]) {
3614 for (i--; i >= 0; i--) 3613 for (i--; i >= 0; i--)
3615 kfree(new.new[i]); 3614 kfree(new->new[i]);
3615 kfree(new);
3616 return -ENOMEM; 3616 return -ENOMEM;
3617 } 3617 }
3618 } 3618 }
3619 new.cachep = cachep; 3619 new->cachep = cachep;
3620 3620
3621 on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); 3621 on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
3622 3622
3623 check_irq_on(); 3623 check_irq_on();
3624 cachep->batchcount = batchcount; 3624 cachep->batchcount = batchcount;
@@ -3626,7 +3626,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3626 cachep->shared = shared; 3626 cachep->shared = shared;
3627 3627
3628 for_each_online_cpu(i) { 3628 for_each_online_cpu(i) {
3629 struct array_cache *ccold = new.new[i]; 3629 struct array_cache *ccold = new->new[i];
3630 if (!ccold) 3630 if (!ccold)
3631 continue; 3631 continue;
3632 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3632 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
@@ -3634,18 +3634,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3634 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3634 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3635 kfree(ccold); 3635 kfree(ccold);
3636 } 3636 }
3637 3637 kfree(new);
3638 err = alloc_kmemlist(cachep); 3638 return alloc_kmemlist(cachep);
3639 if (err) {
3640 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3641 cachep->name, -err);
3642 BUG();
3643 }
3644 return 0;
3645} 3639}
3646 3640
3647/* Called with cache_chain_mutex held always */ 3641/* Called with cache_chain_mutex held always */
3648static void enable_cpucache(struct kmem_cache *cachep) 3642static int enable_cpucache(struct kmem_cache *cachep)
3649{ 3643{
3650 int err; 3644 int err;
3651 int limit, shared; 3645 int limit, shared;
@@ -3697,6 +3691,7 @@ static void enable_cpucache(struct kmem_cache *cachep)
3697 if (err) 3691 if (err)
3698 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3692 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3699 cachep->name, -err); 3693 cachep->name, -err);
3694 return err;
3700} 3695}
3701 3696
3702/* 3697/*
@@ -4157,6 +4152,7 @@ static int leaks_show(struct seq_file *m, void *p)
4157 show_symbol(m, n[2*i+2]); 4152 show_symbol(m, n[2*i+2]);
4158 seq_putc(m, '\n'); 4153 seq_putc(m, '\n');
4159 } 4154 }
4155
4160 return 0; 4156 return 0;
4161} 4157}
4162 4158
diff --git a/mm/slob.c b/mm/slob.c
index 7b52b20b9607..20188627347c 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -339,52 +339,3 @@ void kmem_cache_init(void)
339 339
340 mod_timer(&slob_timer, jiffies + HZ); 340 mod_timer(&slob_timer, jiffies + HZ);
341} 341}
342
343atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
344EXPORT_SYMBOL(slab_reclaim_pages);
345
346#ifdef CONFIG_SMP
347
348void *__alloc_percpu(size_t size)
349{
350 int i;
351 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
352
353 if (!pdata)
354 return NULL;
355
356 for_each_possible_cpu(i) {
357 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
358 if (!pdata->ptrs[i])
359 goto unwind_oom;
360 memset(pdata->ptrs[i], 0, size);
361 }
362
363 /* Catch derefs w/o wrappers */
364 return (void *) (~(unsigned long) pdata);
365
366unwind_oom:
367 while (--i >= 0) {
368 if (!cpu_possible(i))
369 continue;
370 kfree(pdata->ptrs[i]);
371 }
372 kfree(pdata);
373 return NULL;
374}
375EXPORT_SYMBOL(__alloc_percpu);
376
377void
378free_percpu(const void *objp)
379{
380 int i;
381 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
382
383 for_each_possible_cpu(i)
384 kfree(p->ptrs[i]);
385
386 kfree(p);
387}
388EXPORT_SYMBOL(free_percpu);
389
390#endif
diff --git a/mm/swap.c b/mm/swap.c
index 687686a61f7c..2e0e871f542f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,6 +34,25 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37/*
38 * This path almost never happens for VM activity - pages are normally
39 * freed via pagevecs. But it gets used by networking.
40 */
41static void fastcall __page_cache_release(struct page *page)
42{
43 if (PageLRU(page)) {
44 unsigned long flags;
45 struct zone *zone = page_zone(page);
46
47 spin_lock_irqsave(&zone->lru_lock, flags);
48 VM_BUG_ON(!PageLRU(page));
49 __ClearPageLRU(page);
50 del_page_from_lru(zone, page);
51 spin_unlock_irqrestore(&zone->lru_lock, flags);
52 }
53 free_hot_page(page);
54}
55
37static void put_compound_page(struct page *page) 56static void put_compound_page(struct page *page)
38{ 57{
39 page = (struct page *)page_private(page); 58 page = (struct page *)page_private(page);
@@ -223,26 +242,6 @@ int lru_add_drain_all(void)
223#endif 242#endif
224 243
225/* 244/*
226 * This path almost never happens for VM activity - pages are normally
227 * freed via pagevecs. But it gets used by networking.
228 */
229void fastcall __page_cache_release(struct page *page)
230{
231 if (PageLRU(page)) {
232 unsigned long flags;
233 struct zone *zone = page_zone(page);
234
235 spin_lock_irqsave(&zone->lru_lock, flags);
236 BUG_ON(!PageLRU(page));
237 __ClearPageLRU(page);
238 del_page_from_lru(zone, page);
239 spin_unlock_irqrestore(&zone->lru_lock, flags);
240 }
241 free_hot_page(page);
242}
243EXPORT_SYMBOL(__page_cache_release);
244
245/*
246 * Batched page_cache_release(). Decrement the reference count on all the 245 * Batched page_cache_release(). Decrement the reference count on all the
247 * passed pages. If it fell to zero then remove the page from the LRU and 246 * passed pages. If it fell to zero then remove the page from the LRU and
248 * free it. 247 * free it.
@@ -284,7 +283,7 @@ void release_pages(struct page **pages, int nr, int cold)
284 zone = pagezone; 283 zone = pagezone;
285 spin_lock_irq(&zone->lru_lock); 284 spin_lock_irq(&zone->lru_lock);
286 } 285 }
287 BUG_ON(!PageLRU(page)); 286 VM_BUG_ON(!PageLRU(page));
288 __ClearPageLRU(page); 287 __ClearPageLRU(page);
289 del_page_from_lru(zone, page); 288 del_page_from_lru(zone, page);
290 } 289 }
@@ -337,7 +336,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
337 for (i = 0; i < pagevec_count(pvec); i++) { 336 for (i = 0; i < pagevec_count(pvec); i++) {
338 struct page *page = pvec->pages[i]; 337 struct page *page = pvec->pages[i];
339 338
340 BUG_ON(PageLRU(page)); 339 VM_BUG_ON(PageLRU(page));
341 if (put_page_testzero(page)) 340 if (put_page_testzero(page))
342 pagevec_add(&pages_to_free, page); 341 pagevec_add(&pages_to_free, page);
343 } 342 }
@@ -364,7 +363,7 @@ void __pagevec_lru_add(struct pagevec *pvec)
364 zone = pagezone; 363 zone = pagezone;
365 spin_lock_irq(&zone->lru_lock); 364 spin_lock_irq(&zone->lru_lock);
366 } 365 }
367 BUG_ON(PageLRU(page)); 366 VM_BUG_ON(PageLRU(page));
368 SetPageLRU(page); 367 SetPageLRU(page);
369 add_page_to_inactive_list(zone, page); 368 add_page_to_inactive_list(zone, page);
370 } 369 }
@@ -391,9 +390,9 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
391 zone = pagezone; 390 zone = pagezone;
392 spin_lock_irq(&zone->lru_lock); 391 spin_lock_irq(&zone->lru_lock);
393 } 392 }
394 BUG_ON(PageLRU(page)); 393 VM_BUG_ON(PageLRU(page));
395 SetPageLRU(page); 394 SetPageLRU(page);
396 BUG_ON(PageActive(page)); 395 VM_BUG_ON(PageActive(page));
397 SetPageActive(page); 396 SetPageActive(page);
398 add_page_to_active_list(zone, page); 397 add_page_to_active_list(zone, page);
399 } 398 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 266162d2ba28..9aad8b0cc6ee 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,9 @@
24DEFINE_RWLOCK(vmlist_lock); 24DEFINE_RWLOCK(vmlist_lock);
25struct vm_struct *vmlist; 25struct vm_struct *vmlist;
26 26
27static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
28 int node);
29
27static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 30static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
28{ 31{
29 pte_t *pte; 32 pte_t *pte;
@@ -478,8 +481,8 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
478 * allocator with @gfp_mask flags. Map them into contiguous 481 * allocator with @gfp_mask flags. Map them into contiguous
479 * kernel virtual space, using a pagetable protection of @prot. 482 * kernel virtual space, using a pagetable protection of @prot.
480 */ 483 */
481void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 484static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
482 int node) 485 int node)
483{ 486{
484 struct vm_struct *area; 487 struct vm_struct *area;
485 488
@@ -493,7 +496,6 @@ void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
493 496
494 return __vmalloc_area_node(area, gfp_mask, prot, node); 497 return __vmalloc_area_node(area, gfp_mask, prot, node);
495} 498}
496EXPORT_SYMBOL(__vmalloc_node);
497 499
498void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 500void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
499{ 501{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5d4c4d02254d..87779dda4ec6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -62,6 +62,8 @@ struct scan_control {
62 int swap_cluster_max; 62 int swap_cluster_max;
63 63
64 int swappiness; 64 int swappiness;
65
66 int all_unreclaimable;
65}; 67};
66 68
67/* 69/*
@@ -377,8 +379,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
377 379
378int remove_mapping(struct address_space *mapping, struct page *page) 380int remove_mapping(struct address_space *mapping, struct page *page)
379{ 381{
380 if (!mapping) 382 BUG_ON(!PageLocked(page));
381 return 0; /* truncate got there first */ 383 BUG_ON(mapping != page_mapping(page));
382 384
383 write_lock_irq(&mapping->tree_lock); 385 write_lock_irq(&mapping->tree_lock);
384 386
@@ -440,7 +442,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
440 if (TestSetPageLocked(page)) 442 if (TestSetPageLocked(page))
441 goto keep; 443 goto keep;
442 444
443 BUG_ON(PageActive(page)); 445 VM_BUG_ON(PageActive(page));
444 446
445 sc->nr_scanned++; 447 sc->nr_scanned++;
446 448
@@ -547,7 +549,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
547 goto free_it; 549 goto free_it;
548 } 550 }
549 551
550 if (!remove_mapping(mapping, page)) 552 if (!mapping || !remove_mapping(mapping, page))
551 goto keep_locked; 553 goto keep_locked;
552 554
553free_it: 555free_it:
@@ -564,7 +566,7 @@ keep_locked:
564 unlock_page(page); 566 unlock_page(page);
565keep: 567keep:
566 list_add(&page->lru, &ret_pages); 568 list_add(&page->lru, &ret_pages);
567 BUG_ON(PageLRU(page)); 569 VM_BUG_ON(PageLRU(page));
568 } 570 }
569 list_splice(&ret_pages, page_list); 571 list_splice(&ret_pages, page_list);
570 if (pagevec_count(&freed_pvec)) 572 if (pagevec_count(&freed_pvec))
@@ -603,7 +605,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
603 page = lru_to_page(src); 605 page = lru_to_page(src);
604 prefetchw_prev_lru_page(page, src, flags); 606 prefetchw_prev_lru_page(page, src, flags);
605 607
606 BUG_ON(!PageLRU(page)); 608 VM_BUG_ON(!PageLRU(page));
607 609
608 list_del(&page->lru); 610 list_del(&page->lru);
609 target = src; 611 target = src;
@@ -674,7 +676,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
674 */ 676 */
675 while (!list_empty(&page_list)) { 677 while (!list_empty(&page_list)) {
676 page = lru_to_page(&page_list); 678 page = lru_to_page(&page_list);
677 BUG_ON(PageLRU(page)); 679 VM_BUG_ON(PageLRU(page));
678 SetPageLRU(page); 680 SetPageLRU(page);
679 list_del(&page->lru); 681 list_del(&page->lru);
680 if (PageActive(page)) 682 if (PageActive(page))
@@ -695,6 +697,11 @@ done:
695 return nr_reclaimed; 697 return nr_reclaimed;
696} 698}
697 699
700static inline int zone_is_near_oom(struct zone *zone)
701{
702 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
703}
704
698/* 705/*
699 * This moves pages from the active list to the inactive list. 706 * This moves pages from the active list to the inactive list.
700 * 707 *
@@ -730,6 +737,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
730 long distress; 737 long distress;
731 long swap_tendency; 738 long swap_tendency;
732 739
740 if (zone_is_near_oom(zone))
741 goto force_reclaim_mapped;
742
733 /* 743 /*
734 * `distress' is a measure of how much trouble we're having 744 * `distress' is a measure of how much trouble we're having
735 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 745 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
@@ -765,6 +775,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
765 * memory onto the inactive list. 775 * memory onto the inactive list.
766 */ 776 */
767 if (swap_tendency >= 100) 777 if (swap_tendency >= 100)
778force_reclaim_mapped:
768 reclaim_mapped = 1; 779 reclaim_mapped = 1;
769 } 780 }
770 781
@@ -797,9 +808,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
797 while (!list_empty(&l_inactive)) { 808 while (!list_empty(&l_inactive)) {
798 page = lru_to_page(&l_inactive); 809 page = lru_to_page(&l_inactive);
799 prefetchw_prev_lru_page(page, &l_inactive, flags); 810 prefetchw_prev_lru_page(page, &l_inactive, flags);
800 BUG_ON(PageLRU(page)); 811 VM_BUG_ON(PageLRU(page));
801 SetPageLRU(page); 812 SetPageLRU(page);
802 BUG_ON(!PageActive(page)); 813 VM_BUG_ON(!PageActive(page));
803 ClearPageActive(page); 814 ClearPageActive(page);
804 815
805 list_move(&page->lru, &zone->inactive_list); 816 list_move(&page->lru, &zone->inactive_list);
@@ -827,9 +838,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
827 while (!list_empty(&l_active)) { 838 while (!list_empty(&l_active)) {
828 page = lru_to_page(&l_active); 839 page = lru_to_page(&l_active);
829 prefetchw_prev_lru_page(page, &l_active, flags); 840 prefetchw_prev_lru_page(page, &l_active, flags);
830 BUG_ON(PageLRU(page)); 841 VM_BUG_ON(PageLRU(page));
831 SetPageLRU(page); 842 SetPageLRU(page);
832 BUG_ON(!PageActive(page)); 843 VM_BUG_ON(!PageActive(page));
833 list_move(&page->lru, &zone->active_list); 844 list_move(&page->lru, &zone->active_list);
834 pgmoved++; 845 pgmoved++;
835 if (!pagevec_add(&pvec, page)) { 846 if (!pagevec_add(&pvec, page)) {
@@ -925,6 +936,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
925 unsigned long nr_reclaimed = 0; 936 unsigned long nr_reclaimed = 0;
926 int i; 937 int i;
927 938
939 sc->all_unreclaimable = 1;
928 for (i = 0; zones[i] != NULL; i++) { 940 for (i = 0; zones[i] != NULL; i++) {
929 struct zone *zone = zones[i]; 941 struct zone *zone = zones[i];
930 942
@@ -941,6 +953,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
941 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 953 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
942 continue; /* Let kswapd poll it */ 954 continue; /* Let kswapd poll it */
943 955
956 sc->all_unreclaimable = 0;
957
944 nr_reclaimed += shrink_zone(priority, zone, sc); 958 nr_reclaimed += shrink_zone(priority, zone, sc);
945 } 959 }
946 return nr_reclaimed; 960 return nr_reclaimed;
@@ -1021,6 +1035,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1021 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1035 if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
1022 blk_congestion_wait(WRITE, HZ/10); 1036 blk_congestion_wait(WRITE, HZ/10);
1023 } 1037 }
1038 /* top priority shrink_caches still had more to do? don't OOM, then */
1039 if (!sc.all_unreclaimable)
1040 ret = 1;
1024out: 1041out:
1025 for (i = 0; zones[i] != 0; i++) { 1042 for (i = 0; zones[i] != 0; i++) {
1026 struct zone *zone = zones[i]; 1043 struct zone *zone = zones[i];
@@ -1153,7 +1170,7 @@ scan:
1153 if (zone->all_unreclaimable) 1170 if (zone->all_unreclaimable)
1154 continue; 1171 continue;
1155 if (nr_slab == 0 && zone->pages_scanned >= 1172 if (nr_slab == 0 && zone->pages_scanned >=
1156 (zone->nr_active + zone->nr_inactive) * 4) 1173 (zone->nr_active + zone->nr_inactive) * 6)
1157 zone->all_unreclaimable = 1; 1174 zone->all_unreclaimable = 1;
1158 /* 1175 /*
1159 * If we've done a decent amount of scanning and 1176 * If we've done a decent amount of scanning and
@@ -1361,7 +1378,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1361 for_each_zone(zone) 1378 for_each_zone(zone)
1362 lru_pages += zone->nr_active + zone->nr_inactive; 1379 lru_pages += zone->nr_active + zone->nr_inactive;
1363 1380
1364 nr_slab = global_page_state(NR_SLAB); 1381 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1365 /* If slab caches are huge, it's better to hit them first */ 1382 /* If slab caches are huge, it's better to hit them first */
1366 while (nr_slab >= lru_pages) { 1383 while (nr_slab >= lru_pages) {
1367 reclaim_state.reclaimed_slab = 0; 1384 reclaim_state.reclaimed_slab = 0;
@@ -1510,7 +1527,6 @@ int zone_reclaim_mode __read_mostly;
1510#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1527#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
1511#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1528#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1512#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1529#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1513#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
1514 1530
1515/* 1531/*
1516 * Priority for ZONE_RECLAIM. This determines the fraction of pages 1532 * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1526,6 +1542,12 @@ int zone_reclaim_mode __read_mostly;
1526int sysctl_min_unmapped_ratio = 1; 1542int sysctl_min_unmapped_ratio = 1;
1527 1543
1528/* 1544/*
1545 * If the number of slab pages in a zone grows beyond this percentage then
1546 * slab reclaim needs to occur.
1547 */
1548int sysctl_min_slab_ratio = 5;
1549
1550/*
1529 * Try to free up some pages from this zone through reclaim. 1551 * Try to free up some pages from this zone through reclaim.
1530 */ 1552 */
1531static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1553static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -1544,6 +1566,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1544 .gfp_mask = gfp_mask, 1566 .gfp_mask = gfp_mask,
1545 .swappiness = vm_swappiness, 1567 .swappiness = vm_swappiness,
1546 }; 1568 };
1569 unsigned long slab_reclaimable;
1547 1570
1548 disable_swap_token(); 1571 disable_swap_token();
1549 cond_resched(); 1572 cond_resched();
@@ -1556,29 +1579,43 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1556 reclaim_state.reclaimed_slab = 0; 1579 reclaim_state.reclaimed_slab = 0;
1557 p->reclaim_state = &reclaim_state; 1580 p->reclaim_state = &reclaim_state;
1558 1581
1559 /* 1582 if (zone_page_state(zone, NR_FILE_PAGES) -
1560 * Free memory by calling shrink zone with increasing priorities 1583 zone_page_state(zone, NR_FILE_MAPPED) >
1561 * until we have enough memory freed. 1584 zone->min_unmapped_pages) {
1562 */ 1585 /*
1563 priority = ZONE_RECLAIM_PRIORITY; 1586 * Free memory by calling shrink zone with increasing
1564 do { 1587 * priorities until we have enough memory freed.
1565 nr_reclaimed += shrink_zone(priority, zone, &sc); 1588 */
1566 priority--; 1589 priority = ZONE_RECLAIM_PRIORITY;
1567 } while (priority >= 0 && nr_reclaimed < nr_pages); 1590 do {
1591 nr_reclaimed += shrink_zone(priority, zone, &sc);
1592 priority--;
1593 } while (priority >= 0 && nr_reclaimed < nr_pages);
1594 }
1568 1595
1569 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { 1596 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
1597 if (slab_reclaimable > zone->min_slab_pages) {
1570 /* 1598 /*
1571 * shrink_slab() does not currently allow us to determine how 1599 * shrink_slab() does not currently allow us to determine how
1572 * many pages were freed in this zone. So we just shake the slab 1600 * many pages were freed in this zone. So we take the current
1573 * a bit and then go off node for this particular allocation 1601 * number of slab pages and shake the slab until it is reduced
1574 * despite possibly having freed enough memory to allocate in 1602 * by the same nr_pages that we used for reclaiming unmapped
1575 * this zone. If we freed local memory then the next 1603 * pages.
1576 * allocations will be local again.
1577 * 1604 *
1578 * shrink_slab will free memory on all zones and may take 1605 * Note that shrink_slab will free memory on all zones and may
1579 * a long time. 1606 * take a long time.
1607 */
1608 while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
1609 zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
1610 slab_reclaimable - nr_pages)
1611 ;
1612
1613 /*
1614 * Update nr_reclaimed by the number of slab pages we
1615 * reclaimed from this zone.
1580 */ 1616 */
1581 shrink_slab(sc.nr_scanned, gfp_mask, order); 1617 nr_reclaimed += slab_reclaimable -
1618 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
1582 } 1619 }
1583 1620
1584 p->reclaim_state = NULL; 1621 p->reclaim_state = NULL;
@@ -1592,7 +1629,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1592 int node_id; 1629 int node_id;
1593 1630
1594 /* 1631 /*
1595 * Zone reclaim reclaims unmapped file backed pages. 1632 * Zone reclaim reclaims unmapped file backed pages and
1633 * slab pages if we are over the defined limits.
1596 * 1634 *
1597 * A small portion of unmapped file backed pages is needed for 1635 * A small portion of unmapped file backed pages is needed for
1598 * file I/O otherwise pages read by file I/O will be immediately 1636 * file I/O otherwise pages read by file I/O will be immediately
@@ -1601,7 +1639,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1601 * unmapped file backed pages. 1639 * unmapped file backed pages.
1602 */ 1640 */
1603 if (zone_page_state(zone, NR_FILE_PAGES) - 1641 if (zone_page_state(zone, NR_FILE_PAGES) -
1604 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) 1642 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
1643 && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
1644 <= zone->min_slab_pages)
1605 return 0; 1645 return 0;
1606 1646
1607 /* 1647 /*
@@ -1621,7 +1661,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1621 * over remote processors and spread off node memory allocations 1661 * over remote processors and spread off node memory allocations
1622 * as wide as possible. 1662 * as wide as possible.
1623 */ 1663 */
1624 node_id = zone->zone_pgdat->node_id; 1664 node_id = zone_to_nid(zone);
1625 mask = node_to_cpumask(node_id); 1665 mask = node_to_cpumask(node_id);
1626 if (!cpus_empty(mask) && node_id != numa_node_id()) 1666 if (!cpus_empty(mask) && node_id != numa_node_id())
1627 return 0; 1667 return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c1b5f4106b38..490d8c1a0ded 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -321,6 +321,9 @@ void refresh_cpu_vm_stats(int cpu)
321 for_each_zone(zone) { 321 for_each_zone(zone) {
322 struct per_cpu_pageset *pcp; 322 struct per_cpu_pageset *pcp;
323 323
324 if (!populated_zone(zone))
325 continue;
326
324 pcp = zone_pcp(zone, cpu); 327 pcp = zone_pcp(zone, cpu);
325 328
326 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 329 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
@@ -435,12 +438,28 @@ struct seq_operations fragmentation_op = {
435 .show = frag_show, 438 .show = frag_show,
436}; 439};
437 440
441#ifdef CONFIG_ZONE_DMA32
442#define TEXT_FOR_DMA32(xx) xx "_dma32",
443#else
444#define TEXT_FOR_DMA32(xx)
445#endif
446
447#ifdef CONFIG_HIGHMEM
448#define TEXT_FOR_HIGHMEM(xx) xx "_high",
449#else
450#define TEXT_FOR_HIGHMEM(xx)
451#endif
452
453#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
454 TEXT_FOR_HIGHMEM(xx)
455
438static char *vmstat_text[] = { 456static char *vmstat_text[] = {
439 /* Zoned VM counters */ 457 /* Zoned VM counters */
440 "nr_anon_pages", 458 "nr_anon_pages",
441 "nr_mapped", 459 "nr_mapped",
442 "nr_file_pages", 460 "nr_file_pages",
443 "nr_slab", 461 "nr_slab_reclaimable",
462 "nr_slab_unreclaimable",
444 "nr_page_table_pages", 463 "nr_page_table_pages",
445 "nr_dirty", 464 "nr_dirty",
446 "nr_writeback", 465 "nr_writeback",
@@ -462,10 +481,7 @@ static char *vmstat_text[] = {
462 "pswpin", 481 "pswpin",
463 "pswpout", 482 "pswpout",
464 483
465 "pgalloc_dma", 484 TEXTS_FOR_ZONES("pgalloc")
466 "pgalloc_dma32",
467 "pgalloc_normal",
468 "pgalloc_high",
469 485
470 "pgfree", 486 "pgfree",
471 "pgactivate", 487 "pgactivate",
@@ -474,25 +490,10 @@ static char *vmstat_text[] = {
474 "pgfault", 490 "pgfault",
475 "pgmajfault", 491 "pgmajfault",
476 492
477 "pgrefill_dma", 493 TEXTS_FOR_ZONES("pgrefill")
478 "pgrefill_dma32", 494 TEXTS_FOR_ZONES("pgsteal")
479 "pgrefill_normal", 495 TEXTS_FOR_ZONES("pgscan_kswapd")
480 "pgrefill_high", 496 TEXTS_FOR_ZONES("pgscan_direct")
481
482 "pgsteal_dma",
483 "pgsteal_dma32",
484 "pgsteal_normal",
485 "pgsteal_high",
486
487 "pgscan_kswapd_dma",
488 "pgscan_kswapd_dma32",
489 "pgscan_kswapd_normal",
490 "pgscan_kswapd_high",
491
492 "pgscan_direct_dma",
493 "pgscan_direct_dma32",
494 "pgscan_direct_normal",
495 "pgscan_direct_high",
496 497
497 "pginodesteal", 498 "pginodesteal",
498 "slabs_scanned", 499 "slabs_scanned",