aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-09-28 08:29:59 -0400
committerSteven Whitehouse <swhiteho@redhat.com>2006-09-28 08:29:59 -0400
commit185a257f2f73bcd89050ad02da5bedbc28fc43fa (patch)
tree5e32586114534ed3f2165614cba3d578f5d87307 /mm
parent3f1a9aaeffd8d1cbc5ab9776c45cbd66af1c9699 (diff)
parenta77c64c1a641950626181b4857abb701d8f38ccc (diff)
Merge branch 'master' into gfs2
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/allocpercpu.c129
-rw-r--r--mm/bootmem.c202
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/fremap.c4
-rw-r--r--mm/highmem.c13
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memory.c194
-rw-r--r--mm/mempolicy.c23
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mprotect.c51
-rw-r--r--mm/msync.c196
-rw-r--r--mm/nommu.c247
-rw-r--r--mm/oom_kill.c97
-rw-r--r--mm/page-writeback.c29
-rw-r--r--mm/page_alloc.c974
-rw-r--r--mm/page_io.c48
-rw-r--r--mm/rmap.c65
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/slab.c434
-rw-r--r--mm/slob.c52
-rw-r--r--mm/swap.c49
-rw-r--r--mm/truncate.c25
-rw-r--r--mm/vmalloc.c38
-rw-r--r--mm/vmscan.c140
-rw-r--r--mm/vmstat.c52
28 files changed, 2180 insertions, 942 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 9dd824c11eeb..60c56c0b5e10 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,4 +23,4 @@ obj-$(CONFIG_SLAB) += slab.o
23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
24obj-$(CONFIG_FS_XIP) += filemap_xip.o 24obj-$(CONFIG_FS_XIP) += filemap_xip.o
25obj-$(CONFIG_MIGRATION) += migrate.o 25obj-$(CONFIG_MIGRATION) += migrate.o
26 26obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
new file mode 100644
index 000000000000..eaa9abeea536
--- /dev/null
+++ b/mm/allocpercpu.c
@@ -0,0 +1,129 @@
1/*
2 * linux/mm/allocpercpu.c
3 *
4 * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
5 */
6#include <linux/mm.h>
7#include <linux/module.h>
8
9/**
10 * percpu_depopulate - depopulate per-cpu data for given cpu
11 * @__pdata: per-cpu data to depopulate
12 * @cpu: depopulate per-cpu data for this cpu
13 *
14 * Depopulating per-cpu data for a cpu going offline would be a typical
15 * use case. You need to register a cpu hotplug handler for that purpose.
16 */
17void percpu_depopulate(void *__pdata, int cpu)
18{
19 struct percpu_data *pdata = __percpu_disguise(__pdata);
20 if (pdata->ptrs[cpu]) {
21 kfree(pdata->ptrs[cpu]);
22 pdata->ptrs[cpu] = NULL;
23 }
24}
25EXPORT_SYMBOL_GPL(percpu_depopulate);
26
27/**
28 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
29 * @__pdata: per-cpu data to depopulate
30 * @mask: depopulate per-cpu data for cpu's selected through mask bits
31 */
32void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
33{
34 int cpu;
35 for_each_cpu_mask(cpu, *mask)
36 percpu_depopulate(__pdata, cpu);
37}
38EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
39
40/**
41 * percpu_populate - populate per-cpu data for given cpu
42 * @__pdata: per-cpu data to populate further
43 * @size: size of per-cpu object
44 * @gfp: may sleep or not etc.
45 * @cpu: populate per-data for this cpu
46 *
47 * Populating per-cpu data for a cpu coming online would be a typical
48 * use case. You need to register a cpu hotplug handler for that purpose.
49 * Per-cpu object is populated with zeroed buffer.
50 */
51void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
52{
53 struct percpu_data *pdata = __percpu_disguise(__pdata);
54 int node = cpu_to_node(cpu);
55
56 BUG_ON(pdata->ptrs[cpu]);
57 if (node_online(node)) {
58 /* FIXME: kzalloc_node(size, gfp, node) */
59 pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
60 if (pdata->ptrs[cpu])
61 memset(pdata->ptrs[cpu], 0, size);
62 } else
63 pdata->ptrs[cpu] = kzalloc(size, gfp);
64 return pdata->ptrs[cpu];
65}
66EXPORT_SYMBOL_GPL(percpu_populate);
67
68/**
69 * percpu_populate_mask - populate per-cpu data for more cpu's
70 * @__pdata: per-cpu data to populate further
71 * @size: size of per-cpu object
72 * @gfp: may sleep or not etc.
73 * @mask: populate per-cpu data for cpu's selected through mask bits
74 *
75 * Per-cpu objects are populated with zeroed buffers.
76 */
77int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
78 cpumask_t *mask)
79{
80 cpumask_t populated = CPU_MASK_NONE;
81 int cpu;
82
83 for_each_cpu_mask(cpu, *mask)
84 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
85 __percpu_depopulate_mask(__pdata, &populated);
86 return -ENOMEM;
87 } else
88 cpu_set(cpu, populated);
89 return 0;
90}
91EXPORT_SYMBOL_GPL(__percpu_populate_mask);
92
93/**
94 * percpu_alloc_mask - initial setup of per-cpu data
95 * @size: size of per-cpu object
96 * @gfp: may sleep or not etc.
97 * @mask: populate per-data for cpu's selected through mask bits
98 *
99 * Populating per-cpu data for all online cpu's would be a typical use case,
100 * which is simplified by the percpu_alloc() wrapper.
101 * Per-cpu objects are populated with zeroed buffers.
102 */
103void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
104{
105 void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
106 void *__pdata = __percpu_disguise(pdata);
107
108 if (unlikely(!pdata))
109 return NULL;
110 if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
111 return __pdata;
112 kfree(pdata);
113 return NULL;
114}
115EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
116
117/**
118 * percpu_free - final cleanup of per-cpu data
119 * @__pdata: object to clean up
120 *
121 * We simply clean up any per-cpu object left. No need for the client to
122 * track and specify through a bis mask which per-cpu objects are to free.
123 */
124void percpu_free(void *__pdata)
125{
126 __percpu_depopulate_mask(__pdata, &cpu_possible_map);
127 kfree(__percpu_disguise(__pdata));
128}
129EXPORT_SYMBOL_GPL(percpu_free);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 50353e0dac12..d53112fcb404 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -8,17 +8,15 @@
8 * free memory collector. It's used to deal with reserved 8 * free memory collector. It's used to deal with reserved
9 * system memory and memory holes as well. 9 * system memory and memory holes as well.
10 */ 10 */
11
12#include <linux/mm.h>
13#include <linux/kernel_stat.h>
14#include <linux/swap.h>
15#include <linux/interrupt.h>
16#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h>
17#include <linux/bootmem.h> 13#include <linux/bootmem.h>
18#include <linux/mmzone.h>
19#include <linux/module.h> 14#include <linux/module.h>
20#include <asm/dma.h> 15
16#include <asm/bug.h>
21#include <asm/io.h> 17#include <asm/io.h>
18#include <asm/processor.h>
19
22#include "internal.h" 20#include "internal.h"
23 21
24/* 22/*
@@ -41,7 +39,7 @@ unsigned long saved_max_pfn;
41#endif 39#endif
42 40
43/* return the number of _pages_ that will be allocated for the boot bitmap */ 41/* return the number of _pages_ that will be allocated for the boot bitmap */
44unsigned long __init bootmem_bootmap_pages (unsigned long pages) 42unsigned long __init bootmem_bootmap_pages(unsigned long pages)
45{ 43{
46 unsigned long mapsize; 44 unsigned long mapsize;
47 45
@@ -51,12 +49,14 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
51 49
52 return mapsize; 50 return mapsize;
53} 51}
52
54/* 53/*
55 * link bdata in order 54 * link bdata in order
56 */ 55 */
57static void link_bootmem(bootmem_data_t *bdata) 56static void __init link_bootmem(bootmem_data_t *bdata)
58{ 57{
59 bootmem_data_t *ent; 58 bootmem_data_t *ent;
59
60 if (list_empty(&bdata_list)) { 60 if (list_empty(&bdata_list)) {
61 list_add(&bdata->list, &bdata_list); 61 list_add(&bdata->list, &bdata_list);
62 return; 62 return;
@@ -69,22 +69,32 @@ static void link_bootmem(bootmem_data_t *bdata)
69 } 69 }
70 } 70 }
71 list_add_tail(&bdata->list, &bdata_list); 71 list_add_tail(&bdata->list, &bdata_list);
72 return;
73} 72}
74 73
74/*
75 * Given an initialised bdata, it returns the size of the boot bitmap
76 */
77static unsigned long __init get_mapsize(bootmem_data_t *bdata)
78{
79 unsigned long mapsize;
80 unsigned long start = PFN_DOWN(bdata->node_boot_start);
81 unsigned long end = bdata->node_low_pfn;
82
83 mapsize = ((end - start) + 7) / 8;
84 return ALIGN(mapsize, sizeof(long));
85}
75 86
76/* 87/*
77 * Called once to set up the allocator itself. 88 * Called once to set up the allocator itself.
78 */ 89 */
79static unsigned long __init init_bootmem_core (pg_data_t *pgdat, 90static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
80 unsigned long mapstart, unsigned long start, unsigned long end) 91 unsigned long mapstart, unsigned long start, unsigned long end)
81{ 92{
82 bootmem_data_t *bdata = pgdat->bdata; 93 bootmem_data_t *bdata = pgdat->bdata;
83 unsigned long mapsize = ((end - start)+7)/8; 94 unsigned long mapsize;
84 95
85 mapsize = ALIGN(mapsize, sizeof(long)); 96 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
86 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); 97 bdata->node_boot_start = PFN_PHYS(start);
87 bdata->node_boot_start = (start << PAGE_SHIFT);
88 bdata->node_low_pfn = end; 98 bdata->node_low_pfn = end;
89 link_bootmem(bdata); 99 link_bootmem(bdata);
90 100
@@ -92,6 +102,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
92 * Initially all pages are reserved - setup_arch() has to 102 * Initially all pages are reserved - setup_arch() has to
93 * register free RAM areas explicitly. 103 * register free RAM areas explicitly.
94 */ 104 */
105 mapsize = get_mapsize(bdata);
95 memset(bdata->node_bootmem_map, 0xff, mapsize); 106 memset(bdata->node_bootmem_map, 0xff, mapsize);
96 107
97 return mapsize; 108 return mapsize;
@@ -102,22 +113,22 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
102 * might be used for boot-time allocations - or it might get added 113 * might be used for boot-time allocations - or it might get added
103 * to the free page pool later on. 114 * to the free page pool later on.
104 */ 115 */
105static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) 116static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
117 unsigned long size)
106{ 118{
119 unsigned long sidx, eidx;
107 unsigned long i; 120 unsigned long i;
121
108 /* 122 /*
109 * round up, partially reserved pages are considered 123 * round up, partially reserved pages are considered
110 * fully reserved. 124 * fully reserved.
111 */ 125 */
112 unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
113 unsigned long eidx = (addr + size - bdata->node_boot_start +
114 PAGE_SIZE-1)/PAGE_SIZE;
115 unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
116
117 BUG_ON(!size); 126 BUG_ON(!size);
118 BUG_ON(sidx >= eidx); 127 BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
119 BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn); 128 BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
120 BUG_ON(end > bdata->node_low_pfn); 129
130 sidx = PFN_DOWN(addr - bdata->node_boot_start);
131 eidx = PFN_UP(addr + size - bdata->node_boot_start);
121 132
122 for (i = sidx; i < eidx; i++) 133 for (i = sidx; i < eidx; i++)
123 if (test_and_set_bit(i, bdata->node_bootmem_map)) { 134 if (test_and_set_bit(i, bdata->node_bootmem_map)) {
@@ -127,20 +138,18 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add
127 } 138 }
128} 139}
129 140
130static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) 141static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
142 unsigned long size)
131{ 143{
144 unsigned long sidx, eidx;
132 unsigned long i; 145 unsigned long i;
133 unsigned long start; 146
134 /* 147 /*
135 * round down end of usable mem, partially free pages are 148 * round down end of usable mem, partially free pages are
136 * considered reserved. 149 * considered reserved.
137 */ 150 */
138 unsigned long sidx;
139 unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
140 unsigned long end = (addr + size)/PAGE_SIZE;
141
142 BUG_ON(!size); 151 BUG_ON(!size);
143 BUG_ON(end > bdata->node_low_pfn); 152 BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn);
144 153
145 if (addr < bdata->last_success) 154 if (addr < bdata->last_success)
146 bdata->last_success = addr; 155 bdata->last_success = addr;
@@ -148,8 +157,8 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
148 /* 157 /*
149 * Round up the beginning of the address. 158 * Round up the beginning of the address.
150 */ 159 */
151 start = (addr + PAGE_SIZE-1) / PAGE_SIZE; 160 sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
152 sidx = start - (bdata->node_boot_start/PAGE_SIZE); 161 eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
153 162
154 for (i = sidx; i < eidx; i++) { 163 for (i = sidx; i < eidx; i++) {
155 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) 164 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
@@ -175,10 +184,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
175 unsigned long align, unsigned long goal, unsigned long limit) 184 unsigned long align, unsigned long goal, unsigned long limit)
176{ 185{
177 unsigned long offset, remaining_size, areasize, preferred; 186 unsigned long offset, remaining_size, areasize, preferred;
178 unsigned long i, start = 0, incr, eidx, end_pfn = bdata->node_low_pfn; 187 unsigned long i, start = 0, incr, eidx, end_pfn;
179 void *ret; 188 void *ret;
180 189
181 if(!size) { 190 if (!size) {
182 printk("__alloc_bootmem_core(): zero-sized request\n"); 191 printk("__alloc_bootmem_core(): zero-sized request\n");
183 BUG(); 192 BUG();
184 } 193 }
@@ -187,23 +196,22 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
187 if (limit && bdata->node_boot_start >= limit) 196 if (limit && bdata->node_boot_start >= limit)
188 return NULL; 197 return NULL;
189 198
190 limit >>=PAGE_SHIFT; 199 end_pfn = bdata->node_low_pfn;
200 limit = PFN_DOWN(limit);
191 if (limit && end_pfn > limit) 201 if (limit && end_pfn > limit)
192 end_pfn = limit; 202 end_pfn = limit;
193 203
194 eidx = end_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 204 eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
195 offset = 0; 205 offset = 0;
196 if (align && 206 if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
197 (bdata->node_boot_start & (align - 1UL)) != 0) 207 offset = align - (bdata->node_boot_start & (align - 1UL));
198 offset = (align - (bdata->node_boot_start & (align - 1UL))); 208 offset = PFN_DOWN(offset);
199 offset >>= PAGE_SHIFT;
200 209
201 /* 210 /*
202 * We try to allocate bootmem pages above 'goal' 211 * We try to allocate bootmem pages above 'goal'
203 * first, then we try to allocate lower pages. 212 * first, then we try to allocate lower pages.
204 */ 213 */
205 if (goal && (goal >= bdata->node_boot_start) && 214 if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
206 ((goal >> PAGE_SHIFT) < end_pfn)) {
207 preferred = goal - bdata->node_boot_start; 215 preferred = goal - bdata->node_boot_start;
208 216
209 if (bdata->last_success >= preferred) 217 if (bdata->last_success >= preferred)
@@ -212,9 +220,8 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
212 } else 220 } else
213 preferred = 0; 221 preferred = 0;
214 222
215 preferred = ALIGN(preferred, align) >> PAGE_SHIFT; 223 preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
216 preferred += offset; 224 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
217 areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
218 incr = align >> PAGE_SHIFT ? : 1; 225 incr = align >> PAGE_SHIFT ? : 1;
219 226
220restart_scan: 227restart_scan:
@@ -229,7 +236,7 @@ restart_scan:
229 for (j = i + 1; j < i + areasize; ++j) { 236 for (j = i + 1; j < i + areasize; ++j) {
230 if (j >= eidx) 237 if (j >= eidx)
231 goto fail_block; 238 goto fail_block;
232 if (test_bit (j, bdata->node_bootmem_map)) 239 if (test_bit(j, bdata->node_bootmem_map))
233 goto fail_block; 240 goto fail_block;
234 } 241 }
235 start = i; 242 start = i;
@@ -245,7 +252,7 @@ restart_scan:
245 return NULL; 252 return NULL;
246 253
247found: 254found:
248 bdata->last_success = start << PAGE_SHIFT; 255 bdata->last_success = PFN_PHYS(start);
249 BUG_ON(start >= eidx); 256 BUG_ON(start >= eidx);
250 257
251 /* 258 /*
@@ -257,19 +264,21 @@ found:
257 bdata->last_offset && bdata->last_pos+1 == start) { 264 bdata->last_offset && bdata->last_pos+1 == start) {
258 offset = ALIGN(bdata->last_offset, align); 265 offset = ALIGN(bdata->last_offset, align);
259 BUG_ON(offset > PAGE_SIZE); 266 BUG_ON(offset > PAGE_SIZE);
260 remaining_size = PAGE_SIZE-offset; 267 remaining_size = PAGE_SIZE - offset;
261 if (size < remaining_size) { 268 if (size < remaining_size) {
262 areasize = 0; 269 areasize = 0;
263 /* last_pos unchanged */ 270 /* last_pos unchanged */
264 bdata->last_offset = offset+size; 271 bdata->last_offset = offset + size;
265 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + 272 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
266 bdata->node_boot_start); 273 offset +
274 bdata->node_boot_start);
267 } else { 275 } else {
268 remaining_size = size - remaining_size; 276 remaining_size = size - remaining_size;
269 areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; 277 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
270 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + 278 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
271 bdata->node_boot_start); 279 offset +
272 bdata->last_pos = start+areasize-1; 280 bdata->node_boot_start);
281 bdata->last_pos = start + areasize - 1;
273 bdata->last_offset = remaining_size; 282 bdata->last_offset = remaining_size;
274 } 283 }
275 bdata->last_offset &= ~PAGE_MASK; 284 bdata->last_offset &= ~PAGE_MASK;
@@ -282,7 +291,7 @@ found:
282 /* 291 /*
283 * Reserve the area now: 292 * Reserve the area now:
284 */ 293 */
285 for (i = start; i < start+areasize; i++) 294 for (i = start; i < start + areasize; i++)
286 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) 295 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
287 BUG(); 296 BUG();
288 memset(ret, 0, size); 297 memset(ret, 0, size);
@@ -303,8 +312,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
303 312
304 count = 0; 313 count = 0;
305 /* first extant page of the node */ 314 /* first extant page of the node */
306 pfn = bdata->node_boot_start >> PAGE_SHIFT; 315 pfn = PFN_DOWN(bdata->node_boot_start);
307 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 316 idx = bdata->node_low_pfn - pfn;
308 map = bdata->node_bootmem_map; 317 map = bdata->node_bootmem_map;
309 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ 318 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
310 if (bdata->node_boot_start == 0 || 319 if (bdata->node_boot_start == 0 ||
@@ -333,7 +342,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
333 } 342 }
334 } 343 }
335 } else { 344 } else {
336 i+=BITS_PER_LONG; 345 i += BITS_PER_LONG;
337 } 346 }
338 pfn += BITS_PER_LONG; 347 pfn += BITS_PER_LONG;
339 } 348 }
@@ -345,9 +354,10 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
345 */ 354 */
346 page = virt_to_page(bdata->node_bootmem_map); 355 page = virt_to_page(bdata->node_bootmem_map);
347 count = 0; 356 count = 0;
348 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { 357 idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
349 count++; 358 for (i = 0; i < idx; i++, page++) {
350 __free_pages_bootmem(page, 0); 359 __free_pages_bootmem(page, 0);
360 count++;
351 } 361 }
352 total += count; 362 total += count;
353 bdata->node_bootmem_map = NULL; 363 bdata->node_bootmem_map = NULL;
@@ -355,64 +365,72 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
355 return total; 365 return total;
356} 366}
357 367
358unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) 368unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
369 unsigned long startpfn, unsigned long endpfn)
359{ 370{
360 return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); 371 return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
361} 372}
362 373
363void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) 374void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
375 unsigned long size)
364{ 376{
365 reserve_bootmem_core(pgdat->bdata, physaddr, size); 377 reserve_bootmem_core(pgdat->bdata, physaddr, size);
366} 378}
367 379
368void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) 380void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
381 unsigned long size)
369{ 382{
370 free_bootmem_core(pgdat->bdata, physaddr, size); 383 free_bootmem_core(pgdat->bdata, physaddr, size);
371} 384}
372 385
373unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) 386unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
374{ 387{
375 return(free_all_bootmem_core(pgdat)); 388 return free_all_bootmem_core(pgdat);
376} 389}
377 390
378unsigned long __init init_bootmem (unsigned long start, unsigned long pages) 391unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
379{ 392{
380 max_low_pfn = pages; 393 max_low_pfn = pages;
381 min_low_pfn = start; 394 min_low_pfn = start;
382 return(init_bootmem_core(NODE_DATA(0), start, 0, pages)); 395 return init_bootmem_core(NODE_DATA(0), start, 0, pages);
383} 396}
384 397
385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 398#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
386void __init reserve_bootmem (unsigned long addr, unsigned long size) 399void __init reserve_bootmem(unsigned long addr, unsigned long size)
387{ 400{
388 reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); 401 reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
389} 402}
390#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 403#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
391 404
392void __init free_bootmem (unsigned long addr, unsigned long size) 405void __init free_bootmem(unsigned long addr, unsigned long size)
393{ 406{
394 free_bootmem_core(NODE_DATA(0)->bdata, addr, size); 407 free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
395} 408}
396 409
397unsigned long __init free_all_bootmem (void) 410unsigned long __init free_all_bootmem(void)
398{ 411{
399 return(free_all_bootmem_core(NODE_DATA(0))); 412 return free_all_bootmem_core(NODE_DATA(0));
400} 413}
401 414
402void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) 415void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
416 unsigned long goal)
403{ 417{
404 bootmem_data_t *bdata; 418 bootmem_data_t *bdata;
405 void *ptr; 419 void *ptr;
406 420
407 list_for_each_entry(bdata, &bdata_list, list) 421 list_for_each_entry(bdata, &bdata_list, list) {
408 if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) 422 ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
409 return(ptr); 423 if (ptr)
424 return ptr;
425 }
410 return NULL; 426 return NULL;
411} 427}
412 428
413void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) 429void * __init __alloc_bootmem(unsigned long size, unsigned long align,
430 unsigned long goal)
414{ 431{
415 void *mem = __alloc_bootmem_nopanic(size,align,goal); 432 void *mem = __alloc_bootmem_nopanic(size,align,goal);
433
416 if (mem) 434 if (mem)
417 return mem; 435 return mem;
418 /* 436 /*
@@ -424,29 +442,34 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned
424} 442}
425 443
426 444
427void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, 445void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
428 unsigned long goal) 446 unsigned long align, unsigned long goal)
429{ 447{
430 void *ptr; 448 void *ptr;
431 449
432 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 450 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
433 if (ptr) 451 if (ptr)
434 return (ptr); 452 return ptr;
435 453
436 return __alloc_bootmem(size, align, goal); 454 return __alloc_bootmem(size, align, goal);
437} 455}
438 456
439#define LOW32LIMIT 0xffffffff 457#ifndef ARCH_LOW_ADDRESS_LIMIT
458#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
459#endif
440 460
441void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) 461void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
462 unsigned long goal)
442{ 463{
443 bootmem_data_t *bdata; 464 bootmem_data_t *bdata;
444 void *ptr; 465 void *ptr;
445 466
446 list_for_each_entry(bdata, &bdata_list, list) 467 list_for_each_entry(bdata, &bdata_list, list) {
447 if ((ptr = __alloc_bootmem_core(bdata, size, 468 ptr = __alloc_bootmem_core(bdata, size, align, goal,
448 align, goal, LOW32LIMIT))) 469 ARCH_LOW_ADDRESS_LIMIT);
449 return(ptr); 470 if (ptr)
471 return ptr;
472 }
450 473
451 /* 474 /*
452 * Whoops, we cannot satisfy the allocation request. 475 * Whoops, we cannot satisfy the allocation request.
@@ -459,5 +482,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsig
459void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 482void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
460 unsigned long align, unsigned long goal) 483 unsigned long align, unsigned long goal)
461{ 484{
462 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); 485 return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
486 ARCH_LOW_ADDRESS_LIMIT);
463} 487}
diff --git a/mm/filemap.c b/mm/filemap.c
index 3195806d78e0..87d4a398cd16 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -488,6 +488,12 @@ struct page *page_cache_alloc_cold(struct address_space *x)
488EXPORT_SYMBOL(page_cache_alloc_cold); 488EXPORT_SYMBOL(page_cache_alloc_cold);
489#endif 489#endif
490 490
491static int __sleep_on_page_lock(void *word)
492{
493 io_schedule();
494 return 0;
495}
496
491/* 497/*
492 * In order to wait for pages to become available there must be 498 * In order to wait for pages to become available there must be
493 * waitqueues associated with pages. By using a hash table of 499 * waitqueues associated with pages. By using a hash table of
@@ -577,13 +583,24 @@ void fastcall __lock_page(struct page *page)
577} 583}
578EXPORT_SYMBOL(__lock_page); 584EXPORT_SYMBOL(__lock_page);
579 585
586/*
587 * Variant of lock_page that does not require the caller to hold a reference
588 * on the page's mapping.
589 */
590void fastcall __lock_page_nosync(struct page *page)
591{
592 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
593 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
594 TASK_UNINTERRUPTIBLE);
595}
596
580/** 597/**
581 * find_get_page - find and get a page reference 598 * find_get_page - find and get a page reference
582 * @mapping: the address_space to search 599 * @mapping: the address_space to search
583 * @offset: the page index 600 * @offset: the page index
584 * 601 *
585 * A rather lightweight function, finding and getting a reference to a 602 * Is there a pagecache struct page at the given (mapping, offset) tuple?
586 * hashed page atomically. 603 * If yes, increment its refcount and return it; if no, return NULL.
587 */ 604 */
588struct page * find_get_page(struct address_space *mapping, unsigned long offset) 605struct page * find_get_page(struct address_space *mapping, unsigned long offset)
589{ 606{
@@ -970,7 +987,7 @@ page_not_up_to_date:
970 /* Get exclusive access to the page ... */ 987 /* Get exclusive access to the page ... */
971 lock_page(page); 988 lock_page(page);
972 989
973 /* Did it get unhashed before we got the lock? */ 990 /* Did it get truncated before we got the lock? */
974 if (!page->mapping) { 991 if (!page->mapping) {
975 unlock_page(page); 992 unlock_page(page);
976 page_cache_release(page); 993 page_cache_release(page);
@@ -1612,7 +1629,7 @@ no_cached_page:
1612page_not_uptodate: 1629page_not_uptodate:
1613 lock_page(page); 1630 lock_page(page);
1614 1631
1615 /* Did it get unhashed while we waited for it? */ 1632 /* Did it get truncated while we waited for it? */
1616 if (!page->mapping) { 1633 if (!page->mapping) {
1617 unlock_page(page); 1634 unlock_page(page);
1618 goto err; 1635 goto err;
diff --git a/mm/fremap.c b/mm/fremap.c
index 21b7d0cbc98c..aa30618ec6b2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -79,9 +79,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
79 inc_mm_counter(mm, file_rss); 79 inc_mm_counter(mm, file_rss);
80 80
81 flush_icache_page(vma, page); 81 flush_icache_page(vma, page);
82 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 82 pte_val = mk_pte(page, prot);
83 set_pte_at(mm, addr, pte, pte_val);
83 page_add_file_rmap(page); 84 page_add_file_rmap(page);
84 pte_val = *pte;
85 update_mmu_cache(vma, addr, pte_val); 85 update_mmu_cache(vma, addr, pte_val);
86 lazy_mmu_prot_update(pte_val); 86 lazy_mmu_prot_update(pte_val);
87 err = 0; 87 err = 0;
diff --git a/mm/highmem.c b/mm/highmem.c
index 9b2a5403c447..ee5519b176ee 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -46,6 +46,19 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
46 */ 46 */
47#ifdef CONFIG_HIGHMEM 47#ifdef CONFIG_HIGHMEM
48 48
49unsigned long totalhigh_pages __read_mostly;
50
51unsigned int nr_free_highpages (void)
52{
53 pg_data_t *pgdat;
54 unsigned int pages = 0;
55
56 for_each_online_pgdat(pgdat)
57 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
58
59 return pages;
60}
61
49static int pkmap_count[LAST_PKMAP]; 62static int pkmap_count[LAST_PKMAP];
50static unsigned int last_pkmap_nr; 63static unsigned int last_pkmap_nr;
51static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); 64static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index df499973255f..7c7d03dbf73d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -72,7 +72,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
72 struct zone **z; 72 struct zone **z;
73 73
74 for (z = zonelist->zones; *z; z++) { 74 for (z = zonelist->zones; *z; z++) {
75 nid = (*z)->zone_pgdat->node_id; 75 nid = zone_to_nid(*z);
76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && 76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
77 !list_empty(&hugepage_freelists[nid])) 77 !list_empty(&hugepage_freelists[nid]))
78 break; 78 break;
@@ -177,7 +177,7 @@ static void update_and_free_page(struct page *page)
177{ 177{
178 int i; 178 int i;
179 nr_huge_pages--; 179 nr_huge_pages--;
180 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; 180 nr_huge_pages_node[page_to_nid(page)]--;
181 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 181 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
182 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 182 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
183 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 183 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -191,7 +191,8 @@ static void update_and_free_page(struct page *page)
191#ifdef CONFIG_HIGHMEM 191#ifdef CONFIG_HIGHMEM
192static void try_to_free_low(unsigned long count) 192static void try_to_free_low(unsigned long count)
193{ 193{
194 int i, nid; 194 int i;
195
195 for (i = 0; i < MAX_NUMNODES; ++i) { 196 for (i = 0; i < MAX_NUMNODES; ++i) {
196 struct page *page, *next; 197 struct page *page, *next;
197 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 198 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
@@ -199,9 +200,8 @@ static void try_to_free_low(unsigned long count)
199 continue; 200 continue;
200 list_del(&page->lru); 201 list_del(&page->lru);
201 update_and_free_page(page); 202 update_and_free_page(page);
202 nid = page_zone(page)->zone_pgdat->node_id;
203 free_huge_pages--; 203 free_huge_pages--;
204 free_huge_pages_node[nid]--; 204 free_huge_pages_node[page_to_nid(page)]--;
205 if (count >= nr_huge_pages) 205 if (count >= nr_huge_pages)
206 return; 206 return;
207 } 207 }
diff --git a/mm/internal.h b/mm/internal.h
index d20e3cc4aef0..d527b80b292f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,8 +24,8 @@ static inline void set_page_count(struct page *page, int v)
24 */ 24 */
25static inline void set_page_refcounted(struct page *page) 25static inline void set_page_refcounted(struct page *page)
26{ 26{
27 BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); 27 VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
28 BUG_ON(atomic_read(&page->_count)); 28 VM_BUG_ON(atomic_read(&page->_count));
29 set_page_count(page, 1); 29 set_page_count(page, 1);
30} 30}
31 31
diff --git a/mm/memory.c b/mm/memory.c
index 109e9866237e..601159a46ab6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/delayacct.h> 50#include <linux/delayacct.h>
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h>
52 53
53#include <asm/pgalloc.h> 54#include <asm/pgalloc.h>
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
@@ -1226,7 +1227,12 @@ out:
1226 return retval; 1227 return retval;
1227} 1228}
1228 1229
1229/* 1230/**
1231 * vm_insert_page - insert single page into user vma
1232 * @vma: user vma to map to
1233 * @addr: target user address of this page
1234 * @page: source kernel page
1235 *
1230 * This allows drivers to insert individual pages they've allocated 1236 * This allows drivers to insert individual pages they've allocated
1231 * into a user vma. 1237 * into a user vma.
1232 * 1238 *
@@ -1318,7 +1324,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1318 return 0; 1324 return 0;
1319} 1325}
1320 1326
1321/* Note: this is only safe if the mm semaphore is held when called. */ 1327/**
1328 * remap_pfn_range - remap kernel memory to userspace
1329 * @vma: user vma to map to
1330 * @addr: target user address to start at
1331 * @pfn: physical address of kernel memory
1332 * @size: size of map area
1333 * @prot: page protection flags for this mapping
1334 *
1335 * Note: this is only safe if the mm semaphore is held when called.
1336 */
1322int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 1337int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1323 unsigned long pfn, unsigned long size, pgprot_t prot) 1338 unsigned long pfn, unsigned long size, pgprot_t prot)
1324{ 1339{
@@ -1458,14 +1473,29 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1458{ 1473{
1459 struct page *old_page, *new_page; 1474 struct page *old_page, *new_page;
1460 pte_t entry; 1475 pte_t entry;
1461 int reuse, ret = VM_FAULT_MINOR; 1476 int reuse = 0, ret = VM_FAULT_MINOR;
1477 struct page *dirty_page = NULL;
1462 1478
1463 old_page = vm_normal_page(vma, address, orig_pte); 1479 old_page = vm_normal_page(vma, address, orig_pte);
1464 if (!old_page) 1480 if (!old_page)
1465 goto gotten; 1481 goto gotten;
1466 1482
1467 if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == 1483 /*
1468 (VM_SHARED|VM_WRITE))) { 1484 * Take out anonymous pages first, anonymous shared vmas are
1485 * not dirty accountable.
1486 */
1487 if (PageAnon(old_page)) {
1488 if (!TestSetPageLocked(old_page)) {
1489 reuse = can_share_swap_page(old_page);
1490 unlock_page(old_page);
1491 }
1492 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1493 (VM_WRITE|VM_SHARED))) {
1494 /*
1495 * Only catch write-faults on shared writable pages,
1496 * read-only shared pages can get COWed by
1497 * get_user_pages(.write=1, .force=1).
1498 */
1469 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 1499 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1470 /* 1500 /*
1471 * Notify the address space that the page is about to 1501 * Notify the address space that the page is about to
@@ -1494,13 +1524,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1494 if (!pte_same(*page_table, orig_pte)) 1524 if (!pte_same(*page_table, orig_pte))
1495 goto unlock; 1525 goto unlock;
1496 } 1526 }
1497 1527 dirty_page = old_page;
1528 get_page(dirty_page);
1498 reuse = 1; 1529 reuse = 1;
1499 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1500 reuse = can_share_swap_page(old_page);
1501 unlock_page(old_page);
1502 } else {
1503 reuse = 0;
1504 } 1530 }
1505 1531
1506 if (reuse) { 1532 if (reuse) {
@@ -1566,6 +1592,10 @@ gotten:
1566 page_cache_release(old_page); 1592 page_cache_release(old_page);
1567unlock: 1593unlock:
1568 pte_unmap_unlock(page_table, ptl); 1594 pte_unmap_unlock(page_table, ptl);
1595 if (dirty_page) {
1596 set_page_dirty_balance(dirty_page);
1597 put_page(dirty_page);
1598 }
1569 return ret; 1599 return ret;
1570oom: 1600oom:
1571 if (old_page) 1601 if (old_page)
@@ -1785,9 +1815,10 @@ void unmap_mapping_range(struct address_space *mapping,
1785} 1815}
1786EXPORT_SYMBOL(unmap_mapping_range); 1816EXPORT_SYMBOL(unmap_mapping_range);
1787 1817
1788/* 1818/**
1789 * Handle all mappings that got truncated by a "truncate()" 1819 * vmtruncate - unmap mappings "freed" by truncate() syscall
1790 * system call. 1820 * @inode: inode of the file used
1821 * @offset: file offset to start truncating
1791 * 1822 *
1792 * NOTE! We have to be ready to update the memory sharing 1823 * NOTE! We have to be ready to update the memory sharing
1793 * between the file and the memory map for a potential last 1824 * between the file and the memory map for a potential last
@@ -1856,11 +1887,16 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1856} 1887}
1857EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ 1888EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
1858 1889
1859/* 1890/**
1891 * swapin_readahead - swap in pages in hope we need them soon
1892 * @entry: swap entry of this memory
1893 * @addr: address to start
1894 * @vma: user vma this addresses belong to
1895 *
1860 * Primitive swap readahead code. We simply read an aligned block of 1896 * Primitive swap readahead code. We simply read an aligned block of
1861 * (1 << page_cluster) entries in the swap area. This method is chosen 1897 * (1 << page_cluster) entries in the swap area. This method is chosen
1862 * because it doesn't cost us any seek time. We also make sure to queue 1898 * because it doesn't cost us any seek time. We also make sure to queue
1863 * the 'original' request together with the readahead ones... 1899 * the 'original' request together with the readahead ones...
1864 * 1900 *
1865 * This has been extended to use the NUMA policies from the mm triggering 1901 * This has been extended to use the NUMA policies from the mm triggering
1866 * the readahead. 1902 * the readahead.
@@ -2098,6 +2134,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2098 unsigned int sequence = 0; 2134 unsigned int sequence = 0;
2099 int ret = VM_FAULT_MINOR; 2135 int ret = VM_FAULT_MINOR;
2100 int anon = 0; 2136 int anon = 0;
2137 struct page *dirty_page = NULL;
2101 2138
2102 pte_unmap(page_table); 2139 pte_unmap(page_table);
2103 BUG_ON(vma->vm_flags & VM_PFNMAP); 2140 BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2192,6 +2229,10 @@ retry:
2192 } else { 2229 } else {
2193 inc_mm_counter(mm, file_rss); 2230 inc_mm_counter(mm, file_rss);
2194 page_add_file_rmap(new_page); 2231 page_add_file_rmap(new_page);
2232 if (write_access) {
2233 dirty_page = new_page;
2234 get_page(dirty_page);
2235 }
2195 } 2236 }
2196 } else { 2237 } else {
2197 /* One of our sibling threads was faster, back out. */ 2238 /* One of our sibling threads was faster, back out. */
@@ -2204,6 +2245,10 @@ retry:
2204 lazy_mmu_prot_update(entry); 2245 lazy_mmu_prot_update(entry);
2205unlock: 2246unlock:
2206 pte_unmap_unlock(page_table, ptl); 2247 pte_unmap_unlock(page_table, ptl);
2248 if (dirty_page) {
2249 set_page_dirty_balance(dirty_page);
2250 put_page(dirty_page);
2251 }
2207 return ret; 2252 return ret;
2208oom: 2253oom:
2209 page_cache_release(new_page); 2254 page_cache_release(new_page);
@@ -2211,6 +2256,54 @@ oom:
2211} 2256}
2212 2257
2213/* 2258/*
2259 * do_no_pfn() tries to create a new page mapping for a page without
2260 * a struct_page backing it
2261 *
2262 * As this is called only for pages that do not currently exist, we
2263 * do not need to flush old virtual caches or the TLB.
2264 *
2265 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2266 * but allow concurrent faults), and pte mapped but not yet locked.
2267 * We return with mmap_sem still held, but pte unmapped and unlocked.
2268 *
2269 * It is expected that the ->nopfn handler always returns the same pfn
2270 * for a given virtual mapping.
2271 *
2272 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2273 */
2274static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2275 unsigned long address, pte_t *page_table, pmd_t *pmd,
2276 int write_access)
2277{
2278 spinlock_t *ptl;
2279 pte_t entry;
2280 unsigned long pfn;
2281 int ret = VM_FAULT_MINOR;
2282
2283 pte_unmap(page_table);
2284 BUG_ON(!(vma->vm_flags & VM_PFNMAP));
2285 BUG_ON(is_cow_mapping(vma->vm_flags));
2286
2287 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2288 if (pfn == NOPFN_OOM)
2289 return VM_FAULT_OOM;
2290 if (pfn == NOPFN_SIGBUS)
2291 return VM_FAULT_SIGBUS;
2292
2293 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2294
2295 /* Only go through if we didn't race with anybody else... */
2296 if (pte_none(*page_table)) {
2297 entry = pfn_pte(pfn, vma->vm_page_prot);
2298 if (write_access)
2299 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2300 set_pte_at(mm, address, page_table, entry);
2301 }
2302 pte_unmap_unlock(page_table, ptl);
2303 return ret;
2304}
2305
2306/*
2214 * Fault of a previously existing named mapping. Repopulate the pte 2307 * Fault of a previously existing named mapping. Repopulate the pte
2215 * from the encoded file_pte if possible. This enables swappable 2308 * from the encoded file_pte if possible. This enables swappable
2216 * nonlinear vmas. 2309 * nonlinear vmas.
@@ -2272,11 +2365,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2272 old_entry = entry = *pte; 2365 old_entry = entry = *pte;
2273 if (!pte_present(entry)) { 2366 if (!pte_present(entry)) {
2274 if (pte_none(entry)) { 2367 if (pte_none(entry)) {
2275 if (!vma->vm_ops || !vma->vm_ops->nopage) 2368 if (vma->vm_ops) {
2276 return do_anonymous_page(mm, vma, address, 2369 if (vma->vm_ops->nopage)
2277 pte, pmd, write_access); 2370 return do_no_page(mm, vma, address,
2278 return do_no_page(mm, vma, address, 2371 pte, pmd,
2279 pte, pmd, write_access); 2372 write_access);
2373 if (unlikely(vma->vm_ops->nopfn))
2374 return do_no_pfn(mm, vma, address, pte,
2375 pmd, write_access);
2376 }
2377 return do_anonymous_page(mm, vma, address,
2378 pte, pmd, write_access);
2280 } 2379 }
2281 if (pte_file(entry)) 2380 if (pte_file(entry))
2282 return do_file_page(mm, vma, address, 2381 return do_file_page(mm, vma, address,
@@ -2505,3 +2604,56 @@ int in_gate_area_no_task(unsigned long addr)
2505} 2604}
2506 2605
2507#endif /* __HAVE_ARCH_GATE_AREA */ 2606#endif /* __HAVE_ARCH_GATE_AREA */
2607
2608/*
2609 * Access another process' address space.
2610 * Source/target buffer must be kernel space,
2611 * Do not walk the page table directly, use get_user_pages
2612 */
2613int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2614{
2615 struct mm_struct *mm;
2616 struct vm_area_struct *vma;
2617 struct page *page;
2618 void *old_buf = buf;
2619
2620 mm = get_task_mm(tsk);
2621 if (!mm)
2622 return 0;
2623
2624 down_read(&mm->mmap_sem);
2625 /* ignore errors, just check how much was sucessfully transfered */
2626 while (len) {
2627 int bytes, ret, offset;
2628 void *maddr;
2629
2630 ret = get_user_pages(tsk, mm, addr, 1,
2631 write, 1, &page, &vma);
2632 if (ret <= 0)
2633 break;
2634
2635 bytes = len;
2636 offset = addr & (PAGE_SIZE-1);
2637 if (bytes > PAGE_SIZE-offset)
2638 bytes = PAGE_SIZE-offset;
2639
2640 maddr = kmap(page);
2641 if (write) {
2642 copy_to_user_page(vma, page, addr,
2643 maddr + offset, buf, bytes);
2644 set_page_dirty_lock(page);
2645 } else {
2646 copy_from_user_page(vma, page, addr,
2647 buf, maddr + offset, bytes);
2648 }
2649 kunmap(page);
2650 page_cache_release(page);
2651 len -= bytes;
2652 buf += bytes;
2653 addr += bytes;
2654 }
2655 up_read(&mm->mmap_sem);
2656 mmput(mm);
2657
2658 return buf - old_buf;
2659}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a9963ceddd65..cf18f0942553 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -105,7 +105,7 @@ static struct kmem_cache *sn_cache;
105 105
106/* Highest zone. An specific allocation for a zone below that is not 106/* Highest zone. An specific allocation for a zone below that is not
107 policied. */ 107 policied. */
108int policy_zone = ZONE_DMA; 108enum zone_type policy_zone = ZONE_DMA;
109 109
110struct mempolicy default_policy = { 110struct mempolicy default_policy = {
111 .refcnt = ATOMIC_INIT(1), /* never free it */ 111 .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -137,7 +137,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
137static struct zonelist *bind_zonelist(nodemask_t *nodes) 137static struct zonelist *bind_zonelist(nodemask_t *nodes)
138{ 138{
139 struct zonelist *zl; 139 struct zonelist *zl;
140 int num, max, nd, k; 140 int num, max, nd;
141 enum zone_type k;
141 142
142 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
143 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 144 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
@@ -148,12 +149,16 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
148 lower zones etc. Avoid empty zones because the memory allocator 149 lower zones etc. Avoid empty zones because the memory allocator
149 doesn't like them. If you implement node hot removal you 150 doesn't like them. If you implement node hot removal you
150 have to fix that. */ 151 have to fix that. */
151 for (k = policy_zone; k >= 0; k--) { 152 k = policy_zone;
153 while (1) {
152 for_each_node_mask(nd, *nodes) { 154 for_each_node_mask(nd, *nodes) {
153 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 155 struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 if (z->present_pages > 0) 156 if (z->present_pages > 0)
155 zl->zones[num++] = z; 157 zl->zones[num++] = z;
156 } 158 }
159 if (k == 0)
160 break;
161 k--;
157 } 162 }
158 zl->zones[num] = NULL; 163 zl->zones[num] = NULL;
159 return zl; 164 return zl;
@@ -482,7 +487,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
482 switch (p->policy) { 487 switch (p->policy) {
483 case MPOL_BIND: 488 case MPOL_BIND:
484 for (i = 0; p->v.zonelist->zones[i]; i++) 489 for (i = 0; p->v.zonelist->zones[i]; i++)
485 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, 490 node_set(zone_to_nid(p->v.zonelist->zones[i]),
486 *nodes); 491 *nodes);
487 break; 492 break;
488 case MPOL_DEFAULT: 493 case MPOL_DEFAULT:
@@ -1131,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1131 */ 1136 */
1132unsigned slab_node(struct mempolicy *policy) 1137unsigned slab_node(struct mempolicy *policy)
1133{ 1138{
1134 switch (policy->policy) { 1139 int pol = policy ? policy->policy : MPOL_DEFAULT;
1140
1141 switch (pol) {
1135 case MPOL_INTERLEAVE: 1142 case MPOL_INTERLEAVE:
1136 return interleave_nodes(policy); 1143 return interleave_nodes(policy);
1137 1144
@@ -1140,7 +1147,7 @@ unsigned slab_node(struct mempolicy *policy)
1140 * Follow bind policy behavior and start allocation at the 1147 * Follow bind policy behavior and start allocation at the
1141 * first node. 1148 * first node.
1142 */ 1149 */
1143 return policy->v.zonelist->zones[0]->zone_pgdat->node_id; 1150 return zone_to_nid(policy->v.zonelist->zones[0]);
1144 1151
1145 case MPOL_PREFERRED: 1152 case MPOL_PREFERRED:
1146 if (policy->v.preferred_node >= 0) 1153 if (policy->v.preferred_node >= 0)
@@ -1285,7 +1292,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1285 1292
1286 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1293 if ((gfp & __GFP_WAIT) && !in_interrupt())
1287 cpuset_update_task_memory_state(); 1294 cpuset_update_task_memory_state();
1288 if (!pol || in_interrupt()) 1295 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1289 pol = &default_policy; 1296 pol = &default_policy;
1290 if (pol->policy == MPOL_INTERLEAVE) 1297 if (pol->policy == MPOL_INTERLEAVE)
1291 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1298 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
@@ -1644,7 +1651,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1644 1651
1645 nodes_clear(nodes); 1652 nodes_clear(nodes);
1646 for (z = pol->v.zonelist->zones; *z; z++) 1653 for (z = pol->v.zonelist->zones; *z; z++)
1647 node_set((*z)->zone_pgdat->node_id, nodes); 1654 node_set(zone_to_nid(*z), nodes);
1648 nodes_remap(tmp, nodes, *mpolmask, *newmask); 1655 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1649 nodes = tmp; 1656 nodes = tmp;
1650 1657
diff --git a/mm/migrate.c b/mm/migrate.c
index 3f1e0c2c942c..20a8c2687b1e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -741,7 +741,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
741 741
742 *result = &pm->status; 742 *result = &pm->status;
743 743
744 return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); 744 return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0);
745} 745}
746 746
747/* 747/*
diff --git a/mm/mmap.c b/mm/mmap.c
index d799d896d74a..eea8eefd51a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -116,7 +116,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
116 * which are reclaimable, under pressure. The dentry 116 * which are reclaimable, under pressure. The dentry
117 * cache and most inode caches should fall into this 117 * cache and most inode caches should fall into this
118 */ 118 */
119 free += atomic_read(&slab_reclaim_pages); 119 free += global_page_state(NR_SLAB_RECLAIMABLE);
120 120
121 /* 121 /*
122 * Leave the last 3% for root 122 * Leave the last 3% for root
@@ -1105,12 +1105,6 @@ munmap_back:
1105 goto free_vma; 1105 goto free_vma;
1106 } 1106 }
1107 1107
1108 /* Don't make the VMA automatically writable if it's shared, but the
1109 * backer wishes to know when pages are first written to */
1110 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1111 vma->vm_page_prot =
1112 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1113
1114 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform 1108 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1115 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) 1109 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1116 * that memory reservation must be checked; but that reservation 1110 * that memory reservation must be checked; but that reservation
@@ -1128,6 +1122,10 @@ munmap_back:
1128 pgoff = vma->vm_pgoff; 1122 pgoff = vma->vm_pgoff;
1129 vm_flags = vma->vm_flags; 1123 vm_flags = vma->vm_flags;
1130 1124
1125 if (vma_wants_writenotify(vma))
1126 vma->vm_page_prot =
1127 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1128
1131 if (!file || !vma_merge(mm, prev, addr, vma->vm_end, 1129 if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
1132 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { 1130 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
1133 file = vma->vm_file; 1131 file = vma->vm_file;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 638edabaff71..955f9d0e38aa 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -27,7 +27,8 @@
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28 28
29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
30 unsigned long addr, unsigned long end, pgprot_t newprot) 30 unsigned long addr, unsigned long end, pgprot_t newprot,
31 int dirty_accountable)
31{ 32{
32 pte_t *pte, oldpte; 33 pte_t *pte, oldpte;
33 spinlock_t *ptl; 34 spinlock_t *ptl;
@@ -42,7 +43,14 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
42 * bits by wiping the pte and then setting the new pte 43 * bits by wiping the pte and then setting the new pte
43 * into place. 44 * into place.
44 */ 45 */
45 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); 46 ptent = ptep_get_and_clear(mm, addr, pte);
47 ptent = pte_modify(ptent, newprot);
48 /*
49 * Avoid taking write faults for pages we know to be
50 * dirty.
51 */
52 if (dirty_accountable && pte_dirty(ptent))
53 ptent = pte_mkwrite(ptent);
46 set_pte_at(mm, addr, pte, ptent); 54 set_pte_at(mm, addr, pte, ptent);
47 lazy_mmu_prot_update(ptent); 55 lazy_mmu_prot_update(ptent);
48#ifdef CONFIG_MIGRATION 56#ifdef CONFIG_MIGRATION
@@ -66,7 +74,8 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
66} 74}
67 75
68static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, 76static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
69 unsigned long addr, unsigned long end, pgprot_t newprot) 77 unsigned long addr, unsigned long end, pgprot_t newprot,
78 int dirty_accountable)
70{ 79{
71 pmd_t *pmd; 80 pmd_t *pmd;
72 unsigned long next; 81 unsigned long next;
@@ -76,12 +85,13 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
76 next = pmd_addr_end(addr, end); 85 next = pmd_addr_end(addr, end);
77 if (pmd_none_or_clear_bad(pmd)) 86 if (pmd_none_or_clear_bad(pmd))
78 continue; 87 continue;
79 change_pte_range(mm, pmd, addr, next, newprot); 88 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
80 } while (pmd++, addr = next, addr != end); 89 } while (pmd++, addr = next, addr != end);
81} 90}
82 91
83static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, 92static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
84 unsigned long addr, unsigned long end, pgprot_t newprot) 93 unsigned long addr, unsigned long end, pgprot_t newprot,
94 int dirty_accountable)
85{ 95{
86 pud_t *pud; 96 pud_t *pud;
87 unsigned long next; 97 unsigned long next;
@@ -91,12 +101,13 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
91 next = pud_addr_end(addr, end); 101 next = pud_addr_end(addr, end);
92 if (pud_none_or_clear_bad(pud)) 102 if (pud_none_or_clear_bad(pud))
93 continue; 103 continue;
94 change_pmd_range(mm, pud, addr, next, newprot); 104 change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
95 } while (pud++, addr = next, addr != end); 105 } while (pud++, addr = next, addr != end);
96} 106}
97 107
98static void change_protection(struct vm_area_struct *vma, 108static void change_protection(struct vm_area_struct *vma,
99 unsigned long addr, unsigned long end, pgprot_t newprot) 109 unsigned long addr, unsigned long end, pgprot_t newprot,
110 int dirty_accountable)
100{ 111{
101 struct mm_struct *mm = vma->vm_mm; 112 struct mm_struct *mm = vma->vm_mm;
102 pgd_t *pgd; 113 pgd_t *pgd;
@@ -110,7 +121,7 @@ static void change_protection(struct vm_area_struct *vma,
110 next = pgd_addr_end(addr, end); 121 next = pgd_addr_end(addr, end);
111 if (pgd_none_or_clear_bad(pgd)) 122 if (pgd_none_or_clear_bad(pgd))
112 continue; 123 continue;
113 change_pud_range(mm, pgd, addr, next, newprot); 124 change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
114 } while (pgd++, addr = next, addr != end); 125 } while (pgd++, addr = next, addr != end);
115 flush_tlb_range(vma, start, end); 126 flush_tlb_range(vma, start, end);
116} 127}
@@ -123,10 +134,9 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
123 unsigned long oldflags = vma->vm_flags; 134 unsigned long oldflags = vma->vm_flags;
124 long nrpages = (end - start) >> PAGE_SHIFT; 135 long nrpages = (end - start) >> PAGE_SHIFT;
125 unsigned long charged = 0; 136 unsigned long charged = 0;
126 unsigned int mask;
127 pgprot_t newprot;
128 pgoff_t pgoff; 137 pgoff_t pgoff;
129 int error; 138 int error;
139 int dirty_accountable = 0;
130 140
131 if (newflags == oldflags) { 141 if (newflags == oldflags) {
132 *pprev = vma; 142 *pprev = vma;
@@ -176,24 +186,23 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
176 } 186 }
177 187
178success: 188success:
179 /* Don't make the VMA automatically writable if it's shared, but the
180 * backer wishes to know when pages are first written to */
181 mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
182 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
183 mask &= ~VM_SHARED;
184
185 newprot = protection_map[newflags & mask];
186
187 /* 189 /*
188 * vm_flags and vm_page_prot are protected by the mmap_sem 190 * vm_flags and vm_page_prot are protected by the mmap_sem
189 * held in write mode. 191 * held in write mode.
190 */ 192 */
191 vma->vm_flags = newflags; 193 vma->vm_flags = newflags;
192 vma->vm_page_prot = newprot; 194 vma->vm_page_prot = protection_map[newflags &
195 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
196 if (vma_wants_writenotify(vma)) {
197 vma->vm_page_prot = protection_map[newflags &
198 (VM_READ|VM_WRITE|VM_EXEC)];
199 dirty_accountable = 1;
200 }
201
193 if (is_vm_hugetlb_page(vma)) 202 if (is_vm_hugetlb_page(vma))
194 hugetlb_change_protection(vma, start, end, newprot); 203 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
195 else 204 else
196 change_protection(vma, start, end, newprot); 205 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
197 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 206 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
198 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 207 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
199 return 0; 208 return 0;
diff --git a/mm/msync.c b/mm/msync.c
index d083544df21b..358d73cf7b78 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -7,149 +7,33 @@
7/* 7/*
8 * The msync() system call. 8 * The msync() system call.
9 */ 9 */
10#include <linux/slab.h>
11#include <linux/pagemap.h>
12#include <linux/fs.h> 10#include <linux/fs.h>
13#include <linux/mm.h> 11#include <linux/mm.h>
14#include <linux/mman.h> 12#include <linux/mman.h>
15#include <linux/hugetlb.h>
16#include <linux/writeback.h>
17#include <linux/file.h> 13#include <linux/file.h>
18#include <linux/syscalls.h> 14#include <linux/syscalls.h>
19 15
20#include <asm/pgtable.h>
21#include <asm/tlbflush.h>
22
23static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
24 unsigned long addr, unsigned long end)
25{
26 pte_t *pte;
27 spinlock_t *ptl;
28 int progress = 0;
29 unsigned long ret = 0;
30
31again:
32 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
33 do {
34 struct page *page;
35
36 if (progress >= 64) {
37 progress = 0;
38 if (need_resched() || need_lockbreak(ptl))
39 break;
40 }
41 progress++;
42 if (!pte_present(*pte))
43 continue;
44 if (!pte_maybe_dirty(*pte))
45 continue;
46 page = vm_normal_page(vma, addr, *pte);
47 if (!page)
48 continue;
49 if (ptep_clear_flush_dirty(vma, addr, pte) ||
50 page_test_and_clear_dirty(page))
51 ret += set_page_dirty(page);
52 progress += 3;
53 } while (pte++, addr += PAGE_SIZE, addr != end);
54 pte_unmap_unlock(pte - 1, ptl);
55 cond_resched();
56 if (addr != end)
57 goto again;
58 return ret;
59}
60
61static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
62 pud_t *pud, unsigned long addr, unsigned long end)
63{
64 pmd_t *pmd;
65 unsigned long next;
66 unsigned long ret = 0;
67
68 pmd = pmd_offset(pud, addr);
69 do {
70 next = pmd_addr_end(addr, end);
71 if (pmd_none_or_clear_bad(pmd))
72 continue;
73 ret += msync_pte_range(vma, pmd, addr, next);
74 } while (pmd++, addr = next, addr != end);
75 return ret;
76}
77
78static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
79 pgd_t *pgd, unsigned long addr, unsigned long end)
80{
81 pud_t *pud;
82 unsigned long next;
83 unsigned long ret = 0;
84
85 pud = pud_offset(pgd, addr);
86 do {
87 next = pud_addr_end(addr, end);
88 if (pud_none_or_clear_bad(pud))
89 continue;
90 ret += msync_pmd_range(vma, pud, addr, next);
91 } while (pud++, addr = next, addr != end);
92 return ret;
93}
94
95static unsigned long msync_page_range(struct vm_area_struct *vma,
96 unsigned long addr, unsigned long end)
97{
98 pgd_t *pgd;
99 unsigned long next;
100 unsigned long ret = 0;
101
102 /* For hugepages we can't go walking the page table normally,
103 * but that's ok, hugetlbfs is memory based, so we don't need
104 * to do anything more on an msync().
105 */
106 if (vma->vm_flags & VM_HUGETLB)
107 return 0;
108
109 BUG_ON(addr >= end);
110 pgd = pgd_offset(vma->vm_mm, addr);
111 flush_cache_range(vma, addr, end);
112 do {
113 next = pgd_addr_end(addr, end);
114 if (pgd_none_or_clear_bad(pgd))
115 continue;
116 ret += msync_pud_range(vma, pgd, addr, next);
117 } while (pgd++, addr = next, addr != end);
118 return ret;
119}
120
121/* 16/*
122 * MS_SYNC syncs the entire file - including mappings. 17 * MS_SYNC syncs the entire file - including mappings.
123 * 18 *
124 * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just 19 * MS_ASYNC does not start I/O (it used to, up to 2.5.67).
125 * marks the relevant pages dirty. The application may now run fsync() to 20 * Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
21 * Now it doesn't do anything, since dirty pages are properly tracked.
22 *
23 * The application may now run fsync() to
126 * write out the dirty pages and wait on the writeout and check the result. 24 * write out the dirty pages and wait on the writeout and check the result.
127 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start 25 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
128 * async writeout immediately. 26 * async writeout immediately.
129 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to 27 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
130 * applications. 28 * applications.
131 */ 29 */
132static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
133 unsigned long end, int flags,
134 unsigned long *nr_pages_dirtied)
135{
136 struct file *file = vma->vm_file;
137
138 if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
139 return -EBUSY;
140
141 if (file && (vma->vm_flags & VM_SHARED))
142 *nr_pages_dirtied = msync_page_range(vma, addr, end);
143 return 0;
144}
145
146asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 30asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
147{ 31{
148 unsigned long end; 32 unsigned long end;
33 struct mm_struct *mm = current->mm;
149 struct vm_area_struct *vma; 34 struct vm_area_struct *vma;
150 int unmapped_error = 0; 35 int unmapped_error = 0;
151 int error = -EINVAL; 36 int error = -EINVAL;
152 int done = 0;
153 37
154 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) 38 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
155 goto out; 39 goto out;
@@ -169,64 +53,50 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
169 * If the interval [start,end) covers some unmapped address ranges, 53 * If the interval [start,end) covers some unmapped address ranges,
170 * just ignore them, but return -ENOMEM at the end. 54 * just ignore them, but return -ENOMEM at the end.
171 */ 55 */
172 down_read(&current->mm->mmap_sem); 56 down_read(&mm->mmap_sem);
173 vma = find_vma(current->mm, start); 57 vma = find_vma(mm, start);
174 if (!vma) { 58 for (;;) {
175 error = -ENOMEM;
176 goto out_unlock;
177 }
178 do {
179 unsigned long nr_pages_dirtied = 0;
180 struct file *file; 59 struct file *file;
181 60
61 /* Still start < end. */
62 error = -ENOMEM;
63 if (!vma)
64 goto out_unlock;
182 /* Here start < vma->vm_end. */ 65 /* Here start < vma->vm_end. */
183 if (start < vma->vm_start) { 66 if (start < vma->vm_start) {
184 unmapped_error = -ENOMEM;
185 start = vma->vm_start; 67 start = vma->vm_start;
68 if (start >= end)
69 goto out_unlock;
70 unmapped_error = -ENOMEM;
186 } 71 }
187 /* Here vma->vm_start <= start < vma->vm_end. */ 72 /* Here vma->vm_start <= start < vma->vm_end. */
188 if (end <= vma->vm_end) { 73 if ((flags & MS_INVALIDATE) &&
189 if (start < end) { 74 (vma->vm_flags & VM_LOCKED)) {
190 error = msync_interval(vma, start, end, flags, 75 error = -EBUSY;
191 &nr_pages_dirtied); 76 goto out_unlock;
192 if (error)
193 goto out_unlock;
194 }
195 error = unmapped_error;
196 done = 1;
197 } else {
198 /* Here vma->vm_start <= start < vma->vm_end < end. */
199 error = msync_interval(vma, start, vma->vm_end, flags,
200 &nr_pages_dirtied);
201 if (error)
202 goto out_unlock;
203 } 77 }
204 file = vma->vm_file; 78 file = vma->vm_file;
205 start = vma->vm_end; 79 start = vma->vm_end;
206 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { 80 if ((flags & MS_SYNC) && file &&
207 get_file(file);
208 up_read(&current->mm->mmap_sem);
209 balance_dirty_pages_ratelimited_nr(file->f_mapping,
210 nr_pages_dirtied);
211 fput(file);
212 down_read(&current->mm->mmap_sem);
213 vma = find_vma(current->mm, start);
214 } else if ((flags & MS_SYNC) && file &&
215 (vma->vm_flags & VM_SHARED)) { 81 (vma->vm_flags & VM_SHARED)) {
216 get_file(file); 82 get_file(file);
217 up_read(&current->mm->mmap_sem); 83 up_read(&mm->mmap_sem);
218 error = do_fsync(file, 0); 84 error = do_fsync(file, 0);
219 fput(file); 85 fput(file);
220 down_read(&current->mm->mmap_sem); 86 if (error || start >= end)
221 if (error) 87 goto out;
222 goto out_unlock; 88 down_read(&mm->mmap_sem);
223 vma = find_vma(current->mm, start); 89 vma = find_vma(mm, start);
224 } else { 90 } else {
91 if (start >= end) {
92 error = 0;
93 goto out_unlock;
94 }
225 vma = vma->vm_next; 95 vma = vma->vm_next;
226 } 96 }
227 } while (vma && !done); 97 }
228out_unlock: 98out_unlock:
229 up_read(&current->mm->mmap_sem); 99 up_read(&mm->mmap_sem);
230out: 100out:
231 return error; 101 return error ? : unmapped_error;
232} 102}
diff --git a/mm/nommu.c b/mm/nommu.c
index c576df71e3bb..564540662192 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp)
122} 122}
123 123
124/* 124/*
125 * The nommu dodgy version :-) 125 * get a list of pages in an address range belonging to the specified process
126 * and indicate the VMA that covers each page
127 * - this is potentially dodgy as we may end incrementing the page count of a
128 * slab page or a secondary page from a compound page
129 * - don't permit access to VMAs that don't support it, such as I/O mappings
126 */ 130 */
127int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 131int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
128 unsigned long start, int len, int write, int force, 132 unsigned long start, int len, int write, int force,
129 struct page **pages, struct vm_area_struct **vmas) 133 struct page **pages, struct vm_area_struct **vmas)
130{ 134{
135 struct vm_area_struct *vma;
136 unsigned long vm_flags;
131 int i; 137 int i;
132 static struct vm_area_struct dummy_vma; 138
139 /* calculate required read or write permissions.
140 * - if 'force' is set, we only require the "MAY" flags.
141 */
142 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
143 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
133 144
134 for (i = 0; i < len; i++) { 145 for (i = 0; i < len; i++) {
146 vma = find_vma(mm, start);
147 if (!vma)
148 goto finish_or_fault;
149
150 /* protect what we can, including chardevs */
151 if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
152 !(vm_flags & vma->vm_flags))
153 goto finish_or_fault;
154
135 if (pages) { 155 if (pages) {
136 pages[i] = virt_to_page(start); 156 pages[i] = virt_to_page(start);
137 if (pages[i]) 157 if (pages[i])
138 page_cache_get(pages[i]); 158 page_cache_get(pages[i]);
139 } 159 }
140 if (vmas) 160 if (vmas)
141 vmas[i] = &dummy_vma; 161 vmas[i] = vma;
142 start += PAGE_SIZE; 162 start += PAGE_SIZE;
143 } 163 }
144 return(i); 164
165 return i;
166
167finish_or_fault:
168 return i ? : -EFAULT;
145} 169}
146 170
147EXPORT_SYMBOL(get_user_pages); 171EXPORT_SYMBOL(get_user_pages);
@@ -286,6 +310,77 @@ static void show_process_blocks(void)
286} 310}
287#endif /* DEBUG */ 311#endif /* DEBUG */
288 312
313/*
314 * add a VMA into a process's mm_struct in the appropriate place in the list
315 * - should be called with mm->mmap_sem held writelocked
316 */
317static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
318{
319 struct vm_list_struct **ppv;
320
321 for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
322 if ((*ppv)->vma->vm_start > vml->vma->vm_start)
323 break;
324
325 vml->next = *ppv;
326 *ppv = vml;
327}
328
329/*
330 * look up the first VMA in which addr resides, NULL if none
331 * - should be called with mm->mmap_sem at least held readlocked
332 */
333struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
334{
335 struct vm_list_struct *loop, *vml;
336
337 /* search the vm_start ordered list */
338 vml = NULL;
339 for (loop = mm->context.vmlist; loop; loop = loop->next) {
340 if (loop->vma->vm_start > addr)
341 break;
342 vml = loop;
343 }
344
345 if (vml && vml->vma->vm_end > addr)
346 return vml->vma;
347
348 return NULL;
349}
350EXPORT_SYMBOL(find_vma);
351
352/*
353 * find a VMA
354 * - we don't extend stack VMAs under NOMMU conditions
355 */
356struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
357{
358 return find_vma(mm, addr);
359}
360
361/*
362 * look up the first VMA exactly that exactly matches addr
363 * - should be called with mm->mmap_sem at least held readlocked
364 */
365static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
366 unsigned long addr)
367{
368 struct vm_list_struct *vml;
369
370 /* search the vm_start ordered list */
371 for (vml = mm->context.vmlist; vml; vml = vml->next) {
372 if (vml->vma->vm_start == addr)
373 return vml->vma;
374 if (vml->vma->vm_start > addr)
375 break;
376 }
377
378 return NULL;
379}
380
381/*
382 * find a VMA in the global tree
383 */
289static inline struct vm_area_struct *find_nommu_vma(unsigned long start) 384static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
290{ 385{
291 struct vm_area_struct *vma; 386 struct vm_area_struct *vma;
@@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
305 return NULL; 400 return NULL;
306} 401}
307 402
403/*
404 * add a VMA in the global tree
405 */
308static void add_nommu_vma(struct vm_area_struct *vma) 406static void add_nommu_vma(struct vm_area_struct *vma)
309{ 407{
310 struct vm_area_struct *pvma; 408 struct vm_area_struct *pvma;
@@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma)
351 rb_insert_color(&vma->vm_rb, &nommu_vma_tree); 449 rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
352} 450}
353 451
452/*
453 * delete a VMA from the global list
454 */
354static void delete_nommu_vma(struct vm_area_struct *vma) 455static void delete_nommu_vma(struct vm_area_struct *vma)
355{ 456{
356 struct address_space *mapping; 457 struct address_space *mapping;
@@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file,
828 realalloc += kobjsize(vml); 929 realalloc += kobjsize(vml);
829 askedalloc += sizeof(*vml); 930 askedalloc += sizeof(*vml);
830 931
831 vml->next = current->mm->context.vmlist; 932 add_vma_to_mm(current->mm, vml);
832 current->mm->context.vmlist = vml;
833 933
834 up_write(&nommu_vma_sem); 934 up_write(&nommu_vma_sem);
835 935
@@ -908,6 +1008,11 @@ static void put_vma(struct vm_area_struct *vma)
908 } 1008 }
909} 1009}
910 1010
1011/*
1012 * release a mapping
1013 * - under NOMMU conditions the parameters must match exactly to the mapping to
1014 * be removed
1015 */
911int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) 1016int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
912{ 1017{
913 struct vm_list_struct *vml, **parent; 1018 struct vm_list_struct *vml, **parent;
@@ -917,10 +1022,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
917 printk("do_munmap:\n"); 1022 printk("do_munmap:\n");
918#endif 1023#endif
919 1024
920 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) 1025 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
1026 if ((*parent)->vma->vm_start > addr)
1027 break;
921 if ((*parent)->vma->vm_start == addr && 1028 if ((*parent)->vma->vm_start == addr &&
922 ((len == 0) || ((*parent)->vma->vm_end == end))) 1029 ((len == 0) || ((*parent)->vma->vm_end == end)))
923 goto found; 1030 goto found;
1031 }
924 1032
925 printk("munmap of non-mmaped memory by process %d (%s): %p\n", 1033 printk("munmap of non-mmaped memory by process %d (%s): %p\n",
926 current->pid, current->comm, (void *) addr); 1034 current->pid, current->comm, (void *) addr);
@@ -946,7 +1054,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
946 return 0; 1054 return 0;
947} 1055}
948 1056
949/* Release all mmaps. */ 1057asmlinkage long sys_munmap(unsigned long addr, size_t len)
1058{
1059 int ret;
1060 struct mm_struct *mm = current->mm;
1061
1062 down_write(&mm->mmap_sem);
1063 ret = do_munmap(mm, addr, len);
1064 up_write(&mm->mmap_sem);
1065 return ret;
1066}
1067
1068/*
1069 * Release all mappings
1070 */
950void exit_mmap(struct mm_struct * mm) 1071void exit_mmap(struct mm_struct * mm)
951{ 1072{
952 struct vm_list_struct *tmp; 1073 struct vm_list_struct *tmp;
@@ -973,37 +1094,26 @@ void exit_mmap(struct mm_struct * mm)
973 } 1094 }
974} 1095}
975 1096
976asmlinkage long sys_munmap(unsigned long addr, size_t len)
977{
978 int ret;
979 struct mm_struct *mm = current->mm;
980
981 down_write(&mm->mmap_sem);
982 ret = do_munmap(mm, addr, len);
983 up_write(&mm->mmap_sem);
984 return ret;
985}
986
987unsigned long do_brk(unsigned long addr, unsigned long len) 1097unsigned long do_brk(unsigned long addr, unsigned long len)
988{ 1098{
989 return -ENOMEM; 1099 return -ENOMEM;
990} 1100}
991 1101
992/* 1102/*
993 * Expand (or shrink) an existing mapping, potentially moving it at the 1103 * expand (or shrink) an existing mapping, potentially moving it at the same
994 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1104 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
995 * 1105 *
996 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 1106 * under NOMMU conditions, we only permit changing a mapping's size, and only
997 * This option implies MREMAP_MAYMOVE. 1107 * as long as it stays within the hole allocated by the kmalloc() call in
1108 * do_mmap_pgoff() and the block is not shareable
998 * 1109 *
999 * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the 1110 * MREMAP_FIXED is not supported under NOMMU conditions
1000 * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable
1001 */ 1111 */
1002unsigned long do_mremap(unsigned long addr, 1112unsigned long do_mremap(unsigned long addr,
1003 unsigned long old_len, unsigned long new_len, 1113 unsigned long old_len, unsigned long new_len,
1004 unsigned long flags, unsigned long new_addr) 1114 unsigned long flags, unsigned long new_addr)
1005{ 1115{
1006 struct vm_list_struct *vml = NULL; 1116 struct vm_area_struct *vma;
1007 1117
1008 /* insanity checks first */ 1118 /* insanity checks first */
1009 if (new_len == 0) 1119 if (new_len == 0)
@@ -1012,58 +1122,46 @@ unsigned long do_mremap(unsigned long addr,
1012 if (flags & MREMAP_FIXED && new_addr != addr) 1122 if (flags & MREMAP_FIXED && new_addr != addr)
1013 return (unsigned long) -EINVAL; 1123 return (unsigned long) -EINVAL;
1014 1124
1015 for (vml = current->mm->context.vmlist; vml; vml = vml->next) 1125 vma = find_vma_exact(current->mm, addr);
1016 if (vml->vma->vm_start == addr) 1126 if (!vma)
1017 goto found; 1127 return (unsigned long) -EINVAL;
1018
1019 return (unsigned long) -EINVAL;
1020 1128
1021 found: 1129 if (vma->vm_end != vma->vm_start + old_len)
1022 if (vml->vma->vm_end != vml->vma->vm_start + old_len)
1023 return (unsigned long) -EFAULT; 1130 return (unsigned long) -EFAULT;
1024 1131
1025 if (vml->vma->vm_flags & VM_MAYSHARE) 1132 if (vma->vm_flags & VM_MAYSHARE)
1026 return (unsigned long) -EPERM; 1133 return (unsigned long) -EPERM;
1027 1134
1028 if (new_len > kobjsize((void *) addr)) 1135 if (new_len > kobjsize((void *) addr))
1029 return (unsigned long) -ENOMEM; 1136 return (unsigned long) -ENOMEM;
1030 1137
1031 /* all checks complete - do it */ 1138 /* all checks complete - do it */
1032 vml->vma->vm_end = vml->vma->vm_start + new_len; 1139 vma->vm_end = vma->vm_start + new_len;
1033 1140
1034 askedalloc -= old_len; 1141 askedalloc -= old_len;
1035 askedalloc += new_len; 1142 askedalloc += new_len;
1036 1143
1037 return vml->vma->vm_start; 1144 return vma->vm_start;
1038} 1145}
1039 1146
1040/* 1147asmlinkage unsigned long sys_mremap(unsigned long addr,
1041 * Look up the first VMA which satisfies addr < vm_end, NULL if none 1148 unsigned long old_len, unsigned long new_len,
1042 */ 1149 unsigned long flags, unsigned long new_addr)
1043struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1044{ 1150{
1045 struct vm_list_struct *vml; 1151 unsigned long ret;
1046
1047 for (vml = mm->context.vmlist; vml; vml = vml->next)
1048 if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
1049 return vml->vma;
1050 1152
1051 return NULL; 1153 down_write(&current->mm->mmap_sem);
1154 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
1155 up_write(&current->mm->mmap_sem);
1156 return ret;
1052} 1157}
1053 1158
1054EXPORT_SYMBOL(find_vma);
1055
1056struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1159struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1057 unsigned int foll_flags) 1160 unsigned int foll_flags)
1058{ 1161{
1059 return NULL; 1162 return NULL;
1060} 1163}
1061 1164
1062struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
1063{
1064 return NULL;
1065}
1066
1067int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, 1165int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
1068 unsigned long to, unsigned long size, pgprot_t prot) 1166 unsigned long to, unsigned long size, pgprot_t prot)
1069{ 1167{
@@ -1133,7 +1231,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
1133 * which are reclaimable, under pressure. The dentry 1231 * which are reclaimable, under pressure. The dentry
1134 * cache and most inode caches should fall into this 1232 * cache and most inode caches should fall into this
1135 */ 1233 */
1136 free += atomic_read(&slab_reclaim_pages); 1234 free += global_page_state(NR_SLAB_RECLAIMABLE);
1137 1235
1138 /* 1236 /*
1139 * Leave the last 3% for root 1237 * Leave the last 3% for root
@@ -1206,3 +1304,44 @@ struct page *filemap_nopage(struct vm_area_struct *area,
1206 BUG(); 1304 BUG();
1207 return NULL; 1305 return NULL;
1208} 1306}
1307
1308/*
1309 * Access another process' address space.
1310 * - source/target buffer must be kernel space
1311 */
1312int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
1313{
1314 struct vm_area_struct *vma;
1315 struct mm_struct *mm;
1316
1317 if (addr + len < addr)
1318 return 0;
1319
1320 mm = get_task_mm(tsk);
1321 if (!mm)
1322 return 0;
1323
1324 down_read(&mm->mmap_sem);
1325
1326 /* the access must start within one of the target process's mappings */
1327 vma = find_vma(mm, addr);
1328 if (vma) {
1329 /* don't overrun this mapping */
1330 if (addr + len >= vma->vm_end)
1331 len = vma->vm_end - addr;
1332
1333 /* only read or write mappings where it is permitted */
1334 if (write && vma->vm_flags & VM_MAYWRITE)
1335 len -= copy_to_user((void *) addr, buf, len);
1336 else if (!write && vma->vm_flags & VM_MAYREAD)
1337 len -= copy_from_user(buf, (void *) addr, len);
1338 else
1339 len = 0;
1340 } else {
1341 len = 0;
1342 }
1343
1344 up_read(&mm->mmap_sem);
1345 mmput(mm);
1346 return len;
1347}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b9af136e5cfa..bada3d03119f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -21,6 +21,8 @@
21#include <linux/timex.h> 21#include <linux/timex.h>
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/module.h>
25#include <linux/notifier.h>
24 26
25int sysctl_panic_on_oom; 27int sysctl_panic_on_oom;
26/* #define DEBUG */ 28/* #define DEBUG */
@@ -58,6 +60,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 } 60 }
59 61
60 /* 62 /*
63 * swapoff can easily use up all memory, so kill those first.
64 */
65 if (p->flags & PF_SWAPOFF)
66 return ULONG_MAX;
67
68 /*
61 * The memory size of the process is the basis for the badness. 69 * The memory size of the process is the basis for the badness.
62 */ 70 */
63 points = mm->total_vm; 71 points = mm->total_vm;
@@ -127,6 +135,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
127 points /= 4; 135 points /= 4;
128 136
129 /* 137 /*
138 * If p's nodes don't overlap ours, it may still help to kill p
139 * because p may have allocated or otherwise mapped memory on
140 * this node before. However it will be less likely.
141 */
142 if (!cpuset_excl_nodes_overlap(p))
143 points /= 8;
144
145 /*
130 * Adjust the score by oomkilladj. 146 * Adjust the score by oomkilladj.
131 */ 147 */
132 if (p->oomkilladj) { 148 if (p->oomkilladj) {
@@ -161,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
161 177
162 for (z = zonelist->zones; *z; z++) 178 for (z = zonelist->zones; *z; z++)
163 if (cpuset_zone_allowed(*z, gfp_mask)) 179 if (cpuset_zone_allowed(*z, gfp_mask))
164 node_clear((*z)->zone_pgdat->node_id, 180 node_clear(zone_to_nid(*z), nodes);
165 nodes);
166 else 181 else
167 return CONSTRAINT_CPUSET; 182 return CONSTRAINT_CPUSET;
168 183
@@ -191,25 +206,38 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
191 unsigned long points; 206 unsigned long points;
192 int releasing; 207 int releasing;
193 208
209 /* skip kernel threads */
210 if (!p->mm)
211 continue;
194 /* skip the init task with pid == 1 */ 212 /* skip the init task with pid == 1 */
195 if (p->pid == 1) 213 if (p->pid == 1)
196 continue; 214 continue;
197 if (p->oomkilladj == OOM_DISABLE)
198 continue;
199 /* If p's nodes don't overlap ours, it won't help to kill p. */
200 if (!cpuset_excl_nodes_overlap(p))
201 continue;
202 215
203 /* 216 /*
204 * This is in the process of releasing memory so wait for it 217 * This is in the process of releasing memory so wait for it
205 * to finish before killing some other task by mistake. 218 * to finish before killing some other task by mistake.
219 *
220 * However, if p is the current task, we allow the 'kill' to
221 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
222 * which will allow it to gain access to memory reserves in
223 * the process of exiting and releasing its resources.
224 * Otherwise we could get an OOM deadlock.
206 */ 225 */
207 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || 226 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
208 p->flags & PF_EXITING; 227 p->flags & PF_EXITING;
209 if (releasing && !(p->flags & PF_DEAD)) 228 if (releasing) {
229 /* PF_DEAD tasks have already released their mm */
230 if (p->flags & PF_DEAD)
231 continue;
232 if (p->flags & PF_EXITING && p == current) {
233 chosen = p;
234 *ppoints = ULONG_MAX;
235 break;
236 }
210 return ERR_PTR(-1UL); 237 return ERR_PTR(-1UL);
211 if (p->flags & PF_SWAPOFF) 238 }
212 return p; 239 if (p->oomkilladj == OOM_DISABLE)
240 continue;
213 241
214 points = badness(p, uptime.tv_sec); 242 points = badness(p, uptime.tv_sec);
215 if (points > *ppoints || !chosen) { 243 if (points > *ppoints || !chosen) {
@@ -221,9 +249,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
221} 249}
222 250
223/** 251/**
224 * We must be careful though to never send SIGKILL a process with 252 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
225 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that 253 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
226 * we select a process with CAP_SYS_RAW_IO set). 254 * set.
227 */ 255 */
228static void __oom_kill_task(struct task_struct *p, const char *message) 256static void __oom_kill_task(struct task_struct *p, const char *message)
229{ 257{
@@ -241,8 +269,11 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
241 return; 269 return;
242 } 270 }
243 task_unlock(p); 271 task_unlock(p);
244 printk(KERN_ERR "%s: Killed process %d (%s).\n", 272
273 if (message) {
274 printk(KERN_ERR "%s: Killed process %d (%s).\n",
245 message, p->pid, p->comm); 275 message, p->pid, p->comm);
276 }
246 277
247 /* 278 /*
248 * We give our sacrificial lamb high priority and access to 279 * We give our sacrificial lamb high priority and access to
@@ -293,8 +324,17 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
293 struct task_struct *c; 324 struct task_struct *c;
294 struct list_head *tsk; 325 struct list_head *tsk;
295 326
296 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " 327 /*
297 "children.\n", p->pid, p->comm, points); 328 * If the task is already exiting, don't alarm the sysadmin or kill
329 * its children or threads, just set TIF_MEMDIE so it can die quickly
330 */
331 if (p->flags & PF_EXITING) {
332 __oom_kill_task(p, NULL);
333 return 0;
334 }
335
336 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
337 " and children.\n", p->pid, p->comm, points);
298 /* Try to kill a child first */ 338 /* Try to kill a child first */
299 list_for_each(tsk, &p->children) { 339 list_for_each(tsk, &p->children) {
300 c = list_entry(tsk, struct task_struct, sibling); 340 c = list_entry(tsk, struct task_struct, sibling);
@@ -306,6 +346,20 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
306 return oom_kill_task(p, message); 346 return oom_kill_task(p, message);
307} 347}
308 348
349static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
350
351int register_oom_notifier(struct notifier_block *nb)
352{
353 return blocking_notifier_chain_register(&oom_notify_list, nb);
354}
355EXPORT_SYMBOL_GPL(register_oom_notifier);
356
357int unregister_oom_notifier(struct notifier_block *nb)
358{
359 return blocking_notifier_chain_unregister(&oom_notify_list, nb);
360}
361EXPORT_SYMBOL_GPL(unregister_oom_notifier);
362
309/** 363/**
310 * out_of_memory - kill the "best" process when we run out of memory 364 * out_of_memory - kill the "best" process when we run out of memory
311 * 365 *
@@ -318,10 +372,17 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
318{ 372{
319 struct task_struct *p; 373 struct task_struct *p;
320 unsigned long points = 0; 374 unsigned long points = 0;
375 unsigned long freed = 0;
376
377 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
378 if (freed > 0)
379 /* Got some memory back in the last second. */
380 return;
321 381
322 if (printk_ratelimit()) { 382 if (printk_ratelimit()) {
323 printk("oom-killer: gfp_mask=0x%x, order=%d\n", 383 printk(KERN_WARNING "%s invoked oom-killer: "
324 gfp_mask, order); 384 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
385 current->comm, gfp_mask, order, current->oomkilladj);
325 dump_stack(); 386 dump_stack();
326 show_mem(); 387 show_mem();
327 } 388 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 77a0bc4e261a..555752907dc3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -23,6 +23,7 @@
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/mpage.h> 25#include <linux/mpage.h>
26#include <linux/rmap.h>
26#include <linux/percpu.h> 27#include <linux/percpu.h>
27#include <linux/notifier.h> 28#include <linux/notifier.h>
28#include <linux/smp.h> 29#include <linux/smp.h>
@@ -243,6 +244,16 @@ static void balance_dirty_pages(struct address_space *mapping)
243 pdflush_operation(background_writeout, 0); 244 pdflush_operation(background_writeout, 0);
244} 245}
245 246
247void set_page_dirty_balance(struct page *page)
248{
249 if (set_page_dirty(page)) {
250 struct address_space *mapping = page_mapping(page);
251
252 if (mapping)
253 balance_dirty_pages_ratelimited(mapping);
254 }
255}
256
246/** 257/**
247 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 258 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
248 * @mapping: address_space which was dirtied 259 * @mapping: address_space which was dirtied
@@ -550,7 +561,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
550 return 0; 561 return 0;
551 wbc->for_writepages = 1; 562 wbc->for_writepages = 1;
552 if (mapping->a_ops->writepages) 563 if (mapping->a_ops->writepages)
553 ret = mapping->a_ops->writepages(mapping, wbc); 564 ret = mapping->a_ops->writepages(mapping, wbc);
554 else 565 else
555 ret = generic_writepages(mapping, wbc); 566 ret = generic_writepages(mapping, wbc);
556 wbc->for_writepages = 0; 567 wbc->for_writepages = 0;
@@ -690,7 +701,7 @@ int set_page_dirty_lock(struct page *page)
690{ 701{
691 int ret; 702 int ret;
692 703
693 lock_page(page); 704 lock_page_nosync(page);
694 ret = set_page_dirty(page); 705 ret = set_page_dirty(page);
695 unlock_page(page); 706 unlock_page(page);
696 return ret; 707 return ret;
@@ -712,9 +723,15 @@ int test_clear_page_dirty(struct page *page)
712 radix_tree_tag_clear(&mapping->page_tree, 723 radix_tree_tag_clear(&mapping->page_tree,
713 page_index(page), 724 page_index(page),
714 PAGECACHE_TAG_DIRTY); 725 PAGECACHE_TAG_DIRTY);
715 if (mapping_cap_account_dirty(mapping))
716 __dec_zone_page_state(page, NR_FILE_DIRTY);
717 write_unlock_irqrestore(&mapping->tree_lock, flags); 726 write_unlock_irqrestore(&mapping->tree_lock, flags);
727 /*
728 * We can continue to use `mapping' here because the
729 * page is locked, which pins the address_space
730 */
731 if (mapping_cap_account_dirty(mapping)) {
732 page_mkclean(page);
733 dec_zone_page_state(page, NR_FILE_DIRTY);
734 }
718 return 1; 735 return 1;
719 } 736 }
720 write_unlock_irqrestore(&mapping->tree_lock, flags); 737 write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -744,8 +761,10 @@ int clear_page_dirty_for_io(struct page *page)
744 761
745 if (mapping) { 762 if (mapping) {
746 if (TestClearPageDirty(page)) { 763 if (TestClearPageDirty(page)) {
747 if (mapping_cap_account_dirty(mapping)) 764 if (mapping_cap_account_dirty(mapping)) {
765 page_mkclean(page);
748 dec_zone_page_state(page, NR_FILE_DIRTY); 766 dec_zone_page_state(page, NR_FILE_DIRTY);
767 }
749 return 1; 768 return 1;
750 } 769 }
751 return 0; 770 return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3b5358a0561f..4f59d90b81e6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,8 @@
37#include <linux/vmalloc.h> 37#include <linux/vmalloc.h>
38#include <linux/mempolicy.h> 38#include <linux/mempolicy.h>
39#include <linux/stop_machine.h> 39#include <linux/stop_machine.h>
40#include <linux/sort.h>
41#include <linux/pfn.h>
40 42
41#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
42#include <asm/div64.h> 44#include <asm/div64.h>
@@ -51,7 +53,6 @@ EXPORT_SYMBOL(node_online_map);
51nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 53nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
52EXPORT_SYMBOL(node_possible_map); 54EXPORT_SYMBOL(node_possible_map);
53unsigned long totalram_pages __read_mostly; 55unsigned long totalram_pages __read_mostly;
54unsigned long totalhigh_pages __read_mostly;
55unsigned long totalreserve_pages __read_mostly; 56unsigned long totalreserve_pages __read_mostly;
56long nr_swap_pages; 57long nr_swap_pages;
57int percpu_pagelist_fraction; 58int percpu_pagelist_fraction;
@@ -69,7 +70,15 @@ static void __free_pages_ok(struct page *page, unsigned int order);
69 * TBD: should special case ZONE_DMA32 machines here - in those we normally 70 * TBD: should special case ZONE_DMA32 machines here - in those we normally
70 * don't need any ZONE_NORMAL reservation 71 * don't need any ZONE_NORMAL reservation
71 */ 72 */
72int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; 73int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
74 256,
75#ifdef CONFIG_ZONE_DMA32
76 256,
77#endif
78#ifdef CONFIG_HIGHMEM
79 32
80#endif
81};
73 82
74EXPORT_SYMBOL(totalram_pages); 83EXPORT_SYMBOL(totalram_pages);
75 84
@@ -80,11 +89,53 @@ EXPORT_SYMBOL(totalram_pages);
80struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 89struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
81EXPORT_SYMBOL(zone_table); 90EXPORT_SYMBOL(zone_table);
82 91
83static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 92static char *zone_names[MAX_NR_ZONES] = {
93 "DMA",
94#ifdef CONFIG_ZONE_DMA32
95 "DMA32",
96#endif
97 "Normal",
98#ifdef CONFIG_HIGHMEM
99 "HighMem"
100#endif
101};
102
84int min_free_kbytes = 1024; 103int min_free_kbytes = 1024;
85 104
86unsigned long __meminitdata nr_kernel_pages; 105unsigned long __meminitdata nr_kernel_pages;
87unsigned long __meminitdata nr_all_pages; 106unsigned long __meminitdata nr_all_pages;
107static unsigned long __initdata dma_reserve;
108
109#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
110 /*
111 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
112 * ranges of memory (RAM) that may be registered with add_active_range().
113 * Ranges passed to add_active_range() will be merged if possible
114 * so the number of times add_active_range() can be called is
115 * related to the number of nodes and the number of holes
116 */
117 #ifdef CONFIG_MAX_ACTIVE_REGIONS
118 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
119 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
120 #else
121 #if MAX_NUMNODES >= 32
122 /* If there can be many nodes, allow up to 50 holes per node */
123 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
124 #else
125 /* By default, allow up to 256 distinct regions */
126 #define MAX_ACTIVE_REGIONS 256
127 #endif
128 #endif
129
130 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
131 int __initdata nr_nodemap_entries;
132 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
133 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
134#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
135 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
136 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
137#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
138#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
88 139
89#ifdef CONFIG_DEBUG_VM 140#ifdef CONFIG_DEBUG_VM
90static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 141static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -127,7 +178,6 @@ static int bad_range(struct zone *zone, struct page *page)
127 178
128 return 0; 179 return 0;
129} 180}
130
131#else 181#else
132static inline int bad_range(struct zone *zone, struct page *page) 182static inline int bad_range(struct zone *zone, struct page *page)
133{ 183{
@@ -218,12 +268,12 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
218{ 268{
219 int i; 269 int i;
220 270
221 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 271 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
222 /* 272 /*
223 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 273 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
224 * and __GFP_HIGHMEM from hard or soft interrupt context. 274 * and __GFP_HIGHMEM from hard or soft interrupt context.
225 */ 275 */
226 BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 276 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
227 for (i = 0; i < (1 << order); i++) 277 for (i = 0; i < (1 << order); i++)
228 clear_highpage(page + i); 278 clear_highpage(page + i);
229} 279}
@@ -347,8 +397,8 @@ static inline void __free_one_page(struct page *page,
347 397
348 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 398 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
349 399
350 BUG_ON(page_idx & (order_size - 1)); 400 VM_BUG_ON(page_idx & (order_size - 1));
351 BUG_ON(bad_range(zone, page)); 401 VM_BUG_ON(bad_range(zone, page));
352 402
353 zone->free_pages += order_size; 403 zone->free_pages += order_size;
354 while (order < MAX_ORDER-1) { 404 while (order < MAX_ORDER-1) {
@@ -421,7 +471,7 @@ static void free_pages_bulk(struct zone *zone, int count,
421 while (count--) { 471 while (count--) {
422 struct page *page; 472 struct page *page;
423 473
424 BUG_ON(list_empty(list)); 474 VM_BUG_ON(list_empty(list));
425 page = list_entry(list->prev, struct page, lru); 475 page = list_entry(list->prev, struct page, lru);
426 /* have to delete it as __free_one_page list manipulates */ 476 /* have to delete it as __free_one_page list manipulates */
427 list_del(&page->lru); 477 list_del(&page->lru);
@@ -432,9 +482,11 @@ static void free_pages_bulk(struct zone *zone, int count,
432 482
433static void free_one_page(struct zone *zone, struct page *page, int order) 483static void free_one_page(struct zone *zone, struct page *page, int order)
434{ 484{
435 LIST_HEAD(list); 485 spin_lock(&zone->lock);
436 list_add(&page->lru, &list); 486 zone->all_unreclaimable = 0;
437 free_pages_bulk(zone, 1, &list, order); 487 zone->pages_scanned = 0;
488 __free_one_page(page, zone ,order);
489 spin_unlock(&zone->lock);
438} 490}
439 491
440static void __free_pages_ok(struct page *page, unsigned int order) 492static void __free_pages_ok(struct page *page, unsigned int order)
@@ -512,7 +564,7 @@ static inline void expand(struct zone *zone, struct page *page,
512 area--; 564 area--;
513 high--; 565 high--;
514 size >>= 1; 566 size >>= 1;
515 BUG_ON(bad_range(zone, &page[size])); 567 VM_BUG_ON(bad_range(zone, &page[size]));
516 list_add(&page[size].lru, &area->free_list); 568 list_add(&page[size].lru, &area->free_list);
517 area->nr_free++; 569 area->nr_free++;
518 set_page_order(&page[size], high); 570 set_page_order(&page[size], high);
@@ -615,19 +667,23 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
615#ifdef CONFIG_NUMA 667#ifdef CONFIG_NUMA
616/* 668/*
617 * Called from the slab reaper to drain pagesets on a particular node that 669 * Called from the slab reaper to drain pagesets on a particular node that
618 * belong to the currently executing processor. 670 * belongs to the currently executing processor.
619 * Note that this function must be called with the thread pinned to 671 * Note that this function must be called with the thread pinned to
620 * a single processor. 672 * a single processor.
621 */ 673 */
622void drain_node_pages(int nodeid) 674void drain_node_pages(int nodeid)
623{ 675{
624 int i, z; 676 int i;
677 enum zone_type z;
625 unsigned long flags; 678 unsigned long flags;
626 679
627 for (z = 0; z < MAX_NR_ZONES; z++) { 680 for (z = 0; z < MAX_NR_ZONES; z++) {
628 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 681 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
629 struct per_cpu_pageset *pset; 682 struct per_cpu_pageset *pset;
630 683
684 if (!populated_zone(zone))
685 continue;
686
631 pset = zone_pcp(zone, smp_processor_id()); 687 pset = zone_pcp(zone, smp_processor_id());
632 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 688 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
633 struct per_cpu_pages *pcp; 689 struct per_cpu_pages *pcp;
@@ -672,7 +728,8 @@ static void __drain_pages(unsigned int cpu)
672 728
673void mark_free_pages(struct zone *zone) 729void mark_free_pages(struct zone *zone)
674{ 730{
675 unsigned long zone_pfn, flags; 731 unsigned long pfn, max_zone_pfn;
732 unsigned long flags;
676 int order; 733 int order;
677 struct list_head *curr; 734 struct list_head *curr;
678 735
@@ -680,18 +737,25 @@ void mark_free_pages(struct zone *zone)
680 return; 737 return;
681 738
682 spin_lock_irqsave(&zone->lock, flags); 739 spin_lock_irqsave(&zone->lock, flags);
683 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 740
684 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); 741 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
742 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
743 if (pfn_valid(pfn)) {
744 struct page *page = pfn_to_page(pfn);
745
746 if (!PageNosave(page))
747 ClearPageNosaveFree(page);
748 }
685 749
686 for (order = MAX_ORDER - 1; order >= 0; --order) 750 for (order = MAX_ORDER - 1; order >= 0; --order)
687 list_for_each(curr, &zone->free_area[order].free_list) { 751 list_for_each(curr, &zone->free_area[order].free_list) {
688 unsigned long start_pfn, i; 752 unsigned long i;
689 753
690 start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); 754 pfn = page_to_pfn(list_entry(curr, struct page, lru));
755 for (i = 0; i < (1UL << order); i++)
756 SetPageNosaveFree(pfn_to_page(pfn + i));
757 }
691 758
692 for (i=0; i < (1<<order); i++)
693 SetPageNosaveFree(pfn_to_page(start_pfn+i));
694 }
695 spin_unlock_irqrestore(&zone->lock, flags); 759 spin_unlock_irqrestore(&zone->lock, flags);
696} 760}
697 761
@@ -761,8 +825,8 @@ void split_page(struct page *page, unsigned int order)
761{ 825{
762 int i; 826 int i;
763 827
764 BUG_ON(PageCompound(page)); 828 VM_BUG_ON(PageCompound(page));
765 BUG_ON(!page_count(page)); 829 VM_BUG_ON(!page_count(page));
766 for (i = 1; i < (1 << order); i++) 830 for (i = 1; i < (1 << order); i++)
767 set_page_refcounted(page + i); 831 set_page_refcounted(page + i);
768} 832}
@@ -809,7 +873,7 @@ again:
809 local_irq_restore(flags); 873 local_irq_restore(flags);
810 put_cpu(); 874 put_cpu();
811 875
812 BUG_ON(bad_range(zone, page)); 876 VM_BUG_ON(bad_range(zone, page));
813 if (prep_new_page(page, order, gfp_flags)) 877 if (prep_new_page(page, order, gfp_flags))
814 goto again; 878 goto again;
815 return page; 879 return page;
@@ -870,32 +934,37 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
870 struct zone **z = zonelist->zones; 934 struct zone **z = zonelist->zones;
871 struct page *page = NULL; 935 struct page *page = NULL;
872 int classzone_idx = zone_idx(*z); 936 int classzone_idx = zone_idx(*z);
937 struct zone *zone;
873 938
874 /* 939 /*
875 * Go through the zonelist once, looking for a zone with enough free. 940 * Go through the zonelist once, looking for a zone with enough free.
876 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 941 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
877 */ 942 */
878 do { 943 do {
944 zone = *z;
945 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
946 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
947 break;
879 if ((alloc_flags & ALLOC_CPUSET) && 948 if ((alloc_flags & ALLOC_CPUSET) &&
880 !cpuset_zone_allowed(*z, gfp_mask)) 949 !cpuset_zone_allowed(zone, gfp_mask))
881 continue; 950 continue;
882 951
883 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 952 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
884 unsigned long mark; 953 unsigned long mark;
885 if (alloc_flags & ALLOC_WMARK_MIN) 954 if (alloc_flags & ALLOC_WMARK_MIN)
886 mark = (*z)->pages_min; 955 mark = zone->pages_min;
887 else if (alloc_flags & ALLOC_WMARK_LOW) 956 else if (alloc_flags & ALLOC_WMARK_LOW)
888 mark = (*z)->pages_low; 957 mark = zone->pages_low;
889 else 958 else
890 mark = (*z)->pages_high; 959 mark = zone->pages_high;
891 if (!zone_watermark_ok(*z, order, mark, 960 if (!zone_watermark_ok(zone , order, mark,
892 classzone_idx, alloc_flags)) 961 classzone_idx, alloc_flags))
893 if (!zone_reclaim_mode || 962 if (!zone_reclaim_mode ||
894 !zone_reclaim(*z, gfp_mask, order)) 963 !zone_reclaim(zone, gfp_mask, order))
895 continue; 964 continue;
896 } 965 }
897 966
898 page = buffered_rmqueue(zonelist, *z, order, gfp_mask); 967 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
899 if (page) { 968 if (page) {
900 break; 969 break;
901 } 970 }
@@ -1083,7 +1152,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1083 * get_zeroed_page() returns a 32-bit address, which cannot represent 1152 * get_zeroed_page() returns a 32-bit address, which cannot represent
1084 * a highmem page 1153 * a highmem page
1085 */ 1154 */
1086 BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1155 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1087 1156
1088 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1157 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1089 if (page) 1158 if (page)
@@ -1116,7 +1185,7 @@ EXPORT_SYMBOL(__free_pages);
1116fastcall void free_pages(unsigned long addr, unsigned int order) 1185fastcall void free_pages(unsigned long addr, unsigned int order)
1117{ 1186{
1118 if (addr != 0) { 1187 if (addr != 0) {
1119 BUG_ON(!virt_addr_valid((void *)addr)); 1188 VM_BUG_ON(!virt_addr_valid((void *)addr));
1120 __free_pages(virt_to_page((void *)addr), order); 1189 __free_pages(virt_to_page((void *)addr), order);
1121 } 1190 }
1122} 1191}
@@ -1142,7 +1211,8 @@ EXPORT_SYMBOL(nr_free_pages);
1142#ifdef CONFIG_NUMA 1211#ifdef CONFIG_NUMA
1143unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 1212unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1144{ 1213{
1145 unsigned int i, sum = 0; 1214 unsigned int sum = 0;
1215 enum zone_type i;
1146 1216
1147 for (i = 0; i < MAX_NR_ZONES; i++) 1217 for (i = 0; i < MAX_NR_ZONES; i++)
1148 sum += pgdat->node_zones[i].free_pages; 1218 sum += pgdat->node_zones[i].free_pages;
@@ -1187,27 +1257,11 @@ unsigned int nr_free_pagecache_pages(void)
1187 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1257 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1188} 1258}
1189 1259
1190#ifdef CONFIG_HIGHMEM 1260static inline void show_node(struct zone *zone)
1191unsigned int nr_free_highpages (void)
1192{ 1261{
1193 pg_data_t *pgdat; 1262 if (NUMA_BUILD)
1194 unsigned int pages = 0; 1263 printk("Node %ld ", zone_to_nid(zone));
1195
1196 for_each_online_pgdat(pgdat)
1197 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1198
1199 return pages;
1200} 1264}
1201#endif
1202
1203#ifdef CONFIG_NUMA
1204static void show_node(struct zone *zone)
1205{
1206 printk("Node %d ", zone->zone_pgdat->node_id);
1207}
1208#else
1209#define show_node(zone) do { } while (0)
1210#endif
1211 1265
1212void si_meminfo(struct sysinfo *val) 1266void si_meminfo(struct sysinfo *val)
1213{ 1267{
@@ -1215,13 +1269,8 @@ void si_meminfo(struct sysinfo *val)
1215 val->sharedram = 0; 1269 val->sharedram = 0;
1216 val->freeram = nr_free_pages(); 1270 val->freeram = nr_free_pages();
1217 val->bufferram = nr_blockdev_pages(); 1271 val->bufferram = nr_blockdev_pages();
1218#ifdef CONFIG_HIGHMEM
1219 val->totalhigh = totalhigh_pages; 1272 val->totalhigh = totalhigh_pages;
1220 val->freehigh = nr_free_highpages(); 1273 val->freehigh = nr_free_highpages();
1221#else
1222 val->totalhigh = 0;
1223 val->freehigh = 0;
1224#endif
1225 val->mem_unit = PAGE_SIZE; 1274 val->mem_unit = PAGE_SIZE;
1226} 1275}
1227 1276
@@ -1234,8 +1283,13 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1234 1283
1235 val->totalram = pgdat->node_present_pages; 1284 val->totalram = pgdat->node_present_pages;
1236 val->freeram = nr_free_pages_pgdat(pgdat); 1285 val->freeram = nr_free_pages_pgdat(pgdat);
1286#ifdef CONFIG_HIGHMEM
1237 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1287 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1238 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1288 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1289#else
1290 val->totalhigh = 0;
1291 val->freehigh = 0;
1292#endif
1239 val->mem_unit = PAGE_SIZE; 1293 val->mem_unit = PAGE_SIZE;
1240} 1294}
1241#endif 1295#endif
@@ -1249,43 +1303,35 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1249 */ 1303 */
1250void show_free_areas(void) 1304void show_free_areas(void)
1251{ 1305{
1252 int cpu, temperature; 1306 int cpu;
1253 unsigned long active; 1307 unsigned long active;
1254 unsigned long inactive; 1308 unsigned long inactive;
1255 unsigned long free; 1309 unsigned long free;
1256 struct zone *zone; 1310 struct zone *zone;
1257 1311
1258 for_each_zone(zone) { 1312 for_each_zone(zone) {
1259 show_node(zone); 1313 if (!populated_zone(zone))
1260 printk("%s per-cpu:", zone->name);
1261
1262 if (!populated_zone(zone)) {
1263 printk(" empty\n");
1264 continue; 1314 continue;
1265 } else 1315
1266 printk("\n"); 1316 show_node(zone);
1317 printk("%s per-cpu:\n", zone->name);
1267 1318
1268 for_each_online_cpu(cpu) { 1319 for_each_online_cpu(cpu) {
1269 struct per_cpu_pageset *pageset; 1320 struct per_cpu_pageset *pageset;
1270 1321
1271 pageset = zone_pcp(zone, cpu); 1322 pageset = zone_pcp(zone, cpu);
1272 1323
1273 for (temperature = 0; temperature < 2; temperature++) 1324 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
1274 printk("cpu %d %s: high %d, batch %d used:%d\n", 1325 "Cold: hi:%5d, btch:%4d usd:%4d\n",
1275 cpu, 1326 cpu, pageset->pcp[0].high,
1276 temperature ? "cold" : "hot", 1327 pageset->pcp[0].batch, pageset->pcp[0].count,
1277 pageset->pcp[temperature].high, 1328 pageset->pcp[1].high, pageset->pcp[1].batch,
1278 pageset->pcp[temperature].batch, 1329 pageset->pcp[1].count);
1279 pageset->pcp[temperature].count);
1280 } 1330 }
1281 } 1331 }
1282 1332
1283 get_zone_counts(&active, &inactive, &free); 1333 get_zone_counts(&active, &inactive, &free);
1284 1334
1285 printk("Free pages: %11ukB (%ukB HighMem)\n",
1286 K(nr_free_pages()),
1287 K(nr_free_highpages()));
1288
1289 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1335 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1290 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1336 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1291 active, 1337 active,
@@ -1294,13 +1340,17 @@ void show_free_areas(void)
1294 global_page_state(NR_WRITEBACK), 1340 global_page_state(NR_WRITEBACK),
1295 global_page_state(NR_UNSTABLE_NFS), 1341 global_page_state(NR_UNSTABLE_NFS),
1296 nr_free_pages(), 1342 nr_free_pages(),
1297 global_page_state(NR_SLAB), 1343 global_page_state(NR_SLAB_RECLAIMABLE) +
1344 global_page_state(NR_SLAB_UNRECLAIMABLE),
1298 global_page_state(NR_FILE_MAPPED), 1345 global_page_state(NR_FILE_MAPPED),
1299 global_page_state(NR_PAGETABLE)); 1346 global_page_state(NR_PAGETABLE));
1300 1347
1301 for_each_zone(zone) { 1348 for_each_zone(zone) {
1302 int i; 1349 int i;
1303 1350
1351 if (!populated_zone(zone))
1352 continue;
1353
1304 show_node(zone); 1354 show_node(zone);
1305 printk("%s" 1355 printk("%s"
1306 " free:%lukB" 1356 " free:%lukB"
@@ -1333,12 +1383,11 @@ void show_free_areas(void)
1333 for_each_zone(zone) { 1383 for_each_zone(zone) {
1334 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1384 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1335 1385
1386 if (!populated_zone(zone))
1387 continue;
1388
1336 show_node(zone); 1389 show_node(zone);
1337 printk("%s: ", zone->name); 1390 printk("%s: ", zone->name);
1338 if (!populated_zone(zone)) {
1339 printk("empty\n");
1340 continue;
1341 }
1342 1391
1343 spin_lock_irqsave(&zone->lock, flags); 1392 spin_lock_irqsave(&zone->lock, flags);
1344 for (order = 0; order < MAX_ORDER; order++) { 1393 for (order = 0; order < MAX_ORDER; order++) {
@@ -1360,39 +1409,25 @@ void show_free_areas(void)
1360 * Add all populated zones of a node to the zonelist. 1409 * Add all populated zones of a node to the zonelist.
1361 */ 1410 */
1362static int __meminit build_zonelists_node(pg_data_t *pgdat, 1411static int __meminit build_zonelists_node(pg_data_t *pgdat,
1363 struct zonelist *zonelist, int nr_zones, int zone_type) 1412 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
1364{ 1413{
1365 struct zone *zone; 1414 struct zone *zone;
1366 1415
1367 BUG_ON(zone_type > ZONE_HIGHMEM); 1416 BUG_ON(zone_type >= MAX_NR_ZONES);
1417 zone_type++;
1368 1418
1369 do { 1419 do {
1420 zone_type--;
1370 zone = pgdat->node_zones + zone_type; 1421 zone = pgdat->node_zones + zone_type;
1371 if (populated_zone(zone)) { 1422 if (populated_zone(zone)) {
1372#ifndef CONFIG_HIGHMEM
1373 BUG_ON(zone_type > ZONE_NORMAL);
1374#endif
1375 zonelist->zones[nr_zones++] = zone; 1423 zonelist->zones[nr_zones++] = zone;
1376 check_highest_zone(zone_type); 1424 check_highest_zone(zone_type);
1377 } 1425 }
1378 zone_type--;
1379 1426
1380 } while (zone_type >= 0); 1427 } while (zone_type);
1381 return nr_zones; 1428 return nr_zones;
1382} 1429}
1383 1430
1384static inline int highest_zone(int zone_bits)
1385{
1386 int res = ZONE_NORMAL;
1387 if (zone_bits & (__force int)__GFP_HIGHMEM)
1388 res = ZONE_HIGHMEM;
1389 if (zone_bits & (__force int)__GFP_DMA32)
1390 res = ZONE_DMA32;
1391 if (zone_bits & (__force int)__GFP_DMA)
1392 res = ZONE_DMA;
1393 return res;
1394}
1395
1396#ifdef CONFIG_NUMA 1431#ifdef CONFIG_NUMA
1397#define MAX_NODE_LOAD (num_online_nodes()) 1432#define MAX_NODE_LOAD (num_online_nodes())
1398static int __meminitdata node_load[MAX_NUMNODES]; 1433static int __meminitdata node_load[MAX_NUMNODES];
@@ -1458,13 +1493,14 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1458 1493
1459static void __meminit build_zonelists(pg_data_t *pgdat) 1494static void __meminit build_zonelists(pg_data_t *pgdat)
1460{ 1495{
1461 int i, j, k, node, local_node; 1496 int j, node, local_node;
1497 enum zone_type i;
1462 int prev_node, load; 1498 int prev_node, load;
1463 struct zonelist *zonelist; 1499 struct zonelist *zonelist;
1464 nodemask_t used_mask; 1500 nodemask_t used_mask;
1465 1501
1466 /* initialize zonelists */ 1502 /* initialize zonelists */
1467 for (i = 0; i < GFP_ZONETYPES; i++) { 1503 for (i = 0; i < MAX_NR_ZONES; i++) {
1468 zonelist = pgdat->node_zonelists + i; 1504 zonelist = pgdat->node_zonelists + i;
1469 zonelist->zones[0] = NULL; 1505 zonelist->zones[0] = NULL;
1470 } 1506 }
@@ -1494,13 +1530,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1494 node_load[node] += load; 1530 node_load[node] += load;
1495 prev_node = node; 1531 prev_node = node;
1496 load--; 1532 load--;
1497 for (i = 0; i < GFP_ZONETYPES; i++) { 1533 for (i = 0; i < MAX_NR_ZONES; i++) {
1498 zonelist = pgdat->node_zonelists + i; 1534 zonelist = pgdat->node_zonelists + i;
1499 for (j = 0; zonelist->zones[j] != NULL; j++); 1535 for (j = 0; zonelist->zones[j] != NULL; j++);
1500 1536
1501 k = highest_zone(i); 1537 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1502
1503 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1504 zonelist->zones[j] = NULL; 1538 zonelist->zones[j] = NULL;
1505 } 1539 }
1506 } 1540 }
@@ -1510,17 +1544,16 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1510 1544
1511static void __meminit build_zonelists(pg_data_t *pgdat) 1545static void __meminit build_zonelists(pg_data_t *pgdat)
1512{ 1546{
1513 int i, j, k, node, local_node; 1547 int node, local_node;
1548 enum zone_type i,j;
1514 1549
1515 local_node = pgdat->node_id; 1550 local_node = pgdat->node_id;
1516 for (i = 0; i < GFP_ZONETYPES; i++) { 1551 for (i = 0; i < MAX_NR_ZONES; i++) {
1517 struct zonelist *zonelist; 1552 struct zonelist *zonelist;
1518 1553
1519 zonelist = pgdat->node_zonelists + i; 1554 zonelist = pgdat->node_zonelists + i;
1520 1555
1521 j = 0; 1556 j = build_zonelists_node(pgdat, zonelist, 0, i);
1522 k = highest_zone(i);
1523 j = build_zonelists_node(pgdat, zonelist, j, k);
1524 /* 1557 /*
1525 * Now we build the zonelist so that it contains the zones 1558 * Now we build the zonelist so that it contains the zones
1526 * of all the other nodes. 1559 * of all the other nodes.
@@ -1532,12 +1565,12 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1532 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1565 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1533 if (!node_online(node)) 1566 if (!node_online(node))
1534 continue; 1567 continue;
1535 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1568 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1536 } 1569 }
1537 for (node = 0; node < local_node; node++) { 1570 for (node = 0; node < local_node; node++) {
1538 if (!node_online(node)) 1571 if (!node_online(node))
1539 continue; 1572 continue;
1540 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1573 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1541 } 1574 }
1542 1575
1543 zonelist->zones[j] = NULL; 1576 zonelist->zones[j] = NULL;
@@ -1558,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy)
1558void __meminit build_all_zonelists(void) 1591void __meminit build_all_zonelists(void)
1559{ 1592{
1560 if (system_state == SYSTEM_BOOTING) { 1593 if (system_state == SYSTEM_BOOTING) {
1561 __build_all_zonelists(0); 1594 __build_all_zonelists(NULL);
1562 cpuset_init_current_mems_allowed(); 1595 cpuset_init_current_mems_allowed();
1563 } else { 1596 } else {
1564 /* we have to stop all cpus to guaranntee there is no user 1597 /* we have to stop all cpus to guaranntee there is no user
@@ -1639,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
1639 1672
1640#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1673#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1641 1674
1642static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1643 unsigned long *zones_size, unsigned long *zholes_size)
1644{
1645 unsigned long realtotalpages, totalpages = 0;
1646 int i;
1647
1648 for (i = 0; i < MAX_NR_ZONES; i++)
1649 totalpages += zones_size[i];
1650 pgdat->node_spanned_pages = totalpages;
1651
1652 realtotalpages = totalpages;
1653 if (zholes_size)
1654 for (i = 0; i < MAX_NR_ZONES; i++)
1655 realtotalpages -= zholes_size[i];
1656 pgdat->node_present_pages = realtotalpages;
1657 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1658}
1659
1660
1661/* 1675/*
1662 * Initially all pages are reserved - free ones are freed 1676 * Initially all pages are reserved - free ones are freed
1663 * up by free_all_bootmem() once the early boot process is 1677 * up by free_all_bootmem() once the early boot process is
@@ -1698,8 +1712,8 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1698} 1712}
1699 1713
1700#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) 1714#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1701void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 1715void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
1702 unsigned long size) 1716 unsigned long pfn, unsigned long size)
1703{ 1717{
1704 unsigned long snum = pfn_to_section_nr(pfn); 1718 unsigned long snum = pfn_to_section_nr(pfn);
1705 unsigned long end = pfn_to_section_nr(pfn + size); 1719 unsigned long end = pfn_to_section_nr(pfn + size);
@@ -1815,6 +1829,9 @@ static int __cpuinit process_zones(int cpu)
1815 1829
1816 for_each_zone(zone) { 1830 for_each_zone(zone) {
1817 1831
1832 if (!populated_zone(zone))
1833 continue;
1834
1818 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 1835 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1819 GFP_KERNEL, cpu_to_node(cpu)); 1836 GFP_KERNEL, cpu_to_node(cpu));
1820 if (!zone_pcp(zone, cpu)) 1837 if (!zone_pcp(zone, cpu))
@@ -1845,8 +1862,10 @@ static inline void free_zone_pagesets(int cpu)
1845 for_each_zone(zone) { 1862 for_each_zone(zone) {
1846 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 1863 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1847 1864
1865 /* Free per_cpu_pageset if it is slab allocated */
1866 if (pset != &boot_pageset[cpu])
1867 kfree(pset);
1848 zone_pcp(zone, cpu) = NULL; 1868 zone_pcp(zone, cpu) = NULL;
1849 kfree(pset);
1850 } 1869 }
1851} 1870}
1852 1871
@@ -1972,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone,
1972 return 0; 1991 return 0;
1973} 1992}
1974 1993
1994#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
1995/*
1996 * Basic iterator support. Return the first range of PFNs for a node
1997 * Note: nid == MAX_NUMNODES returns first region regardless of node
1998 */
1999static int __init first_active_region_index_in_nid(int nid)
2000{
2001 int i;
2002
2003 for (i = 0; i < nr_nodemap_entries; i++)
2004 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2005 return i;
2006
2007 return -1;
2008}
2009
2010/*
2011 * Basic iterator support. Return the next active range of PFNs for a node
2012 * Note: nid == MAX_NUMNODES returns next region regardles of node
2013 */
2014static int __init next_active_region_index_in_nid(int index, int nid)
2015{
2016 for (index = index + 1; index < nr_nodemap_entries; index++)
2017 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2018 return index;
2019
2020 return -1;
2021}
2022
2023#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2024/*
2025 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2026 * Architectures may implement their own version but if add_active_range()
2027 * was used and there are no special requirements, this is a convenient
2028 * alternative
2029 */
2030int __init early_pfn_to_nid(unsigned long pfn)
2031{
2032 int i;
2033
2034 for (i = 0; i < nr_nodemap_entries; i++) {
2035 unsigned long start_pfn = early_node_map[i].start_pfn;
2036 unsigned long end_pfn = early_node_map[i].end_pfn;
2037
2038 if (start_pfn <= pfn && pfn < end_pfn)
2039 return early_node_map[i].nid;
2040 }
2041
2042 return 0;
2043}
2044#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2045
2046/* Basic iterator support to walk early_node_map[] */
2047#define for_each_active_range_index_in_nid(i, nid) \
2048 for (i = first_active_region_index_in_nid(nid); i != -1; \
2049 i = next_active_region_index_in_nid(i, nid))
2050
2051/**
2052 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2053 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
2054 * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
2055 *
2056 * If an architecture guarantees that all ranges registered with
2057 * add_active_ranges() contain no holes and may be freed, this
2058 * this function may be used instead of calling free_bootmem() manually.
2059 */
2060void __init free_bootmem_with_active_regions(int nid,
2061 unsigned long max_low_pfn)
2062{
2063 int i;
2064
2065 for_each_active_range_index_in_nid(i, nid) {
2066 unsigned long size_pages = 0;
2067 unsigned long end_pfn = early_node_map[i].end_pfn;
2068
2069 if (early_node_map[i].start_pfn >= max_low_pfn)
2070 continue;
2071
2072 if (end_pfn > max_low_pfn)
2073 end_pfn = max_low_pfn;
2074
2075 size_pages = end_pfn - early_node_map[i].start_pfn;
2076 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2077 PFN_PHYS(early_node_map[i].start_pfn),
2078 size_pages << PAGE_SHIFT);
2079 }
2080}
2081
2082/**
2083 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2084 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
2085 *
2086 * If an architecture guarantees that all ranges registered with
2087 * add_active_ranges() contain no holes and may be freed, this
2088 * this function may be used instead of calling memory_present() manually.
2089 */
2090void __init sparse_memory_present_with_active_regions(int nid)
2091{
2092 int i;
2093
2094 for_each_active_range_index_in_nid(i, nid)
2095 memory_present(early_node_map[i].nid,
2096 early_node_map[i].start_pfn,
2097 early_node_map[i].end_pfn);
2098}
2099
2100/**
2101 * push_node_boundaries - Push node boundaries to at least the requested boundary
2102 * @nid: The nid of the node to push the boundary for
2103 * @start_pfn: The start pfn of the node
2104 * @end_pfn: The end pfn of the node
2105 *
2106 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2107 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2108 * be hotplugged even though no physical memory exists. This function allows
2109 * an arch to push out the node boundaries so mem_map is allocated that can
2110 * be used later.
2111 */
2112#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2113void __init push_node_boundaries(unsigned int nid,
2114 unsigned long start_pfn, unsigned long end_pfn)
2115{
2116 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2117 nid, start_pfn, end_pfn);
2118
2119 /* Initialise the boundary for this node if necessary */
2120 if (node_boundary_end_pfn[nid] == 0)
2121 node_boundary_start_pfn[nid] = -1UL;
2122
2123 /* Update the boundaries */
2124 if (node_boundary_start_pfn[nid] > start_pfn)
2125 node_boundary_start_pfn[nid] = start_pfn;
2126 if (node_boundary_end_pfn[nid] < end_pfn)
2127 node_boundary_end_pfn[nid] = end_pfn;
2128}
2129
2130/* If necessary, push the node boundary out for reserve hotadd */
2131static void __init account_node_boundary(unsigned int nid,
2132 unsigned long *start_pfn, unsigned long *end_pfn)
2133{
2134 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2135 nid, *start_pfn, *end_pfn);
2136
2137 /* Return if boundary information has not been provided */
2138 if (node_boundary_end_pfn[nid] == 0)
2139 return;
2140
2141 /* Check the boundaries and update if necessary */
2142 if (node_boundary_start_pfn[nid] < *start_pfn)
2143 *start_pfn = node_boundary_start_pfn[nid];
2144 if (node_boundary_end_pfn[nid] > *end_pfn)
2145 *end_pfn = node_boundary_end_pfn[nid];
2146}
2147#else
2148void __init push_node_boundaries(unsigned int nid,
2149 unsigned long start_pfn, unsigned long end_pfn) {}
2150
2151static void __init account_node_boundary(unsigned int nid,
2152 unsigned long *start_pfn, unsigned long *end_pfn) {}
2153#endif
2154
2155
2156/**
2157 * get_pfn_range_for_nid - Return the start and end page frames for a node
2158 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
2159 * @start_pfn: Passed by reference. On return, it will have the node start_pfn
2160 * @end_pfn: Passed by reference. On return, it will have the node end_pfn
2161 *
2162 * It returns the start and end page frame of a node based on information
2163 * provided by an arch calling add_active_range(). If called for a node
2164 * with no available memory, a warning is printed and the start and end
2165 * PFNs will be 0
2166 */
2167void __init get_pfn_range_for_nid(unsigned int nid,
2168 unsigned long *start_pfn, unsigned long *end_pfn)
2169{
2170 int i;
2171 *start_pfn = -1UL;
2172 *end_pfn = 0;
2173
2174 for_each_active_range_index_in_nid(i, nid) {
2175 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
2176 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2177 }
2178
2179 if (*start_pfn == -1UL) {
2180 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2181 *start_pfn = 0;
2182 }
2183
2184 /* Push the node boundaries out if requested */
2185 account_node_boundary(nid, start_pfn, end_pfn);
2186}
2187
2188/*
2189 * Return the number of pages a zone spans in a node, including holes
2190 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2191 */
2192unsigned long __init zone_spanned_pages_in_node(int nid,
2193 unsigned long zone_type,
2194 unsigned long *ignored)
2195{
2196 unsigned long node_start_pfn, node_end_pfn;
2197 unsigned long zone_start_pfn, zone_end_pfn;
2198
2199 /* Get the start and end of the node and zone */
2200 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2201 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2202 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2203
2204 /* Check that this node has pages within the zone's required range */
2205 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
2206 return 0;
2207
2208 /* Move the zone boundaries inside the node if necessary */
2209 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2210 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2211
2212 /* Return the spanned pages */
2213 return zone_end_pfn - zone_start_pfn;
2214}
2215
2216/*
2217 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2218 * then all holes in the requested range will be accounted for
2219 */
2220unsigned long __init __absent_pages_in_range(int nid,
2221 unsigned long range_start_pfn,
2222 unsigned long range_end_pfn)
2223{
2224 int i = 0;
2225 unsigned long prev_end_pfn = 0, hole_pages = 0;
2226 unsigned long start_pfn;
2227
2228 /* Find the end_pfn of the first active range of pfns in the node */
2229 i = first_active_region_index_in_nid(nid);
2230 if (i == -1)
2231 return 0;
2232
2233 /* Account for ranges before physical memory on this node */
2234 if (early_node_map[i].start_pfn > range_start_pfn)
2235 hole_pages = early_node_map[i].start_pfn - range_start_pfn;
2236
2237 prev_end_pfn = early_node_map[i].start_pfn;
2238
2239 /* Find all holes for the zone within the node */
2240 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2241
2242 /* No need to continue if prev_end_pfn is outside the zone */
2243 if (prev_end_pfn >= range_end_pfn)
2244 break;
2245
2246 /* Make sure the end of the zone is not within the hole */
2247 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2248 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2249
2250 /* Update the hole size cound and move on */
2251 if (start_pfn > range_start_pfn) {
2252 BUG_ON(prev_end_pfn > start_pfn);
2253 hole_pages += start_pfn - prev_end_pfn;
2254 }
2255 prev_end_pfn = early_node_map[i].end_pfn;
2256 }
2257
2258 /* Account for ranges past physical memory on this node */
2259 if (range_end_pfn > prev_end_pfn)
2260 hole_pages = range_end_pfn -
2261 max(range_start_pfn, prev_end_pfn);
2262
2263 return hole_pages;
2264}
2265
2266/**
2267 * absent_pages_in_range - Return number of page frames in holes within a range
2268 * @start_pfn: The start PFN to start searching for holes
2269 * @end_pfn: The end PFN to stop searching for holes
2270 *
2271 * It returns the number of pages frames in memory holes within a range
2272 */
2273unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2274 unsigned long end_pfn)
2275{
2276 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2277}
2278
2279/* Return the number of page frames in holes in a zone on a node */
2280unsigned long __init zone_absent_pages_in_node(int nid,
2281 unsigned long zone_type,
2282 unsigned long *ignored)
2283{
2284 unsigned long node_start_pfn, node_end_pfn;
2285 unsigned long zone_start_pfn, zone_end_pfn;
2286
2287 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2288 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
2289 node_start_pfn);
2290 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2291 node_end_pfn);
2292
2293 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2294}
2295
2296/* Return the zone index a PFN is in */
2297int memmap_zone_idx(struct page *lmem_map)
2298{
2299 int i;
2300 unsigned long phys_addr = virt_to_phys(lmem_map);
2301 unsigned long pfn = phys_addr >> PAGE_SHIFT;
2302
2303 for (i = 0; i < MAX_NR_ZONES; i++)
2304 if (pfn < arch_zone_highest_possible_pfn[i])
2305 break;
2306
2307 return i;
2308}
2309#else
2310static inline unsigned long zone_spanned_pages_in_node(int nid,
2311 unsigned long zone_type,
2312 unsigned long *zones_size)
2313{
2314 return zones_size[zone_type];
2315}
2316
2317static inline unsigned long zone_absent_pages_in_node(int nid,
2318 unsigned long zone_type,
2319 unsigned long *zholes_size)
2320{
2321 if (!zholes_size)
2322 return 0;
2323
2324 return zholes_size[zone_type];
2325}
2326
2327static inline int memmap_zone_idx(struct page *lmem_map)
2328{
2329 return MAX_NR_ZONES;
2330}
2331#endif
2332
2333static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2334 unsigned long *zones_size, unsigned long *zholes_size)
2335{
2336 unsigned long realtotalpages, totalpages = 0;
2337 enum zone_type i;
2338
2339 for (i = 0; i < MAX_NR_ZONES; i++)
2340 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2341 zones_size);
2342 pgdat->node_spanned_pages = totalpages;
2343
2344 realtotalpages = totalpages;
2345 for (i = 0; i < MAX_NR_ZONES; i++)
2346 realtotalpages -=
2347 zone_absent_pages_in_node(pgdat->node_id, i,
2348 zholes_size);
2349 pgdat->node_present_pages = realtotalpages;
2350 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2351 realtotalpages);
2352}
2353
1975/* 2354/*
1976 * Set up the zone data structures: 2355 * Set up the zone data structures:
1977 * - mark all pages reserved 2356 * - mark all pages reserved
@@ -1981,7 +2360,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
1981static void __meminit free_area_init_core(struct pglist_data *pgdat, 2360static void __meminit free_area_init_core(struct pglist_data *pgdat,
1982 unsigned long *zones_size, unsigned long *zholes_size) 2361 unsigned long *zones_size, unsigned long *zholes_size)
1983{ 2362{
1984 unsigned long j; 2363 enum zone_type j;
1985 int nid = pgdat->node_id; 2364 int nid = pgdat->node_id;
1986 unsigned long zone_start_pfn = pgdat->node_start_pfn; 2365 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1987 int ret; 2366 int ret;
@@ -1993,21 +2372,46 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
1993 2372
1994 for (j = 0; j < MAX_NR_ZONES; j++) { 2373 for (j = 0; j < MAX_NR_ZONES; j++) {
1995 struct zone *zone = pgdat->node_zones + j; 2374 struct zone *zone = pgdat->node_zones + j;
1996 unsigned long size, realsize; 2375 unsigned long size, realsize, memmap_pages;
1997 2376
1998 realsize = size = zones_size[j]; 2377 size = zone_spanned_pages_in_node(nid, j, zones_size);
1999 if (zholes_size) 2378 realsize = size - zone_absent_pages_in_node(nid, j,
2000 realsize -= zholes_size[j]; 2379 zholes_size);
2001 2380
2002 if (j < ZONE_HIGHMEM) 2381 /*
2382 * Adjust realsize so that it accounts for how much memory
2383 * is used by this zone for memmap. This affects the watermark
2384 * and per-cpu initialisations
2385 */
2386 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
2387 if (realsize >= memmap_pages) {
2388 realsize -= memmap_pages;
2389 printk(KERN_DEBUG
2390 " %s zone: %lu pages used for memmap\n",
2391 zone_names[j], memmap_pages);
2392 } else
2393 printk(KERN_WARNING
2394 " %s zone: %lu pages exceeds realsize %lu\n",
2395 zone_names[j], memmap_pages, realsize);
2396
2397 /* Account for reserved DMA pages */
2398 if (j == ZONE_DMA && realsize > dma_reserve) {
2399 realsize -= dma_reserve;
2400 printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",
2401 dma_reserve);
2402 }
2403
2404 if (!is_highmem_idx(j))
2003 nr_kernel_pages += realsize; 2405 nr_kernel_pages += realsize;
2004 nr_all_pages += realsize; 2406 nr_all_pages += realsize;
2005 2407
2006 zone->spanned_pages = size; 2408 zone->spanned_pages = size;
2007 zone->present_pages = realsize; 2409 zone->present_pages = realsize;
2008#ifdef CONFIG_NUMA 2410#ifdef CONFIG_NUMA
2009 zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) 2411 zone->node = nid;
2412 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2010 / 100; 2413 / 100;
2414 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
2011#endif 2415#endif
2012 zone->name = zone_names[j]; 2416 zone->name = zone_names[j];
2013 spin_lock_init(&zone->lock); 2417 spin_lock_init(&zone->lock);
@@ -2067,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2067 /* 2471 /*
2068 * With no DISCONTIG, the global mem_map is just set as node 0's 2472 * With no DISCONTIG, the global mem_map is just set as node 0's
2069 */ 2473 */
2070 if (pgdat == NODE_DATA(0)) 2474 if (pgdat == NODE_DATA(0)) {
2071 mem_map = NODE_DATA(0)->node_mem_map; 2475 mem_map = NODE_DATA(0)->node_mem_map;
2476#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2477 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2478 mem_map -= pgdat->node_start_pfn;
2479#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2480 }
2072#endif 2481#endif
2073#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2482#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2074} 2483}
@@ -2079,13 +2488,255 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2079{ 2488{
2080 pgdat->node_id = nid; 2489 pgdat->node_id = nid;
2081 pgdat->node_start_pfn = node_start_pfn; 2490 pgdat->node_start_pfn = node_start_pfn;
2082 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 2491 calculate_node_totalpages(pgdat, zones_size, zholes_size);
2083 2492
2084 alloc_node_mem_map(pgdat); 2493 alloc_node_mem_map(pgdat);
2085 2494
2086 free_area_init_core(pgdat, zones_size, zholes_size); 2495 free_area_init_core(pgdat, zones_size, zholes_size);
2087} 2496}
2088 2497
2498#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2499/**
2500 * add_active_range - Register a range of PFNs backed by physical memory
2501 * @nid: The node ID the range resides on
2502 * @start_pfn: The start PFN of the available physical memory
2503 * @end_pfn: The end PFN of the available physical memory
2504 *
2505 * These ranges are stored in an early_node_map[] and later used by
2506 * free_area_init_nodes() to calculate zone sizes and holes. If the
2507 * range spans a memory hole, it is up to the architecture to ensure
2508 * the memory is not freed by the bootmem allocator. If possible
2509 * the range being registered will be merged with existing ranges.
2510 */
2511void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2512 unsigned long end_pfn)
2513{
2514 int i;
2515
2516 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2517 "%d entries of %d used\n",
2518 nid, start_pfn, end_pfn,
2519 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2520
2521 /* Merge with existing active regions if possible */
2522 for (i = 0; i < nr_nodemap_entries; i++) {
2523 if (early_node_map[i].nid != nid)
2524 continue;
2525
2526 /* Skip if an existing region covers this new one */
2527 if (start_pfn >= early_node_map[i].start_pfn &&
2528 end_pfn <= early_node_map[i].end_pfn)
2529 return;
2530
2531 /* Merge forward if suitable */
2532 if (start_pfn <= early_node_map[i].end_pfn &&
2533 end_pfn > early_node_map[i].end_pfn) {
2534 early_node_map[i].end_pfn = end_pfn;
2535 return;
2536 }
2537
2538 /* Merge backward if suitable */
2539 if (start_pfn < early_node_map[i].end_pfn &&
2540 end_pfn >= early_node_map[i].start_pfn) {
2541 early_node_map[i].start_pfn = start_pfn;
2542 return;
2543 }
2544 }
2545
2546 /* Check that early_node_map is large enough */
2547 if (i >= MAX_ACTIVE_REGIONS) {
2548 printk(KERN_CRIT "More than %d memory regions, truncating\n",
2549 MAX_ACTIVE_REGIONS);
2550 return;
2551 }
2552
2553 early_node_map[i].nid = nid;
2554 early_node_map[i].start_pfn = start_pfn;
2555 early_node_map[i].end_pfn = end_pfn;
2556 nr_nodemap_entries = i + 1;
2557}
2558
2559/**
2560 * shrink_active_range - Shrink an existing registered range of PFNs
2561 * @nid: The node id the range is on that should be shrunk
2562 * @old_end_pfn: The old end PFN of the range
2563 * @new_end_pfn: The new PFN of the range
2564 *
2565 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2566 * The map is kept at the end physical page range that has already been
2567 * registered with add_active_range(). This function allows an arch to shrink
2568 * an existing registered range.
2569 */
2570void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2571 unsigned long new_end_pfn)
2572{
2573 int i;
2574
2575 /* Find the old active region end and shrink */
2576 for_each_active_range_index_in_nid(i, nid)
2577 if (early_node_map[i].end_pfn == old_end_pfn) {
2578 early_node_map[i].end_pfn = new_end_pfn;
2579 break;
2580 }
2581}
2582
2583/**
2584 * remove_all_active_ranges - Remove all currently registered regions
2585 * During discovery, it may be found that a table like SRAT is invalid
2586 * and an alternative discovery method must be used. This function removes
2587 * all currently registered regions.
2588 */
2589void __init remove_all_active_ranges()
2590{
2591 memset(early_node_map, 0, sizeof(early_node_map));
2592 nr_nodemap_entries = 0;
2593#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2594 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
2595 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
2596#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
2597}
2598
2599/* Compare two active node_active_regions */
2600static int __init cmp_node_active_region(const void *a, const void *b)
2601{
2602 struct node_active_region *arange = (struct node_active_region *)a;
2603 struct node_active_region *brange = (struct node_active_region *)b;
2604
2605 /* Done this way to avoid overflows */
2606 if (arange->start_pfn > brange->start_pfn)
2607 return 1;
2608 if (arange->start_pfn < brange->start_pfn)
2609 return -1;
2610
2611 return 0;
2612}
2613
2614/* sort the node_map by start_pfn */
2615static void __init sort_node_map(void)
2616{
2617 sort(early_node_map, (size_t)nr_nodemap_entries,
2618 sizeof(struct node_active_region),
2619 cmp_node_active_region, NULL);
2620}
2621
2622/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
2623unsigned long __init find_min_pfn_for_node(unsigned long nid)
2624{
2625 int i;
2626
2627 /* Assuming a sorted map, the first range found has the starting pfn */
2628 for_each_active_range_index_in_nid(i, nid)
2629 return early_node_map[i].start_pfn;
2630
2631 printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
2632 return 0;
2633}
2634
2635/**
2636 * find_min_pfn_with_active_regions - Find the minimum PFN registered
2637 *
2638 * It returns the minimum PFN based on information provided via
2639 * add_active_range()
2640 */
2641unsigned long __init find_min_pfn_with_active_regions(void)
2642{
2643 return find_min_pfn_for_node(MAX_NUMNODES);
2644}
2645
2646/**
2647 * find_max_pfn_with_active_regions - Find the maximum PFN registered
2648 *
2649 * It returns the maximum PFN based on information provided via
2650 * add_active_range()
2651 */
2652unsigned long __init find_max_pfn_with_active_regions(void)
2653{
2654 int i;
2655 unsigned long max_pfn = 0;
2656
2657 for (i = 0; i < nr_nodemap_entries; i++)
2658 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2659
2660 return max_pfn;
2661}
2662
2663/**
2664 * free_area_init_nodes - Initialise all pg_data_t and zone data
2665 * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
2666 * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
2667 * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
2668 * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
2669 *
2670 * This will call free_area_init_node() for each active node in the system.
2671 * Using the page ranges provided by add_active_range(), the size of each
2672 * zone in each node and their holes is calculated. If the maximum PFN
2673 * between two adjacent zones match, it is assumed that the zone is empty.
2674 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2675 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2676 * starts where the previous one ended. For example, ZONE_DMA32 starts
2677 * at arch_max_dma_pfn.
2678 */
2679void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2680{
2681 unsigned long nid;
2682 enum zone_type i;
2683
2684 /* Record where the zone boundaries are */
2685 memset(arch_zone_lowest_possible_pfn, 0,
2686 sizeof(arch_zone_lowest_possible_pfn));
2687 memset(arch_zone_highest_possible_pfn, 0,
2688 sizeof(arch_zone_highest_possible_pfn));
2689 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2690 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2691 for (i = 1; i < MAX_NR_ZONES; i++) {
2692 arch_zone_lowest_possible_pfn[i] =
2693 arch_zone_highest_possible_pfn[i-1];
2694 arch_zone_highest_possible_pfn[i] =
2695 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2696 }
2697
2698 /* Regions in the early_node_map can be in any order */
2699 sort_node_map();
2700
2701 /* Print out the zone ranges */
2702 printk("Zone PFN ranges:\n");
2703 for (i = 0; i < MAX_NR_ZONES; i++)
2704 printk(" %-8s %8lu -> %8lu\n",
2705 zone_names[i],
2706 arch_zone_lowest_possible_pfn[i],
2707 arch_zone_highest_possible_pfn[i]);
2708
2709 /* Print out the early_node_map[] */
2710 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2711 for (i = 0; i < nr_nodemap_entries; i++)
2712 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2713 early_node_map[i].start_pfn,
2714 early_node_map[i].end_pfn);
2715
2716 /* Initialise every node */
2717 for_each_online_node(nid) {
2718 pg_data_t *pgdat = NODE_DATA(nid);
2719 free_area_init_node(nid, pgdat, NULL,
2720 find_min_pfn_for_node(nid), NULL);
2721 }
2722}
2723#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2724
2725/**
2726 * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA
2727 * @new_dma_reserve - The number of pages to mark reserved
2728 *
2729 * The per-cpu batchsize and zone watermarks are determined by present_pages.
2730 * In the DMA zone, a significant percentage may be consumed by kernel image
2731 * and other unfreeable allocations which can skew the watermarks badly. This
2732 * function may optionally be used to account for unfreeable pages in
2733 * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize
2734 */
2735void __init set_dma_reserve(unsigned long new_dma_reserve)
2736{
2737 dma_reserve = new_dma_reserve;
2738}
2739
2089#ifndef CONFIG_NEED_MULTIPLE_NODES 2740#ifndef CONFIG_NEED_MULTIPLE_NODES
2090static bootmem_data_t contig_bootmem_data; 2741static bootmem_data_t contig_bootmem_data;
2091struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2742struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -2129,7 +2780,7 @@ static void calculate_totalreserve_pages(void)
2129{ 2780{
2130 struct pglist_data *pgdat; 2781 struct pglist_data *pgdat;
2131 unsigned long reserve_pages = 0; 2782 unsigned long reserve_pages = 0;
2132 int i, j; 2783 enum zone_type i, j;
2133 2784
2134 for_each_online_pgdat(pgdat) { 2785 for_each_online_pgdat(pgdat) {
2135 for (i = 0; i < MAX_NR_ZONES; i++) { 2786 for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -2162,7 +2813,7 @@ static void calculate_totalreserve_pages(void)
2162static void setup_per_zone_lowmem_reserve(void) 2813static void setup_per_zone_lowmem_reserve(void)
2163{ 2814{
2164 struct pglist_data *pgdat; 2815 struct pglist_data *pgdat;
2165 int j, idx; 2816 enum zone_type j, idx;
2166 2817
2167 for_each_online_pgdat(pgdat) { 2818 for_each_online_pgdat(pgdat) {
2168 for (j = 0; j < MAX_NR_ZONES; j++) { 2819 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -2171,9 +2822,12 @@ static void setup_per_zone_lowmem_reserve(void)
2171 2822
2172 zone->lowmem_reserve[j] = 0; 2823 zone->lowmem_reserve[j] = 0;
2173 2824
2174 for (idx = j-1; idx >= 0; idx--) { 2825 idx = j;
2826 while (idx) {
2175 struct zone *lower_zone; 2827 struct zone *lower_zone;
2176 2828
2829 idx--;
2830
2177 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2831 if (sysctl_lowmem_reserve_ratio[idx] < 1)
2178 sysctl_lowmem_reserve_ratio[idx] = 1; 2832 sysctl_lowmem_reserve_ratio[idx] = 1;
2179 2833
@@ -2314,10 +2968,26 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
2314 return rc; 2968 return rc;
2315 2969
2316 for_each_zone(zone) 2970 for_each_zone(zone)
2317 zone->min_unmapped_ratio = (zone->present_pages * 2971 zone->min_unmapped_pages = (zone->present_pages *
2318 sysctl_min_unmapped_ratio) / 100; 2972 sysctl_min_unmapped_ratio) / 100;
2319 return 0; 2973 return 0;
2320} 2974}
2975
2976int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
2977 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2978{
2979 struct zone *zone;
2980 int rc;
2981
2982 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2983 if (rc)
2984 return rc;
2985
2986 for_each_zone(zone)
2987 zone->min_slab_pages = (zone->present_pages *
2988 sysctl_min_slab_ratio) / 100;
2989 return 0;
2990}
2321#endif 2991#endif
2322 2992
2323/* 2993/*
diff --git a/mm/page_io.c b/mm/page_io.c
index 88029948d00a..d4840ecbf8f9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -52,14 +52,29 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
52 if (bio->bi_size) 52 if (bio->bi_size)
53 return 1; 53 return 1;
54 54
55 if (!uptodate) 55 if (!uptodate) {
56 SetPageError(page); 56 SetPageError(page);
57 /*
58 * We failed to write the page out to swap-space.
59 * Re-dirty the page in order to avoid it being reclaimed.
60 * Also print a dire warning that things will go BAD (tm)
61 * very quickly.
62 *
63 * Also clear PG_reclaim to avoid rotate_reclaimable_page()
64 */
65 set_page_dirty(page);
66 printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
67 imajor(bio->bi_bdev->bd_inode),
68 iminor(bio->bi_bdev->bd_inode),
69 (unsigned long long)bio->bi_sector);
70 ClearPageReclaim(page);
71 }
57 end_page_writeback(page); 72 end_page_writeback(page);
58 bio_put(bio); 73 bio_put(bio);
59 return 0; 74 return 0;
60} 75}
61 76
62static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) 77int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
63{ 78{
64 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 79 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
65 struct page *page = bio->bi_io_vec[0].bv_page; 80 struct page *page = bio->bi_io_vec[0].bv_page;
@@ -70,6 +85,10 @@ static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
70 if (!uptodate) { 85 if (!uptodate) {
71 SetPageError(page); 86 SetPageError(page);
72 ClearPageUptodate(page); 87 ClearPageUptodate(page);
88 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
89 imajor(bio->bi_bdev->bd_inode),
90 iminor(bio->bi_bdev->bd_inode),
91 (unsigned long long)bio->bi_sector);
73 } else { 92 } else {
74 SetPageUptodate(page); 93 SetPageUptodate(page);
75 } 94 }
@@ -137,10 +156,12 @@ out:
137 * We use end_swap_bio_read() even for writes, because it happens to do what 156 * We use end_swap_bio_read() even for writes, because it happens to do what
138 * we want. 157 * we want.
139 */ 158 */
140int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) 159int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
160 struct bio **bio_chain)
141{ 161{
142 struct bio *bio; 162 struct bio *bio;
143 int ret = 0; 163 int ret = 0;
164 int bio_rw;
144 165
145 lock_page(page); 166 lock_page(page);
146 167
@@ -151,11 +172,22 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
151 goto out; 172 goto out;
152 } 173 }
153 174
154 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 175 bio_rw = rw;
155 wait_on_page_locked(page); 176 if (!bio_chain)
156 177 bio_rw |= (1 << BIO_RW_SYNC);
157 if (!PageUptodate(page) || PageError(page)) 178 if (bio_chain)
158 ret = -EIO; 179 bio_get(bio);
180 submit_bio(bio_rw, bio);
181 if (bio_chain == NULL) {
182 wait_on_page_locked(page);
183
184 if (!PageUptodate(page) || PageError(page))
185 ret = -EIO;
186 }
187 if (bio_chain) {
188 bio->bi_private = *bio_chain;
189 *bio_chain = bio;
190 }
159out: 191out:
160 return ret; 192 return ret;
161} 193}
diff --git a/mm/rmap.c b/mm/rmap.c
index 40158b59729e..e2155d791d99 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -434,6 +434,71 @@ int page_referenced(struct page *page, int is_locked)
434 return referenced; 434 return referenced;
435} 435}
436 436
437static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
438{
439 struct mm_struct *mm = vma->vm_mm;
440 unsigned long address;
441 pte_t *pte, entry;
442 spinlock_t *ptl;
443 int ret = 0;
444
445 address = vma_address(page, vma);
446 if (address == -EFAULT)
447 goto out;
448
449 pte = page_check_address(page, mm, address, &ptl);
450 if (!pte)
451 goto out;
452
453 if (!pte_dirty(*pte) && !pte_write(*pte))
454 goto unlock;
455
456 entry = ptep_get_and_clear(mm, address, pte);
457 entry = pte_mkclean(entry);
458 entry = pte_wrprotect(entry);
459 ptep_establish(vma, address, pte, entry);
460 lazy_mmu_prot_update(entry);
461 ret = 1;
462
463unlock:
464 pte_unmap_unlock(pte, ptl);
465out:
466 return ret;
467}
468
469static int page_mkclean_file(struct address_space *mapping, struct page *page)
470{
471 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
472 struct vm_area_struct *vma;
473 struct prio_tree_iter iter;
474 int ret = 0;
475
476 BUG_ON(PageAnon(page));
477
478 spin_lock(&mapping->i_mmap_lock);
479 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
480 if (vma->vm_flags & VM_SHARED)
481 ret += page_mkclean_one(page, vma);
482 }
483 spin_unlock(&mapping->i_mmap_lock);
484 return ret;
485}
486
487int page_mkclean(struct page *page)
488{
489 int ret = 0;
490
491 BUG_ON(!PageLocked(page));
492
493 if (page_mapped(page)) {
494 struct address_space *mapping = page_mapping(page);
495 if (mapping)
496 ret = page_mkclean_file(mapping, page);
497 }
498
499 return ret;
500}
501
437/** 502/**
438 * page_set_anon_rmap - setup new anonymous rmap 503 * page_set_anon_rmap - setup new anonymous rmap
439 * @page: the page to add the mapping to 504 * @page: the page to add the mapping to
diff --git a/mm/shmem.c b/mm/shmem.c
index db21c51531ca..eda907c3a86a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
45#include <linux/namei.h> 45#include <linux/namei.h>
46#include <linux/ctype.h> 46#include <linux/ctype.h>
47#include <linux/migrate.h> 47#include <linux/migrate.h>
48#include <linux/highmem.h>
48 49
49#include <asm/uaccess.h> 50#include <asm/uaccess.h>
50#include <asm/div64.h> 51#include <asm/div64.h>
@@ -1350,7 +1351,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1350 inode->i_mode = mode; 1351 inode->i_mode = mode;
1351 inode->i_uid = current->fsuid; 1352 inode->i_uid = current->fsuid;
1352 inode->i_gid = current->fsgid; 1353 inode->i_gid = current->fsgid;
1353 inode->i_blksize = PAGE_CACHE_SIZE;
1354 inode->i_blocks = 0; 1354 inode->i_blocks = 0;
1355 inode->i_mapping->a_ops = &shmem_aops; 1355 inode->i_mapping->a_ops = &shmem_aops;
1356 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1356 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -2156,8 +2156,7 @@ static int init_inodecache(void)
2156 2156
2157static void destroy_inodecache(void) 2157static void destroy_inodecache(void)
2158{ 2158{
2159 if (kmem_cache_destroy(shmem_inode_cachep)) 2159 kmem_cache_destroy(shmem_inode_cachep);
2160 printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
2161} 2160}
2162 2161
2163static const struct address_space_operations shmem_aops = { 2162static const struct address_space_operations shmem_aops = {
diff --git a/mm/slab.c b/mm/slab.c
index 21ba06035700..792bfe320a8b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -313,7 +313,7 @@ static int drain_freelist(struct kmem_cache *cache,
313 struct kmem_list3 *l3, int tofree); 313 struct kmem_list3 *l3, int tofree);
314static void free_block(struct kmem_cache *cachep, void **objpp, int len, 314static void free_block(struct kmem_cache *cachep, void **objpp, int len,
315 int node); 315 int node);
316static void enable_cpucache(struct kmem_cache *cachep); 316static int enable_cpucache(struct kmem_cache *cachep);
317static void cache_reap(void *unused); 317static void cache_reap(void *unused);
318 318
319/* 319/*
@@ -674,6 +674,8 @@ static struct kmem_cache cache_cache = {
674#endif 674#endif
675}; 675};
676 676
677#define BAD_ALIEN_MAGIC 0x01020304ul
678
677#ifdef CONFIG_LOCKDEP 679#ifdef CONFIG_LOCKDEP
678 680
679/* 681/*
@@ -682,42 +684,58 @@ static struct kmem_cache cache_cache = {
682 * The locking for this is tricky in that it nests within the locks 684 * The locking for this is tricky in that it nests within the locks
683 * of all other slabs in a few places; to deal with this special 685 * of all other slabs in a few places; to deal with this special
684 * locking we put on-slab caches into a separate lock-class. 686 * locking we put on-slab caches into a separate lock-class.
687 *
688 * We set lock class for alien array caches which are up during init.
689 * The lock annotation will be lost if all cpus of a node goes down and
690 * then comes back up during hotplug
685 */ 691 */
686static struct lock_class_key on_slab_key; 692static struct lock_class_key on_slab_l3_key;
693static struct lock_class_key on_slab_alc_key;
694
695static inline void init_lock_keys(void)
687 696
688static inline void init_lock_keys(struct cache_sizes *s)
689{ 697{
690 int q; 698 int q;
691 699 struct cache_sizes *s = malloc_sizes;
692 for (q = 0; q < MAX_NUMNODES; q++) { 700
693 if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep)) 701 while (s->cs_size != ULONG_MAX) {
694 continue; 702 for_each_node(q) {
695 lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock, 703 struct array_cache **alc;
696 &on_slab_key); 704 int r;
705 struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
706 if (!l3 || OFF_SLAB(s->cs_cachep))
707 continue;
708 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
709 alc = l3->alien;
710 /*
711 * FIXME: This check for BAD_ALIEN_MAGIC
712 * should go away when common slab code is taught to
713 * work even without alien caches.
714 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
715 * for alloc_alien_cache,
716 */
717 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
718 continue;
719 for_each_node(r) {
720 if (alc[r])
721 lockdep_set_class(&alc[r]->lock,
722 &on_slab_alc_key);
723 }
724 }
725 s++;
697 } 726 }
698} 727}
699
700#else 728#else
701static inline void init_lock_keys(struct cache_sizes *s) 729static inline void init_lock_keys(void)
702{ 730{
703} 731}
704#endif 732#endif
705 733
706
707
708/* Guard access to the cache-chain. */ 734/* Guard access to the cache-chain. */
709static DEFINE_MUTEX(cache_chain_mutex); 735static DEFINE_MUTEX(cache_chain_mutex);
710static struct list_head cache_chain; 736static struct list_head cache_chain;
711 737
712/* 738/*
713 * vm_enough_memory() looks at this to determine how many slab-allocated pages
714 * are possibly freeable under pressure
715 *
716 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
717 */
718atomic_t slab_reclaim_pages;
719
720/*
721 * chicken and egg problem: delay the per-cpu array allocation 739 * chicken and egg problem: delay the per-cpu array allocation
722 * until the general caches are up. 740 * until the general caches are up.
723 */ 741 */
@@ -768,11 +786,10 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
768 return csizep->cs_cachep; 786 return csizep->cs_cachep;
769} 787}
770 788
771struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 789static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
772{ 790{
773 return __find_general_cachep(size, gfpflags); 791 return __find_general_cachep(size, gfpflags);
774} 792}
775EXPORT_SYMBOL(kmem_find_general_cachep);
776 793
777static size_t slab_mgmt_size(size_t nr_objs, size_t align) 794static size_t slab_mgmt_size(size_t nr_objs, size_t align)
778{ 795{
@@ -955,7 +972,39 @@ static int transfer_objects(struct array_cache *to,
955 return nr; 972 return nr;
956} 973}
957 974
958#ifdef CONFIG_NUMA 975#ifndef CONFIG_NUMA
976
977#define drain_alien_cache(cachep, alien) do { } while (0)
978#define reap_alien(cachep, l3) do { } while (0)
979
980static inline struct array_cache **alloc_alien_cache(int node, int limit)
981{
982 return (struct array_cache **)BAD_ALIEN_MAGIC;
983}
984
985static inline void free_alien_cache(struct array_cache **ac_ptr)
986{
987}
988
989static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
990{
991 return 0;
992}
993
994static inline void *alternate_node_alloc(struct kmem_cache *cachep,
995 gfp_t flags)
996{
997 return NULL;
998}
999
1000static inline void *__cache_alloc_node(struct kmem_cache *cachep,
1001 gfp_t flags, int nodeid)
1002{
1003 return NULL;
1004}
1005
1006#else /* CONFIG_NUMA */
1007
959static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 1008static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
960static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1009static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
961 1010
@@ -1084,26 +1133,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1084 } 1133 }
1085 return 1; 1134 return 1;
1086} 1135}
1087
1088#else
1089
1090#define drain_alien_cache(cachep, alien) do { } while (0)
1091#define reap_alien(cachep, l3) do { } while (0)
1092
1093static inline struct array_cache **alloc_alien_cache(int node, int limit)
1094{
1095 return (struct array_cache **) 0x01020304ul;
1096}
1097
1098static inline void free_alien_cache(struct array_cache **ac_ptr)
1099{
1100}
1101
1102static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1103{
1104 return 0;
1105}
1106
1107#endif 1136#endif
1108 1137
1109static int __cpuinit cpuup_callback(struct notifier_block *nfb, 1138static int __cpuinit cpuup_callback(struct notifier_block *nfb,
@@ -1422,7 +1451,6 @@ void __init kmem_cache_init(void)
1422 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1451 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1423 NULL, NULL); 1452 NULL, NULL);
1424 } 1453 }
1425 init_lock_keys(sizes);
1426 1454
1427 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1455 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1428 sizes->cs_size, 1456 sizes->cs_size,
@@ -1491,10 +1519,15 @@ void __init kmem_cache_init(void)
1491 struct kmem_cache *cachep; 1519 struct kmem_cache *cachep;
1492 mutex_lock(&cache_chain_mutex); 1520 mutex_lock(&cache_chain_mutex);
1493 list_for_each_entry(cachep, &cache_chain, next) 1521 list_for_each_entry(cachep, &cache_chain, next)
1494 enable_cpucache(cachep); 1522 if (enable_cpucache(cachep))
1523 BUG();
1495 mutex_unlock(&cache_chain_mutex); 1524 mutex_unlock(&cache_chain_mutex);
1496 } 1525 }
1497 1526
1527 /* Annotate slab for lockdep -- annotate the malloc caches */
1528 init_lock_keys();
1529
1530
1498 /* Done! */ 1531 /* Done! */
1499 g_cpucache_up = FULL; 1532 g_cpucache_up = FULL;
1500 1533
@@ -1543,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1543 */ 1576 */
1544 flags |= __GFP_COMP; 1577 flags |= __GFP_COMP;
1545#endif 1578#endif
1546 flags |= cachep->gfpflags; 1579
1580 /*
1581 * Under NUMA we want memory on the indicated node. We will handle
1582 * the needed fallback ourselves since we want to serve from our
1583 * per node object lists first for other nodes.
1584 */
1585 flags |= cachep->gfpflags | GFP_THISNODE;
1547 1586
1548 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1587 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1549 if (!page) 1588 if (!page)
@@ -1551,8 +1590,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1551 1590
1552 nr_pages = (1 << cachep->gfporder); 1591 nr_pages = (1 << cachep->gfporder);
1553 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1592 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1554 atomic_add(nr_pages, &slab_reclaim_pages); 1593 add_zone_page_state(page_zone(page),
1555 add_zone_page_state(page_zone(page), NR_SLAB, nr_pages); 1594 NR_SLAB_RECLAIMABLE, nr_pages);
1595 else
1596 add_zone_page_state(page_zone(page),
1597 NR_SLAB_UNRECLAIMABLE, nr_pages);
1556 for (i = 0; i < nr_pages; i++) 1598 for (i = 0; i < nr_pages; i++)
1557 __SetPageSlab(page + i); 1599 __SetPageSlab(page + i);
1558 return page_address(page); 1600 return page_address(page);
@@ -1567,7 +1609,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1567 struct page *page = virt_to_page(addr); 1609 struct page *page = virt_to_page(addr);
1568 const unsigned long nr_freed = i; 1610 const unsigned long nr_freed = i;
1569 1611
1570 sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed); 1612 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1613 sub_zone_page_state(page_zone(page),
1614 NR_SLAB_RECLAIMABLE, nr_freed);
1615 else
1616 sub_zone_page_state(page_zone(page),
1617 NR_SLAB_UNRECLAIMABLE, nr_freed);
1571 while (i--) { 1618 while (i--) {
1572 BUG_ON(!PageSlab(page)); 1619 BUG_ON(!PageSlab(page));
1573 __ClearPageSlab(page); 1620 __ClearPageSlab(page);
@@ -1576,8 +1623,6 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1576 if (current->reclaim_state) 1623 if (current->reclaim_state)
1577 current->reclaim_state->reclaimed_slab += nr_freed; 1624 current->reclaim_state->reclaimed_slab += nr_freed;
1578 free_pages((unsigned long)addr, cachep->gfporder); 1625 free_pages((unsigned long)addr, cachep->gfporder);
1579 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1580 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1581} 1626}
1582 1627
1583static void kmem_rcu_free(struct rcu_head *head) 1628static void kmem_rcu_free(struct rcu_head *head)
@@ -1834,6 +1879,27 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
1834 } 1879 }
1835} 1880}
1836 1881
1882static void __kmem_cache_destroy(struct kmem_cache *cachep)
1883{
1884 int i;
1885 struct kmem_list3 *l3;
1886
1887 for_each_online_cpu(i)
1888 kfree(cachep->array[i]);
1889
1890 /* NUMA: free the list3 structures */
1891 for_each_online_node(i) {
1892 l3 = cachep->nodelists[i];
1893 if (l3) {
1894 kfree(l3->shared);
1895 free_alien_cache(l3->alien);
1896 kfree(l3);
1897 }
1898 }
1899 kmem_cache_free(&cache_cache, cachep);
1900}
1901
1902
1837/** 1903/**
1838 * calculate_slab_order - calculate size (page order) of slabs 1904 * calculate_slab_order - calculate size (page order) of slabs
1839 * @cachep: pointer to the cache that is being created 1905 * @cachep: pointer to the cache that is being created
@@ -1904,12 +1970,11 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
1904 return left_over; 1970 return left_over;
1905} 1971}
1906 1972
1907static void setup_cpu_cache(struct kmem_cache *cachep) 1973static int setup_cpu_cache(struct kmem_cache *cachep)
1908{ 1974{
1909 if (g_cpucache_up == FULL) { 1975 if (g_cpucache_up == FULL)
1910 enable_cpucache(cachep); 1976 return enable_cpucache(cachep);
1911 return; 1977
1912 }
1913 if (g_cpucache_up == NONE) { 1978 if (g_cpucache_up == NONE) {
1914 /* 1979 /*
1915 * Note: the first kmem_cache_create must create the cache 1980 * Note: the first kmem_cache_create must create the cache
@@ -1956,6 +2021,7 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
1956 cpu_cache_get(cachep)->touched = 0; 2021 cpu_cache_get(cachep)->touched = 0;
1957 cachep->batchcount = 1; 2022 cachep->batchcount = 1;
1958 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2023 cachep->limit = BOOT_CPUCACHE_ENTRIES;
2024 return 0;
1959} 2025}
1960 2026
1961/** 2027/**
@@ -2097,6 +2163,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2097 } else { 2163 } else {
2098 ralign = BYTES_PER_WORD; 2164 ralign = BYTES_PER_WORD;
2099 } 2165 }
2166
2167 /*
2168 * Redzoning and user store require word alignment. Note this will be
2169 * overridden by architecture or caller mandated alignment if either
2170 * is greater than BYTES_PER_WORD.
2171 */
2172 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2173 ralign = BYTES_PER_WORD;
2174
2100 /* 2) arch mandated alignment: disables debug if necessary */ 2175 /* 2) arch mandated alignment: disables debug if necessary */
2101 if (ralign < ARCH_SLAB_MINALIGN) { 2176 if (ralign < ARCH_SLAB_MINALIGN) {
2102 ralign = ARCH_SLAB_MINALIGN; 2177 ralign = ARCH_SLAB_MINALIGN;
@@ -2110,8 +2185,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2110 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2185 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2111 } 2186 }
2112 /* 2187 /*
2113 * 4) Store it. Note that the debug code below can reduce 2188 * 4) Store it.
2114 * the alignment to BYTES_PER_WORD.
2115 */ 2189 */
2116 align = ralign; 2190 align = ralign;
2117 2191
@@ -2123,20 +2197,19 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2123#if DEBUG 2197#if DEBUG
2124 cachep->obj_size = size; 2198 cachep->obj_size = size;
2125 2199
2200 /*
2201 * Both debugging options require word-alignment which is calculated
2202 * into align above.
2203 */
2126 if (flags & SLAB_RED_ZONE) { 2204 if (flags & SLAB_RED_ZONE) {
2127 /* redzoning only works with word aligned caches */
2128 align = BYTES_PER_WORD;
2129
2130 /* add space for red zone words */ 2205 /* add space for red zone words */
2131 cachep->obj_offset += BYTES_PER_WORD; 2206 cachep->obj_offset += BYTES_PER_WORD;
2132 size += 2 * BYTES_PER_WORD; 2207 size += 2 * BYTES_PER_WORD;
2133 } 2208 }
2134 if (flags & SLAB_STORE_USER) { 2209 if (flags & SLAB_STORE_USER) {
2135 /* user store requires word alignment and 2210 /* user store requires one word storage behind the end of
2136 * one word storage behind the end of the real 2211 * the real object.
2137 * object.
2138 */ 2212 */
2139 align = BYTES_PER_WORD;
2140 size += BYTES_PER_WORD; 2213 size += BYTES_PER_WORD;
2141 } 2214 }
2142#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2215#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
@@ -2200,14 +2273,26 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2200 cachep->gfpflags |= GFP_DMA; 2273 cachep->gfpflags |= GFP_DMA;
2201 cachep->buffer_size = size; 2274 cachep->buffer_size = size;
2202 2275
2203 if (flags & CFLGS_OFF_SLAB) 2276 if (flags & CFLGS_OFF_SLAB) {
2204 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2277 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2278 /*
2279 * This is a possibility for one of the malloc_sizes caches.
2280 * But since we go off slab only for object size greater than
2281 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2282 * this should not happen at all.
2283 * But leave a BUG_ON for some lucky dude.
2284 */
2285 BUG_ON(!cachep->slabp_cache);
2286 }
2205 cachep->ctor = ctor; 2287 cachep->ctor = ctor;
2206 cachep->dtor = dtor; 2288 cachep->dtor = dtor;
2207 cachep->name = name; 2289 cachep->name = name;
2208 2290
2209 2291 if (setup_cpu_cache(cachep)) {
2210 setup_cpu_cache(cachep); 2292 __kmem_cache_destroy(cachep);
2293 cachep = NULL;
2294 goto oops;
2295 }
2211 2296
2212 /* cache setup completed, link it into the list */ 2297 /* cache setup completed, link it into the list */
2213 list_add(&cachep->next, &cache_chain); 2298 list_add(&cachep->next, &cache_chain);
@@ -2375,7 +2460,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2375 * @cachep: the cache to destroy 2460 * @cachep: the cache to destroy
2376 * 2461 *
2377 * Remove a struct kmem_cache object from the slab cache. 2462 * Remove a struct kmem_cache object from the slab cache.
2378 * Returns 0 on success.
2379 * 2463 *
2380 * It is expected this function will be called by a module when it is 2464 * It is expected this function will be called by a module when it is
2381 * unloaded. This will remove the cache completely, and avoid a duplicate 2465 * unloaded. This will remove the cache completely, and avoid a duplicate
@@ -2387,11 +2471,8 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2387 * The caller must guarantee that noone will allocate memory from the cache 2471 * The caller must guarantee that noone will allocate memory from the cache
2388 * during the kmem_cache_destroy(). 2472 * during the kmem_cache_destroy().
2389 */ 2473 */
2390int kmem_cache_destroy(struct kmem_cache *cachep) 2474void kmem_cache_destroy(struct kmem_cache *cachep)
2391{ 2475{
2392 int i;
2393 struct kmem_list3 *l3;
2394
2395 BUG_ON(!cachep || in_interrupt()); 2476 BUG_ON(!cachep || in_interrupt());
2396 2477
2397 /* Don't let CPUs to come and go */ 2478 /* Don't let CPUs to come and go */
@@ -2411,31 +2492,28 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
2411 list_add(&cachep->next, &cache_chain); 2492 list_add(&cachep->next, &cache_chain);
2412 mutex_unlock(&cache_chain_mutex); 2493 mutex_unlock(&cache_chain_mutex);
2413 unlock_cpu_hotplug(); 2494 unlock_cpu_hotplug();
2414 return 1; 2495 return;
2415 } 2496 }
2416 2497
2417 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2498 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2418 synchronize_rcu(); 2499 synchronize_rcu();
2419 2500
2420 for_each_online_cpu(i) 2501 __kmem_cache_destroy(cachep);
2421 kfree(cachep->array[i]);
2422
2423 /* NUMA: free the list3 structures */
2424 for_each_online_node(i) {
2425 l3 = cachep->nodelists[i];
2426 if (l3) {
2427 kfree(l3->shared);
2428 free_alien_cache(l3->alien);
2429 kfree(l3);
2430 }
2431 }
2432 kmem_cache_free(&cache_cache, cachep);
2433 unlock_cpu_hotplug(); 2502 unlock_cpu_hotplug();
2434 return 0;
2435} 2503}
2436EXPORT_SYMBOL(kmem_cache_destroy); 2504EXPORT_SYMBOL(kmem_cache_destroy);
2437 2505
2438/* Get the memory for a slab management obj. */ 2506/*
2507 * Get the memory for a slab management obj.
2508 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2509 * always come from malloc_sizes caches. The slab descriptor cannot
2510 * come from the same cache which is getting created because,
2511 * when we are searching for an appropriate cache for these
2512 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2513 * If we are creating a malloc_sizes cache here it would not be visible to
2514 * kmem_find_general_cachep till the initialization is complete.
2515 * Hence we cannot have slabp_cache same as the original cache.
2516 */
2439static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, 2517static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2440 int colour_off, gfp_t local_flags, 2518 int colour_off, gfp_t local_flags,
2441 int nodeid) 2519 int nodeid)
@@ -2968,14 +3046,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2968 void *objp; 3046 void *objp;
2969 struct array_cache *ac; 3047 struct array_cache *ac;
2970 3048
2971#ifdef CONFIG_NUMA
2972 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
2973 objp = alternate_node_alloc(cachep, flags);
2974 if (objp != NULL)
2975 return objp;
2976 }
2977#endif
2978
2979 check_irq_off(); 3049 check_irq_off();
2980 ac = cpu_cache_get(cachep); 3050 ac = cpu_cache_get(cachep);
2981 if (likely(ac->avail)) { 3051 if (likely(ac->avail)) {
@@ -2993,12 +3063,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
2993 gfp_t flags, void *caller) 3063 gfp_t flags, void *caller)
2994{ 3064{
2995 unsigned long save_flags; 3065 unsigned long save_flags;
2996 void *objp; 3066 void *objp = NULL;
2997 3067
2998 cache_alloc_debugcheck_before(cachep, flags); 3068 cache_alloc_debugcheck_before(cachep, flags);
2999 3069
3000 local_irq_save(save_flags); 3070 local_irq_save(save_flags);
3001 objp = ____cache_alloc(cachep, flags); 3071
3072 if (unlikely(NUMA_BUILD &&
3073 current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3074 objp = alternate_node_alloc(cachep, flags);
3075
3076 if (!objp)
3077 objp = ____cache_alloc(cachep, flags);
3078 /*
3079 * We may just have run out of memory on the local node.
3080 * __cache_alloc_node() knows how to locate memory on other nodes
3081 */
3082 if (NUMA_BUILD && !objp)
3083 objp = __cache_alloc_node(cachep, flags, numa_node_id());
3002 local_irq_restore(save_flags); 3084 local_irq_restore(save_flags);
3003 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 3085 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3004 caller); 3086 caller);
@@ -3017,7 +3099,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3017{ 3099{
3018 int nid_alloc, nid_here; 3100 int nid_alloc, nid_here;
3019 3101
3020 if (in_interrupt()) 3102 if (in_interrupt() || (flags & __GFP_THISNODE))
3021 return NULL; 3103 return NULL;
3022 nid_alloc = nid_here = numa_node_id(); 3104 nid_alloc = nid_here = numa_node_id();
3023 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3105 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
@@ -3030,6 +3112,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3030} 3112}
3031 3113
3032/* 3114/*
3115 * Fallback function if there was no memory available and no objects on a
3116 * certain node and we are allowed to fall back. We mimick the behavior of
3117 * the page allocator. We fall back according to a zonelist determined by
3118 * the policy layer while obeying cpuset constraints.
3119 */
3120void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3121{
3122 struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
3123 ->node_zonelists[gfp_zone(flags)];
3124 struct zone **z;
3125 void *obj = NULL;
3126
3127 for (z = zonelist->zones; *z && !obj; z++)
3128 if (zone_idx(*z) <= ZONE_NORMAL &&
3129 cpuset_zone_allowed(*z, flags))
3130 obj = __cache_alloc_node(cache,
3131 flags | __GFP_THISNODE,
3132 zone_to_nid(*z));
3133 return obj;
3134}
3135
3136/*
3033 * A interface to enable slab creation on nodeid 3137 * A interface to enable slab creation on nodeid
3034 */ 3138 */
3035static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3139static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
@@ -3082,11 +3186,15 @@ retry:
3082must_grow: 3186must_grow:
3083 spin_unlock(&l3->list_lock); 3187 spin_unlock(&l3->list_lock);
3084 x = cache_grow(cachep, flags, nodeid); 3188 x = cache_grow(cachep, flags, nodeid);
3189 if (x)
3190 goto retry;
3085 3191
3086 if (!x) 3192 if (!(flags & __GFP_THISNODE))
3087 return NULL; 3193 /* Unable to grow the cache. Fall back to other nodes. */
3194 return fallback_alloc(cachep, flags);
3195
3196 return NULL;
3088 3197
3089 goto retry;
3090done: 3198done:
3091 return obj; 3199 return obj;
3092} 3200}
@@ -3119,6 +3227,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3119 if (slabp->inuse == 0) { 3227 if (slabp->inuse == 0) {
3120 if (l3->free_objects > l3->free_limit) { 3228 if (l3->free_objects > l3->free_limit) {
3121 l3->free_objects -= cachep->num; 3229 l3->free_objects -= cachep->num;
3230 /* No need to drop any previously held
3231 * lock here, even if we have a off-slab slab
3232 * descriptor it is guaranteed to come from
3233 * a different cache, refer to comments before
3234 * alloc_slabmgmt.
3235 */
3122 slab_destroy(cachep, slabp); 3236 slab_destroy(cachep, slabp);
3123 } else { 3237 } else {
3124 list_add(&slabp->list, &l3->slabs_free); 3238 list_add(&slabp->list, &l3->slabs_free);
@@ -3317,7 +3431,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3317} 3431}
3318EXPORT_SYMBOL(kmem_cache_alloc_node); 3432EXPORT_SYMBOL(kmem_cache_alloc_node);
3319 3433
3320void *kmalloc_node(size_t size, gfp_t flags, int node) 3434void *__kmalloc_node(size_t size, gfp_t flags, int node)
3321{ 3435{
3322 struct kmem_cache *cachep; 3436 struct kmem_cache *cachep;
3323 3437
@@ -3326,7 +3440,7 @@ void *kmalloc_node(size_t size, gfp_t flags, int node)
3326 return NULL; 3440 return NULL;
3327 return kmem_cache_alloc_node(cachep, flags, node); 3441 return kmem_cache_alloc_node(cachep, flags, node);
3328} 3442}
3329EXPORT_SYMBOL(kmalloc_node); 3443EXPORT_SYMBOL(__kmalloc_node);
3330#endif 3444#endif
3331 3445
3332/** 3446/**
@@ -3370,55 +3484,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3370EXPORT_SYMBOL(__kmalloc_track_caller); 3484EXPORT_SYMBOL(__kmalloc_track_caller);
3371#endif 3485#endif
3372 3486
3373#ifdef CONFIG_SMP
3374/**
3375 * __alloc_percpu - allocate one copy of the object for every present
3376 * cpu in the system, zeroing them.
3377 * Objects should be dereferenced using the per_cpu_ptr macro only.
3378 *
3379 * @size: how many bytes of memory are required.
3380 */
3381void *__alloc_percpu(size_t size)
3382{
3383 int i;
3384 struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
3385
3386 if (!pdata)
3387 return NULL;
3388
3389 /*
3390 * Cannot use for_each_online_cpu since a cpu may come online
3391 * and we have no way of figuring out how to fix the array
3392 * that we have allocated then....
3393 */
3394 for_each_possible_cpu(i) {
3395 int node = cpu_to_node(i);
3396
3397 if (node_online(node))
3398 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
3399 else
3400 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
3401
3402 if (!pdata->ptrs[i])
3403 goto unwind_oom;
3404 memset(pdata->ptrs[i], 0, size);
3405 }
3406
3407 /* Catch derefs w/o wrappers */
3408 return (void *)(~(unsigned long)pdata);
3409
3410unwind_oom:
3411 while (--i >= 0) {
3412 if (!cpu_possible(i))
3413 continue;
3414 kfree(pdata->ptrs[i]);
3415 }
3416 kfree(pdata);
3417 return NULL;
3418}
3419EXPORT_SYMBOL(__alloc_percpu);
3420#endif
3421
3422/** 3487/**
3423 * kmem_cache_free - Deallocate an object 3488 * kmem_cache_free - Deallocate an object
3424 * @cachep: The cache the allocation was from. 3489 * @cachep: The cache the allocation was from.
@@ -3464,29 +3529,6 @@ void kfree(const void *objp)
3464} 3529}
3465EXPORT_SYMBOL(kfree); 3530EXPORT_SYMBOL(kfree);
3466 3531
3467#ifdef CONFIG_SMP
3468/**
3469 * free_percpu - free previously allocated percpu memory
3470 * @objp: pointer returned by alloc_percpu.
3471 *
3472 * Don't free memory not originally allocated by alloc_percpu()
3473 * The complemented objp is to check for that.
3474 */
3475void free_percpu(const void *objp)
3476{
3477 int i;
3478 struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3479
3480 /*
3481 * We allocate for all cpus so we cannot use for online cpu here.
3482 */
3483 for_each_possible_cpu(i)
3484 kfree(p->ptrs[i]);
3485 kfree(p);
3486}
3487EXPORT_SYMBOL(free_percpu);
3488#endif
3489
3490unsigned int kmem_cache_size(struct kmem_cache *cachep) 3532unsigned int kmem_cache_size(struct kmem_cache *cachep)
3491{ 3533{
3492 return obj_size(cachep); 3534 return obj_size(cachep);
@@ -3603,22 +3645,26 @@ static void do_ccupdate_local(void *info)
3603static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3645static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3604 int batchcount, int shared) 3646 int batchcount, int shared)
3605{ 3647{
3606 struct ccupdate_struct new; 3648 struct ccupdate_struct *new;
3607 int i, err; 3649 int i;
3650
3651 new = kzalloc(sizeof(*new), GFP_KERNEL);
3652 if (!new)
3653 return -ENOMEM;
3608 3654
3609 memset(&new.new, 0, sizeof(new.new));
3610 for_each_online_cpu(i) { 3655 for_each_online_cpu(i) {
3611 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, 3656 new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3612 batchcount); 3657 batchcount);
3613 if (!new.new[i]) { 3658 if (!new->new[i]) {
3614 for (i--; i >= 0; i--) 3659 for (i--; i >= 0; i--)
3615 kfree(new.new[i]); 3660 kfree(new->new[i]);
3661 kfree(new);
3616 return -ENOMEM; 3662 return -ENOMEM;
3617 } 3663 }
3618 } 3664 }
3619 new.cachep = cachep; 3665 new->cachep = cachep;
3620 3666
3621 on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); 3667 on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
3622 3668
3623 check_irq_on(); 3669 check_irq_on();
3624 cachep->batchcount = batchcount; 3670 cachep->batchcount = batchcount;
@@ -3626,7 +3672,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3626 cachep->shared = shared; 3672 cachep->shared = shared;
3627 3673
3628 for_each_online_cpu(i) { 3674 for_each_online_cpu(i) {
3629 struct array_cache *ccold = new.new[i]; 3675 struct array_cache *ccold = new->new[i];
3630 if (!ccold) 3676 if (!ccold)
3631 continue; 3677 continue;
3632 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3678 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
@@ -3634,18 +3680,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3634 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3680 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3635 kfree(ccold); 3681 kfree(ccold);
3636 } 3682 }
3637 3683 kfree(new);
3638 err = alloc_kmemlist(cachep); 3684 return alloc_kmemlist(cachep);
3639 if (err) {
3640 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3641 cachep->name, -err);
3642 BUG();
3643 }
3644 return 0;
3645} 3685}
3646 3686
3647/* Called with cache_chain_mutex held always */ 3687/* Called with cache_chain_mutex held always */
3648static void enable_cpucache(struct kmem_cache *cachep) 3688static int enable_cpucache(struct kmem_cache *cachep)
3649{ 3689{
3650 int err; 3690 int err;
3651 int limit, shared; 3691 int limit, shared;
@@ -3697,6 +3737,7 @@ static void enable_cpucache(struct kmem_cache *cachep)
3697 if (err) 3737 if (err)
3698 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3738 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3699 cachep->name, -err); 3739 cachep->name, -err);
3740 return err;
3700} 3741}
3701 3742
3702/* 3743/*
@@ -4157,6 +4198,7 @@ static int leaks_show(struct seq_file *m, void *p)
4157 show_symbol(m, n[2*i+2]); 4198 show_symbol(m, n[2*i+2]);
4158 seq_putc(m, '\n'); 4199 seq_putc(m, '\n');
4159 } 4200 }
4201
4160 return 0; 4202 return 0;
4161} 4203}
4162 4204
diff --git a/mm/slob.c b/mm/slob.c
index 7b52b20b9607..542394184a58 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -270,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
270} 270}
271EXPORT_SYMBOL(kmem_cache_create); 271EXPORT_SYMBOL(kmem_cache_create);
272 272
273int kmem_cache_destroy(struct kmem_cache *c) 273void kmem_cache_destroy(struct kmem_cache *c)
274{ 274{
275 slob_free(c, sizeof(struct kmem_cache)); 275 slob_free(c, sizeof(struct kmem_cache));
276 return 0;
277} 276}
278EXPORT_SYMBOL(kmem_cache_destroy); 277EXPORT_SYMBOL(kmem_cache_destroy);
279 278
@@ -339,52 +338,3 @@ void kmem_cache_init(void)
339 338
340 mod_timer(&slob_timer, jiffies + HZ); 339 mod_timer(&slob_timer, jiffies + HZ);
341} 340}
342
343atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
344EXPORT_SYMBOL(slab_reclaim_pages);
345
346#ifdef CONFIG_SMP
347
348void *__alloc_percpu(size_t size)
349{
350 int i;
351 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
352
353 if (!pdata)
354 return NULL;
355
356 for_each_possible_cpu(i) {
357 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
358 if (!pdata->ptrs[i])
359 goto unwind_oom;
360 memset(pdata->ptrs[i], 0, size);
361 }
362
363 /* Catch derefs w/o wrappers */
364 return (void *) (~(unsigned long) pdata);
365
366unwind_oom:
367 while (--i >= 0) {
368 if (!cpu_possible(i))
369 continue;
370 kfree(pdata->ptrs[i]);
371 }
372 kfree(pdata);
373 return NULL;
374}
375EXPORT_SYMBOL(__alloc_percpu);
376
377void
378free_percpu(const void *objp)
379{
380 int i;
381 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
382
383 for_each_possible_cpu(i)
384 kfree(p->ptrs[i]);
385
386 kfree(p);
387}
388EXPORT_SYMBOL(free_percpu);
389
390#endif
diff --git a/mm/swap.c b/mm/swap.c
index 687686a61f7c..2e0e871f542f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,6 +34,25 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37/*
38 * This path almost never happens for VM activity - pages are normally
39 * freed via pagevecs. But it gets used by networking.
40 */
41static void fastcall __page_cache_release(struct page *page)
42{
43 if (PageLRU(page)) {
44 unsigned long flags;
45 struct zone *zone = page_zone(page);
46
47 spin_lock_irqsave(&zone->lru_lock, flags);
48 VM_BUG_ON(!PageLRU(page));
49 __ClearPageLRU(page);
50 del_page_from_lru(zone, page);
51 spin_unlock_irqrestore(&zone->lru_lock, flags);
52 }
53 free_hot_page(page);
54}
55
37static void put_compound_page(struct page *page) 56static void put_compound_page(struct page *page)
38{ 57{
39 page = (struct page *)page_private(page); 58 page = (struct page *)page_private(page);
@@ -223,26 +242,6 @@ int lru_add_drain_all(void)
223#endif 242#endif
224 243
225/* 244/*
226 * This path almost never happens for VM activity - pages are normally
227 * freed via pagevecs. But it gets used by networking.
228 */
229void fastcall __page_cache_release(struct page *page)
230{
231 if (PageLRU(page)) {
232 unsigned long flags;
233 struct zone *zone = page_zone(page);
234
235 spin_lock_irqsave(&zone->lru_lock, flags);
236 BUG_ON(!PageLRU(page));
237 __ClearPageLRU(page);
238 del_page_from_lru(zone, page);
239 spin_unlock_irqrestore(&zone->lru_lock, flags);
240 }
241 free_hot_page(page);
242}
243EXPORT_SYMBOL(__page_cache_release);
244
245/*
246 * Batched page_cache_release(). Decrement the reference count on all the 245 * Batched page_cache_release(). Decrement the reference count on all the
247 * passed pages. If it fell to zero then remove the page from the LRU and 246 * passed pages. If it fell to zero then remove the page from the LRU and
248 * free it. 247 * free it.
@@ -284,7 +283,7 @@ void release_pages(struct page **pages, int nr, int cold)
284 zone = pagezone; 283 zone = pagezone;
285 spin_lock_irq(&zone->lru_lock); 284 spin_lock_irq(&zone->lru_lock);
286 } 285 }
287 BUG_ON(!PageLRU(page)); 286 VM_BUG_ON(!PageLRU(page));
288 __ClearPageLRU(page); 287 __ClearPageLRU(page);
289 del_page_from_lru(zone, page); 288 del_page_from_lru(zone, page);
290 } 289 }
@@ -337,7 +336,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
337 for (i = 0; i < pagevec_count(pvec); i++) { 336 for (i = 0; i < pagevec_count(pvec); i++) {
338 struct page *page = pvec->pages[i]; 337 struct page *page = pvec->pages[i];
339 338
340 BUG_ON(PageLRU(page)); 339 VM_BUG_ON(PageLRU(page));
341 if (put_page_testzero(page)) 340 if (put_page_testzero(page))
342 pagevec_add(&pages_to_free, page); 341 pagevec_add(&pages_to_free, page);
343 } 342 }
@@ -364,7 +363,7 @@ void __pagevec_lru_add(struct pagevec *pvec)
364 zone = pagezone; 363 zone = pagezone;
365 spin_lock_irq(&zone->lru_lock); 364 spin_lock_irq(&zone->lru_lock);
366 } 365 }
367 BUG_ON(PageLRU(page)); 366 VM_BUG_ON(PageLRU(page));
368 SetPageLRU(page); 367 SetPageLRU(page);
369 add_page_to_inactive_list(zone, page); 368 add_page_to_inactive_list(zone, page);
370 } 369 }
@@ -391,9 +390,9 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
391 zone = pagezone; 390 zone = pagezone;
392 spin_lock_irq(&zone->lru_lock); 391 spin_lock_irq(&zone->lru_lock);
393 } 392 }
394 BUG_ON(PageLRU(page)); 393 VM_BUG_ON(PageLRU(page));
395 SetPageLRU(page); 394 SetPageLRU(page);
396 BUG_ON(PageActive(page)); 395 VM_BUG_ON(PageActive(page));
397 SetPageActive(page); 396 SetPageActive(page);
398 add_page_to_active_list(zone, page); 397 add_page_to_active_list(zone, page);
399 } 398 }
diff --git a/mm/truncate.c b/mm/truncate.c
index c6ab55ec6883..a654928323dc 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/swap.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/pagevec.h> 15#include <linux/pagevec.h>
@@ -52,36 +53,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
52/* 53/*
53 * This is for invalidate_inode_pages(). That function can be called at 54 * This is for invalidate_inode_pages(). That function can be called at
54 * any time, and is not supposed to throw away dirty pages. But pages can 55 * any time, and is not supposed to throw away dirty pages. But pages can
55 * be marked dirty at any time too. So we re-check the dirtiness inside 56 * be marked dirty at any time too, so use remove_mapping which safely
56 * ->tree_lock. That provides exclusion against the __set_page_dirty 57 * discards clean, unused pages.
57 * functions.
58 * 58 *
59 * Returns non-zero if the page was successfully invalidated. 59 * Returns non-zero if the page was successfully invalidated.
60 */ 60 */
61static int 61static int
62invalidate_complete_page(struct address_space *mapping, struct page *page) 62invalidate_complete_page(struct address_space *mapping, struct page *page)
63{ 63{
64 int ret;
65
64 if (page->mapping != mapping) 66 if (page->mapping != mapping)
65 return 0; 67 return 0;
66 68
67 if (PagePrivate(page) && !try_to_release_page(page, 0)) 69 if (PagePrivate(page) && !try_to_release_page(page, 0))
68 return 0; 70 return 0;
69 71
70 write_lock_irq(&mapping->tree_lock); 72 ret = remove_mapping(mapping, page);
71 if (PageDirty(page))
72 goto failed;
73 if (page_count(page) != 2) /* caller's ref + pagecache ref */
74 goto failed;
75
76 BUG_ON(PagePrivate(page));
77 __remove_from_page_cache(page);
78 write_unlock_irq(&mapping->tree_lock);
79 ClearPageUptodate(page); 73 ClearPageUptodate(page);
80 page_cache_release(page); /* pagecache ref */ 74
81 return 1; 75 return ret;
82failed:
83 write_unlock_irq(&mapping->tree_lock);
84 return 0;
85} 76}
86 77
87/** 78/**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 266162d2ba28..1ac191ce5641 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,9 @@
24DEFINE_RWLOCK(vmlist_lock); 24DEFINE_RWLOCK(vmlist_lock);
25struct vm_struct *vmlist; 25struct vm_struct *vmlist;
26 26
27static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
28 int node);
29
27static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 30static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
28{ 31{
29 pte_t *pte; 32 pte_t *pte;
@@ -238,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
238 241
239/** 242/**
240 * get_vm_area - reserve a contingous kernel virtual area 243 * get_vm_area - reserve a contingous kernel virtual area
241 *
242 * @size: size of the area 244 * @size: size of the area
243 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 245 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
244 * 246 *
@@ -270,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr)
270} 272}
271 273
272/* Caller must hold vmlist_lock */ 274/* Caller must hold vmlist_lock */
273struct vm_struct *__remove_vm_area(void *addr) 275static struct vm_struct *__remove_vm_area(void *addr)
274{ 276{
275 struct vm_struct **p, *tmp; 277 struct vm_struct **p, *tmp;
276 278
@@ -293,7 +295,6 @@ found:
293 295
294/** 296/**
295 * remove_vm_area - find and remove a contingous kernel virtual area 297 * remove_vm_area - find and remove a contingous kernel virtual area
296 *
297 * @addr: base address 298 * @addr: base address
298 * 299 *
299 * Search for the kernel VM area starting at @addr, and remove it. 300 * Search for the kernel VM area starting at @addr, and remove it.
@@ -352,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages)
352 353
353/** 354/**
354 * vfree - release memory allocated by vmalloc() 355 * vfree - release memory allocated by vmalloc()
355 *
356 * @addr: memory base address 356 * @addr: memory base address
357 * 357 *
358 * Free the virtually contiguous memory area starting at @addr, as 358 * Free the virtually contiguous memory area starting at @addr, as
@@ -370,7 +370,6 @@ EXPORT_SYMBOL(vfree);
370 370
371/** 371/**
372 * vunmap - release virtual mapping obtained by vmap() 372 * vunmap - release virtual mapping obtained by vmap()
373 *
374 * @addr: memory base address 373 * @addr: memory base address
375 * 374 *
376 * Free the virtually contiguous memory area starting at @addr, 375 * Free the virtually contiguous memory area starting at @addr,
@@ -387,7 +386,6 @@ EXPORT_SYMBOL(vunmap);
387 386
388/** 387/**
389 * vmap - map an array of pages into virtually contiguous space 388 * vmap - map an array of pages into virtually contiguous space
390 *
391 * @pages: array of page pointers 389 * @pages: array of page pointers
392 * @count: number of pages to map 390 * @count: number of pages to map
393 * @flags: vm_area->flags 391 * @flags: vm_area->flags
@@ -468,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
468 466
469/** 467/**
470 * __vmalloc_node - allocate virtually contiguous memory 468 * __vmalloc_node - allocate virtually contiguous memory
471 *
472 * @size: allocation size 469 * @size: allocation size
473 * @gfp_mask: flags for the page level allocator 470 * @gfp_mask: flags for the page level allocator
474 * @prot: protection mask for the allocated pages 471 * @prot: protection mask for the allocated pages
@@ -478,8 +475,8 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
478 * allocator with @gfp_mask flags. Map them into contiguous 475 * allocator with @gfp_mask flags. Map them into contiguous
479 * kernel virtual space, using a pagetable protection of @prot. 476 * kernel virtual space, using a pagetable protection of @prot.
480 */ 477 */
481void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 478static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
482 int node) 479 int node)
483{ 480{
484 struct vm_struct *area; 481 struct vm_struct *area;
485 482
@@ -493,7 +490,6 @@ void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
493 490
494 return __vmalloc_area_node(area, gfp_mask, prot, node); 491 return __vmalloc_area_node(area, gfp_mask, prot, node);
495} 492}
496EXPORT_SYMBOL(__vmalloc_node);
497 493
498void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 494void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
499{ 495{
@@ -503,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc);
503 499
504/** 500/**
505 * vmalloc - allocate virtually contiguous memory 501 * vmalloc - allocate virtually contiguous memory
506 *
507 * @size: allocation size 502 * @size: allocation size
508 *
509 * Allocate enough pages to cover @size from the page level 503 * Allocate enough pages to cover @size from the page level
510 * allocator and map them into contiguous kernel virtual space. 504 * allocator and map them into contiguous kernel virtual space.
511 * 505 *
@@ -519,11 +513,11 @@ void *vmalloc(unsigned long size)
519EXPORT_SYMBOL(vmalloc); 513EXPORT_SYMBOL(vmalloc);
520 514
521/** 515/**
522 * vmalloc_user - allocate virtually contiguous memory which has 516 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
523 * been zeroed so it can be mapped to userspace without 517 * @size: allocation size
524 * leaking data.
525 * 518 *
526 * @size: allocation size 519 * The resulting memory area is zeroed so it can be mapped to userspace
520 * without leaking data.
527 */ 521 */
528void *vmalloc_user(unsigned long size) 522void *vmalloc_user(unsigned long size)
529{ 523{
@@ -542,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user);
542 536
543/** 537/**
544 * vmalloc_node - allocate memory on a specific node 538 * vmalloc_node - allocate memory on a specific node
545 *
546 * @size: allocation size 539 * @size: allocation size
547 * @node: numa node 540 * @node: numa node
548 * 541 *
@@ -564,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node);
564 557
565/** 558/**
566 * vmalloc_exec - allocate virtually contiguous, executable memory 559 * vmalloc_exec - allocate virtually contiguous, executable memory
567 *
568 * @size: allocation size 560 * @size: allocation size
569 * 561 *
570 * Kernel-internal function to allocate enough pages to cover @size 562 * Kernel-internal function to allocate enough pages to cover @size
@@ -582,7 +574,6 @@ void *vmalloc_exec(unsigned long size)
582 574
583/** 575/**
584 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 576 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
585 *
586 * @size: allocation size 577 * @size: allocation size
587 * 578 *
588 * Allocate enough 32bit PA addressable pages to cover @size from the 579 * Allocate enough 32bit PA addressable pages to cover @size from the
@@ -595,11 +586,11 @@ void *vmalloc_32(unsigned long size)
595EXPORT_SYMBOL(vmalloc_32); 586EXPORT_SYMBOL(vmalloc_32);
596 587
597/** 588/**
598 * vmalloc_32_user - allocate virtually contiguous memory (32bit 589 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
599 * addressable) which is zeroed so it can be
600 * mapped to userspace without leaking data.
601 *
602 * @size: allocation size 590 * @size: allocation size
591 *
592 * The resulting memory area is 32bit addressable and zeroed so it can be
593 * mapped to userspace without leaking data.
603 */ 594 */
604void *vmalloc_32_user(unsigned long size) 595void *vmalloc_32_user(unsigned long size)
605{ 596{
@@ -693,7 +684,6 @@ finished:
693 684
694/** 685/**
695 * remap_vmalloc_range - map vmalloc pages to userspace 686 * remap_vmalloc_range - map vmalloc pages to userspace
696 *
697 * @vma: vma to cover (map full range of vma) 687 * @vma: vma to cover (map full range of vma)
698 * @addr: vmalloc memory 688 * @addr: vmalloc memory
699 * @pgoff: number of pages into addr before first page to map 689 * @pgoff: number of pages into addr before first page to map
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5d4c4d02254d..eca70310adb2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/highmem.h> 21#include <linux/highmem.h>
22#include <linux/vmstat.h>
22#include <linux/file.h> 23#include <linux/file.h>
23#include <linux/writeback.h> 24#include <linux/writeback.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
@@ -62,6 +63,8 @@ struct scan_control {
62 int swap_cluster_max; 63 int swap_cluster_max;
63 64
64 int swappiness; 65 int swappiness;
66
67 int all_unreclaimable;
65}; 68};
66 69
67/* 70/*
@@ -368,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
368 /* synchronous write or broken a_ops? */ 371 /* synchronous write or broken a_ops? */
369 ClearPageReclaim(page); 372 ClearPageReclaim(page);
370 } 373 }
371 374 inc_zone_page_state(page, NR_VMSCAN_WRITE);
372 return PAGE_SUCCESS; 375 return PAGE_SUCCESS;
373 } 376 }
374 377
@@ -377,15 +380,34 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
377 380
378int remove_mapping(struct address_space *mapping, struct page *page) 381int remove_mapping(struct address_space *mapping, struct page *page)
379{ 382{
380 if (!mapping) 383 BUG_ON(!PageLocked(page));
381 return 0; /* truncate got there first */ 384 BUG_ON(mapping != page_mapping(page));
382 385
383 write_lock_irq(&mapping->tree_lock); 386 write_lock_irq(&mapping->tree_lock);
384
385 /* 387 /*
386 * The non-racy check for busy page. It is critical to check 388 * The non racy check for a busy page.
387 * PageDirty _after_ making sure that the page is freeable and 389 *
388 * not in use by anybody. (pagecache + us == 2) 390 * Must be careful with the order of the tests. When someone has
391 * a ref to the page, it may be possible that they dirty it then
392 * drop the reference. So if PageDirty is tested before page_count
393 * here, then the following race may occur:
394 *
395 * get_user_pages(&page);
396 * [user mapping goes away]
397 * write_to(page);
398 * !PageDirty(page) [good]
399 * SetPageDirty(page);
400 * put_page(page);
401 * !page_count(page) [good, discard it]
402 *
403 * [oops, our write_to data is lost]
404 *
405 * Reversing the order of the tests ensures such a situation cannot
406 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
407 * load is not satisfied before that of page->_count.
408 *
409 * Note that if SetPageDirty is always performed via set_page_dirty,
410 * and thus under tree_lock, then this ordering is not required.
389 */ 411 */
390 if (unlikely(page_count(page) != 2)) 412 if (unlikely(page_count(page) != 2))
391 goto cannot_free; 413 goto cannot_free;
@@ -440,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
440 if (TestSetPageLocked(page)) 462 if (TestSetPageLocked(page))
441 goto keep; 463 goto keep;
442 464
443 BUG_ON(PageActive(page)); 465 VM_BUG_ON(PageActive(page));
444 466
445 sc->nr_scanned++; 467 sc->nr_scanned++;
446 468
@@ -547,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
547 goto free_it; 569 goto free_it;
548 } 570 }
549 571
550 if (!remove_mapping(mapping, page)) 572 if (!mapping || !remove_mapping(mapping, page))
551 goto keep_locked; 573 goto keep_locked;
552 574
553free_it: 575free_it:
@@ -564,7 +586,7 @@ keep_locked:
564 unlock_page(page); 586 unlock_page(page);
565keep: 587keep:
566 list_add(&page->lru, &ret_pages); 588 list_add(&page->lru, &ret_pages);
567 BUG_ON(PageLRU(page)); 589 VM_BUG_ON(PageLRU(page));
568 } 590 }
569 list_splice(&ret_pages, page_list); 591 list_splice(&ret_pages, page_list);
570 if (pagevec_count(&freed_pvec)) 592 if (pagevec_count(&freed_pvec))
@@ -603,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
603 page = lru_to_page(src); 625 page = lru_to_page(src);
604 prefetchw_prev_lru_page(page, src, flags); 626 prefetchw_prev_lru_page(page, src, flags);
605 627
606 BUG_ON(!PageLRU(page)); 628 VM_BUG_ON(!PageLRU(page));
607 629
608 list_del(&page->lru); 630 list_del(&page->lru);
609 target = src; 631 target = src;
@@ -674,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
674 */ 696 */
675 while (!list_empty(&page_list)) { 697 while (!list_empty(&page_list)) {
676 page = lru_to_page(&page_list); 698 page = lru_to_page(&page_list);
677 BUG_ON(PageLRU(page)); 699 VM_BUG_ON(PageLRU(page));
678 SetPageLRU(page); 700 SetPageLRU(page);
679 list_del(&page->lru); 701 list_del(&page->lru);
680 if (PageActive(page)) 702 if (PageActive(page))
@@ -695,6 +717,11 @@ done:
695 return nr_reclaimed; 717 return nr_reclaimed;
696} 718}
697 719
720static inline int zone_is_near_oom(struct zone *zone)
721{
722 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
723}
724
698/* 725/*
699 * This moves pages from the active list to the inactive list. 726 * This moves pages from the active list to the inactive list.
700 * 727 *
@@ -730,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
730 long distress; 757 long distress;
731 long swap_tendency; 758 long swap_tendency;
732 759
760 if (zone_is_near_oom(zone))
761 goto force_reclaim_mapped;
762
733 /* 763 /*
734 * `distress' is a measure of how much trouble we're having 764 * `distress' is a measure of how much trouble we're having
735 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 765 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
@@ -765,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
765 * memory onto the inactive list. 795 * memory onto the inactive list.
766 */ 796 */
767 if (swap_tendency >= 100) 797 if (swap_tendency >= 100)
798force_reclaim_mapped:
768 reclaim_mapped = 1; 799 reclaim_mapped = 1;
769 } 800 }
770 801
@@ -797,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
797 while (!list_empty(&l_inactive)) { 828 while (!list_empty(&l_inactive)) {
798 page = lru_to_page(&l_inactive); 829 page = lru_to_page(&l_inactive);
799 prefetchw_prev_lru_page(page, &l_inactive, flags); 830 prefetchw_prev_lru_page(page, &l_inactive, flags);
800 BUG_ON(PageLRU(page)); 831 VM_BUG_ON(PageLRU(page));
801 SetPageLRU(page); 832 SetPageLRU(page);
802 BUG_ON(!PageActive(page)); 833 VM_BUG_ON(!PageActive(page));
803 ClearPageActive(page); 834 ClearPageActive(page);
804 835
805 list_move(&page->lru, &zone->inactive_list); 836 list_move(&page->lru, &zone->inactive_list);
@@ -827,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
827 while (!list_empty(&l_active)) { 858 while (!list_empty(&l_active)) {
828 page = lru_to_page(&l_active); 859 page = lru_to_page(&l_active);
829 prefetchw_prev_lru_page(page, &l_active, flags); 860 prefetchw_prev_lru_page(page, &l_active, flags);
830 BUG_ON(PageLRU(page)); 861 VM_BUG_ON(PageLRU(page));
831 SetPageLRU(page); 862 SetPageLRU(page);
832 BUG_ON(!PageActive(page)); 863 VM_BUG_ON(!PageActive(page));
833 list_move(&page->lru, &zone->active_list); 864 list_move(&page->lru, &zone->active_list);
834 pgmoved++; 865 pgmoved++;
835 if (!pagevec_add(&pvec, page)) { 866 if (!pagevec_add(&pvec, page)) {
@@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
925 unsigned long nr_reclaimed = 0; 956 unsigned long nr_reclaimed = 0;
926 int i; 957 int i;
927 958
959 sc->all_unreclaimable = 1;
928 for (i = 0; zones[i] != NULL; i++) { 960 for (i = 0; zones[i] != NULL; i++) {
929 struct zone *zone = zones[i]; 961 struct zone *zone = zones[i];
930 962
@@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
941 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 973 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
942 continue; /* Let kswapd poll it */ 974 continue; /* Let kswapd poll it */
943 975
976 sc->all_unreclaimable = 0;
977
944 nr_reclaimed += shrink_zone(priority, zone, sc); 978 nr_reclaimed += shrink_zone(priority, zone, sc);
945 } 979 }
946 return nr_reclaimed; 980 return nr_reclaimed;
@@ -1021,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1021 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1055 if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
1022 blk_congestion_wait(WRITE, HZ/10); 1056 blk_congestion_wait(WRITE, HZ/10);
1023 } 1057 }
1058 /* top priority shrink_caches still had more to do? don't OOM, then */
1059 if (!sc.all_unreclaimable)
1060 ret = 1;
1024out: 1061out:
1025 for (i = 0; zones[i] != 0; i++) { 1062 for (i = 0; zones[i] != 0; i++) {
1026 struct zone *zone = zones[i]; 1063 struct zone *zone = zones[i];
@@ -1153,7 +1190,7 @@ scan:
1153 if (zone->all_unreclaimable) 1190 if (zone->all_unreclaimable)
1154 continue; 1191 continue;
1155 if (nr_slab == 0 && zone->pages_scanned >= 1192 if (nr_slab == 0 && zone->pages_scanned >=
1156 (zone->nr_active + zone->nr_inactive) * 4) 1193 (zone->nr_active + zone->nr_inactive) * 6)
1157 zone->all_unreclaimable = 1; 1194 zone->all_unreclaimable = 1;
1158 /* 1195 /*
1159 * If we've done a decent amount of scanning and 1196 * If we've done a decent amount of scanning and
@@ -1361,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1361 for_each_zone(zone) 1398 for_each_zone(zone)
1362 lru_pages += zone->nr_active + zone->nr_inactive; 1399 lru_pages += zone->nr_active + zone->nr_inactive;
1363 1400
1364 nr_slab = global_page_state(NR_SLAB); 1401 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1365 /* If slab caches are huge, it's better to hit them first */ 1402 /* If slab caches are huge, it's better to hit them first */
1366 while (nr_slab >= lru_pages) { 1403 while (nr_slab >= lru_pages) {
1367 reclaim_state.reclaimed_slab = 0; 1404 reclaim_state.reclaimed_slab = 0;
@@ -1510,7 +1547,6 @@ int zone_reclaim_mode __read_mostly;
1510#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1547#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
1511#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1548#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1512#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1549#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1513#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
1514 1550
1515/* 1551/*
1516 * Priority for ZONE_RECLAIM. This determines the fraction of pages 1552 * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1526,6 +1562,12 @@ int zone_reclaim_mode __read_mostly;
1526int sysctl_min_unmapped_ratio = 1; 1562int sysctl_min_unmapped_ratio = 1;
1527 1563
1528/* 1564/*
1565 * If the number of slab pages in a zone grows beyond this percentage then
1566 * slab reclaim needs to occur.
1567 */
1568int sysctl_min_slab_ratio = 5;
1569
1570/*
1529 * Try to free up some pages from this zone through reclaim. 1571 * Try to free up some pages from this zone through reclaim.
1530 */ 1572 */
1531static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1573static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -1544,6 +1586,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1544 .gfp_mask = gfp_mask, 1586 .gfp_mask = gfp_mask,
1545 .swappiness = vm_swappiness, 1587 .swappiness = vm_swappiness,
1546 }; 1588 };
1589 unsigned long slab_reclaimable;
1547 1590
1548 disable_swap_token(); 1591 disable_swap_token();
1549 cond_resched(); 1592 cond_resched();
@@ -1556,29 +1599,43 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1556 reclaim_state.reclaimed_slab = 0; 1599 reclaim_state.reclaimed_slab = 0;
1557 p->reclaim_state = &reclaim_state; 1600 p->reclaim_state = &reclaim_state;
1558 1601
1559 /* 1602 if (zone_page_state(zone, NR_FILE_PAGES) -
1560 * Free memory by calling shrink zone with increasing priorities 1603 zone_page_state(zone, NR_FILE_MAPPED) >
1561 * until we have enough memory freed. 1604 zone->min_unmapped_pages) {
1562 */ 1605 /*
1563 priority = ZONE_RECLAIM_PRIORITY; 1606 * Free memory by calling shrink zone with increasing
1564 do { 1607 * priorities until we have enough memory freed.
1565 nr_reclaimed += shrink_zone(priority, zone, &sc); 1608 */
1566 priority--; 1609 priority = ZONE_RECLAIM_PRIORITY;
1567 } while (priority >= 0 && nr_reclaimed < nr_pages); 1610 do {
1611 nr_reclaimed += shrink_zone(priority, zone, &sc);
1612 priority--;
1613 } while (priority >= 0 && nr_reclaimed < nr_pages);
1614 }
1568 1615
1569 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { 1616 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
1617 if (slab_reclaimable > zone->min_slab_pages) {
1570 /* 1618 /*
1571 * shrink_slab() does not currently allow us to determine how 1619 * shrink_slab() does not currently allow us to determine how
1572 * many pages were freed in this zone. So we just shake the slab 1620 * many pages were freed in this zone. So we take the current
1573 * a bit and then go off node for this particular allocation 1621 * number of slab pages and shake the slab until it is reduced
1574 * despite possibly having freed enough memory to allocate in 1622 * by the same nr_pages that we used for reclaiming unmapped
1575 * this zone. If we freed local memory then the next 1623 * pages.
1576 * allocations will be local again.
1577 * 1624 *
1578 * shrink_slab will free memory on all zones and may take 1625 * Note that shrink_slab will free memory on all zones and may
1579 * a long time. 1626 * take a long time.
1627 */
1628 while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
1629 zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
1630 slab_reclaimable - nr_pages)
1631 ;
1632
1633 /*
1634 * Update nr_reclaimed by the number of slab pages we
1635 * reclaimed from this zone.
1580 */ 1636 */
1581 shrink_slab(sc.nr_scanned, gfp_mask, order); 1637 nr_reclaimed += slab_reclaimable -
1638 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
1582 } 1639 }
1583 1640
1584 p->reclaim_state = NULL; 1641 p->reclaim_state = NULL;
@@ -1592,7 +1649,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1592 int node_id; 1649 int node_id;
1593 1650
1594 /* 1651 /*
1595 * Zone reclaim reclaims unmapped file backed pages. 1652 * Zone reclaim reclaims unmapped file backed pages and
1653 * slab pages if we are over the defined limits.
1596 * 1654 *
1597 * A small portion of unmapped file backed pages is needed for 1655 * A small portion of unmapped file backed pages is needed for
1598 * file I/O otherwise pages read by file I/O will be immediately 1656 * file I/O otherwise pages read by file I/O will be immediately
@@ -1601,7 +1659,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1601 * unmapped file backed pages. 1659 * unmapped file backed pages.
1602 */ 1660 */
1603 if (zone_page_state(zone, NR_FILE_PAGES) - 1661 if (zone_page_state(zone, NR_FILE_PAGES) -
1604 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) 1662 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
1663 && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
1664 <= zone->min_slab_pages)
1605 return 0; 1665 return 0;
1606 1666
1607 /* 1667 /*
@@ -1621,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1621 * over remote processors and spread off node memory allocations 1681 * over remote processors and spread off node memory allocations
1622 * as wide as possible. 1682 * as wide as possible.
1623 */ 1683 */
1624 node_id = zone->zone_pgdat->node_id; 1684 node_id = zone_to_nid(zone);
1625 mask = node_to_cpumask(node_id); 1685 mask = node_to_cpumask(node_id);
1626 if (!cpus_empty(mask) && node_id != numa_node_id()) 1686 if (!cpus_empty(mask) && node_id != numa_node_id())
1627 return 0; 1687 return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c1b5f4106b38..a2b6a9f96e5c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -321,6 +321,9 @@ void refresh_cpu_vm_stats(int cpu)
321 for_each_zone(zone) { 321 for_each_zone(zone) {
322 struct per_cpu_pageset *pcp; 322 struct per_cpu_pageset *pcp;
323 323
324 if (!populated_zone(zone))
325 continue;
326
324 pcp = zone_pcp(zone, cpu); 327 pcp = zone_pcp(zone, cpu);
325 328
326 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 329 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
@@ -368,7 +371,7 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z)
368 __inc_zone_state(z, NUMA_MISS); 371 __inc_zone_state(z, NUMA_MISS);
369 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); 372 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
370 } 373 }
371 if (z->zone_pgdat == NODE_DATA(numa_node_id())) 374 if (z->node == numa_node_id())
372 __inc_zone_state(z, NUMA_LOCAL); 375 __inc_zone_state(z, NUMA_LOCAL);
373 else 376 else
374 __inc_zone_state(z, NUMA_OTHER); 377 __inc_zone_state(z, NUMA_OTHER);
@@ -435,17 +438,34 @@ struct seq_operations fragmentation_op = {
435 .show = frag_show, 438 .show = frag_show,
436}; 439};
437 440
441#ifdef CONFIG_ZONE_DMA32
442#define TEXT_FOR_DMA32(xx) xx "_dma32",
443#else
444#define TEXT_FOR_DMA32(xx)
445#endif
446
447#ifdef CONFIG_HIGHMEM
448#define TEXT_FOR_HIGHMEM(xx) xx "_high",
449#else
450#define TEXT_FOR_HIGHMEM(xx)
451#endif
452
453#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
454 TEXT_FOR_HIGHMEM(xx)
455
438static char *vmstat_text[] = { 456static char *vmstat_text[] = {
439 /* Zoned VM counters */ 457 /* Zoned VM counters */
440 "nr_anon_pages", 458 "nr_anon_pages",
441 "nr_mapped", 459 "nr_mapped",
442 "nr_file_pages", 460 "nr_file_pages",
443 "nr_slab", 461 "nr_slab_reclaimable",
462 "nr_slab_unreclaimable",
444 "nr_page_table_pages", 463 "nr_page_table_pages",
445 "nr_dirty", 464 "nr_dirty",
446 "nr_writeback", 465 "nr_writeback",
447 "nr_unstable", 466 "nr_unstable",
448 "nr_bounce", 467 "nr_bounce",
468 "nr_vmscan_write",
449 469
450#ifdef CONFIG_NUMA 470#ifdef CONFIG_NUMA
451 "numa_hit", 471 "numa_hit",
@@ -462,10 +482,7 @@ static char *vmstat_text[] = {
462 "pswpin", 482 "pswpin",
463 "pswpout", 483 "pswpout",
464 484
465 "pgalloc_dma", 485 TEXTS_FOR_ZONES("pgalloc")
466 "pgalloc_dma32",
467 "pgalloc_normal",
468 "pgalloc_high",
469 486
470 "pgfree", 487 "pgfree",
471 "pgactivate", 488 "pgactivate",
@@ -474,25 +491,10 @@ static char *vmstat_text[] = {
474 "pgfault", 491 "pgfault",
475 "pgmajfault", 492 "pgmajfault",
476 493
477 "pgrefill_dma", 494 TEXTS_FOR_ZONES("pgrefill")
478 "pgrefill_dma32", 495 TEXTS_FOR_ZONES("pgsteal")
479 "pgrefill_normal", 496 TEXTS_FOR_ZONES("pgscan_kswapd")
480 "pgrefill_high", 497 TEXTS_FOR_ZONES("pgscan_direct")
481
482 "pgsteal_dma",
483 "pgsteal_dma32",
484 "pgsteal_normal",
485 "pgsteal_high",
486
487 "pgscan_kswapd_dma",
488 "pgscan_kswapd_dma32",
489 "pgscan_kswapd_normal",
490 "pgscan_kswapd_high",
491
492 "pgscan_direct_dma",
493 "pgscan_direct_dma32",
494 "pgscan_direct_normal",
495 "pgscan_direct_high",
496 498
497 "pginodesteal", 499 "pginodesteal",
498 "slabs_scanned", 500 "slabs_scanned",