aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid Woodhouse <dwmw2@infradead.org>2006-10-01 12:55:53 -0400
committerDavid Woodhouse <dwmw2@infradead.org>2006-10-01 12:55:53 -0400
commit8a84fc15ae5cafcc366dd85cf8e1ab2040679abc (patch)
tree5d8dce194c9667fa92e9ec9f545cec867a9a1e0d /mm
parent28b79ff9661b22e4c41c0d00d4ab8503e810f13d (diff)
parent82965addad66fce61a92c5f03104ea90b0b87124 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Manually resolve conflict in include/mtd/Kbuild Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/Makefile6
-rw-r--r--mm/allocpercpu.c129
-rw-r--r--mm/bootmem.c202
-rw-r--r--mm/bounce.c302
-rw-r--r--mm/filemap.c211
-rw-r--r--mm/fremap.c6
-rw-r--r--mm/highmem.c294
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memory.c215
-rw-r--r--mm/memory_hotplug.c71
-rw-r--r--mm/mempolicy.c26
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mmap.c19
-rw-r--r--mm/mprotect.c53
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/msync.c196
-rw-r--r--mm/nommu.c250
-rw-r--r--mm/oom_kill.c126
-rw-r--r--mm/page-writeback.c198
-rw-r--r--mm/page_alloc.c976
-rw-r--r--mm/page_io.c48
-rw-r--r--mm/rmap.c65
-rw-r--r--mm/shmem.c122
-rw-r--r--mm/shmem_acl.c197
-rw-r--r--mm/slab.c458
-rw-r--r--mm/slob.c52
-rw-r--r--mm/swap.c49
-rw-r--r--mm/swapfile.c7
-rw-r--r--mm/truncate.c85
-rw-r--r--mm/util.c18
-rw-r--r--mm/vmalloc.c38
-rw-r--r--mm/vmscan.c140
-rw-r--r--mm/vmstat.c52
35 files changed, 3199 insertions, 1441 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8f5b45615f..5d88489ef2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,12 +115,17 @@ config SPARSEMEM_EXTREME
115# eventually, we can have this option just 'select SPARSEMEM' 115# eventually, we can have this option just 'select SPARSEMEM'
116config MEMORY_HOTPLUG 116config MEMORY_HOTPLUG
117 bool "Allow for memory hot-add" 117 bool "Allow for memory hot-add"
118 depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG 118 depends on SPARSEMEM || X86_64_ACPI_NUMA
119 depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
119 depends on (IA64 || X86 || PPC64) 120 depends on (IA64 || X86 || PPC64)
120 121
121comment "Memory hotplug is currently incompatible with Software Suspend" 122comment "Memory hotplug is currently incompatible with Software Suspend"
122 depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND 123 depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
123 124
125config MEMORY_HOTPLUG_SPARSE
126 def_bool y
127 depends on SPARSEMEM && MEMORY_HOTPLUG
128
124# Heavily threaded applications may benefit from splitting the mm-wide 129# Heavily threaded applications may benefit from splitting the mm-wide
125# page_table_lock, so that faults on different parts of the user address 130# page_table_lock, so that faults on different parts of the user address
126# space can be handled with less contention: split it at this NR_CPUS. 131# space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/Makefile b/mm/Makefile
index 9dd824c11e..12b3a4eee8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,15 +12,19 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) 13 prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
14 14
15ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
16obj-y += bounce.o
17endif
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 18obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 19obj-$(CONFIG_HUGETLBFS) += hugetlb.o
17obj-$(CONFIG_NUMA) += mempolicy.o 20obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o 21obj-$(CONFIG_SPARSEMEM) += sparse.o
19obj-$(CONFIG_SHMEM) += shmem.o 22obj-$(CONFIG_SHMEM) += shmem.o
23obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 24obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
21obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
22obj-$(CONFIG_SLAB) += slab.o 26obj-$(CONFIG_SLAB) += slab.o
23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 27obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
24obj-$(CONFIG_FS_XIP) += filemap_xip.o 28obj-$(CONFIG_FS_XIP) += filemap_xip.o
25obj-$(CONFIG_MIGRATION) += migrate.o 29obj-$(CONFIG_MIGRATION) += migrate.o
26 30obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
new file mode 100644
index 0000000000..eaa9abeea5
--- /dev/null
+++ b/mm/allocpercpu.c
@@ -0,0 +1,129 @@
1/*
2 * linux/mm/allocpercpu.c
3 *
4 * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
5 */
6#include <linux/mm.h>
7#include <linux/module.h>
8
9/**
10 * percpu_depopulate - depopulate per-cpu data for given cpu
11 * @__pdata: per-cpu data to depopulate
12 * @cpu: depopulate per-cpu data for this cpu
13 *
14 * Depopulating per-cpu data for a cpu going offline would be a typical
15 * use case. You need to register a cpu hotplug handler for that purpose.
16 */
17void percpu_depopulate(void *__pdata, int cpu)
18{
19 struct percpu_data *pdata = __percpu_disguise(__pdata);
20 if (pdata->ptrs[cpu]) {
21 kfree(pdata->ptrs[cpu]);
22 pdata->ptrs[cpu] = NULL;
23 }
24}
25EXPORT_SYMBOL_GPL(percpu_depopulate);
26
27/**
28 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
29 * @__pdata: per-cpu data to depopulate
30 * @mask: depopulate per-cpu data for cpu's selected through mask bits
31 */
32void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
33{
34 int cpu;
35 for_each_cpu_mask(cpu, *mask)
36 percpu_depopulate(__pdata, cpu);
37}
38EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
39
40/**
41 * percpu_populate - populate per-cpu data for given cpu
42 * @__pdata: per-cpu data to populate further
43 * @size: size of per-cpu object
44 * @gfp: may sleep or not etc.
45 * @cpu: populate per-data for this cpu
46 *
47 * Populating per-cpu data for a cpu coming online would be a typical
48 * use case. You need to register a cpu hotplug handler for that purpose.
49 * Per-cpu object is populated with zeroed buffer.
50 */
51void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
52{
53 struct percpu_data *pdata = __percpu_disguise(__pdata);
54 int node = cpu_to_node(cpu);
55
56 BUG_ON(pdata->ptrs[cpu]);
57 if (node_online(node)) {
58 /* FIXME: kzalloc_node(size, gfp, node) */
59 pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
60 if (pdata->ptrs[cpu])
61 memset(pdata->ptrs[cpu], 0, size);
62 } else
63 pdata->ptrs[cpu] = kzalloc(size, gfp);
64 return pdata->ptrs[cpu];
65}
66EXPORT_SYMBOL_GPL(percpu_populate);
67
68/**
69 * percpu_populate_mask - populate per-cpu data for more cpu's
70 * @__pdata: per-cpu data to populate further
71 * @size: size of per-cpu object
72 * @gfp: may sleep or not etc.
73 * @mask: populate per-cpu data for cpu's selected through mask bits
74 *
75 * Per-cpu objects are populated with zeroed buffers.
76 */
77int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
78 cpumask_t *mask)
79{
80 cpumask_t populated = CPU_MASK_NONE;
81 int cpu;
82
83 for_each_cpu_mask(cpu, *mask)
84 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
85 __percpu_depopulate_mask(__pdata, &populated);
86 return -ENOMEM;
87 } else
88 cpu_set(cpu, populated);
89 return 0;
90}
91EXPORT_SYMBOL_GPL(__percpu_populate_mask);
92
93/**
94 * percpu_alloc_mask - initial setup of per-cpu data
95 * @size: size of per-cpu object
96 * @gfp: may sleep or not etc.
97 * @mask: populate per-data for cpu's selected through mask bits
98 *
99 * Populating per-cpu data for all online cpu's would be a typical use case,
100 * which is simplified by the percpu_alloc() wrapper.
101 * Per-cpu objects are populated with zeroed buffers.
102 */
103void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
104{
105 void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
106 void *__pdata = __percpu_disguise(pdata);
107
108 if (unlikely(!pdata))
109 return NULL;
110 if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
111 return __pdata;
112 kfree(pdata);
113 return NULL;
114}
115EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
116
117/**
118 * percpu_free - final cleanup of per-cpu data
119 * @__pdata: object to clean up
120 *
121 * We simply clean up any per-cpu object left. No need for the client to
122 * track and specify through a bis mask which per-cpu objects are to free.
123 */
124void percpu_free(void *__pdata)
125{
126 __percpu_depopulate_mask(__pdata, &cpu_possible_map);
127 kfree(__percpu_disguise(__pdata));
128}
129EXPORT_SYMBOL_GPL(percpu_free);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 50353e0dac..d53112fcb4 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -8,17 +8,15 @@
8 * free memory collector. It's used to deal with reserved 8 * free memory collector. It's used to deal with reserved
9 * system memory and memory holes as well. 9 * system memory and memory holes as well.
10 */ 10 */
11
12#include <linux/mm.h>
13#include <linux/kernel_stat.h>
14#include <linux/swap.h>
15#include <linux/interrupt.h>
16#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h>
17#include <linux/bootmem.h> 13#include <linux/bootmem.h>
18#include <linux/mmzone.h>
19#include <linux/module.h> 14#include <linux/module.h>
20#include <asm/dma.h> 15
16#include <asm/bug.h>
21#include <asm/io.h> 17#include <asm/io.h>
18#include <asm/processor.h>
19
22#include "internal.h" 20#include "internal.h"
23 21
24/* 22/*
@@ -41,7 +39,7 @@ unsigned long saved_max_pfn;
41#endif 39#endif
42 40
43/* return the number of _pages_ that will be allocated for the boot bitmap */ 41/* return the number of _pages_ that will be allocated for the boot bitmap */
44unsigned long __init bootmem_bootmap_pages (unsigned long pages) 42unsigned long __init bootmem_bootmap_pages(unsigned long pages)
45{ 43{
46 unsigned long mapsize; 44 unsigned long mapsize;
47 45
@@ -51,12 +49,14 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
51 49
52 return mapsize; 50 return mapsize;
53} 51}
52
54/* 53/*
55 * link bdata in order 54 * link bdata in order
56 */ 55 */
57static void link_bootmem(bootmem_data_t *bdata) 56static void __init link_bootmem(bootmem_data_t *bdata)
58{ 57{
59 bootmem_data_t *ent; 58 bootmem_data_t *ent;
59
60 if (list_empty(&bdata_list)) { 60 if (list_empty(&bdata_list)) {
61 list_add(&bdata->list, &bdata_list); 61 list_add(&bdata->list, &bdata_list);
62 return; 62 return;
@@ -69,22 +69,32 @@ static void link_bootmem(bootmem_data_t *bdata)
69 } 69 }
70 } 70 }
71 list_add_tail(&bdata->list, &bdata_list); 71 list_add_tail(&bdata->list, &bdata_list);
72 return;
73} 72}
74 73
74/*
75 * Given an initialised bdata, it returns the size of the boot bitmap
76 */
77static unsigned long __init get_mapsize(bootmem_data_t *bdata)
78{
79 unsigned long mapsize;
80 unsigned long start = PFN_DOWN(bdata->node_boot_start);
81 unsigned long end = bdata->node_low_pfn;
82
83 mapsize = ((end - start) + 7) / 8;
84 return ALIGN(mapsize, sizeof(long));
85}
75 86
76/* 87/*
77 * Called once to set up the allocator itself. 88 * Called once to set up the allocator itself.
78 */ 89 */
79static unsigned long __init init_bootmem_core (pg_data_t *pgdat, 90static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
80 unsigned long mapstart, unsigned long start, unsigned long end) 91 unsigned long mapstart, unsigned long start, unsigned long end)
81{ 92{
82 bootmem_data_t *bdata = pgdat->bdata; 93 bootmem_data_t *bdata = pgdat->bdata;
83 unsigned long mapsize = ((end - start)+7)/8; 94 unsigned long mapsize;
84 95
85 mapsize = ALIGN(mapsize, sizeof(long)); 96 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
86 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); 97 bdata->node_boot_start = PFN_PHYS(start);
87 bdata->node_boot_start = (start << PAGE_SHIFT);
88 bdata->node_low_pfn = end; 98 bdata->node_low_pfn = end;
89 link_bootmem(bdata); 99 link_bootmem(bdata);
90 100
@@ -92,6 +102,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
92 * Initially all pages are reserved - setup_arch() has to 102 * Initially all pages are reserved - setup_arch() has to
93 * register free RAM areas explicitly. 103 * register free RAM areas explicitly.
94 */ 104 */
105 mapsize = get_mapsize(bdata);
95 memset(bdata->node_bootmem_map, 0xff, mapsize); 106 memset(bdata->node_bootmem_map, 0xff, mapsize);
96 107
97 return mapsize; 108 return mapsize;
@@ -102,22 +113,22 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
102 * might be used for boot-time allocations - or it might get added 113 * might be used for boot-time allocations - or it might get added
103 * to the free page pool later on. 114 * to the free page pool later on.
104 */ 115 */
105static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) 116static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
117 unsigned long size)
106{ 118{
119 unsigned long sidx, eidx;
107 unsigned long i; 120 unsigned long i;
121
108 /* 122 /*
109 * round up, partially reserved pages are considered 123 * round up, partially reserved pages are considered
110 * fully reserved. 124 * fully reserved.
111 */ 125 */
112 unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
113 unsigned long eidx = (addr + size - bdata->node_boot_start +
114 PAGE_SIZE-1)/PAGE_SIZE;
115 unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
116
117 BUG_ON(!size); 126 BUG_ON(!size);
118 BUG_ON(sidx >= eidx); 127 BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
119 BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn); 128 BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
120 BUG_ON(end > bdata->node_low_pfn); 129
130 sidx = PFN_DOWN(addr - bdata->node_boot_start);
131 eidx = PFN_UP(addr + size - bdata->node_boot_start);
121 132
122 for (i = sidx; i < eidx; i++) 133 for (i = sidx; i < eidx; i++)
123 if (test_and_set_bit(i, bdata->node_bootmem_map)) { 134 if (test_and_set_bit(i, bdata->node_bootmem_map)) {
@@ -127,20 +138,18 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add
127 } 138 }
128} 139}
129 140
130static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) 141static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
142 unsigned long size)
131{ 143{
144 unsigned long sidx, eidx;
132 unsigned long i; 145 unsigned long i;
133 unsigned long start; 146
134 /* 147 /*
135 * round down end of usable mem, partially free pages are 148 * round down end of usable mem, partially free pages are
136 * considered reserved. 149 * considered reserved.
137 */ 150 */
138 unsigned long sidx;
139 unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
140 unsigned long end = (addr + size)/PAGE_SIZE;
141
142 BUG_ON(!size); 151 BUG_ON(!size);
143 BUG_ON(end > bdata->node_low_pfn); 152 BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn);
144 153
145 if (addr < bdata->last_success) 154 if (addr < bdata->last_success)
146 bdata->last_success = addr; 155 bdata->last_success = addr;
@@ -148,8 +157,8 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
148 /* 157 /*
149 * Round up the beginning of the address. 158 * Round up the beginning of the address.
150 */ 159 */
151 start = (addr + PAGE_SIZE-1) / PAGE_SIZE; 160 sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
152 sidx = start - (bdata->node_boot_start/PAGE_SIZE); 161 eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
153 162
154 for (i = sidx; i < eidx; i++) { 163 for (i = sidx; i < eidx; i++) {
155 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) 164 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
@@ -175,10 +184,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
175 unsigned long align, unsigned long goal, unsigned long limit) 184 unsigned long align, unsigned long goal, unsigned long limit)
176{ 185{
177 unsigned long offset, remaining_size, areasize, preferred; 186 unsigned long offset, remaining_size, areasize, preferred;
178 unsigned long i, start = 0, incr, eidx, end_pfn = bdata->node_low_pfn; 187 unsigned long i, start = 0, incr, eidx, end_pfn;
179 void *ret; 188 void *ret;
180 189
181 if(!size) { 190 if (!size) {
182 printk("__alloc_bootmem_core(): zero-sized request\n"); 191 printk("__alloc_bootmem_core(): zero-sized request\n");
183 BUG(); 192 BUG();
184 } 193 }
@@ -187,23 +196,22 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
187 if (limit && bdata->node_boot_start >= limit) 196 if (limit && bdata->node_boot_start >= limit)
188 return NULL; 197 return NULL;
189 198
190 limit >>=PAGE_SHIFT; 199 end_pfn = bdata->node_low_pfn;
200 limit = PFN_DOWN(limit);
191 if (limit && end_pfn > limit) 201 if (limit && end_pfn > limit)
192 end_pfn = limit; 202 end_pfn = limit;
193 203
194 eidx = end_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 204 eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
195 offset = 0; 205 offset = 0;
196 if (align && 206 if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
197 (bdata->node_boot_start & (align - 1UL)) != 0) 207 offset = align - (bdata->node_boot_start & (align - 1UL));
198 offset = (align - (bdata->node_boot_start & (align - 1UL))); 208 offset = PFN_DOWN(offset);
199 offset >>= PAGE_SHIFT;
200 209
201 /* 210 /*
202 * We try to allocate bootmem pages above 'goal' 211 * We try to allocate bootmem pages above 'goal'
203 * first, then we try to allocate lower pages. 212 * first, then we try to allocate lower pages.
204 */ 213 */
205 if (goal && (goal >= bdata->node_boot_start) && 214 if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
206 ((goal >> PAGE_SHIFT) < end_pfn)) {
207 preferred = goal - bdata->node_boot_start; 215 preferred = goal - bdata->node_boot_start;
208 216
209 if (bdata->last_success >= preferred) 217 if (bdata->last_success >= preferred)
@@ -212,9 +220,8 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
212 } else 220 } else
213 preferred = 0; 221 preferred = 0;
214 222
215 preferred = ALIGN(preferred, align) >> PAGE_SHIFT; 223 preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
216 preferred += offset; 224 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
217 areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
218 incr = align >> PAGE_SHIFT ? : 1; 225 incr = align >> PAGE_SHIFT ? : 1;
219 226
220restart_scan: 227restart_scan:
@@ -229,7 +236,7 @@ restart_scan:
229 for (j = i + 1; j < i + areasize; ++j) { 236 for (j = i + 1; j < i + areasize; ++j) {
230 if (j >= eidx) 237 if (j >= eidx)
231 goto fail_block; 238 goto fail_block;
232 if (test_bit (j, bdata->node_bootmem_map)) 239 if (test_bit(j, bdata->node_bootmem_map))
233 goto fail_block; 240 goto fail_block;
234 } 241 }
235 start = i; 242 start = i;
@@ -245,7 +252,7 @@ restart_scan:
245 return NULL; 252 return NULL;
246 253
247found: 254found:
248 bdata->last_success = start << PAGE_SHIFT; 255 bdata->last_success = PFN_PHYS(start);
249 BUG_ON(start >= eidx); 256 BUG_ON(start >= eidx);
250 257
251 /* 258 /*
@@ -257,19 +264,21 @@ found:
257 bdata->last_offset && bdata->last_pos+1 == start) { 264 bdata->last_offset && bdata->last_pos+1 == start) {
258 offset = ALIGN(bdata->last_offset, align); 265 offset = ALIGN(bdata->last_offset, align);
259 BUG_ON(offset > PAGE_SIZE); 266 BUG_ON(offset > PAGE_SIZE);
260 remaining_size = PAGE_SIZE-offset; 267 remaining_size = PAGE_SIZE - offset;
261 if (size < remaining_size) { 268 if (size < remaining_size) {
262 areasize = 0; 269 areasize = 0;
263 /* last_pos unchanged */ 270 /* last_pos unchanged */
264 bdata->last_offset = offset+size; 271 bdata->last_offset = offset + size;
265 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + 272 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
266 bdata->node_boot_start); 273 offset +
274 bdata->node_boot_start);
267 } else { 275 } else {
268 remaining_size = size - remaining_size; 276 remaining_size = size - remaining_size;
269 areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; 277 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
270 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + 278 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
271 bdata->node_boot_start); 279 offset +
272 bdata->last_pos = start+areasize-1; 280 bdata->node_boot_start);
281 bdata->last_pos = start + areasize - 1;
273 bdata->last_offset = remaining_size; 282 bdata->last_offset = remaining_size;
274 } 283 }
275 bdata->last_offset &= ~PAGE_MASK; 284 bdata->last_offset &= ~PAGE_MASK;
@@ -282,7 +291,7 @@ found:
282 /* 291 /*
283 * Reserve the area now: 292 * Reserve the area now:
284 */ 293 */
285 for (i = start; i < start+areasize; i++) 294 for (i = start; i < start + areasize; i++)
286 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) 295 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
287 BUG(); 296 BUG();
288 memset(ret, 0, size); 297 memset(ret, 0, size);
@@ -303,8 +312,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
303 312
304 count = 0; 313 count = 0;
305 /* first extant page of the node */ 314 /* first extant page of the node */
306 pfn = bdata->node_boot_start >> PAGE_SHIFT; 315 pfn = PFN_DOWN(bdata->node_boot_start);
307 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 316 idx = bdata->node_low_pfn - pfn;
308 map = bdata->node_bootmem_map; 317 map = bdata->node_bootmem_map;
309 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ 318 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
310 if (bdata->node_boot_start == 0 || 319 if (bdata->node_boot_start == 0 ||
@@ -333,7 +342,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
333 } 342 }
334 } 343 }
335 } else { 344 } else {
336 i+=BITS_PER_LONG; 345 i += BITS_PER_LONG;
337 } 346 }
338 pfn += BITS_PER_LONG; 347 pfn += BITS_PER_LONG;
339 } 348 }
@@ -345,9 +354,10 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
345 */ 354 */
346 page = virt_to_page(bdata->node_bootmem_map); 355 page = virt_to_page(bdata->node_bootmem_map);
347 count = 0; 356 count = 0;
348 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { 357 idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
349 count++; 358 for (i = 0; i < idx; i++, page++) {
350 __free_pages_bootmem(page, 0); 359 __free_pages_bootmem(page, 0);
360 count++;
351 } 361 }
352 total += count; 362 total += count;
353 bdata->node_bootmem_map = NULL; 363 bdata->node_bootmem_map = NULL;
@@ -355,64 +365,72 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
355 return total; 365 return total;
356} 366}
357 367
358unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) 368unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
369 unsigned long startpfn, unsigned long endpfn)
359{ 370{
360 return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); 371 return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
361} 372}
362 373
363void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) 374void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
375 unsigned long size)
364{ 376{
365 reserve_bootmem_core(pgdat->bdata, physaddr, size); 377 reserve_bootmem_core(pgdat->bdata, physaddr, size);
366} 378}
367 379
368void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) 380void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
381 unsigned long size)
369{ 382{
370 free_bootmem_core(pgdat->bdata, physaddr, size); 383 free_bootmem_core(pgdat->bdata, physaddr, size);
371} 384}
372 385
373unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) 386unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
374{ 387{
375 return(free_all_bootmem_core(pgdat)); 388 return free_all_bootmem_core(pgdat);
376} 389}
377 390
378unsigned long __init init_bootmem (unsigned long start, unsigned long pages) 391unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
379{ 392{
380 max_low_pfn = pages; 393 max_low_pfn = pages;
381 min_low_pfn = start; 394 min_low_pfn = start;
382 return(init_bootmem_core(NODE_DATA(0), start, 0, pages)); 395 return init_bootmem_core(NODE_DATA(0), start, 0, pages);
383} 396}
384 397
385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 398#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
386void __init reserve_bootmem (unsigned long addr, unsigned long size) 399void __init reserve_bootmem(unsigned long addr, unsigned long size)
387{ 400{
388 reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); 401 reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
389} 402}
390#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 403#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
391 404
392void __init free_bootmem (unsigned long addr, unsigned long size) 405void __init free_bootmem(unsigned long addr, unsigned long size)
393{ 406{
394 free_bootmem_core(NODE_DATA(0)->bdata, addr, size); 407 free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
395} 408}
396 409
397unsigned long __init free_all_bootmem (void) 410unsigned long __init free_all_bootmem(void)
398{ 411{
399 return(free_all_bootmem_core(NODE_DATA(0))); 412 return free_all_bootmem_core(NODE_DATA(0));
400} 413}
401 414
402void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) 415void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
416 unsigned long goal)
403{ 417{
404 bootmem_data_t *bdata; 418 bootmem_data_t *bdata;
405 void *ptr; 419 void *ptr;
406 420
407 list_for_each_entry(bdata, &bdata_list, list) 421 list_for_each_entry(bdata, &bdata_list, list) {
408 if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) 422 ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
409 return(ptr); 423 if (ptr)
424 return ptr;
425 }
410 return NULL; 426 return NULL;
411} 427}
412 428
413void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) 429void * __init __alloc_bootmem(unsigned long size, unsigned long align,
430 unsigned long goal)
414{ 431{
415 void *mem = __alloc_bootmem_nopanic(size,align,goal); 432 void *mem = __alloc_bootmem_nopanic(size,align,goal);
433
416 if (mem) 434 if (mem)
417 return mem; 435 return mem;
418 /* 436 /*
@@ -424,29 +442,34 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned
424} 442}
425 443
426 444
427void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, 445void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
428 unsigned long goal) 446 unsigned long align, unsigned long goal)
429{ 447{
430 void *ptr; 448 void *ptr;
431 449
432 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 450 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
433 if (ptr) 451 if (ptr)
434 return (ptr); 452 return ptr;
435 453
436 return __alloc_bootmem(size, align, goal); 454 return __alloc_bootmem(size, align, goal);
437} 455}
438 456
439#define LOW32LIMIT 0xffffffff 457#ifndef ARCH_LOW_ADDRESS_LIMIT
458#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
459#endif
440 460
441void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) 461void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
462 unsigned long goal)
442{ 463{
443 bootmem_data_t *bdata; 464 bootmem_data_t *bdata;
444 void *ptr; 465 void *ptr;
445 466
446 list_for_each_entry(bdata, &bdata_list, list) 467 list_for_each_entry(bdata, &bdata_list, list) {
447 if ((ptr = __alloc_bootmem_core(bdata, size, 468 ptr = __alloc_bootmem_core(bdata, size, align, goal,
448 align, goal, LOW32LIMIT))) 469 ARCH_LOW_ADDRESS_LIMIT);
449 return(ptr); 470 if (ptr)
471 return ptr;
472 }
450 473
451 /* 474 /*
452 * Whoops, we cannot satisfy the allocation request. 475 * Whoops, we cannot satisfy the allocation request.
@@ -459,5 +482,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsig
459void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 482void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
460 unsigned long align, unsigned long goal) 483 unsigned long align, unsigned long goal)
461{ 484{
462 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); 485 return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
486 ARCH_LOW_ADDRESS_LIMIT);
463} 487}
diff --git a/mm/bounce.c b/mm/bounce.c
new file mode 100644
index 0000000000..e4b62d2a40
--- /dev/null
+++ b/mm/bounce.c
@@ -0,0 +1,302 @@
1/* bounce buffer handling for block devices
2 *
3 * - Split from highmem.c
4 */
5
6#include <linux/mm.h>
7#include <linux/module.h>
8#include <linux/swap.h>
9#include <linux/bio.h>
10#include <linux/pagemap.h>
11#include <linux/mempool.h>
12#include <linux/blkdev.h>
13#include <linux/init.h>
14#include <linux/hash.h>
15#include <linux/highmem.h>
16#include <linux/blktrace_api.h>
17#include <asm/tlbflush.h>
18
19#define POOL_SIZE 64
20#define ISA_POOL_SIZE 16
21
22static mempool_t *page_pool, *isa_page_pool;
23
24#ifdef CONFIG_HIGHMEM
25static __init int init_emergency_pool(void)
26{
27 struct sysinfo i;
28 si_meminfo(&i);
29 si_swapinfo(&i);
30
31 if (!i.totalhigh)
32 return 0;
33
34 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
35 BUG_ON(!page_pool);
36 printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
37
38 return 0;
39}
40
41__initcall(init_emergency_pool);
42
43/*
44 * highmem version, map in to vec
45 */
46static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
47{
48 unsigned long flags;
49 unsigned char *vto;
50
51 local_irq_save(flags);
52 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
53 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
54 kunmap_atomic(vto, KM_BOUNCE_READ);
55 local_irq_restore(flags);
56}
57
58#else /* CONFIG_HIGHMEM */
59
60#define bounce_copy_vec(to, vfrom) \
61 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
62
63#endif /* CONFIG_HIGHMEM */
64
65/*
66 * allocate pages in the DMA region for the ISA pool
67 */
68static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
69{
70 return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
71}
72
73/*
74 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
75 * as the max address, so check if the pool has already been created.
76 */
77int init_emergency_isa_pool(void)
78{
79 if (isa_page_pool)
80 return 0;
81
82 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
83 mempool_free_pages, (void *) 0);
84 BUG_ON(!isa_page_pool);
85
86 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
87 return 0;
88}
89
90/*
91 * Simple bounce buffer support for highmem pages. Depending on the
92 * queue gfp mask set, *to may or may not be a highmem page. kmap it
93 * always, it will do the Right Thing
94 */
95static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
96{
97 unsigned char *vfrom;
98 struct bio_vec *tovec, *fromvec;
99 int i;
100
101 __bio_for_each_segment(tovec, to, i, 0) {
102 fromvec = from->bi_io_vec + i;
103
104 /*
105 * not bounced
106 */
107 if (tovec->bv_page == fromvec->bv_page)
108 continue;
109
110 /*
111 * fromvec->bv_offset and fromvec->bv_len might have been
112 * modified by the block layer, so use the original copy,
113 * bounce_copy_vec already uses tovec->bv_len
114 */
115 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
116
117 flush_dcache_page(tovec->bv_page);
118 bounce_copy_vec(tovec, vfrom);
119 }
120}
121
122static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
123{
124 struct bio *bio_orig = bio->bi_private;
125 struct bio_vec *bvec, *org_vec;
126 int i;
127
128 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
129 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
130
131 /*
132 * free up bounce indirect pages used
133 */
134 __bio_for_each_segment(bvec, bio, i, 0) {
135 org_vec = bio_orig->bi_io_vec + i;
136 if (bvec->bv_page == org_vec->bv_page)
137 continue;
138
139 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
140 mempool_free(bvec->bv_page, pool);
141 }
142
143 bio_endio(bio_orig, bio_orig->bi_size, err);
144 bio_put(bio);
145}
146
147static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
148{
149 if (bio->bi_size)
150 return 1;
151
152 bounce_end_io(bio, page_pool, err);
153 return 0;
154}
155
156static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
157{
158 if (bio->bi_size)
159 return 1;
160
161 bounce_end_io(bio, isa_page_pool, err);
162 return 0;
163}
164
165static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
166{
167 struct bio *bio_orig = bio->bi_private;
168
169 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
170 copy_to_high_bio_irq(bio_orig, bio);
171
172 bounce_end_io(bio, pool, err);
173}
174
175static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
176{
177 if (bio->bi_size)
178 return 1;
179
180 __bounce_end_io_read(bio, page_pool, err);
181 return 0;
182}
183
184static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
185{
186 if (bio->bi_size)
187 return 1;
188
189 __bounce_end_io_read(bio, isa_page_pool, err);
190 return 0;
191}
192
193static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
194 mempool_t *pool)
195{
196 struct page *page;
197 struct bio *bio = NULL;
198 int i, rw = bio_data_dir(*bio_orig);
199 struct bio_vec *to, *from;
200
201 bio_for_each_segment(from, *bio_orig, i) {
202 page = from->bv_page;
203
204 /*
205 * is destination page below bounce pfn?
206 */
207 if (page_to_pfn(page) < q->bounce_pfn)
208 continue;
209
210 /*
211 * irk, bounce it
212 */
213 if (!bio)
214 bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
215
216 to = bio->bi_io_vec + i;
217
218 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
219 to->bv_len = from->bv_len;
220 to->bv_offset = from->bv_offset;
221 inc_zone_page_state(to->bv_page, NR_BOUNCE);
222
223 if (rw == WRITE) {
224 char *vto, *vfrom;
225
226 flush_dcache_page(from->bv_page);
227 vto = page_address(to->bv_page) + to->bv_offset;
228 vfrom = kmap(from->bv_page) + from->bv_offset;
229 memcpy(vto, vfrom, to->bv_len);
230 kunmap(from->bv_page);
231 }
232 }
233
234 /*
235 * no pages bounced
236 */
237 if (!bio)
238 return;
239
240 /*
241 * at least one page was bounced, fill in possible non-highmem
242 * pages
243 */
244 __bio_for_each_segment(from, *bio_orig, i, 0) {
245 to = bio_iovec_idx(bio, i);
246 if (!to->bv_page) {
247 to->bv_page = from->bv_page;
248 to->bv_len = from->bv_len;
249 to->bv_offset = from->bv_offset;
250 }
251 }
252
253 bio->bi_bdev = (*bio_orig)->bi_bdev;
254 bio->bi_flags |= (1 << BIO_BOUNCED);
255 bio->bi_sector = (*bio_orig)->bi_sector;
256 bio->bi_rw = (*bio_orig)->bi_rw;
257
258 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
259 bio->bi_idx = (*bio_orig)->bi_idx;
260 bio->bi_size = (*bio_orig)->bi_size;
261
262 if (pool == page_pool) {
263 bio->bi_end_io = bounce_end_io_write;
264 if (rw == READ)
265 bio->bi_end_io = bounce_end_io_read;
266 } else {
267 bio->bi_end_io = bounce_end_io_write_isa;
268 if (rw == READ)
269 bio->bi_end_io = bounce_end_io_read_isa;
270 }
271
272 bio->bi_private = *bio_orig;
273 *bio_orig = bio;
274}
275
276void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
277{
278 mempool_t *pool;
279
280 /*
281 * for non-isa bounce case, just check if the bounce pfn is equal
282 * to or bigger than the highest pfn in the system -- in that case,
283 * don't waste time iterating over bio segments
284 */
285 if (!(q->bounce_gfp & GFP_DMA)) {
286 if (q->bounce_pfn >= blk_max_pfn)
287 return;
288 pool = page_pool;
289 } else {
290 BUG_ON(!isa_page_pool);
291 pool = isa_page_pool;
292 }
293
294 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
295
296 /*
297 * slow path
298 */
299 __blk_queue_bounce(q, bio_orig, pool);
300}
301
302EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/filemap.c b/mm/filemap.c
index b9a60c43b6..ec46923598 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -488,6 +488,12 @@ struct page *page_cache_alloc_cold(struct address_space *x)
488EXPORT_SYMBOL(page_cache_alloc_cold); 488EXPORT_SYMBOL(page_cache_alloc_cold);
489#endif 489#endif
490 490
491static int __sleep_on_page_lock(void *word)
492{
493 io_schedule();
494 return 0;
495}
496
491/* 497/*
492 * In order to wait for pages to become available there must be 498 * In order to wait for pages to become available there must be
493 * waitqueues associated with pages. By using a hash table of 499 * waitqueues associated with pages. By using a hash table of
@@ -577,13 +583,24 @@ void fastcall __lock_page(struct page *page)
577} 583}
578EXPORT_SYMBOL(__lock_page); 584EXPORT_SYMBOL(__lock_page);
579 585
586/*
587 * Variant of lock_page that does not require the caller to hold a reference
588 * on the page's mapping.
589 */
590void fastcall __lock_page_nosync(struct page *page)
591{
592 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
593 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
594 TASK_UNINTERRUPTIBLE);
595}
596
580/** 597/**
581 * find_get_page - find and get a page reference 598 * find_get_page - find and get a page reference
582 * @mapping: the address_space to search 599 * @mapping: the address_space to search
583 * @offset: the page index 600 * @offset: the page index
584 * 601 *
585 * A rather lightweight function, finding and getting a reference to a 602 * Is there a pagecache struct page at the given (mapping, offset) tuple?
586 * hashed page atomically. 603 * If yes, increment its refcount and return it; if no, return NULL.
587 */ 604 */
588struct page * find_get_page(struct address_space *mapping, unsigned long offset) 605struct page * find_get_page(struct address_space *mapping, unsigned long offset)
589{ 606{
@@ -970,7 +987,7 @@ page_not_up_to_date:
970 /* Get exclusive access to the page ... */ 987 /* Get exclusive access to the page ... */
971 lock_page(page); 988 lock_page(page);
972 989
973 /* Did it get unhashed before we got the lock? */ 990 /* Did it get truncated before we got the lock? */
974 if (!page->mapping) { 991 if (!page->mapping) {
975 unlock_page(page); 992 unlock_page(page);
976 page_cache_release(page); 993 page_cache_release(page);
@@ -1132,13 +1149,14 @@ success:
1132 * that can use the page cache directly. 1149 * that can use the page cache directly.
1133 */ 1150 */
1134ssize_t 1151ssize_t
1135__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, 1152generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1136 unsigned long nr_segs, loff_t *ppos) 1153 unsigned long nr_segs, loff_t pos)
1137{ 1154{
1138 struct file *filp = iocb->ki_filp; 1155 struct file *filp = iocb->ki_filp;
1139 ssize_t retval; 1156 ssize_t retval;
1140 unsigned long seg; 1157 unsigned long seg;
1141 size_t count; 1158 size_t count;
1159 loff_t *ppos = &iocb->ki_pos;
1142 1160
1143 count = 0; 1161 count = 0;
1144 for (seg = 0; seg < nr_segs; seg++) { 1162 for (seg = 0; seg < nr_segs; seg++) {
@@ -1162,7 +1180,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1162 1180
1163 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1181 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1164 if (filp->f_flags & O_DIRECT) { 1182 if (filp->f_flags & O_DIRECT) {
1165 loff_t pos = *ppos, size; 1183 loff_t size;
1166 struct address_space *mapping; 1184 struct address_space *mapping;
1167 struct inode *inode; 1185 struct inode *inode;
1168 1186
@@ -1206,33 +1224,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1206out: 1224out:
1207 return retval; 1225 return retval;
1208} 1226}
1209EXPORT_SYMBOL(__generic_file_aio_read);
1210
1211ssize_t
1212generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
1213{
1214 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1215
1216 BUG_ON(iocb->ki_pos != pos);
1217 return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
1218}
1219EXPORT_SYMBOL(generic_file_aio_read); 1227EXPORT_SYMBOL(generic_file_aio_read);
1220 1228
1221ssize_t
1222generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1223{
1224 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1225 struct kiocb kiocb;
1226 ssize_t ret;
1227
1228 init_sync_kiocb(&kiocb, filp);
1229 ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
1230 if (-EIOCBQUEUED == ret)
1231 ret = wait_on_sync_kiocb(&kiocb);
1232 return ret;
1233}
1234EXPORT_SYMBOL(generic_file_read);
1235
1236int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) 1229int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1237{ 1230{
1238 ssize_t written; 1231 ssize_t written;
@@ -1454,7 +1447,7 @@ outside_data_content:
1454 * accessible.. 1447 * accessible..
1455 */ 1448 */
1456 if (area->vm_mm == current->mm) 1449 if (area->vm_mm == current->mm)
1457 return NULL; 1450 return NOPAGE_SIGBUS;
1458 /* Fall through to the non-read-ahead case */ 1451 /* Fall through to the non-read-ahead case */
1459no_cached_page: 1452no_cached_page:
1460 /* 1453 /*
@@ -1479,7 +1472,7 @@ no_cached_page:
1479 */ 1472 */
1480 if (error == -ENOMEM) 1473 if (error == -ENOMEM)
1481 return NOPAGE_OOM; 1474 return NOPAGE_OOM;
1482 return NULL; 1475 return NOPAGE_SIGBUS;
1483 1476
1484page_not_uptodate: 1477page_not_uptodate:
1485 if (!did_readaround) { 1478 if (!did_readaround) {
@@ -1548,7 +1541,7 @@ page_not_uptodate:
1548 */ 1541 */
1549 shrink_readahead_size_eio(file, ra); 1542 shrink_readahead_size_eio(file, ra);
1550 page_cache_release(page); 1543 page_cache_release(page);
1551 return NULL; 1544 return NOPAGE_SIGBUS;
1552} 1545}
1553EXPORT_SYMBOL(filemap_nopage); 1546EXPORT_SYMBOL(filemap_nopage);
1554 1547
@@ -1610,7 +1603,7 @@ no_cached_page:
1610page_not_uptodate: 1603page_not_uptodate:
1611 lock_page(page); 1604 lock_page(page);
1612 1605
1613 /* Did it get unhashed while we waited for it? */ 1606 /* Did it get truncated while we waited for it? */
1614 if (!page->mapping) { 1607 if (!page->mapping) {
1615 unlock_page(page); 1608 unlock_page(page);
1616 goto err; 1609 goto err;
@@ -2003,6 +1996,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
2003 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) 1996 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
2004 *count = inode->i_sb->s_maxbytes - *pos; 1997 *count = inode->i_sb->s_maxbytes - *pos;
2005 } else { 1998 } else {
1999#ifdef CONFIG_BLOCK
2006 loff_t isize; 2000 loff_t isize;
2007 if (bdev_read_only(I_BDEV(inode))) 2001 if (bdev_read_only(I_BDEV(inode)))
2008 return -EPERM; 2002 return -EPERM;
@@ -2014,6 +2008,9 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
2014 2008
2015 if (*pos + *count > isize) 2009 if (*pos + *count > isize)
2016 *count = isize - *pos; 2010 *count = isize - *pos;
2011#else
2012 return -EPERM;
2013#endif
2017 } 2014 }
2018 return 0; 2015 return 0;
2019} 2016}
@@ -2294,22 +2291,22 @@ out:
2294 current->backing_dev_info = NULL; 2291 current->backing_dev_info = NULL;
2295 return written ? written : err; 2292 return written ? written : err;
2296} 2293}
2297EXPORT_SYMBOL(generic_file_aio_write_nolock);
2298 2294
2299ssize_t 2295ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
2300generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, 2296 const struct iovec *iov, unsigned long nr_segs, loff_t pos)
2301 unsigned long nr_segs, loff_t *ppos)
2302{ 2297{
2303 struct file *file = iocb->ki_filp; 2298 struct file *file = iocb->ki_filp;
2304 struct address_space *mapping = file->f_mapping; 2299 struct address_space *mapping = file->f_mapping;
2305 struct inode *inode = mapping->host; 2300 struct inode *inode = mapping->host;
2306 ssize_t ret; 2301 ssize_t ret;
2307 loff_t pos = *ppos;
2308 2302
2309 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos); 2303 BUG_ON(iocb->ki_pos != pos);
2304
2305 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2306 &iocb->ki_pos);
2310 2307
2311 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2308 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2312 int err; 2309 ssize_t err;
2313 2310
2314 err = sync_page_range_nolock(inode, mapping, pos, ret); 2311 err = sync_page_range_nolock(inode, mapping, pos, ret);
2315 if (err < 0) 2312 if (err < 0)
@@ -2317,51 +2314,21 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2317 } 2314 }
2318 return ret; 2315 return ret;
2319} 2316}
2317EXPORT_SYMBOL(generic_file_aio_write_nolock);
2320 2318
2321static ssize_t 2319ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2322__generic_file_write_nolock(struct file *file, const struct iovec *iov, 2320 unsigned long nr_segs, loff_t pos)
2323 unsigned long nr_segs, loff_t *ppos)
2324{
2325 struct kiocb kiocb;
2326 ssize_t ret;
2327
2328 init_sync_kiocb(&kiocb, file);
2329 ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2330 if (ret == -EIOCBQUEUED)
2331 ret = wait_on_sync_kiocb(&kiocb);
2332 return ret;
2333}
2334
2335ssize_t
2336generic_file_write_nolock(struct file *file, const struct iovec *iov,
2337 unsigned long nr_segs, loff_t *ppos)
2338{
2339 struct kiocb kiocb;
2340 ssize_t ret;
2341
2342 init_sync_kiocb(&kiocb, file);
2343 ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2344 if (-EIOCBQUEUED == ret)
2345 ret = wait_on_sync_kiocb(&kiocb);
2346 return ret;
2347}
2348EXPORT_SYMBOL(generic_file_write_nolock);
2349
2350ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2351 size_t count, loff_t pos)
2352{ 2321{
2353 struct file *file = iocb->ki_filp; 2322 struct file *file = iocb->ki_filp;
2354 struct address_space *mapping = file->f_mapping; 2323 struct address_space *mapping = file->f_mapping;
2355 struct inode *inode = mapping->host; 2324 struct inode *inode = mapping->host;
2356 ssize_t ret; 2325 ssize_t ret;
2357 struct iovec local_iov = { .iov_base = (void __user *)buf,
2358 .iov_len = count };
2359 2326
2360 BUG_ON(iocb->ki_pos != pos); 2327 BUG_ON(iocb->ki_pos != pos);
2361 2328
2362 mutex_lock(&inode->i_mutex); 2329 mutex_lock(&inode->i_mutex);
2363 ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, 2330 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2364 &iocb->ki_pos); 2331 &iocb->ki_pos);
2365 mutex_unlock(&inode->i_mutex); 2332 mutex_unlock(&inode->i_mutex);
2366 2333
2367 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2334 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -2375,66 +2342,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2375} 2342}
2376EXPORT_SYMBOL(generic_file_aio_write); 2343EXPORT_SYMBOL(generic_file_aio_write);
2377 2344
2378ssize_t generic_file_write(struct file *file, const char __user *buf,
2379 size_t count, loff_t *ppos)
2380{
2381 struct address_space *mapping = file->f_mapping;
2382 struct inode *inode = mapping->host;
2383 ssize_t ret;
2384 struct iovec local_iov = { .iov_base = (void __user *)buf,
2385 .iov_len = count };
2386
2387 mutex_lock(&inode->i_mutex);
2388 ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
2389 mutex_unlock(&inode->i_mutex);
2390
2391 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2392 ssize_t err;
2393
2394 err = sync_page_range(inode, mapping, *ppos - ret, ret);
2395 if (err < 0)
2396 ret = err;
2397 }
2398 return ret;
2399}
2400EXPORT_SYMBOL(generic_file_write);
2401
2402ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
2403 unsigned long nr_segs, loff_t *ppos)
2404{
2405 struct kiocb kiocb;
2406 ssize_t ret;
2407
2408 init_sync_kiocb(&kiocb, filp);
2409 ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
2410 if (-EIOCBQUEUED == ret)
2411 ret = wait_on_sync_kiocb(&kiocb);
2412 return ret;
2413}
2414EXPORT_SYMBOL(generic_file_readv);
2415
2416ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2417 unsigned long nr_segs, loff_t *ppos)
2418{
2419 struct address_space *mapping = file->f_mapping;
2420 struct inode *inode = mapping->host;
2421 ssize_t ret;
2422
2423 mutex_lock(&inode->i_mutex);
2424 ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
2425 mutex_unlock(&inode->i_mutex);
2426
2427 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2428 int err;
2429
2430 err = sync_page_range(inode, mapping, *ppos - ret, ret);
2431 if (err < 0)
2432 ret = err;
2433 }
2434 return ret;
2435}
2436EXPORT_SYMBOL(generic_file_writev);
2437
2438/* 2345/*
2439 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something 2346 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
2440 * went wrong during pagecache shootdown. 2347 * went wrong during pagecache shootdown.
@@ -2474,3 +2381,33 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2474 } 2381 }
2475 return retval; 2382 return retval;
2476} 2383}
2384
2385/**
2386 * try_to_release_page() - release old fs-specific metadata on a page
2387 *
2388 * @page: the page which the kernel is trying to free
2389 * @gfp_mask: memory allocation flags (and I/O mode)
2390 *
2391 * The address_space is to try to release any data against the page
2392 * (presumably at page->private). If the release was successful, return `1'.
2393 * Otherwise return zero.
2394 *
2395 * The @gfp_mask argument specifies whether I/O may be performed to release
2396 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
2397 *
2398 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
2399 */
2400int try_to_release_page(struct page *page, gfp_t gfp_mask)
2401{
2402 struct address_space * const mapping = page->mapping;
2403
2404 BUG_ON(!PageLocked(page));
2405 if (PageWriteback(page))
2406 return 0;
2407
2408 if (mapping && mapping->a_ops->releasepage)
2409 return mapping->a_ops->releasepage(page, gfp_mask);
2410 return try_to_free_buffers(page);
2411}
2412
2413EXPORT_SYMBOL(try_to_release_page);
diff --git a/mm/fremap.c b/mm/fremap.c
index 21b7d0cbc9..7a9d0f5d24 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -39,7 +39,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
39 } else { 39 } else {
40 if (!pte_file(pte)) 40 if (!pte_file(pte))
41 free_swap_and_cache(pte_to_swp_entry(pte)); 41 free_swap_and_cache(pte_to_swp_entry(pte));
42 pte_clear(mm, addr, ptep); 42 pte_clear_not_present_full(mm, addr, ptep, 0);
43 } 43 }
44 return !!page; 44 return !!page;
45} 45}
@@ -79,9 +79,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
79 inc_mm_counter(mm, file_rss); 79 inc_mm_counter(mm, file_rss);
80 80
81 flush_icache_page(vma, page); 81 flush_icache_page(vma, page);
82 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 82 pte_val = mk_pte(page, prot);
83 set_pte_at(mm, addr, pte, pte_val);
83 page_add_file_rmap(page); 84 page_add_file_rmap(page);
84 pte_val = *pte;
85 update_mmu_cache(vma, addr, pte_val); 85 update_mmu_cache(vma, addr, pte_val);
86 lazy_mmu_prot_update(pte_val); 86 lazy_mmu_prot_update(pte_val);
87 err = 0; 87 err = 0;
diff --git a/mm/highmem.c b/mm/highmem.c
index 9b2a5403c4..0206e7e501 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,13 +29,6 @@
29#include <linux/blktrace_api.h> 29#include <linux/blktrace_api.h>
30#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
31 31
32static mempool_t *page_pool, *isa_page_pool;
33
34static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
35{
36 return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
37}
38
39/* 32/*
40 * Virtual_count is not a pure "count". 33 * Virtual_count is not a pure "count".
41 * 0 means that it is not mapped, and has not been mapped 34 * 0 means that it is not mapped, and has not been mapped
@@ -46,6 +39,19 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
46 */ 39 */
47#ifdef CONFIG_HIGHMEM 40#ifdef CONFIG_HIGHMEM
48 41
42unsigned long totalhigh_pages __read_mostly;
43
44unsigned int nr_free_highpages (void)
45{
46 pg_data_t *pgdat;
47 unsigned int pages = 0;
48
49 for_each_online_pgdat(pgdat)
50 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
51
52 return pages;
53}
54
49static int pkmap_count[LAST_PKMAP]; 55static int pkmap_count[LAST_PKMAP];
50static unsigned int last_pkmap_nr; 56static unsigned int last_pkmap_nr;
51static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); 57static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -204,282 +210,8 @@ void fastcall kunmap_high(struct page *page)
204} 210}
205 211
206EXPORT_SYMBOL(kunmap_high); 212EXPORT_SYMBOL(kunmap_high);
207
208#define POOL_SIZE 64
209
210static __init int init_emergency_pool(void)
211{
212 struct sysinfo i;
213 si_meminfo(&i);
214 si_swapinfo(&i);
215
216 if (!i.totalhigh)
217 return 0;
218
219 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
220 BUG_ON(!page_pool);
221 printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
222
223 return 0;
224}
225
226__initcall(init_emergency_pool);
227
228/*
229 * highmem version, map in to vec
230 */
231static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
232{
233 unsigned long flags;
234 unsigned char *vto;
235
236 local_irq_save(flags);
237 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
238 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
239 kunmap_atomic(vto, KM_BOUNCE_READ);
240 local_irq_restore(flags);
241}
242
243#else /* CONFIG_HIGHMEM */
244
245#define bounce_copy_vec(to, vfrom) \
246 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
247
248#endif 213#endif
249 214
250#define ISA_POOL_SIZE 16
251
252/*
253 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
254 * as the max address, so check if the pool has already been created.
255 */
256int init_emergency_isa_pool(void)
257{
258 if (isa_page_pool)
259 return 0;
260
261 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
262 mempool_free_pages, (void *) 0);
263 BUG_ON(!isa_page_pool);
264
265 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
266 return 0;
267}
268
269/*
270 * Simple bounce buffer support for highmem pages. Depending on the
271 * queue gfp mask set, *to may or may not be a highmem page. kmap it
272 * always, it will do the Right Thing
273 */
274static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
275{
276 unsigned char *vfrom;
277 struct bio_vec *tovec, *fromvec;
278 int i;
279
280 __bio_for_each_segment(tovec, to, i, 0) {
281 fromvec = from->bi_io_vec + i;
282
283 /*
284 * not bounced
285 */
286 if (tovec->bv_page == fromvec->bv_page)
287 continue;
288
289 /*
290 * fromvec->bv_offset and fromvec->bv_len might have been
291 * modified by the block layer, so use the original copy,
292 * bounce_copy_vec already uses tovec->bv_len
293 */
294 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
295
296 flush_dcache_page(tovec->bv_page);
297 bounce_copy_vec(tovec, vfrom);
298 }
299}
300
301static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
302{
303 struct bio *bio_orig = bio->bi_private;
304 struct bio_vec *bvec, *org_vec;
305 int i;
306
307 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
308 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
309
310 /*
311 * free up bounce indirect pages used
312 */
313 __bio_for_each_segment(bvec, bio, i, 0) {
314 org_vec = bio_orig->bi_io_vec + i;
315 if (bvec->bv_page == org_vec->bv_page)
316 continue;
317
318 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
319 mempool_free(bvec->bv_page, pool);
320 }
321
322 bio_endio(bio_orig, bio_orig->bi_size, err);
323 bio_put(bio);
324}
325
326static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
327{
328 if (bio->bi_size)
329 return 1;
330
331 bounce_end_io(bio, page_pool, err);
332 return 0;
333}
334
335static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
336{
337 if (bio->bi_size)
338 return 1;
339
340 bounce_end_io(bio, isa_page_pool, err);
341 return 0;
342}
343
344static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
345{
346 struct bio *bio_orig = bio->bi_private;
347
348 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
349 copy_to_high_bio_irq(bio_orig, bio);
350
351 bounce_end_io(bio, pool, err);
352}
353
354static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
355{
356 if (bio->bi_size)
357 return 1;
358
359 __bounce_end_io_read(bio, page_pool, err);
360 return 0;
361}
362
363static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
364{
365 if (bio->bi_size)
366 return 1;
367
368 __bounce_end_io_read(bio, isa_page_pool, err);
369 return 0;
370}
371
372static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
373 mempool_t *pool)
374{
375 struct page *page;
376 struct bio *bio = NULL;
377 int i, rw = bio_data_dir(*bio_orig);
378 struct bio_vec *to, *from;
379
380 bio_for_each_segment(from, *bio_orig, i) {
381 page = from->bv_page;
382
383 /*
384 * is destination page below bounce pfn?
385 */
386 if (page_to_pfn(page) < q->bounce_pfn)
387 continue;
388
389 /*
390 * irk, bounce it
391 */
392 if (!bio)
393 bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
394
395 to = bio->bi_io_vec + i;
396
397 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
398 to->bv_len = from->bv_len;
399 to->bv_offset = from->bv_offset;
400 inc_zone_page_state(to->bv_page, NR_BOUNCE);
401
402 if (rw == WRITE) {
403 char *vto, *vfrom;
404
405 flush_dcache_page(from->bv_page);
406 vto = page_address(to->bv_page) + to->bv_offset;
407 vfrom = kmap(from->bv_page) + from->bv_offset;
408 memcpy(vto, vfrom, to->bv_len);
409 kunmap(from->bv_page);
410 }
411 }
412
413 /*
414 * no pages bounced
415 */
416 if (!bio)
417 return;
418
419 /*
420 * at least one page was bounced, fill in possible non-highmem
421 * pages
422 */
423 __bio_for_each_segment(from, *bio_orig, i, 0) {
424 to = bio_iovec_idx(bio, i);
425 if (!to->bv_page) {
426 to->bv_page = from->bv_page;
427 to->bv_len = from->bv_len;
428 to->bv_offset = from->bv_offset;
429 }
430 }
431
432 bio->bi_bdev = (*bio_orig)->bi_bdev;
433 bio->bi_flags |= (1 << BIO_BOUNCED);
434 bio->bi_sector = (*bio_orig)->bi_sector;
435 bio->bi_rw = (*bio_orig)->bi_rw;
436
437 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
438 bio->bi_idx = (*bio_orig)->bi_idx;
439 bio->bi_size = (*bio_orig)->bi_size;
440
441 if (pool == page_pool) {
442 bio->bi_end_io = bounce_end_io_write;
443 if (rw == READ)
444 bio->bi_end_io = bounce_end_io_read;
445 } else {
446 bio->bi_end_io = bounce_end_io_write_isa;
447 if (rw == READ)
448 bio->bi_end_io = bounce_end_io_read_isa;
449 }
450
451 bio->bi_private = *bio_orig;
452 *bio_orig = bio;
453}
454
455void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
456{
457 mempool_t *pool;
458
459 /*
460 * for non-isa bounce case, just check if the bounce pfn is equal
461 * to or bigger than the highest pfn in the system -- in that case,
462 * don't waste time iterating over bio segments
463 */
464 if (!(q->bounce_gfp & GFP_DMA)) {
465 if (q->bounce_pfn >= blk_max_pfn)
466 return;
467 pool = page_pool;
468 } else {
469 BUG_ON(!isa_page_pool);
470 pool = isa_page_pool;
471 }
472
473 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
474
475 /*
476 * slow path
477 */
478 __blk_queue_bounce(q, bio_orig, pool);
479}
480
481EXPORT_SYMBOL(blk_queue_bounce);
482
483#if defined(HASHED_PAGE_VIRTUAL) 215#if defined(HASHED_PAGE_VIRTUAL)
484 216
485#define PA_HASH_ORDER 7 217#define PA_HASH_ORDER 7
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index df49997325..7c7d03dbf7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -72,7 +72,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
72 struct zone **z; 72 struct zone **z;
73 73
74 for (z = zonelist->zones; *z; z++) { 74 for (z = zonelist->zones; *z; z++) {
75 nid = (*z)->zone_pgdat->node_id; 75 nid = zone_to_nid(*z);
76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && 76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
77 !list_empty(&hugepage_freelists[nid])) 77 !list_empty(&hugepage_freelists[nid]))
78 break; 78 break;
@@ -177,7 +177,7 @@ static void update_and_free_page(struct page *page)
177{ 177{
178 int i; 178 int i;
179 nr_huge_pages--; 179 nr_huge_pages--;
180 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; 180 nr_huge_pages_node[page_to_nid(page)]--;
181 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 181 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
182 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 182 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
183 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 183 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -191,7 +191,8 @@ static void update_and_free_page(struct page *page)
191#ifdef CONFIG_HIGHMEM 191#ifdef CONFIG_HIGHMEM
192static void try_to_free_low(unsigned long count) 192static void try_to_free_low(unsigned long count)
193{ 193{
194 int i, nid; 194 int i;
195
195 for (i = 0; i < MAX_NUMNODES; ++i) { 196 for (i = 0; i < MAX_NUMNODES; ++i) {
196 struct page *page, *next; 197 struct page *page, *next;
197 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 198 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
@@ -199,9 +200,8 @@ static void try_to_free_low(unsigned long count)
199 continue; 200 continue;
200 list_del(&page->lru); 201 list_del(&page->lru);
201 update_and_free_page(page); 202 update_and_free_page(page);
202 nid = page_zone(page)->zone_pgdat->node_id;
203 free_huge_pages--; 203 free_huge_pages--;
204 free_huge_pages_node[nid]--; 204 free_huge_pages_node[page_to_nid(page)]--;
205 if (count >= nr_huge_pages) 205 if (count >= nr_huge_pages)
206 return; 206 return;
207 } 207 }
diff --git a/mm/internal.h b/mm/internal.h
index d20e3cc4ae..d527b80b29 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,8 +24,8 @@ static inline void set_page_count(struct page *page, int v)
24 */ 24 */
25static inline void set_page_refcounted(struct page *page) 25static inline void set_page_refcounted(struct page *page)
26{ 26{
27 BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); 27 VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
28 BUG_ON(atomic_read(&page->_count)); 28 VM_BUG_ON(atomic_read(&page->_count));
29 set_page_count(page, 1); 29 set_page_count(page, 1);
30} 30}
31 31
diff --git a/mm/memory.c b/mm/memory.c
index 109e986623..9cf3f341a2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/delayacct.h> 50#include <linux/delayacct.h>
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h>
52 53
53#include <asm/pgalloc.h> 54#include <asm/pgalloc.h>
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
@@ -466,7 +467,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
466 */ 467 */
467 if (is_cow_mapping(vm_flags)) { 468 if (is_cow_mapping(vm_flags)) {
468 ptep_set_wrprotect(src_mm, addr, src_pte); 469 ptep_set_wrprotect(src_mm, addr, src_pte);
469 pte = *src_pte; 470 pte = pte_wrprotect(pte);
470 } 471 }
471 472
472 /* 473 /*
@@ -505,6 +506,7 @@ again:
505 src_pte = pte_offset_map_nested(src_pmd, addr); 506 src_pte = pte_offset_map_nested(src_pmd, addr);
506 src_ptl = pte_lockptr(src_mm, src_pmd); 507 src_ptl = pte_lockptr(src_mm, src_pmd);
507 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 508 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
509 arch_enter_lazy_mmu_mode();
508 510
509 do { 511 do {
510 /* 512 /*
@@ -526,6 +528,7 @@ again:
526 progress += 8; 528 progress += 8;
527 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 529 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
528 530
531 arch_leave_lazy_mmu_mode();
529 spin_unlock(src_ptl); 532 spin_unlock(src_ptl);
530 pte_unmap_nested(src_pte - 1); 533 pte_unmap_nested(src_pte - 1);
531 add_mm_rss(dst_mm, rss[0], rss[1]); 534 add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -627,6 +630,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
627 int anon_rss = 0; 630 int anon_rss = 0;
628 631
629 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 632 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
633 arch_enter_lazy_mmu_mode();
630 do { 634 do {
631 pte_t ptent = *pte; 635 pte_t ptent = *pte;
632 if (pte_none(ptent)) { 636 if (pte_none(ptent)) {
@@ -689,10 +693,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
689 continue; 693 continue;
690 if (!pte_file(ptent)) 694 if (!pte_file(ptent))
691 free_swap_and_cache(pte_to_swp_entry(ptent)); 695 free_swap_and_cache(pte_to_swp_entry(ptent));
692 pte_clear_full(mm, addr, pte, tlb->fullmm); 696 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
693 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 697 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
694 698
695 add_mm_rss(mm, file_rss, anon_rss); 699 add_mm_rss(mm, file_rss, anon_rss);
700 arch_leave_lazy_mmu_mode();
696 pte_unmap_unlock(pte - 1, ptl); 701 pte_unmap_unlock(pte - 1, ptl);
697 702
698 return addr; 703 return addr;
@@ -1108,6 +1113,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1108 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1113 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1109 if (!pte) 1114 if (!pte)
1110 return -ENOMEM; 1115 return -ENOMEM;
1116 arch_enter_lazy_mmu_mode();
1111 do { 1117 do {
1112 struct page *page = ZERO_PAGE(addr); 1118 struct page *page = ZERO_PAGE(addr);
1113 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); 1119 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
@@ -1117,6 +1123,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1117 BUG_ON(!pte_none(*pte)); 1123 BUG_ON(!pte_none(*pte));
1118 set_pte_at(mm, addr, pte, zero_pte); 1124 set_pte_at(mm, addr, pte, zero_pte);
1119 } while (pte++, addr += PAGE_SIZE, addr != end); 1125 } while (pte++, addr += PAGE_SIZE, addr != end);
1126 arch_leave_lazy_mmu_mode();
1120 pte_unmap_unlock(pte - 1, ptl); 1127 pte_unmap_unlock(pte - 1, ptl);
1121 return 0; 1128 return 0;
1122} 1129}
@@ -1226,7 +1233,12 @@ out:
1226 return retval; 1233 return retval;
1227} 1234}
1228 1235
1229/* 1236/**
1237 * vm_insert_page - insert single page into user vma
1238 * @vma: user vma to map to
1239 * @addr: target user address of this page
1240 * @page: source kernel page
1241 *
1230 * This allows drivers to insert individual pages they've allocated 1242 * This allows drivers to insert individual pages they've allocated
1231 * into a user vma. 1243 * into a user vma.
1232 * 1244 *
@@ -1269,11 +1281,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1269 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1281 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1270 if (!pte) 1282 if (!pte)
1271 return -ENOMEM; 1283 return -ENOMEM;
1284 arch_enter_lazy_mmu_mode();
1272 do { 1285 do {
1273 BUG_ON(!pte_none(*pte)); 1286 BUG_ON(!pte_none(*pte));
1274 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); 1287 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1275 pfn++; 1288 pfn++;
1276 } while (pte++, addr += PAGE_SIZE, addr != end); 1289 } while (pte++, addr += PAGE_SIZE, addr != end);
1290 arch_leave_lazy_mmu_mode();
1277 pte_unmap_unlock(pte - 1, ptl); 1291 pte_unmap_unlock(pte - 1, ptl);
1278 return 0; 1292 return 0;
1279} 1293}
@@ -1318,7 +1332,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1318 return 0; 1332 return 0;
1319} 1333}
1320 1334
1321/* Note: this is only safe if the mm semaphore is held when called. */ 1335/**
1336 * remap_pfn_range - remap kernel memory to userspace
1337 * @vma: user vma to map to
1338 * @addr: target user address to start at
1339 * @pfn: physical address of kernel memory
1340 * @size: size of map area
1341 * @prot: page protection flags for this mapping
1342 *
1343 * Note: this is only safe if the mm semaphore is held when called.
1344 */
1322int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 1345int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1323 unsigned long pfn, unsigned long size, pgprot_t prot) 1346 unsigned long pfn, unsigned long size, pgprot_t prot)
1324{ 1347{
@@ -1458,14 +1481,29 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1458{ 1481{
1459 struct page *old_page, *new_page; 1482 struct page *old_page, *new_page;
1460 pte_t entry; 1483 pte_t entry;
1461 int reuse, ret = VM_FAULT_MINOR; 1484 int reuse = 0, ret = VM_FAULT_MINOR;
1485 struct page *dirty_page = NULL;
1462 1486
1463 old_page = vm_normal_page(vma, address, orig_pte); 1487 old_page = vm_normal_page(vma, address, orig_pte);
1464 if (!old_page) 1488 if (!old_page)
1465 goto gotten; 1489 goto gotten;
1466 1490
1467 if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == 1491 /*
1468 (VM_SHARED|VM_WRITE))) { 1492 * Take out anonymous pages first, anonymous shared vmas are
1493 * not dirty accountable.
1494 */
1495 if (PageAnon(old_page)) {
1496 if (!TestSetPageLocked(old_page)) {
1497 reuse = can_share_swap_page(old_page);
1498 unlock_page(old_page);
1499 }
1500 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1501 (VM_WRITE|VM_SHARED))) {
1502 /*
1503 * Only catch write-faults on shared writable pages,
1504 * read-only shared pages can get COWed by
1505 * get_user_pages(.write=1, .force=1).
1506 */
1469 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 1507 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1470 /* 1508 /*
1471 * Notify the address space that the page is about to 1509 * Notify the address space that the page is about to
@@ -1494,13 +1532,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1494 if (!pte_same(*page_table, orig_pte)) 1532 if (!pte_same(*page_table, orig_pte))
1495 goto unlock; 1533 goto unlock;
1496 } 1534 }
1497 1535 dirty_page = old_page;
1536 get_page(dirty_page);
1498 reuse = 1; 1537 reuse = 1;
1499 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1500 reuse = can_share_swap_page(old_page);
1501 unlock_page(old_page);
1502 } else {
1503 reuse = 0;
1504 } 1538 }
1505 1539
1506 if (reuse) { 1540 if (reuse) {
@@ -1551,7 +1585,14 @@ gotten:
1551 entry = mk_pte(new_page, vma->vm_page_prot); 1585 entry = mk_pte(new_page, vma->vm_page_prot);
1552 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1586 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1553 lazy_mmu_prot_update(entry); 1587 lazy_mmu_prot_update(entry);
1554 ptep_establish(vma, address, page_table, entry); 1588 /*
1589 * Clear the pte entry and flush it first, before updating the
1590 * pte with the new entry. This will avoid a race condition
1591 * seen in the presence of one thread doing SMC and another
1592 * thread doing COW.
1593 */
1594 ptep_clear_flush(vma, address, page_table);
1595 set_pte_at(mm, address, page_table, entry);
1555 update_mmu_cache(vma, address, entry); 1596 update_mmu_cache(vma, address, entry);
1556 lru_cache_add_active(new_page); 1597 lru_cache_add_active(new_page);
1557 page_add_new_anon_rmap(new_page, vma, address); 1598 page_add_new_anon_rmap(new_page, vma, address);
@@ -1566,6 +1607,10 @@ gotten:
1566 page_cache_release(old_page); 1607 page_cache_release(old_page);
1567unlock: 1608unlock:
1568 pte_unmap_unlock(page_table, ptl); 1609 pte_unmap_unlock(page_table, ptl);
1610 if (dirty_page) {
1611 set_page_dirty_balance(dirty_page);
1612 put_page(dirty_page);
1613 }
1569 return ret; 1614 return ret;
1570oom: 1615oom:
1571 if (old_page) 1616 if (old_page)
@@ -1785,9 +1830,10 @@ void unmap_mapping_range(struct address_space *mapping,
1785} 1830}
1786EXPORT_SYMBOL(unmap_mapping_range); 1831EXPORT_SYMBOL(unmap_mapping_range);
1787 1832
1788/* 1833/**
1789 * Handle all mappings that got truncated by a "truncate()" 1834 * vmtruncate - unmap mappings "freed" by truncate() syscall
1790 * system call. 1835 * @inode: inode of the file used
1836 * @offset: file offset to start truncating
1791 * 1837 *
1792 * NOTE! We have to be ready to update the memory sharing 1838 * NOTE! We have to be ready to update the memory sharing
1793 * between the file and the memory map for a potential last 1839 * between the file and the memory map for a potential last
@@ -1856,11 +1902,16 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1856} 1902}
1857EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ 1903EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
1858 1904
1859/* 1905/**
1906 * swapin_readahead - swap in pages in hope we need them soon
1907 * @entry: swap entry of this memory
1908 * @addr: address to start
1909 * @vma: user vma this addresses belong to
1910 *
1860 * Primitive swap readahead code. We simply read an aligned block of 1911 * Primitive swap readahead code. We simply read an aligned block of
1861 * (1 << page_cluster) entries in the swap area. This method is chosen 1912 * (1 << page_cluster) entries in the swap area. This method is chosen
1862 * because it doesn't cost us any seek time. We also make sure to queue 1913 * because it doesn't cost us any seek time. We also make sure to queue
1863 * the 'original' request together with the readahead ones... 1914 * the 'original' request together with the readahead ones...
1864 * 1915 *
1865 * This has been extended to use the NUMA policies from the mm triggering 1916 * This has been extended to use the NUMA policies from the mm triggering
1866 * the readahead. 1917 * the readahead.
@@ -2098,6 +2149,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2098 unsigned int sequence = 0; 2149 unsigned int sequence = 0;
2099 int ret = VM_FAULT_MINOR; 2150 int ret = VM_FAULT_MINOR;
2100 int anon = 0; 2151 int anon = 0;
2152 struct page *dirty_page = NULL;
2101 2153
2102 pte_unmap(page_table); 2154 pte_unmap(page_table);
2103 BUG_ON(vma->vm_flags & VM_PFNMAP); 2155 BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2192,6 +2244,10 @@ retry:
2192 } else { 2244 } else {
2193 inc_mm_counter(mm, file_rss); 2245 inc_mm_counter(mm, file_rss);
2194 page_add_file_rmap(new_page); 2246 page_add_file_rmap(new_page);
2247 if (write_access) {
2248 dirty_page = new_page;
2249 get_page(dirty_page);
2250 }
2195 } 2251 }
2196 } else { 2252 } else {
2197 /* One of our sibling threads was faster, back out. */ 2253 /* One of our sibling threads was faster, back out. */
@@ -2204,6 +2260,10 @@ retry:
2204 lazy_mmu_prot_update(entry); 2260 lazy_mmu_prot_update(entry);
2205unlock: 2261unlock:
2206 pte_unmap_unlock(page_table, ptl); 2262 pte_unmap_unlock(page_table, ptl);
2263 if (dirty_page) {
2264 set_page_dirty_balance(dirty_page);
2265 put_page(dirty_page);
2266 }
2207 return ret; 2267 return ret;
2208oom: 2268oom:
2209 page_cache_release(new_page); 2269 page_cache_release(new_page);
@@ -2211,6 +2271,54 @@ oom:
2211} 2271}
2212 2272
2213/* 2273/*
2274 * do_no_pfn() tries to create a new page mapping for a page without
2275 * a struct_page backing it
2276 *
2277 * As this is called only for pages that do not currently exist, we
2278 * do not need to flush old virtual caches or the TLB.
2279 *
2280 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2281 * but allow concurrent faults), and pte mapped but not yet locked.
2282 * We return with mmap_sem still held, but pte unmapped and unlocked.
2283 *
2284 * It is expected that the ->nopfn handler always returns the same pfn
2285 * for a given virtual mapping.
2286 *
2287 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2288 */
2289static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2290 unsigned long address, pte_t *page_table, pmd_t *pmd,
2291 int write_access)
2292{
2293 spinlock_t *ptl;
2294 pte_t entry;
2295 unsigned long pfn;
2296 int ret = VM_FAULT_MINOR;
2297
2298 pte_unmap(page_table);
2299 BUG_ON(!(vma->vm_flags & VM_PFNMAP));
2300 BUG_ON(is_cow_mapping(vma->vm_flags));
2301
2302 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2303 if (pfn == NOPFN_OOM)
2304 return VM_FAULT_OOM;
2305 if (pfn == NOPFN_SIGBUS)
2306 return VM_FAULT_SIGBUS;
2307
2308 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2309
2310 /* Only go through if we didn't race with anybody else... */
2311 if (pte_none(*page_table)) {
2312 entry = pfn_pte(pfn, vma->vm_page_prot);
2313 if (write_access)
2314 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2315 set_pte_at(mm, address, page_table, entry);
2316 }
2317 pte_unmap_unlock(page_table, ptl);
2318 return ret;
2319}
2320
2321/*
2214 * Fault of a previously existing named mapping. Repopulate the pte 2322 * Fault of a previously existing named mapping. Repopulate the pte
2215 * from the encoded file_pte if possible. This enables swappable 2323 * from the encoded file_pte if possible. This enables swappable
2216 * nonlinear vmas. 2324 * nonlinear vmas.
@@ -2272,11 +2380,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2272 old_entry = entry = *pte; 2380 old_entry = entry = *pte;
2273 if (!pte_present(entry)) { 2381 if (!pte_present(entry)) {
2274 if (pte_none(entry)) { 2382 if (pte_none(entry)) {
2275 if (!vma->vm_ops || !vma->vm_ops->nopage) 2383 if (vma->vm_ops) {
2276 return do_anonymous_page(mm, vma, address, 2384 if (vma->vm_ops->nopage)
2277 pte, pmd, write_access); 2385 return do_no_page(mm, vma, address,
2278 return do_no_page(mm, vma, address, 2386 pte, pmd,
2279 pte, pmd, write_access); 2387 write_access);
2388 if (unlikely(vma->vm_ops->nopfn))
2389 return do_no_pfn(mm, vma, address, pte,
2390 pmd, write_access);
2391 }
2392 return do_anonymous_page(mm, vma, address,
2393 pte, pmd, write_access);
2280 } 2394 }
2281 if (pte_file(entry)) 2395 if (pte_file(entry))
2282 return do_file_page(mm, vma, address, 2396 return do_file_page(mm, vma, address,
@@ -2505,3 +2619,56 @@ int in_gate_area_no_task(unsigned long addr)
2505} 2619}
2506 2620
2507#endif /* __HAVE_ARCH_GATE_AREA */ 2621#endif /* __HAVE_ARCH_GATE_AREA */
2622
2623/*
2624 * Access another process' address space.
2625 * Source/target buffer must be kernel space,
2626 * Do not walk the page table directly, use get_user_pages
2627 */
2628int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2629{
2630 struct mm_struct *mm;
2631 struct vm_area_struct *vma;
2632 struct page *page;
2633 void *old_buf = buf;
2634
2635 mm = get_task_mm(tsk);
2636 if (!mm)
2637 return 0;
2638
2639 down_read(&mm->mmap_sem);
2640 /* ignore errors, just check how much was sucessfully transfered */
2641 while (len) {
2642 int bytes, ret, offset;
2643 void *maddr;
2644
2645 ret = get_user_pages(tsk, mm, addr, 1,
2646 write, 1, &page, &vma);
2647 if (ret <= 0)
2648 break;
2649
2650 bytes = len;
2651 offset = addr & (PAGE_SIZE-1);
2652 if (bytes > PAGE_SIZE-offset)
2653 bytes = PAGE_SIZE-offset;
2654
2655 maddr = kmap(page);
2656 if (write) {
2657 copy_to_user_page(vma, page, addr,
2658 maddr + offset, buf, bytes);
2659 set_page_dirty_lock(page);
2660 } else {
2661 copy_from_user_page(vma, page, addr,
2662 buf, maddr + offset, bytes);
2663 }
2664 kunmap(page);
2665 page_cache_release(page);
2666 len -= bytes;
2667 buf += bytes;
2668 addr += bytes;
2669 }
2670 up_read(&mm->mmap_sem);
2671 mmput(mm);
2672
2673 return buf - old_buf;
2674}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c37319542b..fd678a662e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -13,6 +13,7 @@
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/pagevec.h> 15#include <linux/pagevec.h>
16#include <linux/writeback.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17#include <linux/sysctl.h> 18#include <linux/sysctl.h>
18#include <linux/cpu.h> 19#include <linux/cpu.h>
@@ -21,11 +22,41 @@
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
23#include <linux/ioport.h> 24#include <linux/ioport.h>
25#include <linux/cpuset.h>
24 26
25#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
26 28
27extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 29/* add this memory to iomem resource */
28 unsigned long size); 30static struct resource *register_memory_resource(u64 start, u64 size)
31{
32 struct resource *res;
33 res = kzalloc(sizeof(struct resource), GFP_KERNEL);
34 BUG_ON(!res);
35
36 res->name = "System RAM";
37 res->start = start;
38 res->end = start + size - 1;
39 res->flags = IORESOURCE_MEM;
40 if (request_resource(&iomem_resource, res) < 0) {
41 printk("System RAM resource %llx - %llx cannot be added\n",
42 (unsigned long long)res->start, (unsigned long long)res->end);
43 kfree(res);
44 res = NULL;
45 }
46 return res;
47}
48
49static void release_memory_resource(struct resource *res)
50{
51 if (!res)
52 return;
53 release_resource(res);
54 kfree(res);
55 return;
56}
57
58
59#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
29static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) 60static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
30{ 61{
31 struct pglist_data *pgdat = zone->zone_pgdat; 62 struct pglist_data *pgdat = zone->zone_pgdat;
@@ -45,8 +76,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
45 return 0; 76 return 0;
46} 77}
47 78
48extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
49 int nr_pages);
50static int __add_section(struct zone *zone, unsigned long phys_start_pfn) 79static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
51{ 80{
52 int nr_pages = PAGES_PER_SECTION; 81 int nr_pages = PAGES_PER_SECTION;
@@ -191,8 +220,10 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
191 if (need_zonelists_rebuild) 220 if (need_zonelists_rebuild)
192 build_all_zonelists(); 221 build_all_zonelists();
193 vm_total_pages = nr_free_pagecache_pages(); 222 vm_total_pages = nr_free_pagecache_pages();
223 writeback_set_ratelimit();
194 return 0; 224 return 0;
195} 225}
226#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
196 227
197static pg_data_t *hotadd_new_pgdat(int nid, u64 start) 228static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
198{ 229{
@@ -222,36 +253,6 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
222 return; 253 return;
223} 254}
224 255
225/* add this memory to iomem resource */
226static struct resource *register_memory_resource(u64 start, u64 size)
227{
228 struct resource *res;
229 res = kzalloc(sizeof(struct resource), GFP_KERNEL);
230 BUG_ON(!res);
231
232 res->name = "System RAM";
233 res->start = start;
234 res->end = start + size - 1;
235 res->flags = IORESOURCE_MEM;
236 if (request_resource(&iomem_resource, res) < 0) {
237 printk("System RAM resource %llx - %llx cannot be added\n",
238 (unsigned long long)res->start, (unsigned long long)res->end);
239 kfree(res);
240 res = NULL;
241 }
242 return res;
243}
244
245static void release_memory_resource(struct resource *res)
246{
247 if (!res)
248 return;
249 release_resource(res);
250 kfree(res);
251 return;
252}
253
254
255 256
256int add_memory(int nid, u64 start, u64 size) 257int add_memory(int nid, u64 start, u64 size)
257{ 258{
@@ -283,6 +284,8 @@ int add_memory(int nid, u64 start, u64 size)
283 /* we online node here. we can't roll back from here. */ 284 /* we online node here. we can't roll back from here. */
284 node_set_online(nid); 285 node_set_online(nid);
285 286
287 cpuset_track_online_nodes();
288
286 if (new_pgdat) { 289 if (new_pgdat) {
287 ret = register_one_node(nid); 290 ret = register_one_node(nid);
288 /* 291 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a9963ceddd..25788b1b7f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -105,7 +105,7 @@ static struct kmem_cache *sn_cache;
105 105
106/* Highest zone. An specific allocation for a zone below that is not 106/* Highest zone. An specific allocation for a zone below that is not
107 policied. */ 107 policied. */
108int policy_zone = ZONE_DMA; 108enum zone_type policy_zone = ZONE_DMA;
109 109
110struct mempolicy default_policy = { 110struct mempolicy default_policy = {
111 .refcnt = ATOMIC_INIT(1), /* never free it */ 111 .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -137,7 +137,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
137static struct zonelist *bind_zonelist(nodemask_t *nodes) 137static struct zonelist *bind_zonelist(nodemask_t *nodes)
138{ 138{
139 struct zonelist *zl; 139 struct zonelist *zl;
140 int num, max, nd, k; 140 int num, max, nd;
141 enum zone_type k;
141 142
142 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
143 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 144 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
@@ -148,12 +149,16 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
148 lower zones etc. Avoid empty zones because the memory allocator 149 lower zones etc. Avoid empty zones because the memory allocator
149 doesn't like them. If you implement node hot removal you 150 doesn't like them. If you implement node hot removal you
150 have to fix that. */ 151 have to fix that. */
151 for (k = policy_zone; k >= 0; k--) { 152 k = policy_zone;
153 while (1) {
152 for_each_node_mask(nd, *nodes) { 154 for_each_node_mask(nd, *nodes) {
153 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 155 struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 if (z->present_pages > 0) 156 if (z->present_pages > 0)
155 zl->zones[num++] = z; 157 zl->zones[num++] = z;
156 } 158 }
159 if (k == 0)
160 break;
161 k--;
157 } 162 }
158 zl->zones[num] = NULL; 163 zl->zones[num] = NULL;
159 return zl; 164 return zl;
@@ -482,7 +487,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
482 switch (p->policy) { 487 switch (p->policy) {
483 case MPOL_BIND: 488 case MPOL_BIND:
484 for (i = 0; p->v.zonelist->zones[i]; i++) 489 for (i = 0; p->v.zonelist->zones[i]; i++)
485 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, 490 node_set(zone_to_nid(p->v.zonelist->zones[i]),
486 *nodes); 491 *nodes);
487 break; 492 break;
488 case MPOL_DEFAULT: 493 case MPOL_DEFAULT:
@@ -1131,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1131 */ 1136 */
1132unsigned slab_node(struct mempolicy *policy) 1137unsigned slab_node(struct mempolicy *policy)
1133{ 1138{
1134 switch (policy->policy) { 1139 int pol = policy ? policy->policy : MPOL_DEFAULT;
1140
1141 switch (pol) {
1135 case MPOL_INTERLEAVE: 1142 case MPOL_INTERLEAVE:
1136 return interleave_nodes(policy); 1143 return interleave_nodes(policy);
1137 1144
@@ -1140,7 +1147,7 @@ unsigned slab_node(struct mempolicy *policy)
1140 * Follow bind policy behavior and start allocation at the 1147 * Follow bind policy behavior and start allocation at the
1141 * first node. 1148 * first node.
1142 */ 1149 */
1143 return policy->v.zonelist->zones[0]->zone_pgdat->node_id; 1150 return zone_to_nid(policy->v.zonelist->zones[0]);
1144 1151
1145 case MPOL_PREFERRED: 1152 case MPOL_PREFERRED:
1146 if (policy->v.preferred_node >= 0) 1153 if (policy->v.preferred_node >= 0)
@@ -1285,7 +1292,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1285 1292
1286 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1293 if ((gfp & __GFP_WAIT) && !in_interrupt())
1287 cpuset_update_task_memory_state(); 1294 cpuset_update_task_memory_state();
1288 if (!pol || in_interrupt()) 1295 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1289 pol = &default_policy; 1296 pol = &default_policy;
1290 if (pol->policy == MPOL_INTERLEAVE) 1297 if (pol->policy == MPOL_INTERLEAVE)
1291 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1298 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
@@ -1317,12 +1324,11 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1317 atomic_set(&new->refcnt, 1); 1324 atomic_set(&new->refcnt, 1);
1318 if (new->policy == MPOL_BIND) { 1325 if (new->policy == MPOL_BIND) {
1319 int sz = ksize(old->v.zonelist); 1326 int sz = ksize(old->v.zonelist);
1320 new->v.zonelist = kmalloc(sz, SLAB_KERNEL); 1327 new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL);
1321 if (!new->v.zonelist) { 1328 if (!new->v.zonelist) {
1322 kmem_cache_free(policy_cache, new); 1329 kmem_cache_free(policy_cache, new);
1323 return ERR_PTR(-ENOMEM); 1330 return ERR_PTR(-ENOMEM);
1324 } 1331 }
1325 memcpy(new->v.zonelist, old->v.zonelist, sz);
1326 } 1332 }
1327 return new; 1333 return new;
1328} 1334}
@@ -1644,7 +1650,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1644 1650
1645 nodes_clear(nodes); 1651 nodes_clear(nodes);
1646 for (z = pol->v.zonelist->zones; *z; z++) 1652 for (z = pol->v.zonelist->zones; *z; z++)
1647 node_set((*z)->zone_pgdat->node_id, nodes); 1653 node_set(zone_to_nid(*z), nodes);
1648 nodes_remap(tmp, nodes, *mpolmask, *newmask); 1654 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1649 nodes = tmp; 1655 nodes = tmp;
1650 1656
diff --git a/mm/migrate.c b/mm/migrate.c
index 3f1e0c2c94..ba2453f948 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -409,6 +409,7 @@ int migrate_page(struct address_space *mapping,
409} 409}
410EXPORT_SYMBOL(migrate_page); 410EXPORT_SYMBOL(migrate_page);
411 411
412#ifdef CONFIG_BLOCK
412/* 413/*
413 * Migration function for pages with buffers. This function can only be used 414 * Migration function for pages with buffers. This function can only be used
414 * if the underlying filesystem guarantees that no other references to "page" 415 * if the underlying filesystem guarantees that no other references to "page"
@@ -466,6 +467,7 @@ int buffer_migrate_page(struct address_space *mapping,
466 return 0; 467 return 0;
467} 468}
468EXPORT_SYMBOL(buffer_migrate_page); 469EXPORT_SYMBOL(buffer_migrate_page);
470#endif
469 471
470/* 472/*
471 * Writeback a page to clean the dirty state 473 * Writeback a page to clean the dirty state
@@ -525,7 +527,7 @@ static int fallback_migrate_page(struct address_space *mapping,
525 * Buffers may be managed in a filesystem specific way. 527 * Buffers may be managed in a filesystem specific way.
526 * We must have no buffers or drop them. 528 * We must have no buffers or drop them.
527 */ 529 */
528 if (page_has_buffers(page) && 530 if (PagePrivate(page) &&
529 !try_to_release_page(page, GFP_KERNEL)) 531 !try_to_release_page(page, GFP_KERNEL))
530 return -EAGAIN; 532 return -EAGAIN;
531 533
@@ -741,7 +743,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
741 743
742 *result = &pm->status; 744 *result = &pm->status;
743 745
744 return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); 746 return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0);
745} 747}
746 748
747/* 749/*
diff --git a/mm/mmap.c b/mm/mmap.c
index e66a0b524a..eea8eefd51 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -64,6 +64,13 @@ pgprot_t protection_map[16] = {
64 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 64 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
65}; 65};
66 66
67pgprot_t vm_get_page_prot(unsigned long vm_flags)
68{
69 return protection_map[vm_flags &
70 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
71}
72EXPORT_SYMBOL(vm_get_page_prot);
73
67int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 74int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
68int sysctl_overcommit_ratio = 50; /* default is 50% */ 75int sysctl_overcommit_ratio = 50; /* default is 50% */
69int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 76int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
@@ -109,7 +116,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
109 * which are reclaimable, under pressure. The dentry 116 * which are reclaimable, under pressure. The dentry
110 * cache and most inode caches should fall into this 117 * cache and most inode caches should fall into this
111 */ 118 */
112 free += atomic_read(&slab_reclaim_pages); 119 free += global_page_state(NR_SLAB_RECLAIMABLE);
113 120
114 /* 121 /*
115 * Leave the last 3% for root 122 * Leave the last 3% for root
@@ -1098,12 +1105,6 @@ munmap_back:
1098 goto free_vma; 1105 goto free_vma;
1099 } 1106 }
1100 1107
1101 /* Don't make the VMA automatically writable if it's shared, but the
1102 * backer wishes to know when pages are first written to */
1103 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1104 vma->vm_page_prot =
1105 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1106
1107 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform 1108 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1108 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) 1109 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1109 * that memory reservation must be checked; but that reservation 1110 * that memory reservation must be checked; but that reservation
@@ -1121,6 +1122,10 @@ munmap_back:
1121 pgoff = vma->vm_pgoff; 1122 pgoff = vma->vm_pgoff;
1122 vm_flags = vma->vm_flags; 1123 vm_flags = vma->vm_flags;
1123 1124
1125 if (vma_wants_writenotify(vma))
1126 vma->vm_page_prot =
1127 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1128
1124 if (!file || !vma_merge(mm, prev, addr, vma->vm_end, 1129 if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
1125 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { 1130 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
1126 file = vma->vm_file; 1131 file = vma->vm_file;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 638edabaff..3b8f3c0c63 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -27,12 +27,14 @@
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28 28
29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
30 unsigned long addr, unsigned long end, pgprot_t newprot) 30 unsigned long addr, unsigned long end, pgprot_t newprot,
31 int dirty_accountable)
31{ 32{
32 pte_t *pte, oldpte; 33 pte_t *pte, oldpte;
33 spinlock_t *ptl; 34 spinlock_t *ptl;
34 35
35 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 36 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
37 arch_enter_lazy_mmu_mode();
36 do { 38 do {
37 oldpte = *pte; 39 oldpte = *pte;
38 if (pte_present(oldpte)) { 40 if (pte_present(oldpte)) {
@@ -42,7 +44,14 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
42 * bits by wiping the pte and then setting the new pte 44 * bits by wiping the pte and then setting the new pte
43 * into place. 45 * into place.
44 */ 46 */
45 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); 47 ptent = ptep_get_and_clear(mm, addr, pte);
48 ptent = pte_modify(ptent, newprot);
49 /*
50 * Avoid taking write faults for pages we know to be
51 * dirty.
52 */
53 if (dirty_accountable && pte_dirty(ptent))
54 ptent = pte_mkwrite(ptent);
46 set_pte_at(mm, addr, pte, ptent); 55 set_pte_at(mm, addr, pte, ptent);
47 lazy_mmu_prot_update(ptent); 56 lazy_mmu_prot_update(ptent);
48#ifdef CONFIG_MIGRATION 57#ifdef CONFIG_MIGRATION
@@ -62,11 +71,13 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
62 } 71 }
63 72
64 } while (pte++, addr += PAGE_SIZE, addr != end); 73 } while (pte++, addr += PAGE_SIZE, addr != end);
74 arch_leave_lazy_mmu_mode();
65 pte_unmap_unlock(pte - 1, ptl); 75 pte_unmap_unlock(pte - 1, ptl);
66} 76}
67 77
68static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, 78static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
69 unsigned long addr, unsigned long end, pgprot_t newprot) 79 unsigned long addr, unsigned long end, pgprot_t newprot,
80 int dirty_accountable)
70{ 81{
71 pmd_t *pmd; 82 pmd_t *pmd;
72 unsigned long next; 83 unsigned long next;
@@ -76,12 +87,13 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
76 next = pmd_addr_end(addr, end); 87 next = pmd_addr_end(addr, end);
77 if (pmd_none_or_clear_bad(pmd)) 88 if (pmd_none_or_clear_bad(pmd))
78 continue; 89 continue;
79 change_pte_range(mm, pmd, addr, next, newprot); 90 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
80 } while (pmd++, addr = next, addr != end); 91 } while (pmd++, addr = next, addr != end);
81} 92}
82 93
83static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, 94static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
84 unsigned long addr, unsigned long end, pgprot_t newprot) 95 unsigned long addr, unsigned long end, pgprot_t newprot,
96 int dirty_accountable)
85{ 97{
86 pud_t *pud; 98 pud_t *pud;
87 unsigned long next; 99 unsigned long next;
@@ -91,12 +103,13 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
91 next = pud_addr_end(addr, end); 103 next = pud_addr_end(addr, end);
92 if (pud_none_or_clear_bad(pud)) 104 if (pud_none_or_clear_bad(pud))
93 continue; 105 continue;
94 change_pmd_range(mm, pud, addr, next, newprot); 106 change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
95 } while (pud++, addr = next, addr != end); 107 } while (pud++, addr = next, addr != end);
96} 108}
97 109
98static void change_protection(struct vm_area_struct *vma, 110static void change_protection(struct vm_area_struct *vma,
99 unsigned long addr, unsigned long end, pgprot_t newprot) 111 unsigned long addr, unsigned long end, pgprot_t newprot,
112 int dirty_accountable)
100{ 113{
101 struct mm_struct *mm = vma->vm_mm; 114 struct mm_struct *mm = vma->vm_mm;
102 pgd_t *pgd; 115 pgd_t *pgd;
@@ -110,7 +123,7 @@ static void change_protection(struct vm_area_struct *vma,
110 next = pgd_addr_end(addr, end); 123 next = pgd_addr_end(addr, end);
111 if (pgd_none_or_clear_bad(pgd)) 124 if (pgd_none_or_clear_bad(pgd))
112 continue; 125 continue;
113 change_pud_range(mm, pgd, addr, next, newprot); 126 change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
114 } while (pgd++, addr = next, addr != end); 127 } while (pgd++, addr = next, addr != end);
115 flush_tlb_range(vma, start, end); 128 flush_tlb_range(vma, start, end);
116} 129}
@@ -123,10 +136,9 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
123 unsigned long oldflags = vma->vm_flags; 136 unsigned long oldflags = vma->vm_flags;
124 long nrpages = (end - start) >> PAGE_SHIFT; 137 long nrpages = (end - start) >> PAGE_SHIFT;
125 unsigned long charged = 0; 138 unsigned long charged = 0;
126 unsigned int mask;
127 pgprot_t newprot;
128 pgoff_t pgoff; 139 pgoff_t pgoff;
129 int error; 140 int error;
141 int dirty_accountable = 0;
130 142
131 if (newflags == oldflags) { 143 if (newflags == oldflags) {
132 *pprev = vma; 144 *pprev = vma;
@@ -176,24 +188,23 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
176 } 188 }
177 189
178success: 190success:
179 /* Don't make the VMA automatically writable if it's shared, but the
180 * backer wishes to know when pages are first written to */
181 mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
182 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
183 mask &= ~VM_SHARED;
184
185 newprot = protection_map[newflags & mask];
186
187 /* 191 /*
188 * vm_flags and vm_page_prot are protected by the mmap_sem 192 * vm_flags and vm_page_prot are protected by the mmap_sem
189 * held in write mode. 193 * held in write mode.
190 */ 194 */
191 vma->vm_flags = newflags; 195 vma->vm_flags = newflags;
192 vma->vm_page_prot = newprot; 196 vma->vm_page_prot = protection_map[newflags &
197 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
198 if (vma_wants_writenotify(vma)) {
199 vma->vm_page_prot = protection_map[newflags &
200 (VM_READ|VM_WRITE|VM_EXEC)];
201 dirty_accountable = 1;
202 }
203
193 if (is_vm_hugetlb_page(vma)) 204 if (is_vm_hugetlb_page(vma))
194 hugetlb_change_protection(vma, start, end, newprot); 205 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
195 else 206 else
196 change_protection(vma, start, end, newprot); 207 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
197 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 208 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
198 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 209 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
199 return 0; 210 return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index 7c15cf3373..9c769fa29f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -98,6 +98,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
98 new_ptl = pte_lockptr(mm, new_pmd); 98 new_ptl = pte_lockptr(mm, new_pmd);
99 if (new_ptl != old_ptl) 99 if (new_ptl != old_ptl)
100 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 100 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
101 arch_enter_lazy_mmu_mode();
101 102
102 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, 103 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
103 new_pte++, new_addr += PAGE_SIZE) { 104 new_pte++, new_addr += PAGE_SIZE) {
@@ -109,6 +110,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
109 set_pte_at(mm, new_addr, new_pte, pte); 110 set_pte_at(mm, new_addr, new_pte, pte);
110 } 111 }
111 112
113 arch_leave_lazy_mmu_mode();
112 if (new_ptl != old_ptl) 114 if (new_ptl != old_ptl)
113 spin_unlock(new_ptl); 115 spin_unlock(new_ptl);
114 pte_unmap_nested(new_pte - 1); 116 pte_unmap_nested(new_pte - 1);
diff --git a/mm/msync.c b/mm/msync.c
index d083544df2..358d73cf7b 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -7,149 +7,33 @@
7/* 7/*
8 * The msync() system call. 8 * The msync() system call.
9 */ 9 */
10#include <linux/slab.h>
11#include <linux/pagemap.h>
12#include <linux/fs.h> 10#include <linux/fs.h>
13#include <linux/mm.h> 11#include <linux/mm.h>
14#include <linux/mman.h> 12#include <linux/mman.h>
15#include <linux/hugetlb.h>
16#include <linux/writeback.h>
17#include <linux/file.h> 13#include <linux/file.h>
18#include <linux/syscalls.h> 14#include <linux/syscalls.h>
19 15
20#include <asm/pgtable.h>
21#include <asm/tlbflush.h>
22
23static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
24 unsigned long addr, unsigned long end)
25{
26 pte_t *pte;
27 spinlock_t *ptl;
28 int progress = 0;
29 unsigned long ret = 0;
30
31again:
32 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
33 do {
34 struct page *page;
35
36 if (progress >= 64) {
37 progress = 0;
38 if (need_resched() || need_lockbreak(ptl))
39 break;
40 }
41 progress++;
42 if (!pte_present(*pte))
43 continue;
44 if (!pte_maybe_dirty(*pte))
45 continue;
46 page = vm_normal_page(vma, addr, *pte);
47 if (!page)
48 continue;
49 if (ptep_clear_flush_dirty(vma, addr, pte) ||
50 page_test_and_clear_dirty(page))
51 ret += set_page_dirty(page);
52 progress += 3;
53 } while (pte++, addr += PAGE_SIZE, addr != end);
54 pte_unmap_unlock(pte - 1, ptl);
55 cond_resched();
56 if (addr != end)
57 goto again;
58 return ret;
59}
60
61static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
62 pud_t *pud, unsigned long addr, unsigned long end)
63{
64 pmd_t *pmd;
65 unsigned long next;
66 unsigned long ret = 0;
67
68 pmd = pmd_offset(pud, addr);
69 do {
70 next = pmd_addr_end(addr, end);
71 if (pmd_none_or_clear_bad(pmd))
72 continue;
73 ret += msync_pte_range(vma, pmd, addr, next);
74 } while (pmd++, addr = next, addr != end);
75 return ret;
76}
77
78static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
79 pgd_t *pgd, unsigned long addr, unsigned long end)
80{
81 pud_t *pud;
82 unsigned long next;
83 unsigned long ret = 0;
84
85 pud = pud_offset(pgd, addr);
86 do {
87 next = pud_addr_end(addr, end);
88 if (pud_none_or_clear_bad(pud))
89 continue;
90 ret += msync_pmd_range(vma, pud, addr, next);
91 } while (pud++, addr = next, addr != end);
92 return ret;
93}
94
95static unsigned long msync_page_range(struct vm_area_struct *vma,
96 unsigned long addr, unsigned long end)
97{
98 pgd_t *pgd;
99 unsigned long next;
100 unsigned long ret = 0;
101
102 /* For hugepages we can't go walking the page table normally,
103 * but that's ok, hugetlbfs is memory based, so we don't need
104 * to do anything more on an msync().
105 */
106 if (vma->vm_flags & VM_HUGETLB)
107 return 0;
108
109 BUG_ON(addr >= end);
110 pgd = pgd_offset(vma->vm_mm, addr);
111 flush_cache_range(vma, addr, end);
112 do {
113 next = pgd_addr_end(addr, end);
114 if (pgd_none_or_clear_bad(pgd))
115 continue;
116 ret += msync_pud_range(vma, pgd, addr, next);
117 } while (pgd++, addr = next, addr != end);
118 return ret;
119}
120
121/* 16/*
122 * MS_SYNC syncs the entire file - including mappings. 17 * MS_SYNC syncs the entire file - including mappings.
123 * 18 *
124 * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just 19 * MS_ASYNC does not start I/O (it used to, up to 2.5.67).
125 * marks the relevant pages dirty. The application may now run fsync() to 20 * Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
21 * Now it doesn't do anything, since dirty pages are properly tracked.
22 *
23 * The application may now run fsync() to
126 * write out the dirty pages and wait on the writeout and check the result. 24 * write out the dirty pages and wait on the writeout and check the result.
127 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start 25 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
128 * async writeout immediately. 26 * async writeout immediately.
129 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to 27 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
130 * applications. 28 * applications.
131 */ 29 */
132static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
133 unsigned long end, int flags,
134 unsigned long *nr_pages_dirtied)
135{
136 struct file *file = vma->vm_file;
137
138 if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
139 return -EBUSY;
140
141 if (file && (vma->vm_flags & VM_SHARED))
142 *nr_pages_dirtied = msync_page_range(vma, addr, end);
143 return 0;
144}
145
146asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 30asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
147{ 31{
148 unsigned long end; 32 unsigned long end;
33 struct mm_struct *mm = current->mm;
149 struct vm_area_struct *vma; 34 struct vm_area_struct *vma;
150 int unmapped_error = 0; 35 int unmapped_error = 0;
151 int error = -EINVAL; 36 int error = -EINVAL;
152 int done = 0;
153 37
154 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) 38 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
155 goto out; 39 goto out;
@@ -169,64 +53,50 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
169 * If the interval [start,end) covers some unmapped address ranges, 53 * If the interval [start,end) covers some unmapped address ranges,
170 * just ignore them, but return -ENOMEM at the end. 54 * just ignore them, but return -ENOMEM at the end.
171 */ 55 */
172 down_read(&current->mm->mmap_sem); 56 down_read(&mm->mmap_sem);
173 vma = find_vma(current->mm, start); 57 vma = find_vma(mm, start);
174 if (!vma) { 58 for (;;) {
175 error = -ENOMEM;
176 goto out_unlock;
177 }
178 do {
179 unsigned long nr_pages_dirtied = 0;
180 struct file *file; 59 struct file *file;
181 60
61 /* Still start < end. */
62 error = -ENOMEM;
63 if (!vma)
64 goto out_unlock;
182 /* Here start < vma->vm_end. */ 65 /* Here start < vma->vm_end. */
183 if (start < vma->vm_start) { 66 if (start < vma->vm_start) {
184 unmapped_error = -ENOMEM;
185 start = vma->vm_start; 67 start = vma->vm_start;
68 if (start >= end)
69 goto out_unlock;
70 unmapped_error = -ENOMEM;
186 } 71 }
187 /* Here vma->vm_start <= start < vma->vm_end. */ 72 /* Here vma->vm_start <= start < vma->vm_end. */
188 if (end <= vma->vm_end) { 73 if ((flags & MS_INVALIDATE) &&
189 if (start < end) { 74 (vma->vm_flags & VM_LOCKED)) {
190 error = msync_interval(vma, start, end, flags, 75 error = -EBUSY;
191 &nr_pages_dirtied); 76 goto out_unlock;
192 if (error)
193 goto out_unlock;
194 }
195 error = unmapped_error;
196 done = 1;
197 } else {
198 /* Here vma->vm_start <= start < vma->vm_end < end. */
199 error = msync_interval(vma, start, vma->vm_end, flags,
200 &nr_pages_dirtied);
201 if (error)
202 goto out_unlock;
203 } 77 }
204 file = vma->vm_file; 78 file = vma->vm_file;
205 start = vma->vm_end; 79 start = vma->vm_end;
206 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { 80 if ((flags & MS_SYNC) && file &&
207 get_file(file);
208 up_read(&current->mm->mmap_sem);
209 balance_dirty_pages_ratelimited_nr(file->f_mapping,
210 nr_pages_dirtied);
211 fput(file);
212 down_read(&current->mm->mmap_sem);
213 vma = find_vma(current->mm, start);
214 } else if ((flags & MS_SYNC) && file &&
215 (vma->vm_flags & VM_SHARED)) { 81 (vma->vm_flags & VM_SHARED)) {
216 get_file(file); 82 get_file(file);
217 up_read(&current->mm->mmap_sem); 83 up_read(&mm->mmap_sem);
218 error = do_fsync(file, 0); 84 error = do_fsync(file, 0);
219 fput(file); 85 fput(file);
220 down_read(&current->mm->mmap_sem); 86 if (error || start >= end)
221 if (error) 87 goto out;
222 goto out_unlock; 88 down_read(&mm->mmap_sem);
223 vma = find_vma(current->mm, start); 89 vma = find_vma(mm, start);
224 } else { 90 } else {
91 if (start >= end) {
92 error = 0;
93 goto out_unlock;
94 }
225 vma = vma->vm_next; 95 vma = vma->vm_next;
226 } 96 }
227 } while (vma && !done); 97 }
228out_unlock: 98out_unlock:
229 up_read(&current->mm->mmap_sem); 99 up_read(&mm->mmap_sem);
230out: 100out:
231 return error; 101 return error ? : unmapped_error;
232} 102}
diff --git a/mm/nommu.c b/mm/nommu.c
index c576df71e3..365019599d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp)
122} 122}
123 123
124/* 124/*
125 * The nommu dodgy version :-) 125 * get a list of pages in an address range belonging to the specified process
126 * and indicate the VMA that covers each page
127 * - this is potentially dodgy as we may end incrementing the page count of a
128 * slab page or a secondary page from a compound page
129 * - don't permit access to VMAs that don't support it, such as I/O mappings
126 */ 130 */
127int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 131int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
128 unsigned long start, int len, int write, int force, 132 unsigned long start, int len, int write, int force,
129 struct page **pages, struct vm_area_struct **vmas) 133 struct page **pages, struct vm_area_struct **vmas)
130{ 134{
135 struct vm_area_struct *vma;
136 unsigned long vm_flags;
131 int i; 137 int i;
132 static struct vm_area_struct dummy_vma; 138
139 /* calculate required read or write permissions.
140 * - if 'force' is set, we only require the "MAY" flags.
141 */
142 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
143 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
133 144
134 for (i = 0; i < len; i++) { 145 for (i = 0; i < len; i++) {
146 vma = find_vma(mm, start);
147 if (!vma)
148 goto finish_or_fault;
149
150 /* protect what we can, including chardevs */
151 if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
152 !(vm_flags & vma->vm_flags))
153 goto finish_or_fault;
154
135 if (pages) { 155 if (pages) {
136 pages[i] = virt_to_page(start); 156 pages[i] = virt_to_page(start);
137 if (pages[i]) 157 if (pages[i])
138 page_cache_get(pages[i]); 158 page_cache_get(pages[i]);
139 } 159 }
140 if (vmas) 160 if (vmas)
141 vmas[i] = &dummy_vma; 161 vmas[i] = vma;
142 start += PAGE_SIZE; 162 start += PAGE_SIZE;
143 } 163 }
144 return(i); 164
165 return i;
166
167finish_or_fault:
168 return i ? : -EFAULT;
145} 169}
146 170
147EXPORT_SYMBOL(get_user_pages); 171EXPORT_SYMBOL(get_user_pages);
@@ -286,6 +310,77 @@ static void show_process_blocks(void)
286} 310}
287#endif /* DEBUG */ 311#endif /* DEBUG */
288 312
313/*
314 * add a VMA into a process's mm_struct in the appropriate place in the list
315 * - should be called with mm->mmap_sem held writelocked
316 */
317static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
318{
319 struct vm_list_struct **ppv;
320
321 for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
322 if ((*ppv)->vma->vm_start > vml->vma->vm_start)
323 break;
324
325 vml->next = *ppv;
326 *ppv = vml;
327}
328
329/*
330 * look up the first VMA in which addr resides, NULL if none
331 * - should be called with mm->mmap_sem at least held readlocked
332 */
333struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
334{
335 struct vm_list_struct *loop, *vml;
336
337 /* search the vm_start ordered list */
338 vml = NULL;
339 for (loop = mm->context.vmlist; loop; loop = loop->next) {
340 if (loop->vma->vm_start > addr)
341 break;
342 vml = loop;
343 }
344
345 if (vml && vml->vma->vm_end > addr)
346 return vml->vma;
347
348 return NULL;
349}
350EXPORT_SYMBOL(find_vma);
351
352/*
353 * find a VMA
354 * - we don't extend stack VMAs under NOMMU conditions
355 */
356struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
357{
358 return find_vma(mm, addr);
359}
360
361/*
362 * look up the first VMA exactly that exactly matches addr
363 * - should be called with mm->mmap_sem at least held readlocked
364 */
365static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
366 unsigned long addr)
367{
368 struct vm_list_struct *vml;
369
370 /* search the vm_start ordered list */
371 for (vml = mm->context.vmlist; vml; vml = vml->next) {
372 if (vml->vma->vm_start == addr)
373 return vml->vma;
374 if (vml->vma->vm_start > addr)
375 break;
376 }
377
378 return NULL;
379}
380
381/*
382 * find a VMA in the global tree
383 */
289static inline struct vm_area_struct *find_nommu_vma(unsigned long start) 384static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
290{ 385{
291 struct vm_area_struct *vma; 386 struct vm_area_struct *vma;
@@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
305 return NULL; 400 return NULL;
306} 401}
307 402
403/*
404 * add a VMA in the global tree
405 */
308static void add_nommu_vma(struct vm_area_struct *vma) 406static void add_nommu_vma(struct vm_area_struct *vma)
309{ 407{
310 struct vm_area_struct *pvma; 408 struct vm_area_struct *pvma;
@@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma)
351 rb_insert_color(&vma->vm_rb, &nommu_vma_tree); 449 rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
352} 450}
353 451
452/*
453 * delete a VMA from the global list
454 */
354static void delete_nommu_vma(struct vm_area_struct *vma) 455static void delete_nommu_vma(struct vm_area_struct *vma)
355{ 456{
356 struct address_space *mapping; 457 struct address_space *mapping;
@@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file,
828 realalloc += kobjsize(vml); 929 realalloc += kobjsize(vml);
829 askedalloc += sizeof(*vml); 930 askedalloc += sizeof(*vml);
830 931
831 vml->next = current->mm->context.vmlist; 932 add_vma_to_mm(current->mm, vml);
832 current->mm->context.vmlist = vml;
833 933
834 up_write(&nommu_vma_sem); 934 up_write(&nommu_vma_sem);
835 935
@@ -848,7 +948,8 @@ unsigned long do_mmap_pgoff(struct file *file,
848 up_write(&nommu_vma_sem); 948 up_write(&nommu_vma_sem);
849 kfree(vml); 949 kfree(vml);
850 if (vma) { 950 if (vma) {
851 fput(vma->vm_file); 951 if (vma->vm_file)
952 fput(vma->vm_file);
852 kfree(vma); 953 kfree(vma);
853 } 954 }
854 return ret; 955 return ret;
@@ -908,6 +1009,11 @@ static void put_vma(struct vm_area_struct *vma)
908 } 1009 }
909} 1010}
910 1011
1012/*
1013 * release a mapping
1014 * - under NOMMU conditions the parameters must match exactly to the mapping to
1015 * be removed
1016 */
911int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) 1017int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
912{ 1018{
913 struct vm_list_struct *vml, **parent; 1019 struct vm_list_struct *vml, **parent;
@@ -917,10 +1023,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
917 printk("do_munmap:\n"); 1023 printk("do_munmap:\n");
918#endif 1024#endif
919 1025
920 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) 1026 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
1027 if ((*parent)->vma->vm_start > addr)
1028 break;
921 if ((*parent)->vma->vm_start == addr && 1029 if ((*parent)->vma->vm_start == addr &&
922 ((len == 0) || ((*parent)->vma->vm_end == end))) 1030 ((len == 0) || ((*parent)->vma->vm_end == end)))
923 goto found; 1031 goto found;
1032 }
924 1033
925 printk("munmap of non-mmaped memory by process %d (%s): %p\n", 1034 printk("munmap of non-mmaped memory by process %d (%s): %p\n",
926 current->pid, current->comm, (void *) addr); 1035 current->pid, current->comm, (void *) addr);
@@ -946,7 +1055,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
946 return 0; 1055 return 0;
947} 1056}
948 1057
949/* Release all mmaps. */ 1058asmlinkage long sys_munmap(unsigned long addr, size_t len)
1059{
1060 int ret;
1061 struct mm_struct *mm = current->mm;
1062
1063 down_write(&mm->mmap_sem);
1064 ret = do_munmap(mm, addr, len);
1065 up_write(&mm->mmap_sem);
1066 return ret;
1067}
1068
1069/*
1070 * Release all mappings
1071 */
950void exit_mmap(struct mm_struct * mm) 1072void exit_mmap(struct mm_struct * mm)
951{ 1073{
952 struct vm_list_struct *tmp; 1074 struct vm_list_struct *tmp;
@@ -973,37 +1095,26 @@ void exit_mmap(struct mm_struct * mm)
973 } 1095 }
974} 1096}
975 1097
976asmlinkage long sys_munmap(unsigned long addr, size_t len)
977{
978 int ret;
979 struct mm_struct *mm = current->mm;
980
981 down_write(&mm->mmap_sem);
982 ret = do_munmap(mm, addr, len);
983 up_write(&mm->mmap_sem);
984 return ret;
985}
986
987unsigned long do_brk(unsigned long addr, unsigned long len) 1098unsigned long do_brk(unsigned long addr, unsigned long len)
988{ 1099{
989 return -ENOMEM; 1100 return -ENOMEM;
990} 1101}
991 1102
992/* 1103/*
993 * Expand (or shrink) an existing mapping, potentially moving it at the 1104 * expand (or shrink) an existing mapping, potentially moving it at the same
994 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1105 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
995 * 1106 *
996 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 1107 * under NOMMU conditions, we only permit changing a mapping's size, and only
997 * This option implies MREMAP_MAYMOVE. 1108 * as long as it stays within the hole allocated by the kmalloc() call in
1109 * do_mmap_pgoff() and the block is not shareable
998 * 1110 *
999 * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the 1111 * MREMAP_FIXED is not supported under NOMMU conditions
1000 * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable
1001 */ 1112 */
1002unsigned long do_mremap(unsigned long addr, 1113unsigned long do_mremap(unsigned long addr,
1003 unsigned long old_len, unsigned long new_len, 1114 unsigned long old_len, unsigned long new_len,
1004 unsigned long flags, unsigned long new_addr) 1115 unsigned long flags, unsigned long new_addr)
1005{ 1116{
1006 struct vm_list_struct *vml = NULL; 1117 struct vm_area_struct *vma;
1007 1118
1008 /* insanity checks first */ 1119 /* insanity checks first */
1009 if (new_len == 0) 1120 if (new_len == 0)
@@ -1012,58 +1123,46 @@ unsigned long do_mremap(unsigned long addr,
1012 if (flags & MREMAP_FIXED && new_addr != addr) 1123 if (flags & MREMAP_FIXED && new_addr != addr)
1013 return (unsigned long) -EINVAL; 1124 return (unsigned long) -EINVAL;
1014 1125
1015 for (vml = current->mm->context.vmlist; vml; vml = vml->next) 1126 vma = find_vma_exact(current->mm, addr);
1016 if (vml->vma->vm_start == addr) 1127 if (!vma)
1017 goto found; 1128 return (unsigned long) -EINVAL;
1018
1019 return (unsigned long) -EINVAL;
1020 1129
1021 found: 1130 if (vma->vm_end != vma->vm_start + old_len)
1022 if (vml->vma->vm_end != vml->vma->vm_start + old_len)
1023 return (unsigned long) -EFAULT; 1131 return (unsigned long) -EFAULT;
1024 1132
1025 if (vml->vma->vm_flags & VM_MAYSHARE) 1133 if (vma->vm_flags & VM_MAYSHARE)
1026 return (unsigned long) -EPERM; 1134 return (unsigned long) -EPERM;
1027 1135
1028 if (new_len > kobjsize((void *) addr)) 1136 if (new_len > kobjsize((void *) addr))
1029 return (unsigned long) -ENOMEM; 1137 return (unsigned long) -ENOMEM;
1030 1138
1031 /* all checks complete - do it */ 1139 /* all checks complete - do it */
1032 vml->vma->vm_end = vml->vma->vm_start + new_len; 1140 vma->vm_end = vma->vm_start + new_len;
1033 1141
1034 askedalloc -= old_len; 1142 askedalloc -= old_len;
1035 askedalloc += new_len; 1143 askedalloc += new_len;
1036 1144
1037 return vml->vma->vm_start; 1145 return vma->vm_start;
1038} 1146}
1039 1147
1040/* 1148asmlinkage unsigned long sys_mremap(unsigned long addr,
1041 * Look up the first VMA which satisfies addr < vm_end, NULL if none 1149 unsigned long old_len, unsigned long new_len,
1042 */ 1150 unsigned long flags, unsigned long new_addr)
1043struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1044{ 1151{
1045 struct vm_list_struct *vml; 1152 unsigned long ret;
1046
1047 for (vml = mm->context.vmlist; vml; vml = vml->next)
1048 if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
1049 return vml->vma;
1050 1153
1051 return NULL; 1154 down_write(&current->mm->mmap_sem);
1155 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
1156 up_write(&current->mm->mmap_sem);
1157 return ret;
1052} 1158}
1053 1159
1054EXPORT_SYMBOL(find_vma);
1055
1056struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1160struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1057 unsigned int foll_flags) 1161 unsigned int foll_flags)
1058{ 1162{
1059 return NULL; 1163 return NULL;
1060} 1164}
1061 1165
1062struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
1063{
1064 return NULL;
1065}
1066
1067int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, 1166int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
1068 unsigned long to, unsigned long size, pgprot_t prot) 1167 unsigned long to, unsigned long size, pgprot_t prot)
1069{ 1168{
@@ -1133,7 +1232,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
1133 * which are reclaimable, under pressure. The dentry 1232 * which are reclaimable, under pressure. The dentry
1134 * cache and most inode caches should fall into this 1233 * cache and most inode caches should fall into this
1135 */ 1234 */
1136 free += atomic_read(&slab_reclaim_pages); 1235 free += global_page_state(NR_SLAB_RECLAIMABLE);
1137 1236
1138 /* 1237 /*
1139 * Leave the last 3% for root 1238 * Leave the last 3% for root
@@ -1206,3 +1305,44 @@ struct page *filemap_nopage(struct vm_area_struct *area,
1206 BUG(); 1305 BUG();
1207 return NULL; 1306 return NULL;
1208} 1307}
1308
1309/*
1310 * Access another process' address space.
1311 * - source/target buffer must be kernel space
1312 */
1313int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
1314{
1315 struct vm_area_struct *vma;
1316 struct mm_struct *mm;
1317
1318 if (addr + len < addr)
1319 return 0;
1320
1321 mm = get_task_mm(tsk);
1322 if (!mm)
1323 return 0;
1324
1325 down_read(&mm->mmap_sem);
1326
1327 /* the access must start within one of the target process's mappings */
1328 vma = find_vma(mm, addr);
1329 if (vma) {
1330 /* don't overrun this mapping */
1331 if (addr + len >= vma->vm_end)
1332 len = vma->vm_end - addr;
1333
1334 /* only read or write mappings where it is permitted */
1335 if (write && vma->vm_flags & VM_MAYWRITE)
1336 len -= copy_to_user((void *) addr, buf, len);
1337 else if (!write && vma->vm_flags & VM_MAYREAD)
1338 len -= copy_from_user(buf, (void *) addr, len);
1339 else
1340 len = 0;
1341 } else {
1342 len = 0;
1343 }
1344
1345 up_read(&mm->mmap_sem);
1346 mmput(mm);
1347 return len;
1348}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b9af136e5c..20f41b082e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -21,6 +21,8 @@
21#include <linux/timex.h> 21#include <linux/timex.h>
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/module.h>
25#include <linux/notifier.h>
24 26
25int sysctl_panic_on_oom; 27int sysctl_panic_on_oom;
26/* #define DEBUG */ 28/* #define DEBUG */
@@ -58,6 +60,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 } 60 }
59 61
60 /* 62 /*
63 * swapoff can easily use up all memory, so kill those first.
64 */
65 if (p->flags & PF_SWAPOFF)
66 return ULONG_MAX;
67
68 /*
61 * The memory size of the process is the basis for the badness. 69 * The memory size of the process is the basis for the badness.
62 */ 70 */
63 points = mm->total_vm; 71 points = mm->total_vm;
@@ -127,6 +135,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
127 points /= 4; 135 points /= 4;
128 136
129 /* 137 /*
138 * If p's nodes don't overlap ours, it may still help to kill p
139 * because p may have allocated or otherwise mapped memory on
140 * this node before. However it will be less likely.
141 */
142 if (!cpuset_excl_nodes_overlap(p))
143 points /= 8;
144
145 /*
130 * Adjust the score by oomkilladj. 146 * Adjust the score by oomkilladj.
131 */ 147 */
132 if (p->oomkilladj) { 148 if (p->oomkilladj) {
@@ -161,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
161 177
162 for (z = zonelist->zones; *z; z++) 178 for (z = zonelist->zones; *z; z++)
163 if (cpuset_zone_allowed(*z, gfp_mask)) 179 if (cpuset_zone_allowed(*z, gfp_mask))
164 node_clear((*z)->zone_pgdat->node_id, 180 node_clear(zone_to_nid(*z), nodes);
165 nodes);
166 else 181 else
167 return CONSTRAINT_CPUSET; 182 return CONSTRAINT_CPUSET;
168 183
@@ -189,27 +204,49 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
189 do_posix_clock_monotonic_gettime(&uptime); 204 do_posix_clock_monotonic_gettime(&uptime);
190 do_each_thread(g, p) { 205 do_each_thread(g, p) {
191 unsigned long points; 206 unsigned long points;
192 int releasing;
193 207
194 /* skip the init task with pid == 1 */ 208 /*
195 if (p->pid == 1) 209 * skip kernel threads and tasks which have already released
196 continue; 210 * their mm.
197 if (p->oomkilladj == OOM_DISABLE) 211 */
212 if (!p->mm)
198 continue; 213 continue;
199 /* If p's nodes don't overlap ours, it won't help to kill p. */ 214 /* skip the init task */
200 if (!cpuset_excl_nodes_overlap(p)) 215 if (is_init(p))
201 continue; 216 continue;
202 217
203 /* 218 /*
219 * This task already has access to memory reserves and is
220 * being killed. Don't allow any other task access to the
221 * memory reserve.
222 *
223 * Note: this may have a chance of deadlock if it gets
224 * blocked waiting for another task which itself is waiting
225 * for memory. Is there a better alternative?
226 */
227 if (test_tsk_thread_flag(p, TIF_MEMDIE))
228 return ERR_PTR(-1UL);
229
230 /*
204 * This is in the process of releasing memory so wait for it 231 * This is in the process of releasing memory so wait for it
205 * to finish before killing some other task by mistake. 232 * to finish before killing some other task by mistake.
233 *
234 * However, if p is the current task, we allow the 'kill' to
235 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
236 * which will allow it to gain access to memory reserves in
237 * the process of exiting and releasing its resources.
238 * Otherwise we could get an easy OOM deadlock.
206 */ 239 */
207 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || 240 if (p->flags & PF_EXITING) {
208 p->flags & PF_EXITING; 241 if (p != current)
209 if (releasing && !(p->flags & PF_DEAD)) 242 return ERR_PTR(-1UL);
210 return ERR_PTR(-1UL); 243
211 if (p->flags & PF_SWAPOFF) 244 chosen = p;
212 return p; 245 *ppoints = ULONG_MAX;
246 }
247
248 if (p->oomkilladj == OOM_DISABLE)
249 continue;
213 250
214 points = badness(p, uptime.tv_sec); 251 points = badness(p, uptime.tv_sec);
215 if (points > *ppoints || !chosen) { 252 if (points > *ppoints || !chosen) {
@@ -217,32 +254,33 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
217 *ppoints = points; 254 *ppoints = points;
218 } 255 }
219 } while_each_thread(g, p); 256 } while_each_thread(g, p);
257
220 return chosen; 258 return chosen;
221} 259}
222 260
223/** 261/**
224 * We must be careful though to never send SIGKILL a process with 262 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
225 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that 263 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
226 * we select a process with CAP_SYS_RAW_IO set). 264 * set.
227 */ 265 */
228static void __oom_kill_task(struct task_struct *p, const char *message) 266static void __oom_kill_task(struct task_struct *p, const char *message)
229{ 267{
230 if (p->pid == 1) { 268 if (is_init(p)) {
231 WARN_ON(1); 269 WARN_ON(1);
232 printk(KERN_WARNING "tried to kill init!\n"); 270 printk(KERN_WARNING "tried to kill init!\n");
233 return; 271 return;
234 } 272 }
235 273
236 task_lock(p); 274 if (!p->mm) {
237 if (!p->mm || p->mm == &init_mm) {
238 WARN_ON(1); 275 WARN_ON(1);
239 printk(KERN_WARNING "tried to kill an mm-less task!\n"); 276 printk(KERN_WARNING "tried to kill an mm-less task!\n");
240 task_unlock(p);
241 return; 277 return;
242 } 278 }
243 task_unlock(p); 279
244 printk(KERN_ERR "%s: Killed process %d (%s).\n", 280 if (message) {
281 printk(KERN_ERR "%s: Killed process %d (%s).\n",
245 message, p->pid, p->comm); 282 message, p->pid, p->comm);
283 }
246 284
247 /* 285 /*
248 * We give our sacrificial lamb high priority and access to 286 * We give our sacrificial lamb high priority and access to
@@ -271,7 +309,7 @@ static int oom_kill_task(struct task_struct *p, const char *message)
271 * However, this is of no concern to us. 309 * However, this is of no concern to us.
272 */ 310 */
273 311
274 if (mm == NULL || mm == &init_mm) 312 if (mm == NULL)
275 return 1; 313 return 1;
276 314
277 __oom_kill_task(p, message); 315 __oom_kill_task(p, message);
@@ -293,8 +331,17 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
293 struct task_struct *c; 331 struct task_struct *c;
294 struct list_head *tsk; 332 struct list_head *tsk;
295 333
296 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " 334 /*
297 "children.\n", p->pid, p->comm, points); 335 * If the task is already exiting, don't alarm the sysadmin or kill
336 * its children or threads, just set TIF_MEMDIE so it can die quickly
337 */
338 if (p->flags & PF_EXITING) {
339 __oom_kill_task(p, NULL);
340 return 0;
341 }
342
343 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
344 " and children.\n", p->pid, p->comm, points);
298 /* Try to kill a child first */ 345 /* Try to kill a child first */
299 list_for_each(tsk, &p->children) { 346 list_for_each(tsk, &p->children) {
300 c = list_entry(tsk, struct task_struct, sibling); 347 c = list_entry(tsk, struct task_struct, sibling);
@@ -306,6 +353,20 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
306 return oom_kill_task(p, message); 353 return oom_kill_task(p, message);
307} 354}
308 355
356static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
357
358int register_oom_notifier(struct notifier_block *nb)
359{
360 return blocking_notifier_chain_register(&oom_notify_list, nb);
361}
362EXPORT_SYMBOL_GPL(register_oom_notifier);
363
364int unregister_oom_notifier(struct notifier_block *nb)
365{
366 return blocking_notifier_chain_unregister(&oom_notify_list, nb);
367}
368EXPORT_SYMBOL_GPL(unregister_oom_notifier);
369
309/** 370/**
310 * out_of_memory - kill the "best" process when we run out of memory 371 * out_of_memory - kill the "best" process when we run out of memory
311 * 372 *
@@ -318,10 +379,17 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
318{ 379{
319 struct task_struct *p; 380 struct task_struct *p;
320 unsigned long points = 0; 381 unsigned long points = 0;
382 unsigned long freed = 0;
383
384 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
385 if (freed > 0)
386 /* Got some memory back in the last second. */
387 return;
321 388
322 if (printk_ratelimit()) { 389 if (printk_ratelimit()) {
323 printk("oom-killer: gfp_mask=0x%x, order=%d\n", 390 printk(KERN_WARNING "%s invoked oom-killer: "
324 gfp_mask, order); 391 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
392 current->comm, gfp_mask, order, current->oomkilladj);
325 dump_stack(); 393 dump_stack();
326 show_mem(); 394 show_mem();
327 } 395 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e630188ccc..c0d4ce144d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -23,12 +23,15 @@
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/mpage.h> 25#include <linux/mpage.h>
26#include <linux/rmap.h>
26#include <linux/percpu.h> 27#include <linux/percpu.h>
27#include <linux/notifier.h> 28#include <linux/notifier.h>
28#include <linux/smp.h> 29#include <linux/smp.h>
29#include <linux/sysctl.h> 30#include <linux/sysctl.h>
30#include <linux/cpu.h> 31#include <linux/cpu.h>
31#include <linux/syscalls.h> 32#include <linux/syscalls.h>
33#include <linux/buffer_head.h>
34#include <linux/pagevec.h>
32 35
33/* 36/*
34 * The maximum number of pages to writeout in a single bdflush/kupdate 37 * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -45,7 +48,6 @@
45 */ 48 */
46static long ratelimit_pages = 32; 49static long ratelimit_pages = 32;
47 50
48static long total_pages; /* The total number of pages in the machine. */
49static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ 51static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
50 52
51/* 53/*
@@ -125,7 +127,7 @@ get_dirty_limits(long *pbackground, long *pdirty,
125 int unmapped_ratio; 127 int unmapped_ratio;
126 long background; 128 long background;
127 long dirty; 129 long dirty;
128 unsigned long available_memory = total_pages; 130 unsigned long available_memory = vm_total_pages;
129 struct task_struct *tsk; 131 struct task_struct *tsk;
130 132
131#ifdef CONFIG_HIGHMEM 133#ifdef CONFIG_HIGHMEM
@@ -140,7 +142,7 @@ get_dirty_limits(long *pbackground, long *pdirty,
140 142
141 unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + 143 unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
142 global_page_state(NR_ANON_PAGES)) * 100) / 144 global_page_state(NR_ANON_PAGES)) * 100) /
143 total_pages; 145 vm_total_pages;
144 146
145 dirty_ratio = vm_dirty_ratio; 147 dirty_ratio = vm_dirty_ratio;
146 if (dirty_ratio > unmapped_ratio / 2) 148 if (dirty_ratio > unmapped_ratio / 2)
@@ -243,6 +245,16 @@ static void balance_dirty_pages(struct address_space *mapping)
243 pdflush_operation(background_writeout, 0); 245 pdflush_operation(background_writeout, 0);
244} 246}
245 247
248void set_page_dirty_balance(struct page *page)
249{
250 if (set_page_dirty(page)) {
251 struct address_space *mapping = page_mapping(page);
252
253 if (mapping)
254 balance_dirty_pages_ratelimited(mapping);
255 }
256}
257
246/** 258/**
247 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 259 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
248 * @mapping: address_space which was dirtied 260 * @mapping: address_space which was dirtied
@@ -491,9 +503,9 @@ void laptop_sync_completion(void)
491 * will write six megabyte chunks, max. 503 * will write six megabyte chunks, max.
492 */ 504 */
493 505
494static void set_ratelimit(void) 506void writeback_set_ratelimit(void)
495{ 507{
496 ratelimit_pages = total_pages / (num_online_cpus() * 32); 508 ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
497 if (ratelimit_pages < 16) 509 if (ratelimit_pages < 16)
498 ratelimit_pages = 16; 510 ratelimit_pages = 16;
499 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) 511 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
@@ -503,7 +515,7 @@ static void set_ratelimit(void)
503static int __cpuinit 515static int __cpuinit
504ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) 516ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
505{ 517{
506 set_ratelimit(); 518 writeback_set_ratelimit();
507 return 0; 519 return 0;
508} 520}
509 521
@@ -522,9 +534,7 @@ void __init page_writeback_init(void)
522 long buffer_pages = nr_free_buffer_pages(); 534 long buffer_pages = nr_free_buffer_pages();
523 long correction; 535 long correction;
524 536
525 total_pages = nr_free_pagecache_pages(); 537 correction = (100 * 4 * buffer_pages) / vm_total_pages;
526
527 correction = (100 * 4 * buffer_pages) / total_pages;
528 538
529 if (correction < 100) { 539 if (correction < 100) {
530 dirty_background_ratio *= correction; 540 dirty_background_ratio *= correction;
@@ -538,10 +548,143 @@ void __init page_writeback_init(void)
538 vm_dirty_ratio = 1; 548 vm_dirty_ratio = 1;
539 } 549 }
540 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 550 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
541 set_ratelimit(); 551 writeback_set_ratelimit();
542 register_cpu_notifier(&ratelimit_nb); 552 register_cpu_notifier(&ratelimit_nb);
543} 553}
544 554
555/**
556 * generic_writepages - walk the list of dirty pages of the given
557 * address space and writepage() all of them.
558 *
559 * @mapping: address space structure to write
560 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
561 *
562 * This is a library function, which implements the writepages()
563 * address_space_operation.
564 *
565 * If a page is already under I/O, generic_writepages() skips it, even
566 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
567 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
568 * and msync() need to guarantee that all the data which was dirty at the time
569 * the call was made get new I/O started against them. If wbc->sync_mode is
570 * WB_SYNC_ALL then we were called for data integrity and we must wait for
571 * existing IO to complete.
572 *
573 * Derived from mpage_writepages() - if you fix this you should check that
574 * also!
575 */
576int generic_writepages(struct address_space *mapping,
577 struct writeback_control *wbc)
578{
579 struct backing_dev_info *bdi = mapping->backing_dev_info;
580 int ret = 0;
581 int done = 0;
582 int (*writepage)(struct page *page, struct writeback_control *wbc);
583 struct pagevec pvec;
584 int nr_pages;
585 pgoff_t index;
586 pgoff_t end; /* Inclusive */
587 int scanned = 0;
588 int range_whole = 0;
589
590 if (wbc->nonblocking && bdi_write_congested(bdi)) {
591 wbc->encountered_congestion = 1;
592 return 0;
593 }
594
595 writepage = mapping->a_ops->writepage;
596
597 /* deal with chardevs and other special file */
598 if (!writepage)
599 return 0;
600
601 pagevec_init(&pvec, 0);
602 if (wbc->range_cyclic) {
603 index = mapping->writeback_index; /* Start from prev offset */
604 end = -1;
605 } else {
606 index = wbc->range_start >> PAGE_CACHE_SHIFT;
607 end = wbc->range_end >> PAGE_CACHE_SHIFT;
608 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
609 range_whole = 1;
610 scanned = 1;
611 }
612retry:
613 while (!done && (index <= end) &&
614 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
615 PAGECACHE_TAG_DIRTY,
616 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
617 unsigned i;
618
619 scanned = 1;
620 for (i = 0; i < nr_pages; i++) {
621 struct page *page = pvec.pages[i];
622
623 /*
624 * At this point we hold neither mapping->tree_lock nor
625 * lock on the page itself: the page may be truncated or
626 * invalidated (changing page->mapping to NULL), or even
627 * swizzled back from swapper_space to tmpfs file
628 * mapping
629 */
630 lock_page(page);
631
632 if (unlikely(page->mapping != mapping)) {
633 unlock_page(page);
634 continue;
635 }
636
637 if (!wbc->range_cyclic && page->index > end) {
638 done = 1;
639 unlock_page(page);
640 continue;
641 }
642
643 if (wbc->sync_mode != WB_SYNC_NONE)
644 wait_on_page_writeback(page);
645
646 if (PageWriteback(page) ||
647 !clear_page_dirty_for_io(page)) {
648 unlock_page(page);
649 continue;
650 }
651
652 ret = (*writepage)(page, wbc);
653 if (ret) {
654 if (ret == -ENOSPC)
655 set_bit(AS_ENOSPC, &mapping->flags);
656 else
657 set_bit(AS_EIO, &mapping->flags);
658 }
659
660 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
661 unlock_page(page);
662 if (ret || (--(wbc->nr_to_write) <= 0))
663 done = 1;
664 if (wbc->nonblocking && bdi_write_congested(bdi)) {
665 wbc->encountered_congestion = 1;
666 done = 1;
667 }
668 }
669 pagevec_release(&pvec);
670 cond_resched();
671 }
672 if (!scanned && !done) {
673 /*
674 * We hit the last page and there is more work to be done: wrap
675 * back to the start of the file
676 */
677 scanned = 1;
678 index = 0;
679 goto retry;
680 }
681 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
682 mapping->writeback_index = index;
683 return ret;
684}
685
686EXPORT_SYMBOL(generic_writepages);
687
545int do_writepages(struct address_space *mapping, struct writeback_control *wbc) 688int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
546{ 689{
547 int ret; 690 int ret;
@@ -550,7 +693,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
550 return 0; 693 return 0;
551 wbc->for_writepages = 1; 694 wbc->for_writepages = 1;
552 if (mapping->a_ops->writepages) 695 if (mapping->a_ops->writepages)
553 ret = mapping->a_ops->writepages(mapping, wbc); 696 ret = mapping->a_ops->writepages(mapping, wbc);
554 else 697 else
555 ret = generic_writepages(mapping, wbc); 698 ret = generic_writepages(mapping, wbc);
556 wbc->for_writepages = 0; 699 wbc->for_writepages = 0;
@@ -664,9 +807,11 @@ int fastcall set_page_dirty(struct page *page)
664 807
665 if (likely(mapping)) { 808 if (likely(mapping)) {
666 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; 809 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
667 if (spd) 810#ifdef CONFIG_BLOCK
668 return (*spd)(page); 811 if (!spd)
669 return __set_page_dirty_buffers(page); 812 spd = __set_page_dirty_buffers;
813#endif
814 return (*spd)(page);
670 } 815 }
671 if (!PageDirty(page)) { 816 if (!PageDirty(page)) {
672 if (!TestSetPageDirty(page)) 817 if (!TestSetPageDirty(page))
@@ -690,7 +835,7 @@ int set_page_dirty_lock(struct page *page)
690{ 835{
691 int ret; 836 int ret;
692 837
693 lock_page(page); 838 lock_page_nosync(page);
694 ret = set_page_dirty(page); 839 ret = set_page_dirty(page);
695 unlock_page(page); 840 unlock_page(page);
696 return ret; 841 return ret;
@@ -712,9 +857,15 @@ int test_clear_page_dirty(struct page *page)
712 radix_tree_tag_clear(&mapping->page_tree, 857 radix_tree_tag_clear(&mapping->page_tree,
713 page_index(page), 858 page_index(page),
714 PAGECACHE_TAG_DIRTY); 859 PAGECACHE_TAG_DIRTY);
715 if (mapping_cap_account_dirty(mapping))
716 __dec_zone_page_state(page, NR_FILE_DIRTY);
717 write_unlock_irqrestore(&mapping->tree_lock, flags); 860 write_unlock_irqrestore(&mapping->tree_lock, flags);
861 /*
862 * We can continue to use `mapping' here because the
863 * page is locked, which pins the address_space
864 */
865 if (mapping_cap_account_dirty(mapping)) {
866 page_mkclean(page);
867 dec_zone_page_state(page, NR_FILE_DIRTY);
868 }
718 return 1; 869 return 1;
719 } 870 }
720 write_unlock_irqrestore(&mapping->tree_lock, flags); 871 write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -744,8 +895,10 @@ int clear_page_dirty_for_io(struct page *page)
744 895
745 if (mapping) { 896 if (mapping) {
746 if (TestClearPageDirty(page)) { 897 if (TestClearPageDirty(page)) {
747 if (mapping_cap_account_dirty(mapping)) 898 if (mapping_cap_account_dirty(mapping)) {
899 page_mkclean(page);
748 dec_zone_page_state(page, NR_FILE_DIRTY); 900 dec_zone_page_state(page, NR_FILE_DIRTY);
901 }
749 return 1; 902 return 1;
750 } 903 }
751 return 0; 904 return 0;
@@ -803,6 +956,15 @@ int test_set_page_writeback(struct page *page)
803EXPORT_SYMBOL(test_set_page_writeback); 956EXPORT_SYMBOL(test_set_page_writeback);
804 957
805/* 958/*
959 * Wakes up tasks that are being throttled due to writeback congestion
960 */
961void writeback_congestion_end(void)
962{
963 blk_congestion_end(WRITE);
964}
965EXPORT_SYMBOL(writeback_congestion_end);
966
967/*
806 * Return true if any of the pages in the mapping are marged with the 968 * Return true if any of the pages in the mapping are marged with the
807 * passed tag. 969 * passed tag.
808 */ 970 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54a4f5375b..4f59d90b81 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,8 @@
37#include <linux/vmalloc.h> 37#include <linux/vmalloc.h>
38#include <linux/mempolicy.h> 38#include <linux/mempolicy.h>
39#include <linux/stop_machine.h> 39#include <linux/stop_machine.h>
40#include <linux/sort.h>
41#include <linux/pfn.h>
40 42
41#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
42#include <asm/div64.h> 44#include <asm/div64.h>
@@ -51,7 +53,6 @@ EXPORT_SYMBOL(node_online_map);
51nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 53nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
52EXPORT_SYMBOL(node_possible_map); 54EXPORT_SYMBOL(node_possible_map);
53unsigned long totalram_pages __read_mostly; 55unsigned long totalram_pages __read_mostly;
54unsigned long totalhigh_pages __read_mostly;
55unsigned long totalreserve_pages __read_mostly; 56unsigned long totalreserve_pages __read_mostly;
56long nr_swap_pages; 57long nr_swap_pages;
57int percpu_pagelist_fraction; 58int percpu_pagelist_fraction;
@@ -69,7 +70,15 @@ static void __free_pages_ok(struct page *page, unsigned int order);
69 * TBD: should special case ZONE_DMA32 machines here - in those we normally 70 * TBD: should special case ZONE_DMA32 machines here - in those we normally
70 * don't need any ZONE_NORMAL reservation 71 * don't need any ZONE_NORMAL reservation
71 */ 72 */
72int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; 73int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
74 256,
75#ifdef CONFIG_ZONE_DMA32
76 256,
77#endif
78#ifdef CONFIG_HIGHMEM
79 32
80#endif
81};
73 82
74EXPORT_SYMBOL(totalram_pages); 83EXPORT_SYMBOL(totalram_pages);
75 84
@@ -80,11 +89,53 @@ EXPORT_SYMBOL(totalram_pages);
80struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 89struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
81EXPORT_SYMBOL(zone_table); 90EXPORT_SYMBOL(zone_table);
82 91
83static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 92static char *zone_names[MAX_NR_ZONES] = {
93 "DMA",
94#ifdef CONFIG_ZONE_DMA32
95 "DMA32",
96#endif
97 "Normal",
98#ifdef CONFIG_HIGHMEM
99 "HighMem"
100#endif
101};
102
84int min_free_kbytes = 1024; 103int min_free_kbytes = 1024;
85 104
86unsigned long __meminitdata nr_kernel_pages; 105unsigned long __meminitdata nr_kernel_pages;
87unsigned long __meminitdata nr_all_pages; 106unsigned long __meminitdata nr_all_pages;
107static unsigned long __initdata dma_reserve;
108
109#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
110 /*
111 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
112 * ranges of memory (RAM) that may be registered with add_active_range().
113 * Ranges passed to add_active_range() will be merged if possible
114 * so the number of times add_active_range() can be called is
115 * related to the number of nodes and the number of holes
116 */
117 #ifdef CONFIG_MAX_ACTIVE_REGIONS
118 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
119 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
120 #else
121 #if MAX_NUMNODES >= 32
122 /* If there can be many nodes, allow up to 50 holes per node */
123 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
124 #else
125 /* By default, allow up to 256 distinct regions */
126 #define MAX_ACTIVE_REGIONS 256
127 #endif
128 #endif
129
130 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
131 int __initdata nr_nodemap_entries;
132 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
133 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
134#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
135 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
136 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
137#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
138#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
88 139
89#ifdef CONFIG_DEBUG_VM 140#ifdef CONFIG_DEBUG_VM
90static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 141static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -127,7 +178,6 @@ static int bad_range(struct zone *zone, struct page *page)
127 178
128 return 0; 179 return 0;
129} 180}
130
131#else 181#else
132static inline int bad_range(struct zone *zone, struct page *page) 182static inline int bad_range(struct zone *zone, struct page *page)
133{ 183{
@@ -218,12 +268,12 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
218{ 268{
219 int i; 269 int i;
220 270
221 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 271 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
222 /* 272 /*
223 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 273 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
224 * and __GFP_HIGHMEM from hard or soft interrupt context. 274 * and __GFP_HIGHMEM from hard or soft interrupt context.
225 */ 275 */
226 BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 276 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
227 for (i = 0; i < (1 << order); i++) 277 for (i = 0; i < (1 << order); i++)
228 clear_highpage(page + i); 278 clear_highpage(page + i);
229} 279}
@@ -347,8 +397,8 @@ static inline void __free_one_page(struct page *page,
347 397
348 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 398 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
349 399
350 BUG_ON(page_idx & (order_size - 1)); 400 VM_BUG_ON(page_idx & (order_size - 1));
351 BUG_ON(bad_range(zone, page)); 401 VM_BUG_ON(bad_range(zone, page));
352 402
353 zone->free_pages += order_size; 403 zone->free_pages += order_size;
354 while (order < MAX_ORDER-1) { 404 while (order < MAX_ORDER-1) {
@@ -421,7 +471,7 @@ static void free_pages_bulk(struct zone *zone, int count,
421 while (count--) { 471 while (count--) {
422 struct page *page; 472 struct page *page;
423 473
424 BUG_ON(list_empty(list)); 474 VM_BUG_ON(list_empty(list));
425 page = list_entry(list->prev, struct page, lru); 475 page = list_entry(list->prev, struct page, lru);
426 /* have to delete it as __free_one_page list manipulates */ 476 /* have to delete it as __free_one_page list manipulates */
427 list_del(&page->lru); 477 list_del(&page->lru);
@@ -432,9 +482,11 @@ static void free_pages_bulk(struct zone *zone, int count,
432 482
433static void free_one_page(struct zone *zone, struct page *page, int order) 483static void free_one_page(struct zone *zone, struct page *page, int order)
434{ 484{
435 LIST_HEAD(list); 485 spin_lock(&zone->lock);
436 list_add(&page->lru, &list); 486 zone->all_unreclaimable = 0;
437 free_pages_bulk(zone, 1, &list, order); 487 zone->pages_scanned = 0;
488 __free_one_page(page, zone ,order);
489 spin_unlock(&zone->lock);
438} 490}
439 491
440static void __free_pages_ok(struct page *page, unsigned int order) 492static void __free_pages_ok(struct page *page, unsigned int order)
@@ -512,7 +564,7 @@ static inline void expand(struct zone *zone, struct page *page,
512 area--; 564 area--;
513 high--; 565 high--;
514 size >>= 1; 566 size >>= 1;
515 BUG_ON(bad_range(zone, &page[size])); 567 VM_BUG_ON(bad_range(zone, &page[size]));
516 list_add(&page[size].lru, &area->free_list); 568 list_add(&page[size].lru, &area->free_list);
517 area->nr_free++; 569 area->nr_free++;
518 set_page_order(&page[size], high); 570 set_page_order(&page[size], high);
@@ -615,19 +667,23 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
615#ifdef CONFIG_NUMA 667#ifdef CONFIG_NUMA
616/* 668/*
617 * Called from the slab reaper to drain pagesets on a particular node that 669 * Called from the slab reaper to drain pagesets on a particular node that
618 * belong to the currently executing processor. 670 * belongs to the currently executing processor.
619 * Note that this function must be called with the thread pinned to 671 * Note that this function must be called with the thread pinned to
620 * a single processor. 672 * a single processor.
621 */ 673 */
622void drain_node_pages(int nodeid) 674void drain_node_pages(int nodeid)
623{ 675{
624 int i, z; 676 int i;
677 enum zone_type z;
625 unsigned long flags; 678 unsigned long flags;
626 679
627 for (z = 0; z < MAX_NR_ZONES; z++) { 680 for (z = 0; z < MAX_NR_ZONES; z++) {
628 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 681 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
629 struct per_cpu_pageset *pset; 682 struct per_cpu_pageset *pset;
630 683
684 if (!populated_zone(zone))
685 continue;
686
631 pset = zone_pcp(zone, smp_processor_id()); 687 pset = zone_pcp(zone, smp_processor_id());
632 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 688 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
633 struct per_cpu_pages *pcp; 689 struct per_cpu_pages *pcp;
@@ -672,7 +728,8 @@ static void __drain_pages(unsigned int cpu)
672 728
673void mark_free_pages(struct zone *zone) 729void mark_free_pages(struct zone *zone)
674{ 730{
675 unsigned long zone_pfn, flags; 731 unsigned long pfn, max_zone_pfn;
732 unsigned long flags;
676 int order; 733 int order;
677 struct list_head *curr; 734 struct list_head *curr;
678 735
@@ -680,18 +737,25 @@ void mark_free_pages(struct zone *zone)
680 return; 737 return;
681 738
682 spin_lock_irqsave(&zone->lock, flags); 739 spin_lock_irqsave(&zone->lock, flags);
683 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 740
684 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); 741 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
742 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
743 if (pfn_valid(pfn)) {
744 struct page *page = pfn_to_page(pfn);
745
746 if (!PageNosave(page))
747 ClearPageNosaveFree(page);
748 }
685 749
686 for (order = MAX_ORDER - 1; order >= 0; --order) 750 for (order = MAX_ORDER - 1; order >= 0; --order)
687 list_for_each(curr, &zone->free_area[order].free_list) { 751 list_for_each(curr, &zone->free_area[order].free_list) {
688 unsigned long start_pfn, i; 752 unsigned long i;
689 753
690 start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); 754 pfn = page_to_pfn(list_entry(curr, struct page, lru));
755 for (i = 0; i < (1UL << order); i++)
756 SetPageNosaveFree(pfn_to_page(pfn + i));
757 }
691 758
692 for (i=0; i < (1<<order); i++)
693 SetPageNosaveFree(pfn_to_page(start_pfn+i));
694 }
695 spin_unlock_irqrestore(&zone->lock, flags); 759 spin_unlock_irqrestore(&zone->lock, flags);
696} 760}
697 761
@@ -761,8 +825,8 @@ void split_page(struct page *page, unsigned int order)
761{ 825{
762 int i; 826 int i;
763 827
764 BUG_ON(PageCompound(page)); 828 VM_BUG_ON(PageCompound(page));
765 BUG_ON(!page_count(page)); 829 VM_BUG_ON(!page_count(page));
766 for (i = 1; i < (1 << order); i++) 830 for (i = 1; i < (1 << order); i++)
767 set_page_refcounted(page + i); 831 set_page_refcounted(page + i);
768} 832}
@@ -809,7 +873,7 @@ again:
809 local_irq_restore(flags); 873 local_irq_restore(flags);
810 put_cpu(); 874 put_cpu();
811 875
812 BUG_ON(bad_range(zone, page)); 876 VM_BUG_ON(bad_range(zone, page));
813 if (prep_new_page(page, order, gfp_flags)) 877 if (prep_new_page(page, order, gfp_flags))
814 goto again; 878 goto again;
815 return page; 879 return page;
@@ -870,32 +934,37 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
870 struct zone **z = zonelist->zones; 934 struct zone **z = zonelist->zones;
871 struct page *page = NULL; 935 struct page *page = NULL;
872 int classzone_idx = zone_idx(*z); 936 int classzone_idx = zone_idx(*z);
937 struct zone *zone;
873 938
874 /* 939 /*
875 * Go through the zonelist once, looking for a zone with enough free. 940 * Go through the zonelist once, looking for a zone with enough free.
876 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 941 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
877 */ 942 */
878 do { 943 do {
944 zone = *z;
945 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
946 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
947 break;
879 if ((alloc_flags & ALLOC_CPUSET) && 948 if ((alloc_flags & ALLOC_CPUSET) &&
880 !cpuset_zone_allowed(*z, gfp_mask)) 949 !cpuset_zone_allowed(zone, gfp_mask))
881 continue; 950 continue;
882 951
883 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 952 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
884 unsigned long mark; 953 unsigned long mark;
885 if (alloc_flags & ALLOC_WMARK_MIN) 954 if (alloc_flags & ALLOC_WMARK_MIN)
886 mark = (*z)->pages_min; 955 mark = zone->pages_min;
887 else if (alloc_flags & ALLOC_WMARK_LOW) 956 else if (alloc_flags & ALLOC_WMARK_LOW)
888 mark = (*z)->pages_low; 957 mark = zone->pages_low;
889 else 958 else
890 mark = (*z)->pages_high; 959 mark = zone->pages_high;
891 if (!zone_watermark_ok(*z, order, mark, 960 if (!zone_watermark_ok(zone , order, mark,
892 classzone_idx, alloc_flags)) 961 classzone_idx, alloc_flags))
893 if (!zone_reclaim_mode || 962 if (!zone_reclaim_mode ||
894 !zone_reclaim(*z, gfp_mask, order)) 963 !zone_reclaim(zone, gfp_mask, order))
895 continue; 964 continue;
896 } 965 }
897 966
898 page = buffered_rmqueue(zonelist, *z, order, gfp_mask); 967 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
899 if (page) { 968 if (page) {
900 break; 969 break;
901 } 970 }
@@ -1083,7 +1152,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1083 * get_zeroed_page() returns a 32-bit address, which cannot represent 1152 * get_zeroed_page() returns a 32-bit address, which cannot represent
1084 * a highmem page 1153 * a highmem page
1085 */ 1154 */
1086 BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1155 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1087 1156
1088 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1157 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1089 if (page) 1158 if (page)
@@ -1116,7 +1185,7 @@ EXPORT_SYMBOL(__free_pages);
1116fastcall void free_pages(unsigned long addr, unsigned int order) 1185fastcall void free_pages(unsigned long addr, unsigned int order)
1117{ 1186{
1118 if (addr != 0) { 1187 if (addr != 0) {
1119 BUG_ON(!virt_addr_valid((void *)addr)); 1188 VM_BUG_ON(!virt_addr_valid((void *)addr));
1120 __free_pages(virt_to_page((void *)addr), order); 1189 __free_pages(virt_to_page((void *)addr), order);
1121 } 1190 }
1122} 1191}
@@ -1142,7 +1211,8 @@ EXPORT_SYMBOL(nr_free_pages);
1142#ifdef CONFIG_NUMA 1211#ifdef CONFIG_NUMA
1143unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 1212unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1144{ 1213{
1145 unsigned int i, sum = 0; 1214 unsigned int sum = 0;
1215 enum zone_type i;
1146 1216
1147 for (i = 0; i < MAX_NR_ZONES; i++) 1217 for (i = 0; i < MAX_NR_ZONES; i++)
1148 sum += pgdat->node_zones[i].free_pages; 1218 sum += pgdat->node_zones[i].free_pages;
@@ -1187,27 +1257,11 @@ unsigned int nr_free_pagecache_pages(void)
1187 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1257 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1188} 1258}
1189 1259
1190#ifdef CONFIG_HIGHMEM 1260static inline void show_node(struct zone *zone)
1191unsigned int nr_free_highpages (void)
1192{ 1261{
1193 pg_data_t *pgdat; 1262 if (NUMA_BUILD)
1194 unsigned int pages = 0; 1263 printk("Node %ld ", zone_to_nid(zone));
1195
1196 for_each_online_pgdat(pgdat)
1197 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1198
1199 return pages;
1200} 1264}
1201#endif
1202
1203#ifdef CONFIG_NUMA
1204static void show_node(struct zone *zone)
1205{
1206 printk("Node %d ", zone->zone_pgdat->node_id);
1207}
1208#else
1209#define show_node(zone) do { } while (0)
1210#endif
1211 1265
1212void si_meminfo(struct sysinfo *val) 1266void si_meminfo(struct sysinfo *val)
1213{ 1267{
@@ -1215,13 +1269,8 @@ void si_meminfo(struct sysinfo *val)
1215 val->sharedram = 0; 1269 val->sharedram = 0;
1216 val->freeram = nr_free_pages(); 1270 val->freeram = nr_free_pages();
1217 val->bufferram = nr_blockdev_pages(); 1271 val->bufferram = nr_blockdev_pages();
1218#ifdef CONFIG_HIGHMEM
1219 val->totalhigh = totalhigh_pages; 1272 val->totalhigh = totalhigh_pages;
1220 val->freehigh = nr_free_highpages(); 1273 val->freehigh = nr_free_highpages();
1221#else
1222 val->totalhigh = 0;
1223 val->freehigh = 0;
1224#endif
1225 val->mem_unit = PAGE_SIZE; 1274 val->mem_unit = PAGE_SIZE;
1226} 1275}
1227 1276
@@ -1234,8 +1283,13 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1234 1283
1235 val->totalram = pgdat->node_present_pages; 1284 val->totalram = pgdat->node_present_pages;
1236 val->freeram = nr_free_pages_pgdat(pgdat); 1285 val->freeram = nr_free_pages_pgdat(pgdat);
1286#ifdef CONFIG_HIGHMEM
1237 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1287 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1238 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1288 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1289#else
1290 val->totalhigh = 0;
1291 val->freehigh = 0;
1292#endif
1239 val->mem_unit = PAGE_SIZE; 1293 val->mem_unit = PAGE_SIZE;
1240} 1294}
1241#endif 1295#endif
@@ -1249,43 +1303,35 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1249 */ 1303 */
1250void show_free_areas(void) 1304void show_free_areas(void)
1251{ 1305{
1252 int cpu, temperature; 1306 int cpu;
1253 unsigned long active; 1307 unsigned long active;
1254 unsigned long inactive; 1308 unsigned long inactive;
1255 unsigned long free; 1309 unsigned long free;
1256 struct zone *zone; 1310 struct zone *zone;
1257 1311
1258 for_each_zone(zone) { 1312 for_each_zone(zone) {
1259 show_node(zone); 1313 if (!populated_zone(zone))
1260 printk("%s per-cpu:", zone->name);
1261
1262 if (!populated_zone(zone)) {
1263 printk(" empty\n");
1264 continue; 1314 continue;
1265 } else 1315
1266 printk("\n"); 1316 show_node(zone);
1317 printk("%s per-cpu:\n", zone->name);
1267 1318
1268 for_each_online_cpu(cpu) { 1319 for_each_online_cpu(cpu) {
1269 struct per_cpu_pageset *pageset; 1320 struct per_cpu_pageset *pageset;
1270 1321
1271 pageset = zone_pcp(zone, cpu); 1322 pageset = zone_pcp(zone, cpu);
1272 1323
1273 for (temperature = 0; temperature < 2; temperature++) 1324 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
1274 printk("cpu %d %s: high %d, batch %d used:%d\n", 1325 "Cold: hi:%5d, btch:%4d usd:%4d\n",
1275 cpu, 1326 cpu, pageset->pcp[0].high,
1276 temperature ? "cold" : "hot", 1327 pageset->pcp[0].batch, pageset->pcp[0].count,
1277 pageset->pcp[temperature].high, 1328 pageset->pcp[1].high, pageset->pcp[1].batch,
1278 pageset->pcp[temperature].batch, 1329 pageset->pcp[1].count);
1279 pageset->pcp[temperature].count);
1280 } 1330 }
1281 } 1331 }
1282 1332
1283 get_zone_counts(&active, &inactive, &free); 1333 get_zone_counts(&active, &inactive, &free);
1284 1334
1285 printk("Free pages: %11ukB (%ukB HighMem)\n",
1286 K(nr_free_pages()),
1287 K(nr_free_highpages()));
1288
1289 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1335 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1290 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1336 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1291 active, 1337 active,
@@ -1294,13 +1340,17 @@ void show_free_areas(void)
1294 global_page_state(NR_WRITEBACK), 1340 global_page_state(NR_WRITEBACK),
1295 global_page_state(NR_UNSTABLE_NFS), 1341 global_page_state(NR_UNSTABLE_NFS),
1296 nr_free_pages(), 1342 nr_free_pages(),
1297 global_page_state(NR_SLAB), 1343 global_page_state(NR_SLAB_RECLAIMABLE) +
1344 global_page_state(NR_SLAB_UNRECLAIMABLE),
1298 global_page_state(NR_FILE_MAPPED), 1345 global_page_state(NR_FILE_MAPPED),
1299 global_page_state(NR_PAGETABLE)); 1346 global_page_state(NR_PAGETABLE));
1300 1347
1301 for_each_zone(zone) { 1348 for_each_zone(zone) {
1302 int i; 1349 int i;
1303 1350
1351 if (!populated_zone(zone))
1352 continue;
1353
1304 show_node(zone); 1354 show_node(zone);
1305 printk("%s" 1355 printk("%s"
1306 " free:%lukB" 1356 " free:%lukB"
@@ -1333,12 +1383,11 @@ void show_free_areas(void)
1333 for_each_zone(zone) { 1383 for_each_zone(zone) {
1334 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1384 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1335 1385
1386 if (!populated_zone(zone))
1387 continue;
1388
1336 show_node(zone); 1389 show_node(zone);
1337 printk("%s: ", zone->name); 1390 printk("%s: ", zone->name);
1338 if (!populated_zone(zone)) {
1339 printk("empty\n");
1340 continue;
1341 }
1342 1391
1343 spin_lock_irqsave(&zone->lock, flags); 1392 spin_lock_irqsave(&zone->lock, flags);
1344 for (order = 0; order < MAX_ORDER; order++) { 1393 for (order = 0; order < MAX_ORDER; order++) {
@@ -1360,39 +1409,25 @@ void show_free_areas(void)
1360 * Add all populated zones of a node to the zonelist. 1409 * Add all populated zones of a node to the zonelist.
1361 */ 1410 */
1362static int __meminit build_zonelists_node(pg_data_t *pgdat, 1411static int __meminit build_zonelists_node(pg_data_t *pgdat,
1363 struct zonelist *zonelist, int nr_zones, int zone_type) 1412 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
1364{ 1413{
1365 struct zone *zone; 1414 struct zone *zone;
1366 1415
1367 BUG_ON(zone_type > ZONE_HIGHMEM); 1416 BUG_ON(zone_type >= MAX_NR_ZONES);
1417 zone_type++;
1368 1418
1369 do { 1419 do {
1420 zone_type--;
1370 zone = pgdat->node_zones + zone_type; 1421 zone = pgdat->node_zones + zone_type;
1371 if (populated_zone(zone)) { 1422 if (populated_zone(zone)) {
1372#ifndef CONFIG_HIGHMEM
1373 BUG_ON(zone_type > ZONE_NORMAL);
1374#endif
1375 zonelist->zones[nr_zones++] = zone; 1423 zonelist->zones[nr_zones++] = zone;
1376 check_highest_zone(zone_type); 1424 check_highest_zone(zone_type);
1377 } 1425 }
1378 zone_type--;
1379 1426
1380 } while (zone_type >= 0); 1427 } while (zone_type);
1381 return nr_zones; 1428 return nr_zones;
1382} 1429}
1383 1430
1384static inline int highest_zone(int zone_bits)
1385{
1386 int res = ZONE_NORMAL;
1387 if (zone_bits & (__force int)__GFP_HIGHMEM)
1388 res = ZONE_HIGHMEM;
1389 if (zone_bits & (__force int)__GFP_DMA32)
1390 res = ZONE_DMA32;
1391 if (zone_bits & (__force int)__GFP_DMA)
1392 res = ZONE_DMA;
1393 return res;
1394}
1395
1396#ifdef CONFIG_NUMA 1431#ifdef CONFIG_NUMA
1397#define MAX_NODE_LOAD (num_online_nodes()) 1432#define MAX_NODE_LOAD (num_online_nodes())
1398static int __meminitdata node_load[MAX_NUMNODES]; 1433static int __meminitdata node_load[MAX_NUMNODES];
@@ -1458,13 +1493,14 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1458 1493
1459static void __meminit build_zonelists(pg_data_t *pgdat) 1494static void __meminit build_zonelists(pg_data_t *pgdat)
1460{ 1495{
1461 int i, j, k, node, local_node; 1496 int j, node, local_node;
1497 enum zone_type i;
1462 int prev_node, load; 1498 int prev_node, load;
1463 struct zonelist *zonelist; 1499 struct zonelist *zonelist;
1464 nodemask_t used_mask; 1500 nodemask_t used_mask;
1465 1501
1466 /* initialize zonelists */ 1502 /* initialize zonelists */
1467 for (i = 0; i < GFP_ZONETYPES; i++) { 1503 for (i = 0; i < MAX_NR_ZONES; i++) {
1468 zonelist = pgdat->node_zonelists + i; 1504 zonelist = pgdat->node_zonelists + i;
1469 zonelist->zones[0] = NULL; 1505 zonelist->zones[0] = NULL;
1470 } 1506 }
@@ -1494,13 +1530,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1494 node_load[node] += load; 1530 node_load[node] += load;
1495 prev_node = node; 1531 prev_node = node;
1496 load--; 1532 load--;
1497 for (i = 0; i < GFP_ZONETYPES; i++) { 1533 for (i = 0; i < MAX_NR_ZONES; i++) {
1498 zonelist = pgdat->node_zonelists + i; 1534 zonelist = pgdat->node_zonelists + i;
1499 for (j = 0; zonelist->zones[j] != NULL; j++); 1535 for (j = 0; zonelist->zones[j] != NULL; j++);
1500 1536
1501 k = highest_zone(i); 1537 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1502
1503 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1504 zonelist->zones[j] = NULL; 1538 zonelist->zones[j] = NULL;
1505 } 1539 }
1506 } 1540 }
@@ -1510,17 +1544,16 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1510 1544
1511static void __meminit build_zonelists(pg_data_t *pgdat) 1545static void __meminit build_zonelists(pg_data_t *pgdat)
1512{ 1546{
1513 int i, j, k, node, local_node; 1547 int node, local_node;
1548 enum zone_type i,j;
1514 1549
1515 local_node = pgdat->node_id; 1550 local_node = pgdat->node_id;
1516 for (i = 0; i < GFP_ZONETYPES; i++) { 1551 for (i = 0; i < MAX_NR_ZONES; i++) {
1517 struct zonelist *zonelist; 1552 struct zonelist *zonelist;
1518 1553
1519 zonelist = pgdat->node_zonelists + i; 1554 zonelist = pgdat->node_zonelists + i;
1520 1555
1521 j = 0; 1556 j = build_zonelists_node(pgdat, zonelist, 0, i);
1522 k = highest_zone(i);
1523 j = build_zonelists_node(pgdat, zonelist, j, k);
1524 /* 1557 /*
1525 * Now we build the zonelist so that it contains the zones 1558 * Now we build the zonelist so that it contains the zones
1526 * of all the other nodes. 1559 * of all the other nodes.
@@ -1532,12 +1565,12 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1532 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1565 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1533 if (!node_online(node)) 1566 if (!node_online(node))
1534 continue; 1567 continue;
1535 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1568 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1536 } 1569 }
1537 for (node = 0; node < local_node; node++) { 1570 for (node = 0; node < local_node; node++) {
1538 if (!node_online(node)) 1571 if (!node_online(node))
1539 continue; 1572 continue;
1540 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1573 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1541 } 1574 }
1542 1575
1543 zonelist->zones[j] = NULL; 1576 zonelist->zones[j] = NULL;
@@ -1558,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy)
1558void __meminit build_all_zonelists(void) 1591void __meminit build_all_zonelists(void)
1559{ 1592{
1560 if (system_state == SYSTEM_BOOTING) { 1593 if (system_state == SYSTEM_BOOTING) {
1561 __build_all_zonelists(0); 1594 __build_all_zonelists(NULL);
1562 cpuset_init_current_mems_allowed(); 1595 cpuset_init_current_mems_allowed();
1563 } else { 1596 } else {
1564 /* we have to stop all cpus to guaranntee there is no user 1597 /* we have to stop all cpus to guaranntee there is no user
@@ -1639,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
1639 1672
1640#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1673#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1641 1674
1642static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1643 unsigned long *zones_size, unsigned long *zholes_size)
1644{
1645 unsigned long realtotalpages, totalpages = 0;
1646 int i;
1647
1648 for (i = 0; i < MAX_NR_ZONES; i++)
1649 totalpages += zones_size[i];
1650 pgdat->node_spanned_pages = totalpages;
1651
1652 realtotalpages = totalpages;
1653 if (zholes_size)
1654 for (i = 0; i < MAX_NR_ZONES; i++)
1655 realtotalpages -= zholes_size[i];
1656 pgdat->node_present_pages = realtotalpages;
1657 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1658}
1659
1660
1661/* 1675/*
1662 * Initially all pages are reserved - free ones are freed 1676 * Initially all pages are reserved - free ones are freed
1663 * up by free_all_bootmem() once the early boot process is 1677 * up by free_all_bootmem() once the early boot process is
@@ -1698,8 +1712,8 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1698} 1712}
1699 1713
1700#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) 1714#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1701void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 1715void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
1702 unsigned long size) 1716 unsigned long pfn, unsigned long size)
1703{ 1717{
1704 unsigned long snum = pfn_to_section_nr(pfn); 1718 unsigned long snum = pfn_to_section_nr(pfn);
1705 unsigned long end = pfn_to_section_nr(pfn + size); 1719 unsigned long end = pfn_to_section_nr(pfn + size);
@@ -1815,6 +1829,9 @@ static int __cpuinit process_zones(int cpu)
1815 1829
1816 for_each_zone(zone) { 1830 for_each_zone(zone) {
1817 1831
1832 if (!populated_zone(zone))
1833 continue;
1834
1818 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 1835 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1819 GFP_KERNEL, cpu_to_node(cpu)); 1836 GFP_KERNEL, cpu_to_node(cpu));
1820 if (!zone_pcp(zone, cpu)) 1837 if (!zone_pcp(zone, cpu))
@@ -1845,8 +1862,10 @@ static inline void free_zone_pagesets(int cpu)
1845 for_each_zone(zone) { 1862 for_each_zone(zone) {
1846 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 1863 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1847 1864
1865 /* Free per_cpu_pageset if it is slab allocated */
1866 if (pset != &boot_pageset[cpu])
1867 kfree(pset);
1848 zone_pcp(zone, cpu) = NULL; 1868 zone_pcp(zone, cpu) = NULL;
1849 kfree(pset);
1850 } 1869 }
1851} 1870}
1852 1871
@@ -1972,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone,
1972 return 0; 1991 return 0;
1973} 1992}
1974 1993
1994#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
1995/*
1996 * Basic iterator support. Return the first range of PFNs for a node
1997 * Note: nid == MAX_NUMNODES returns first region regardless of node
1998 */
1999static int __init first_active_region_index_in_nid(int nid)
2000{
2001 int i;
2002
2003 for (i = 0; i < nr_nodemap_entries; i++)
2004 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2005 return i;
2006
2007 return -1;
2008}
2009
2010/*
2011 * Basic iterator support. Return the next active range of PFNs for a node
2012 * Note: nid == MAX_NUMNODES returns next region regardles of node
2013 */
2014static int __init next_active_region_index_in_nid(int index, int nid)
2015{
2016 for (index = index + 1; index < nr_nodemap_entries; index++)
2017 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2018 return index;
2019
2020 return -1;
2021}
2022
2023#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2024/*
2025 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2026 * Architectures may implement their own version but if add_active_range()
2027 * was used and there are no special requirements, this is a convenient
2028 * alternative
2029 */
2030int __init early_pfn_to_nid(unsigned long pfn)
2031{
2032 int i;
2033
2034 for (i = 0; i < nr_nodemap_entries; i++) {
2035 unsigned long start_pfn = early_node_map[i].start_pfn;
2036 unsigned long end_pfn = early_node_map[i].end_pfn;
2037
2038 if (start_pfn <= pfn && pfn < end_pfn)
2039 return early_node_map[i].nid;
2040 }
2041
2042 return 0;
2043}
2044#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2045
2046/* Basic iterator support to walk early_node_map[] */
2047#define for_each_active_range_index_in_nid(i, nid) \
2048 for (i = first_active_region_index_in_nid(nid); i != -1; \
2049 i = next_active_region_index_in_nid(i, nid))
2050
2051/**
2052 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2053 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
2054 * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
2055 *
2056 * If an architecture guarantees that all ranges registered with
2057 * add_active_ranges() contain no holes and may be freed, this
2058 * this function may be used instead of calling free_bootmem() manually.
2059 */
2060void __init free_bootmem_with_active_regions(int nid,
2061 unsigned long max_low_pfn)
2062{
2063 int i;
2064
2065 for_each_active_range_index_in_nid(i, nid) {
2066 unsigned long size_pages = 0;
2067 unsigned long end_pfn = early_node_map[i].end_pfn;
2068
2069 if (early_node_map[i].start_pfn >= max_low_pfn)
2070 continue;
2071
2072 if (end_pfn > max_low_pfn)
2073 end_pfn = max_low_pfn;
2074
2075 size_pages = end_pfn - early_node_map[i].start_pfn;
2076 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2077 PFN_PHYS(early_node_map[i].start_pfn),
2078 size_pages << PAGE_SHIFT);
2079 }
2080}
2081
2082/**
2083 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2084 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
2085 *
2086 * If an architecture guarantees that all ranges registered with
2087 * add_active_ranges() contain no holes and may be freed, this
2088 * this function may be used instead of calling memory_present() manually.
2089 */
2090void __init sparse_memory_present_with_active_regions(int nid)
2091{
2092 int i;
2093
2094 for_each_active_range_index_in_nid(i, nid)
2095 memory_present(early_node_map[i].nid,
2096 early_node_map[i].start_pfn,
2097 early_node_map[i].end_pfn);
2098}
2099
2100/**
2101 * push_node_boundaries - Push node boundaries to at least the requested boundary
2102 * @nid: The nid of the node to push the boundary for
2103 * @start_pfn: The start pfn of the node
2104 * @end_pfn: The end pfn of the node
2105 *
2106 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2107 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2108 * be hotplugged even though no physical memory exists. This function allows
2109 * an arch to push out the node boundaries so mem_map is allocated that can
2110 * be used later.
2111 */
2112#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2113void __init push_node_boundaries(unsigned int nid,
2114 unsigned long start_pfn, unsigned long end_pfn)
2115{
2116 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2117 nid, start_pfn, end_pfn);
2118
2119 /* Initialise the boundary for this node if necessary */
2120 if (node_boundary_end_pfn[nid] == 0)
2121 node_boundary_start_pfn[nid] = -1UL;
2122
2123 /* Update the boundaries */
2124 if (node_boundary_start_pfn[nid] > start_pfn)
2125 node_boundary_start_pfn[nid] = start_pfn;
2126 if (node_boundary_end_pfn[nid] < end_pfn)
2127 node_boundary_end_pfn[nid] = end_pfn;
2128}
2129
2130/* If necessary, push the node boundary out for reserve hotadd */
2131static void __init account_node_boundary(unsigned int nid,
2132 unsigned long *start_pfn, unsigned long *end_pfn)
2133{
2134 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2135 nid, *start_pfn, *end_pfn);
2136
2137 /* Return if boundary information has not been provided */
2138 if (node_boundary_end_pfn[nid] == 0)
2139 return;
2140
2141 /* Check the boundaries and update if necessary */
2142 if (node_boundary_start_pfn[nid] < *start_pfn)
2143 *start_pfn = node_boundary_start_pfn[nid];
2144 if (node_boundary_end_pfn[nid] > *end_pfn)
2145 *end_pfn = node_boundary_end_pfn[nid];
2146}
2147#else
2148void __init push_node_boundaries(unsigned int nid,
2149 unsigned long start_pfn, unsigned long end_pfn) {}
2150
2151static void __init account_node_boundary(unsigned int nid,
2152 unsigned long *start_pfn, unsigned long *end_pfn) {}
2153#endif
2154
2155
2156/**
2157 * get_pfn_range_for_nid - Return the start and end page frames for a node
2158 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
2159 * @start_pfn: Passed by reference. On return, it will have the node start_pfn
2160 * @end_pfn: Passed by reference. On return, it will have the node end_pfn
2161 *
2162 * It returns the start and end page frame of a node based on information
2163 * provided by an arch calling add_active_range(). If called for a node
2164 * with no available memory, a warning is printed and the start and end
2165 * PFNs will be 0
2166 */
2167void __init get_pfn_range_for_nid(unsigned int nid,
2168 unsigned long *start_pfn, unsigned long *end_pfn)
2169{
2170 int i;
2171 *start_pfn = -1UL;
2172 *end_pfn = 0;
2173
2174 for_each_active_range_index_in_nid(i, nid) {
2175 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
2176 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2177 }
2178
2179 if (*start_pfn == -1UL) {
2180 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2181 *start_pfn = 0;
2182 }
2183
2184 /* Push the node boundaries out if requested */
2185 account_node_boundary(nid, start_pfn, end_pfn);
2186}
2187
2188/*
2189 * Return the number of pages a zone spans in a node, including holes
2190 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2191 */
2192unsigned long __init zone_spanned_pages_in_node(int nid,
2193 unsigned long zone_type,
2194 unsigned long *ignored)
2195{
2196 unsigned long node_start_pfn, node_end_pfn;
2197 unsigned long zone_start_pfn, zone_end_pfn;
2198
2199 /* Get the start and end of the node and zone */
2200 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2201 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2202 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2203
2204 /* Check that this node has pages within the zone's required range */
2205 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
2206 return 0;
2207
2208 /* Move the zone boundaries inside the node if necessary */
2209 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2210 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2211
2212 /* Return the spanned pages */
2213 return zone_end_pfn - zone_start_pfn;
2214}
2215
2216/*
2217 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2218 * then all holes in the requested range will be accounted for
2219 */
2220unsigned long __init __absent_pages_in_range(int nid,
2221 unsigned long range_start_pfn,
2222 unsigned long range_end_pfn)
2223{
2224 int i = 0;
2225 unsigned long prev_end_pfn = 0, hole_pages = 0;
2226 unsigned long start_pfn;
2227
2228 /* Find the end_pfn of the first active range of pfns in the node */
2229 i = first_active_region_index_in_nid(nid);
2230 if (i == -1)
2231 return 0;
2232
2233 /* Account for ranges before physical memory on this node */
2234 if (early_node_map[i].start_pfn > range_start_pfn)
2235 hole_pages = early_node_map[i].start_pfn - range_start_pfn;
2236
2237 prev_end_pfn = early_node_map[i].start_pfn;
2238
2239 /* Find all holes for the zone within the node */
2240 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2241
2242 /* No need to continue if prev_end_pfn is outside the zone */
2243 if (prev_end_pfn >= range_end_pfn)
2244 break;
2245
2246 /* Make sure the end of the zone is not within the hole */
2247 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2248 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2249
2250 /* Update the hole size cound and move on */
2251 if (start_pfn > range_start_pfn) {
2252 BUG_ON(prev_end_pfn > start_pfn);
2253 hole_pages += start_pfn - prev_end_pfn;
2254 }
2255 prev_end_pfn = early_node_map[i].end_pfn;
2256 }
2257
2258 /* Account for ranges past physical memory on this node */
2259 if (range_end_pfn > prev_end_pfn)
2260 hole_pages = range_end_pfn -
2261 max(range_start_pfn, prev_end_pfn);
2262
2263 return hole_pages;
2264}
2265
2266/**
2267 * absent_pages_in_range - Return number of page frames in holes within a range
2268 * @start_pfn: The start PFN to start searching for holes
2269 * @end_pfn: The end PFN to stop searching for holes
2270 *
2271 * It returns the number of pages frames in memory holes within a range
2272 */
2273unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2274 unsigned long end_pfn)
2275{
2276 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2277}
2278
2279/* Return the number of page frames in holes in a zone on a node */
2280unsigned long __init zone_absent_pages_in_node(int nid,
2281 unsigned long zone_type,
2282 unsigned long *ignored)
2283{
2284 unsigned long node_start_pfn, node_end_pfn;
2285 unsigned long zone_start_pfn, zone_end_pfn;
2286
2287 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2288 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
2289 node_start_pfn);
2290 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2291 node_end_pfn);
2292
2293 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2294}
2295
2296/* Return the zone index a PFN is in */
2297int memmap_zone_idx(struct page *lmem_map)
2298{
2299 int i;
2300 unsigned long phys_addr = virt_to_phys(lmem_map);
2301 unsigned long pfn = phys_addr >> PAGE_SHIFT;
2302
2303 for (i = 0; i < MAX_NR_ZONES; i++)
2304 if (pfn < arch_zone_highest_possible_pfn[i])
2305 break;
2306
2307 return i;
2308}
2309#else
2310static inline unsigned long zone_spanned_pages_in_node(int nid,
2311 unsigned long zone_type,
2312 unsigned long *zones_size)
2313{
2314 return zones_size[zone_type];
2315}
2316
2317static inline unsigned long zone_absent_pages_in_node(int nid,
2318 unsigned long zone_type,
2319 unsigned long *zholes_size)
2320{
2321 if (!zholes_size)
2322 return 0;
2323
2324 return zholes_size[zone_type];
2325}
2326
2327static inline int memmap_zone_idx(struct page *lmem_map)
2328{
2329 return MAX_NR_ZONES;
2330}
2331#endif
2332
2333static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2334 unsigned long *zones_size, unsigned long *zholes_size)
2335{
2336 unsigned long realtotalpages, totalpages = 0;
2337 enum zone_type i;
2338
2339 for (i = 0; i < MAX_NR_ZONES; i++)
2340 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2341 zones_size);
2342 pgdat->node_spanned_pages = totalpages;
2343
2344 realtotalpages = totalpages;
2345 for (i = 0; i < MAX_NR_ZONES; i++)
2346 realtotalpages -=
2347 zone_absent_pages_in_node(pgdat->node_id, i,
2348 zholes_size);
2349 pgdat->node_present_pages = realtotalpages;
2350 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2351 realtotalpages);
2352}
2353
1975/* 2354/*
1976 * Set up the zone data structures: 2355 * Set up the zone data structures:
1977 * - mark all pages reserved 2356 * - mark all pages reserved
@@ -1981,7 +2360,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
1981static void __meminit free_area_init_core(struct pglist_data *pgdat, 2360static void __meminit free_area_init_core(struct pglist_data *pgdat,
1982 unsigned long *zones_size, unsigned long *zholes_size) 2361 unsigned long *zones_size, unsigned long *zholes_size)
1983{ 2362{
1984 unsigned long j; 2363 enum zone_type j;
1985 int nid = pgdat->node_id; 2364 int nid = pgdat->node_id;
1986 unsigned long zone_start_pfn = pgdat->node_start_pfn; 2365 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1987 int ret; 2366 int ret;
@@ -1993,21 +2372,46 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
1993 2372
1994 for (j = 0; j < MAX_NR_ZONES; j++) { 2373 for (j = 0; j < MAX_NR_ZONES; j++) {
1995 struct zone *zone = pgdat->node_zones + j; 2374 struct zone *zone = pgdat->node_zones + j;
1996 unsigned long size, realsize; 2375 unsigned long size, realsize, memmap_pages;
1997 2376
1998 realsize = size = zones_size[j]; 2377 size = zone_spanned_pages_in_node(nid, j, zones_size);
1999 if (zholes_size) 2378 realsize = size - zone_absent_pages_in_node(nid, j,
2000 realsize -= zholes_size[j]; 2379 zholes_size);
2001 2380
2002 if (j < ZONE_HIGHMEM) 2381 /*
2382 * Adjust realsize so that it accounts for how much memory
2383 * is used by this zone for memmap. This affects the watermark
2384 * and per-cpu initialisations
2385 */
2386 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
2387 if (realsize >= memmap_pages) {
2388 realsize -= memmap_pages;
2389 printk(KERN_DEBUG
2390 " %s zone: %lu pages used for memmap\n",
2391 zone_names[j], memmap_pages);
2392 } else
2393 printk(KERN_WARNING
2394 " %s zone: %lu pages exceeds realsize %lu\n",
2395 zone_names[j], memmap_pages, realsize);
2396
2397 /* Account for reserved DMA pages */
2398 if (j == ZONE_DMA && realsize > dma_reserve) {
2399 realsize -= dma_reserve;
2400 printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",
2401 dma_reserve);
2402 }
2403
2404 if (!is_highmem_idx(j))
2003 nr_kernel_pages += realsize; 2405 nr_kernel_pages += realsize;
2004 nr_all_pages += realsize; 2406 nr_all_pages += realsize;
2005 2407
2006 zone->spanned_pages = size; 2408 zone->spanned_pages = size;
2007 zone->present_pages = realsize; 2409 zone->present_pages = realsize;
2008#ifdef CONFIG_NUMA 2410#ifdef CONFIG_NUMA
2009 zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) 2411 zone->node = nid;
2412 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2010 / 100; 2413 / 100;
2414 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
2011#endif 2415#endif
2012 zone->name = zone_names[j]; 2416 zone->name = zone_names[j];
2013 spin_lock_init(&zone->lock); 2417 spin_lock_init(&zone->lock);
@@ -2067,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2067 /* 2471 /*
2068 * With no DISCONTIG, the global mem_map is just set as node 0's 2472 * With no DISCONTIG, the global mem_map is just set as node 0's
2069 */ 2473 */
2070 if (pgdat == NODE_DATA(0)) 2474 if (pgdat == NODE_DATA(0)) {
2071 mem_map = NODE_DATA(0)->node_mem_map; 2475 mem_map = NODE_DATA(0)->node_mem_map;
2476#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2477 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2478 mem_map -= pgdat->node_start_pfn;
2479#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2480 }
2072#endif 2481#endif
2073#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2482#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2074} 2483}
@@ -2079,13 +2488,255 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2079{ 2488{
2080 pgdat->node_id = nid; 2489 pgdat->node_id = nid;
2081 pgdat->node_start_pfn = node_start_pfn; 2490 pgdat->node_start_pfn = node_start_pfn;
2082 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 2491 calculate_node_totalpages(pgdat, zones_size, zholes_size);
2083 2492
2084 alloc_node_mem_map(pgdat); 2493 alloc_node_mem_map(pgdat);
2085 2494
2086 free_area_init_core(pgdat, zones_size, zholes_size); 2495 free_area_init_core(pgdat, zones_size, zholes_size);
2087} 2496}
2088 2497
2498#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2499/**
2500 * add_active_range - Register a range of PFNs backed by physical memory
2501 * @nid: The node ID the range resides on
2502 * @start_pfn: The start PFN of the available physical memory
2503 * @end_pfn: The end PFN of the available physical memory
2504 *
2505 * These ranges are stored in an early_node_map[] and later used by
2506 * free_area_init_nodes() to calculate zone sizes and holes. If the
2507 * range spans a memory hole, it is up to the architecture to ensure
2508 * the memory is not freed by the bootmem allocator. If possible
2509 * the range being registered will be merged with existing ranges.
2510 */
2511void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2512 unsigned long end_pfn)
2513{
2514 int i;
2515
2516 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2517 "%d entries of %d used\n",
2518 nid, start_pfn, end_pfn,
2519 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2520
2521 /* Merge with existing active regions if possible */
2522 for (i = 0; i < nr_nodemap_entries; i++) {
2523 if (early_node_map[i].nid != nid)
2524 continue;
2525
2526 /* Skip if an existing region covers this new one */
2527 if (start_pfn >= early_node_map[i].start_pfn &&
2528 end_pfn <= early_node_map[i].end_pfn)
2529 return;
2530
2531 /* Merge forward if suitable */
2532 if (start_pfn <= early_node_map[i].end_pfn &&
2533 end_pfn > early_node_map[i].end_pfn) {
2534 early_node_map[i].end_pfn = end_pfn;
2535 return;
2536 }
2537
2538 /* Merge backward if suitable */
2539 if (start_pfn < early_node_map[i].end_pfn &&
2540 end_pfn >= early_node_map[i].start_pfn) {
2541 early_node_map[i].start_pfn = start_pfn;
2542 return;
2543 }
2544 }
2545
2546 /* Check that early_node_map is large enough */
2547 if (i >= MAX_ACTIVE_REGIONS) {
2548 printk(KERN_CRIT "More than %d memory regions, truncating\n",
2549 MAX_ACTIVE_REGIONS);
2550 return;
2551 }
2552
2553 early_node_map[i].nid = nid;
2554 early_node_map[i].start_pfn = start_pfn;
2555 early_node_map[i].end_pfn = end_pfn;
2556 nr_nodemap_entries = i + 1;
2557}
2558
2559/**
2560 * shrink_active_range - Shrink an existing registered range of PFNs
2561 * @nid: The node id the range is on that should be shrunk
2562 * @old_end_pfn: The old end PFN of the range
2563 * @new_end_pfn: The new PFN of the range
2564 *
2565 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2566 * The map is kept at the end physical page range that has already been
2567 * registered with add_active_range(). This function allows an arch to shrink
2568 * an existing registered range.
2569 */
2570void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2571 unsigned long new_end_pfn)
2572{
2573 int i;
2574
2575 /* Find the old active region end and shrink */
2576 for_each_active_range_index_in_nid(i, nid)
2577 if (early_node_map[i].end_pfn == old_end_pfn) {
2578 early_node_map[i].end_pfn = new_end_pfn;
2579 break;
2580 }
2581}
2582
2583/**
2584 * remove_all_active_ranges - Remove all currently registered regions
2585 * During discovery, it may be found that a table like SRAT is invalid
2586 * and an alternative discovery method must be used. This function removes
2587 * all currently registered regions.
2588 */
2589void __init remove_all_active_ranges()
2590{
2591 memset(early_node_map, 0, sizeof(early_node_map));
2592 nr_nodemap_entries = 0;
2593#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2594 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
2595 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
2596#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
2597}
2598
2599/* Compare two active node_active_regions */
2600static int __init cmp_node_active_region(const void *a, const void *b)
2601{
2602 struct node_active_region *arange = (struct node_active_region *)a;
2603 struct node_active_region *brange = (struct node_active_region *)b;
2604
2605 /* Done this way to avoid overflows */
2606 if (arange->start_pfn > brange->start_pfn)
2607 return 1;
2608 if (arange->start_pfn < brange->start_pfn)
2609 return -1;
2610
2611 return 0;
2612}
2613
2614/* sort the node_map by start_pfn */
2615static void __init sort_node_map(void)
2616{
2617 sort(early_node_map, (size_t)nr_nodemap_entries,
2618 sizeof(struct node_active_region),
2619 cmp_node_active_region, NULL);
2620}
2621
2622/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
2623unsigned long __init find_min_pfn_for_node(unsigned long nid)
2624{
2625 int i;
2626
2627 /* Assuming a sorted map, the first range found has the starting pfn */
2628 for_each_active_range_index_in_nid(i, nid)
2629 return early_node_map[i].start_pfn;
2630
2631 printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
2632 return 0;
2633}
2634
2635/**
2636 * find_min_pfn_with_active_regions - Find the minimum PFN registered
2637 *
2638 * It returns the minimum PFN based on information provided via
2639 * add_active_range()
2640 */
2641unsigned long __init find_min_pfn_with_active_regions(void)
2642{
2643 return find_min_pfn_for_node(MAX_NUMNODES);
2644}
2645
2646/**
2647 * find_max_pfn_with_active_regions - Find the maximum PFN registered
2648 *
2649 * It returns the maximum PFN based on information provided via
2650 * add_active_range()
2651 */
2652unsigned long __init find_max_pfn_with_active_regions(void)
2653{
2654 int i;
2655 unsigned long max_pfn = 0;
2656
2657 for (i = 0; i < nr_nodemap_entries; i++)
2658 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2659
2660 return max_pfn;
2661}
2662
2663/**
2664 * free_area_init_nodes - Initialise all pg_data_t and zone data
2665 * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
2666 * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
2667 * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
2668 * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
2669 *
2670 * This will call free_area_init_node() for each active node in the system.
2671 * Using the page ranges provided by add_active_range(), the size of each
2672 * zone in each node and their holes is calculated. If the maximum PFN
2673 * between two adjacent zones match, it is assumed that the zone is empty.
2674 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2675 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2676 * starts where the previous one ended. For example, ZONE_DMA32 starts
2677 * at arch_max_dma_pfn.
2678 */
2679void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2680{
2681 unsigned long nid;
2682 enum zone_type i;
2683
2684 /* Record where the zone boundaries are */
2685 memset(arch_zone_lowest_possible_pfn, 0,
2686 sizeof(arch_zone_lowest_possible_pfn));
2687 memset(arch_zone_highest_possible_pfn, 0,
2688 sizeof(arch_zone_highest_possible_pfn));
2689 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2690 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2691 for (i = 1; i < MAX_NR_ZONES; i++) {
2692 arch_zone_lowest_possible_pfn[i] =
2693 arch_zone_highest_possible_pfn[i-1];
2694 arch_zone_highest_possible_pfn[i] =
2695 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2696 }
2697
2698 /* Regions in the early_node_map can be in any order */
2699 sort_node_map();
2700
2701 /* Print out the zone ranges */
2702 printk("Zone PFN ranges:\n");
2703 for (i = 0; i < MAX_NR_ZONES; i++)
2704 printk(" %-8s %8lu -> %8lu\n",
2705 zone_names[i],
2706 arch_zone_lowest_possible_pfn[i],
2707 arch_zone_highest_possible_pfn[i]);
2708
2709 /* Print out the early_node_map[] */
2710 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2711 for (i = 0; i < nr_nodemap_entries; i++)
2712 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2713 early_node_map[i].start_pfn,
2714 early_node_map[i].end_pfn);
2715
2716 /* Initialise every node */
2717 for_each_online_node(nid) {
2718 pg_data_t *pgdat = NODE_DATA(nid);
2719 free_area_init_node(nid, pgdat, NULL,
2720 find_min_pfn_for_node(nid), NULL);
2721 }
2722}
2723#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2724
2725/**
2726 * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA
2727 * @new_dma_reserve - The number of pages to mark reserved
2728 *
2729 * The per-cpu batchsize and zone watermarks are determined by present_pages.
2730 * In the DMA zone, a significant percentage may be consumed by kernel image
2731 * and other unfreeable allocations which can skew the watermarks badly. This
2732 * function may optionally be used to account for unfreeable pages in
2733 * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize
2734 */
2735void __init set_dma_reserve(unsigned long new_dma_reserve)
2736{
2737 dma_reserve = new_dma_reserve;
2738}
2739
2089#ifndef CONFIG_NEED_MULTIPLE_NODES 2740#ifndef CONFIG_NEED_MULTIPLE_NODES
2090static bootmem_data_t contig_bootmem_data; 2741static bootmem_data_t contig_bootmem_data;
2091struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2742struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -2129,7 +2780,7 @@ static void calculate_totalreserve_pages(void)
2129{ 2780{
2130 struct pglist_data *pgdat; 2781 struct pglist_data *pgdat;
2131 unsigned long reserve_pages = 0; 2782 unsigned long reserve_pages = 0;
2132 int i, j; 2783 enum zone_type i, j;
2133 2784
2134 for_each_online_pgdat(pgdat) { 2785 for_each_online_pgdat(pgdat) {
2135 for (i = 0; i < MAX_NR_ZONES; i++) { 2786 for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -2162,7 +2813,7 @@ static void calculate_totalreserve_pages(void)
2162static void setup_per_zone_lowmem_reserve(void) 2813static void setup_per_zone_lowmem_reserve(void)
2163{ 2814{
2164 struct pglist_data *pgdat; 2815 struct pglist_data *pgdat;
2165 int j, idx; 2816 enum zone_type j, idx;
2166 2817
2167 for_each_online_pgdat(pgdat) { 2818 for_each_online_pgdat(pgdat) {
2168 for (j = 0; j < MAX_NR_ZONES; j++) { 2819 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -2171,9 +2822,12 @@ static void setup_per_zone_lowmem_reserve(void)
2171 2822
2172 zone->lowmem_reserve[j] = 0; 2823 zone->lowmem_reserve[j] = 0;
2173 2824
2174 for (idx = j-1; idx >= 0; idx--) { 2825 idx = j;
2826 while (idx) {
2175 struct zone *lower_zone; 2827 struct zone *lower_zone;
2176 2828
2829 idx--;
2830
2177 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2831 if (sysctl_lowmem_reserve_ratio[idx] < 1)
2178 sysctl_lowmem_reserve_ratio[idx] = 1; 2832 sysctl_lowmem_reserve_ratio[idx] = 1;
2179 2833
@@ -2314,10 +2968,26 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
2314 return rc; 2968 return rc;
2315 2969
2316 for_each_zone(zone) 2970 for_each_zone(zone)
2317 zone->min_unmapped_ratio = (zone->present_pages * 2971 zone->min_unmapped_pages = (zone->present_pages *
2318 sysctl_min_unmapped_ratio) / 100; 2972 sysctl_min_unmapped_ratio) / 100;
2319 return 0; 2973 return 0;
2320} 2974}
2975
2976int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
2977 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2978{
2979 struct zone *zone;
2980 int rc;
2981
2982 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2983 if (rc)
2984 return rc;
2985
2986 for_each_zone(zone)
2987 zone->min_slab_pages = (zone->present_pages *
2988 sysctl_min_slab_ratio) / 100;
2989 return 0;
2990}
2321#endif 2991#endif
2322 2992
2323/* 2993/*
@@ -2363,7 +3033,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
2363 return 0; 3033 return 0;
2364} 3034}
2365 3035
2366__initdata int hashdist = HASHDIST_DEFAULT; 3036int hashdist = HASHDIST_DEFAULT;
2367 3037
2368#ifdef CONFIG_NUMA 3038#ifdef CONFIG_NUMA
2369static int __init set_hashdist(char *str) 3039static int __init set_hashdist(char *str)
diff --git a/mm/page_io.c b/mm/page_io.c
index 88029948d0..d4840ecbf8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -52,14 +52,29 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
52 if (bio->bi_size) 52 if (bio->bi_size)
53 return 1; 53 return 1;
54 54
55 if (!uptodate) 55 if (!uptodate) {
56 SetPageError(page); 56 SetPageError(page);
57 /*
58 * We failed to write the page out to swap-space.
59 * Re-dirty the page in order to avoid it being reclaimed.
60 * Also print a dire warning that things will go BAD (tm)
61 * very quickly.
62 *
63 * Also clear PG_reclaim to avoid rotate_reclaimable_page()
64 */
65 set_page_dirty(page);
66 printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
67 imajor(bio->bi_bdev->bd_inode),
68 iminor(bio->bi_bdev->bd_inode),
69 (unsigned long long)bio->bi_sector);
70 ClearPageReclaim(page);
71 }
57 end_page_writeback(page); 72 end_page_writeback(page);
58 bio_put(bio); 73 bio_put(bio);
59 return 0; 74 return 0;
60} 75}
61 76
62static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) 77int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
63{ 78{
64 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 79 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
65 struct page *page = bio->bi_io_vec[0].bv_page; 80 struct page *page = bio->bi_io_vec[0].bv_page;
@@ -70,6 +85,10 @@ static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
70 if (!uptodate) { 85 if (!uptodate) {
71 SetPageError(page); 86 SetPageError(page);
72 ClearPageUptodate(page); 87 ClearPageUptodate(page);
88 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
89 imajor(bio->bi_bdev->bd_inode),
90 iminor(bio->bi_bdev->bd_inode),
91 (unsigned long long)bio->bi_sector);
73 } else { 92 } else {
74 SetPageUptodate(page); 93 SetPageUptodate(page);
75 } 94 }
@@ -137,10 +156,12 @@ out:
137 * We use end_swap_bio_read() even for writes, because it happens to do what 156 * We use end_swap_bio_read() even for writes, because it happens to do what
138 * we want. 157 * we want.
139 */ 158 */
140int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) 159int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
160 struct bio **bio_chain)
141{ 161{
142 struct bio *bio; 162 struct bio *bio;
143 int ret = 0; 163 int ret = 0;
164 int bio_rw;
144 165
145 lock_page(page); 166 lock_page(page);
146 167
@@ -151,11 +172,22 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
151 goto out; 172 goto out;
152 } 173 }
153 174
154 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 175 bio_rw = rw;
155 wait_on_page_locked(page); 176 if (!bio_chain)
156 177 bio_rw |= (1 << BIO_RW_SYNC);
157 if (!PageUptodate(page) || PageError(page)) 178 if (bio_chain)
158 ret = -EIO; 179 bio_get(bio);
180 submit_bio(bio_rw, bio);
181 if (bio_chain == NULL) {
182 wait_on_page_locked(page);
183
184 if (!PageUptodate(page) || PageError(page))
185 ret = -EIO;
186 }
187 if (bio_chain) {
188 bio->bi_private = *bio_chain;
189 *bio_chain = bio;
190 }
159out: 191out:
160 return ret; 192 return ret;
161} 193}
diff --git a/mm/rmap.c b/mm/rmap.c
index 40158b5972..e2155d791d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -434,6 +434,71 @@ int page_referenced(struct page *page, int is_locked)
434 return referenced; 434 return referenced;
435} 435}
436 436
437static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
438{
439 struct mm_struct *mm = vma->vm_mm;
440 unsigned long address;
441 pte_t *pte, entry;
442 spinlock_t *ptl;
443 int ret = 0;
444
445 address = vma_address(page, vma);
446 if (address == -EFAULT)
447 goto out;
448
449 pte = page_check_address(page, mm, address, &ptl);
450 if (!pte)
451 goto out;
452
453 if (!pte_dirty(*pte) && !pte_write(*pte))
454 goto unlock;
455
456 entry = ptep_get_and_clear(mm, address, pte);
457 entry = pte_mkclean(entry);
458 entry = pte_wrprotect(entry);
459 ptep_establish(vma, address, pte, entry);
460 lazy_mmu_prot_update(entry);
461 ret = 1;
462
463unlock:
464 pte_unmap_unlock(pte, ptl);
465out:
466 return ret;
467}
468
469static int page_mkclean_file(struct address_space *mapping, struct page *page)
470{
471 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
472 struct vm_area_struct *vma;
473 struct prio_tree_iter iter;
474 int ret = 0;
475
476 BUG_ON(PageAnon(page));
477
478 spin_lock(&mapping->i_mmap_lock);
479 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
480 if (vma->vm_flags & VM_SHARED)
481 ret += page_mkclean_one(page, vma);
482 }
483 spin_unlock(&mapping->i_mmap_lock);
484 return ret;
485}
486
487int page_mkclean(struct page *page)
488{
489 int ret = 0;
490
491 BUG_ON(!PageLocked(page));
492
493 if (page_mapped(page)) {
494 struct address_space *mapping = page_mapping(page);
495 if (mapping)
496 ret = page_mkclean_file(mapping, page);
497 }
498
499 return ret;
500}
501
437/** 502/**
438 * page_set_anon_rmap - setup new anonymous rmap 503 * page_set_anon_rmap - setup new anonymous rmap
439 * @page: the page to add the mapping to 504 * @page: the page to add the mapping to
diff --git a/mm/shmem.c b/mm/shmem.c
index db21c51531..bb8ca7ef70 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -26,6 +26,8 @@
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/xattr.h>
30#include <linux/generic_acl.h>
29#include <linux/mm.h> 31#include <linux/mm.h>
30#include <linux/mman.h> 32#include <linux/mman.h>
31#include <linux/file.h> 33#include <linux/file.h>
@@ -45,6 +47,7 @@
45#include <linux/namei.h> 47#include <linux/namei.h>
46#include <linux/ctype.h> 48#include <linux/ctype.h>
47#include <linux/migrate.h> 49#include <linux/migrate.h>
50#include <linux/highmem.h>
48 51
49#include <asm/uaccess.h> 52#include <asm/uaccess.h>
50#include <asm/div64.h> 53#include <asm/div64.h>
@@ -176,6 +179,7 @@ static const struct address_space_operations shmem_aops;
176static struct file_operations shmem_file_operations; 179static struct file_operations shmem_file_operations;
177static struct inode_operations shmem_inode_operations; 180static struct inode_operations shmem_inode_operations;
178static struct inode_operations shmem_dir_inode_operations; 181static struct inode_operations shmem_dir_inode_operations;
182static struct inode_operations shmem_special_inode_operations;
179static struct vm_operations_struct shmem_vm_ops; 183static struct vm_operations_struct shmem_vm_ops;
180 184
181static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 185static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
@@ -636,7 +640,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
636 struct page *page = NULL; 640 struct page *page = NULL;
637 int error; 641 int error;
638 642
639 if (attr->ia_valid & ATTR_SIZE) { 643 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
640 if (attr->ia_size < inode->i_size) { 644 if (attr->ia_size < inode->i_size) {
641 /* 645 /*
642 * If truncating down to a partial page, then 646 * If truncating down to a partial page, then
@@ -669,6 +673,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
669 error = inode_change_ok(inode, attr); 673 error = inode_change_ok(inode, attr);
670 if (!error) 674 if (!error)
671 error = inode_setattr(inode, attr); 675 error = inode_setattr(inode, attr);
676#ifdef CONFIG_TMPFS_POSIX_ACL
677 if (!error && (attr->ia_valid & ATTR_MODE))
678 error = generic_acl_chmod(inode, &shmem_acl_ops);
679#endif
672 if (page) 680 if (page)
673 page_cache_release(page); 681 page_cache_release(page);
674 return error; 682 return error;
@@ -1350,7 +1358,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1350 inode->i_mode = mode; 1358 inode->i_mode = mode;
1351 inode->i_uid = current->fsuid; 1359 inode->i_uid = current->fsuid;
1352 inode->i_gid = current->fsgid; 1360 inode->i_gid = current->fsgid;
1353 inode->i_blksize = PAGE_CACHE_SIZE;
1354 inode->i_blocks = 0; 1361 inode->i_blocks = 0;
1355 inode->i_mapping->a_ops = &shmem_aops; 1362 inode->i_mapping->a_ops = &shmem_aops;
1356 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1363 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1362,6 +1369,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1362 1369
1363 switch (mode & S_IFMT) { 1370 switch (mode & S_IFMT) {
1364 default: 1371 default:
1372 inode->i_op = &shmem_special_inode_operations;
1365 init_special_inode(inode, mode, dev); 1373 init_special_inode(inode, mode, dev);
1366 break; 1374 break;
1367 case S_IFREG: 1375 case S_IFREG:
@@ -1371,7 +1379,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1371 &sbinfo->policy_nodes); 1379 &sbinfo->policy_nodes);
1372 break; 1380 break;
1373 case S_IFDIR: 1381 case S_IFDIR:
1374 inode->i_nlink++; 1382 inc_nlink(inode);
1375 /* Some things misbehave if size == 0 on a directory */ 1383 /* Some things misbehave if size == 0 on a directory */
1376 inode->i_size = 2 * BOGO_DIRENT_SIZE; 1384 inode->i_size = 2 * BOGO_DIRENT_SIZE;
1377 inode->i_op = &shmem_dir_inode_operations; 1385 inode->i_op = &shmem_dir_inode_operations;
@@ -1682,7 +1690,11 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1682 iput(inode); 1690 iput(inode);
1683 return error; 1691 return error;
1684 } 1692 }
1685 error = 0; 1693 }
1694 error = shmem_acl_init(inode, dir);
1695 if (error) {
1696 iput(inode);
1697 return error;
1686 } 1698 }
1687 if (dir->i_mode & S_ISGID) { 1699 if (dir->i_mode & S_ISGID) {
1688 inode->i_gid = dir->i_gid; 1700 inode->i_gid = dir->i_gid;
@@ -1703,7 +1715,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1703 1715
1704 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 1716 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1705 return error; 1717 return error;
1706 dir->i_nlink++; 1718 inc_nlink(dir);
1707 return 0; 1719 return 0;
1708} 1720}
1709 1721
@@ -1738,7 +1750,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
1738 1750
1739 dir->i_size += BOGO_DIRENT_SIZE; 1751 dir->i_size += BOGO_DIRENT_SIZE;
1740 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1752 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1741 inode->i_nlink++; 1753 inc_nlink(inode);
1742 atomic_inc(&inode->i_count); /* New dentry reference */ 1754 atomic_inc(&inode->i_count); /* New dentry reference */
1743 dget(dentry); /* Extra pinning count for the created dentry */ 1755 dget(dentry); /* Extra pinning count for the created dentry */
1744 d_instantiate(dentry, inode); 1756 d_instantiate(dentry, inode);
@@ -1760,7 +1772,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1760 1772
1761 dir->i_size -= BOGO_DIRENT_SIZE; 1773 dir->i_size -= BOGO_DIRENT_SIZE;
1762 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1774 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1763 inode->i_nlink--; 1775 drop_nlink(inode);
1764 dput(dentry); /* Undo the count from "create" - this does all the work */ 1776 dput(dentry); /* Undo the count from "create" - this does all the work */
1765 return 0; 1777 return 0;
1766} 1778}
@@ -1770,8 +1782,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1770 if (!simple_empty(dentry)) 1782 if (!simple_empty(dentry))
1771 return -ENOTEMPTY; 1783 return -ENOTEMPTY;
1772 1784
1773 dentry->d_inode->i_nlink--; 1785 drop_nlink(dentry->d_inode);
1774 dir->i_nlink--; 1786 drop_nlink(dir);
1775 return shmem_unlink(dir, dentry); 1787 return shmem_unlink(dir, dentry);
1776} 1788}
1777 1789
@@ -1792,10 +1804,10 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
1792 if (new_dentry->d_inode) { 1804 if (new_dentry->d_inode) {
1793 (void) shmem_unlink(new_dir, new_dentry); 1805 (void) shmem_unlink(new_dir, new_dentry);
1794 if (they_are_dirs) 1806 if (they_are_dirs)
1795 old_dir->i_nlink--; 1807 drop_nlink(old_dir);
1796 } else if (they_are_dirs) { 1808 } else if (they_are_dirs) {
1797 old_dir->i_nlink--; 1809 drop_nlink(old_dir);
1798 new_dir->i_nlink++; 1810 inc_nlink(new_dir);
1799 } 1811 }
1800 1812
1801 old_dir->i_size -= BOGO_DIRENT_SIZE; 1813 old_dir->i_size -= BOGO_DIRENT_SIZE;
@@ -1897,6 +1909,53 @@ static struct inode_operations shmem_symlink_inode_operations = {
1897 .put_link = shmem_put_link, 1909 .put_link = shmem_put_link,
1898}; 1910};
1899 1911
1912#ifdef CONFIG_TMPFS_POSIX_ACL
1913/**
1914 * Superblocks without xattr inode operations will get security.* xattr
1915 * support from the VFS "for free". As soon as we have any other xattrs
1916 * like ACLs, we also need to implement the security.* handlers at
1917 * filesystem level, though.
1918 */
1919
1920static size_t shmem_xattr_security_list(struct inode *inode, char *list,
1921 size_t list_len, const char *name,
1922 size_t name_len)
1923{
1924 return security_inode_listsecurity(inode, list, list_len);
1925}
1926
1927static int shmem_xattr_security_get(struct inode *inode, const char *name,
1928 void *buffer, size_t size)
1929{
1930 if (strcmp(name, "") == 0)
1931 return -EINVAL;
1932 return security_inode_getsecurity(inode, name, buffer, size,
1933 -EOPNOTSUPP);
1934}
1935
1936static int shmem_xattr_security_set(struct inode *inode, const char *name,
1937 const void *value, size_t size, int flags)
1938{
1939 if (strcmp(name, "") == 0)
1940 return -EINVAL;
1941 return security_inode_setsecurity(inode, name, value, size, flags);
1942}
1943
1944struct xattr_handler shmem_xattr_security_handler = {
1945 .prefix = XATTR_SECURITY_PREFIX,
1946 .list = shmem_xattr_security_list,
1947 .get = shmem_xattr_security_get,
1948 .set = shmem_xattr_security_set,
1949};
1950
1951static struct xattr_handler *shmem_xattr_handlers[] = {
1952 &shmem_xattr_acl_access_handler,
1953 &shmem_xattr_acl_default_handler,
1954 &shmem_xattr_security_handler,
1955 NULL
1956};
1957#endif
1958
1900static int shmem_parse_options(char *options, int *mode, uid_t *uid, 1959static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1901 gid_t *gid, unsigned long *blocks, unsigned long *inodes, 1960 gid_t *gid, unsigned long *blocks, unsigned long *inodes,
1902 int *policy, nodemask_t *policy_nodes) 1961 int *policy, nodemask_t *policy_nodes)
@@ -2094,6 +2153,10 @@ static int shmem_fill_super(struct super_block *sb,
2094 sb->s_magic = TMPFS_MAGIC; 2153 sb->s_magic = TMPFS_MAGIC;
2095 sb->s_op = &shmem_ops; 2154 sb->s_op = &shmem_ops;
2096 sb->s_time_gran = 1; 2155 sb->s_time_gran = 1;
2156#ifdef CONFIG_TMPFS_POSIX_ACL
2157 sb->s_xattr = shmem_xattr_handlers;
2158 sb->s_flags |= MS_POSIXACL;
2159#endif
2097 2160
2098 inode = shmem_get_inode(sb, S_IFDIR | mode, 0); 2161 inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2099 if (!inode) 2162 if (!inode)
@@ -2130,6 +2193,7 @@ static void shmem_destroy_inode(struct inode *inode)
2130 /* only struct inode is valid if it's an inline symlink */ 2193 /* only struct inode is valid if it's an inline symlink */
2131 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2194 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2132 } 2195 }
2196 shmem_acl_destroy_inode(inode);
2133 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2197 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2134} 2198}
2135 2199
@@ -2141,6 +2205,10 @@ static void init_once(void *foo, struct kmem_cache *cachep,
2141 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 2205 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2142 SLAB_CTOR_CONSTRUCTOR) { 2206 SLAB_CTOR_CONSTRUCTOR) {
2143 inode_init_once(&p->vfs_inode); 2207 inode_init_once(&p->vfs_inode);
2208#ifdef CONFIG_TMPFS_POSIX_ACL
2209 p->i_acl = NULL;
2210 p->i_default_acl = NULL;
2211#endif
2144 } 2212 }
2145} 2213}
2146 2214
@@ -2156,8 +2224,7 @@ static int init_inodecache(void)
2156 2224
2157static void destroy_inodecache(void) 2225static void destroy_inodecache(void)
2158{ 2226{
2159 if (kmem_cache_destroy(shmem_inode_cachep)) 2227 kmem_cache_destroy(shmem_inode_cachep);
2160 printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
2161} 2228}
2162 2229
2163static const struct address_space_operations shmem_aops = { 2230static const struct address_space_operations shmem_aops = {
@@ -2185,6 +2252,14 @@ static struct inode_operations shmem_inode_operations = {
2185 .truncate = shmem_truncate, 2252 .truncate = shmem_truncate,
2186 .setattr = shmem_notify_change, 2253 .setattr = shmem_notify_change,
2187 .truncate_range = shmem_truncate_range, 2254 .truncate_range = shmem_truncate_range,
2255#ifdef CONFIG_TMPFS_POSIX_ACL
2256 .setxattr = generic_setxattr,
2257 .getxattr = generic_getxattr,
2258 .listxattr = generic_listxattr,
2259 .removexattr = generic_removexattr,
2260 .permission = shmem_permission,
2261#endif
2262
2188}; 2263};
2189 2264
2190static struct inode_operations shmem_dir_inode_operations = { 2265static struct inode_operations shmem_dir_inode_operations = {
@@ -2199,6 +2274,25 @@ static struct inode_operations shmem_dir_inode_operations = {
2199 .mknod = shmem_mknod, 2274 .mknod = shmem_mknod,
2200 .rename = shmem_rename, 2275 .rename = shmem_rename,
2201#endif 2276#endif
2277#ifdef CONFIG_TMPFS_POSIX_ACL
2278 .setattr = shmem_notify_change,
2279 .setxattr = generic_setxattr,
2280 .getxattr = generic_getxattr,
2281 .listxattr = generic_listxattr,
2282 .removexattr = generic_removexattr,
2283 .permission = shmem_permission,
2284#endif
2285};
2286
2287static struct inode_operations shmem_special_inode_operations = {
2288#ifdef CONFIG_TMPFS_POSIX_ACL
2289 .setattr = shmem_notify_change,
2290 .setxattr = generic_setxattr,
2291 .getxattr = generic_getxattr,
2292 .listxattr = generic_listxattr,
2293 .removexattr = generic_removexattr,
2294 .permission = shmem_permission,
2295#endif
2202}; 2296};
2203 2297
2204static struct super_operations shmem_ops = { 2298static struct super_operations shmem_ops = {
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
new file mode 100644
index 0000000000..c946bf4687
--- /dev/null
+++ b/mm/shmem_acl.c
@@ -0,0 +1,197 @@
1/*
2 * mm/shmem_acl.c
3 *
4 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
5 *
6 * This file is released under the GPL.
7 */
8
9#include <linux/fs.h>
10#include <linux/shmem_fs.h>
11#include <linux/xattr.h>
12#include <linux/generic_acl.h>
13
14/**
15 * shmem_get_acl - generic_acl_operations->getacl() operation
16 */
17static struct posix_acl *
18shmem_get_acl(struct inode *inode, int type)
19{
20 struct posix_acl *acl = NULL;
21
22 spin_lock(&inode->i_lock);
23 switch(type) {
24 case ACL_TYPE_ACCESS:
25 acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
26 break;
27
28 case ACL_TYPE_DEFAULT:
29 acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
30 break;
31 }
32 spin_unlock(&inode->i_lock);
33
34 return acl;
35}
36
37/**
38 * shmem_get_acl - generic_acl_operations->setacl() operation
39 */
40static void
41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
42{
43 struct posix_acl *free = NULL;
44
45 spin_lock(&inode->i_lock);
46 switch(type) {
47 case ACL_TYPE_ACCESS:
48 free = SHMEM_I(inode)->i_acl;
49 SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
50 break;
51
52 case ACL_TYPE_DEFAULT:
53 free = SHMEM_I(inode)->i_default_acl;
54 SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
55 break;
56 }
57 spin_unlock(&inode->i_lock);
58 posix_acl_release(free);
59}
60
61struct generic_acl_operations shmem_acl_ops = {
62 .getacl = shmem_get_acl,
63 .setacl = shmem_set_acl,
64};
65
66/**
67 * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
68 * shmem_xattr_acl_access_handler - plumbing code to implement the
69 * system.posix_acl_access xattr using the generic acl functions.
70 */
71
72static size_t
73shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
74 const char *name, size_t name_len)
75{
76 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
77 list, list_size);
78}
79
80static int
81shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
82 size_t size)
83{
84 if (strcmp(name, "") != 0)
85 return -EINVAL;
86 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
87 size);
88}
89
90static int
91shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
92 size_t size, int flags)
93{
94 if (strcmp(name, "") != 0)
95 return -EINVAL;
96 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
97 size);
98}
99
100struct xattr_handler shmem_xattr_acl_access_handler = {
101 .prefix = POSIX_ACL_XATTR_ACCESS,
102 .list = shmem_list_acl_access,
103 .get = shmem_get_acl_access,
104 .set = shmem_set_acl_access,
105};
106
107/**
108 * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
109 * shmem_xattr_acl_default_handler - plumbing code to implement the
110 * system.posix_acl_default xattr using the generic acl functions.
111 */
112
113static size_t
114shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
115 const char *name, size_t name_len)
116{
117 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
118 list, list_size);
119}
120
121static int
122shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
123 size_t size)
124{
125 if (strcmp(name, "") != 0)
126 return -EINVAL;
127 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
128 size);
129}
130
131static int
132shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
133 size_t size, int flags)
134{
135 if (strcmp(name, "") != 0)
136 return -EINVAL;
137 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
138 size);
139}
140
141struct xattr_handler shmem_xattr_acl_default_handler = {
142 .prefix = POSIX_ACL_XATTR_DEFAULT,
143 .list = shmem_list_acl_default,
144 .get = shmem_get_acl_default,
145 .set = shmem_set_acl_default,
146};
147
148/**
149 * shmem_acl_init - Inizialize the acl(s) of a new inode
150 */
151int
152shmem_acl_init(struct inode *inode, struct inode *dir)
153{
154 return generic_acl_init(inode, dir, &shmem_acl_ops);
155}
156
157/**
158 * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode
159 *
160 * This is done before destroying the actual inode.
161 */
162
163void
164shmem_acl_destroy_inode(struct inode *inode)
165{
166 if (SHMEM_I(inode)->i_acl)
167 posix_acl_release(SHMEM_I(inode)->i_acl);
168 SHMEM_I(inode)->i_acl = NULL;
169 if (SHMEM_I(inode)->i_default_acl)
170 posix_acl_release(SHMEM_I(inode)->i_default_acl);
171 SHMEM_I(inode)->i_default_acl = NULL;
172}
173
174/**
175 * shmem_check_acl - check_acl() callback for generic_permission()
176 */
177static int
178shmem_check_acl(struct inode *inode, int mask)
179{
180 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
181
182 if (acl) {
183 int error = posix_acl_permission(inode, acl, mask);
184 posix_acl_release(acl);
185 return error;
186 }
187 return -EAGAIN;
188}
189
190/**
191 * shmem_permission - permission() inode operation
192 */
193int
194shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
195{
196 return generic_permission(inode, mask, shmem_check_acl);
197}
diff --git a/mm/slab.c b/mm/slab.c
index 21ba060357..3dbd6f4e74 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -313,7 +313,7 @@ static int drain_freelist(struct kmem_cache *cache,
313 struct kmem_list3 *l3, int tofree); 313 struct kmem_list3 *l3, int tofree);
314static void free_block(struct kmem_cache *cachep, void **objpp, int len, 314static void free_block(struct kmem_cache *cachep, void **objpp, int len,
315 int node); 315 int node);
316static void enable_cpucache(struct kmem_cache *cachep); 316static int enable_cpucache(struct kmem_cache *cachep);
317static void cache_reap(void *unused); 317static void cache_reap(void *unused);
318 318
319/* 319/*
@@ -674,6 +674,8 @@ static struct kmem_cache cache_cache = {
674#endif 674#endif
675}; 675};
676 676
677#define BAD_ALIEN_MAGIC 0x01020304ul
678
677#ifdef CONFIG_LOCKDEP 679#ifdef CONFIG_LOCKDEP
678 680
679/* 681/*
@@ -682,42 +684,58 @@ static struct kmem_cache cache_cache = {
682 * The locking for this is tricky in that it nests within the locks 684 * The locking for this is tricky in that it nests within the locks
683 * of all other slabs in a few places; to deal with this special 685 * of all other slabs in a few places; to deal with this special
684 * locking we put on-slab caches into a separate lock-class. 686 * locking we put on-slab caches into a separate lock-class.
687 *
688 * We set lock class for alien array caches which are up during init.
689 * The lock annotation will be lost if all cpus of a node goes down and
690 * then comes back up during hotplug
685 */ 691 */
686static struct lock_class_key on_slab_key; 692static struct lock_class_key on_slab_l3_key;
693static struct lock_class_key on_slab_alc_key;
694
695static inline void init_lock_keys(void)
687 696
688static inline void init_lock_keys(struct cache_sizes *s)
689{ 697{
690 int q; 698 int q;
691 699 struct cache_sizes *s = malloc_sizes;
692 for (q = 0; q < MAX_NUMNODES; q++) { 700
693 if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep)) 701 while (s->cs_size != ULONG_MAX) {
694 continue; 702 for_each_node(q) {
695 lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock, 703 struct array_cache **alc;
696 &on_slab_key); 704 int r;
705 struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
706 if (!l3 || OFF_SLAB(s->cs_cachep))
707 continue;
708 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
709 alc = l3->alien;
710 /*
711 * FIXME: This check for BAD_ALIEN_MAGIC
712 * should go away when common slab code is taught to
713 * work even without alien caches.
714 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
715 * for alloc_alien_cache,
716 */
717 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
718 continue;
719 for_each_node(r) {
720 if (alc[r])
721 lockdep_set_class(&alc[r]->lock,
722 &on_slab_alc_key);
723 }
724 }
725 s++;
697 } 726 }
698} 727}
699
700#else 728#else
701static inline void init_lock_keys(struct cache_sizes *s) 729static inline void init_lock_keys(void)
702{ 730{
703} 731}
704#endif 732#endif
705 733
706
707
708/* Guard access to the cache-chain. */ 734/* Guard access to the cache-chain. */
709static DEFINE_MUTEX(cache_chain_mutex); 735static DEFINE_MUTEX(cache_chain_mutex);
710static struct list_head cache_chain; 736static struct list_head cache_chain;
711 737
712/* 738/*
713 * vm_enough_memory() looks at this to determine how many slab-allocated pages
714 * are possibly freeable under pressure
715 *
716 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
717 */
718atomic_t slab_reclaim_pages;
719
720/*
721 * chicken and egg problem: delay the per-cpu array allocation 739 * chicken and egg problem: delay the per-cpu array allocation
722 * until the general caches are up. 740 * until the general caches are up.
723 */ 741 */
@@ -768,11 +786,10 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
768 return csizep->cs_cachep; 786 return csizep->cs_cachep;
769} 787}
770 788
771struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 789static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
772{ 790{
773 return __find_general_cachep(size, gfpflags); 791 return __find_general_cachep(size, gfpflags);
774} 792}
775EXPORT_SYMBOL(kmem_find_general_cachep);
776 793
777static size_t slab_mgmt_size(size_t nr_objs, size_t align) 794static size_t slab_mgmt_size(size_t nr_objs, size_t align)
778{ 795{
@@ -955,7 +972,39 @@ static int transfer_objects(struct array_cache *to,
955 return nr; 972 return nr;
956} 973}
957 974
958#ifdef CONFIG_NUMA 975#ifndef CONFIG_NUMA
976
977#define drain_alien_cache(cachep, alien) do { } while (0)
978#define reap_alien(cachep, l3) do { } while (0)
979
980static inline struct array_cache **alloc_alien_cache(int node, int limit)
981{
982 return (struct array_cache **)BAD_ALIEN_MAGIC;
983}
984
985static inline void free_alien_cache(struct array_cache **ac_ptr)
986{
987}
988
989static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
990{
991 return 0;
992}
993
994static inline void *alternate_node_alloc(struct kmem_cache *cachep,
995 gfp_t flags)
996{
997 return NULL;
998}
999
1000static inline void *__cache_alloc_node(struct kmem_cache *cachep,
1001 gfp_t flags, int nodeid)
1002{
1003 return NULL;
1004}
1005
1006#else /* CONFIG_NUMA */
1007
959static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 1008static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
960static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1009static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
961 1010
@@ -1084,26 +1133,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1084 } 1133 }
1085 return 1; 1134 return 1;
1086} 1135}
1087
1088#else
1089
1090#define drain_alien_cache(cachep, alien) do { } while (0)
1091#define reap_alien(cachep, l3) do { } while (0)
1092
1093static inline struct array_cache **alloc_alien_cache(int node, int limit)
1094{
1095 return (struct array_cache **) 0x01020304ul;
1096}
1097
1098static inline void free_alien_cache(struct array_cache **ac_ptr)
1099{
1100}
1101
1102static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1103{
1104 return 0;
1105}
1106
1107#endif 1136#endif
1108 1137
1109static int __cpuinit cpuup_callback(struct notifier_block *nfb, 1138static int __cpuinit cpuup_callback(struct notifier_block *nfb,
@@ -1422,7 +1451,6 @@ void __init kmem_cache_init(void)
1422 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1451 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1423 NULL, NULL); 1452 NULL, NULL);
1424 } 1453 }
1425 init_lock_keys(sizes);
1426 1454
1427 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1455 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1428 sizes->cs_size, 1456 sizes->cs_size,
@@ -1491,10 +1519,15 @@ void __init kmem_cache_init(void)
1491 struct kmem_cache *cachep; 1519 struct kmem_cache *cachep;
1492 mutex_lock(&cache_chain_mutex); 1520 mutex_lock(&cache_chain_mutex);
1493 list_for_each_entry(cachep, &cache_chain, next) 1521 list_for_each_entry(cachep, &cache_chain, next)
1494 enable_cpucache(cachep); 1522 if (enable_cpucache(cachep))
1523 BUG();
1495 mutex_unlock(&cache_chain_mutex); 1524 mutex_unlock(&cache_chain_mutex);
1496 } 1525 }
1497 1526
1527 /* Annotate slab for lockdep -- annotate the malloc caches */
1528 init_lock_keys();
1529
1530
1498 /* Done! */ 1531 /* Done! */
1499 g_cpucache_up = FULL; 1532 g_cpucache_up = FULL;
1500 1533
@@ -1543,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1543 */ 1576 */
1544 flags |= __GFP_COMP; 1577 flags |= __GFP_COMP;
1545#endif 1578#endif
1546 flags |= cachep->gfpflags; 1579
1580 /*
1581 * Under NUMA we want memory on the indicated node. We will handle
1582 * the needed fallback ourselves since we want to serve from our
1583 * per node object lists first for other nodes.
1584 */
1585 flags |= cachep->gfpflags | GFP_THISNODE;
1547 1586
1548 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1587 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1549 if (!page) 1588 if (!page)
@@ -1551,8 +1590,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1551 1590
1552 nr_pages = (1 << cachep->gfporder); 1591 nr_pages = (1 << cachep->gfporder);
1553 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1592 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1554 atomic_add(nr_pages, &slab_reclaim_pages); 1593 add_zone_page_state(page_zone(page),
1555 add_zone_page_state(page_zone(page), NR_SLAB, nr_pages); 1594 NR_SLAB_RECLAIMABLE, nr_pages);
1595 else
1596 add_zone_page_state(page_zone(page),
1597 NR_SLAB_UNRECLAIMABLE, nr_pages);
1556 for (i = 0; i < nr_pages; i++) 1598 for (i = 0; i < nr_pages; i++)
1557 __SetPageSlab(page + i); 1599 __SetPageSlab(page + i);
1558 return page_address(page); 1600 return page_address(page);
@@ -1567,7 +1609,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1567 struct page *page = virt_to_page(addr); 1609 struct page *page = virt_to_page(addr);
1568 const unsigned long nr_freed = i; 1610 const unsigned long nr_freed = i;
1569 1611
1570 sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed); 1612 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1613 sub_zone_page_state(page_zone(page),
1614 NR_SLAB_RECLAIMABLE, nr_freed);
1615 else
1616 sub_zone_page_state(page_zone(page),
1617 NR_SLAB_UNRECLAIMABLE, nr_freed);
1571 while (i--) { 1618 while (i--) {
1572 BUG_ON(!PageSlab(page)); 1619 BUG_ON(!PageSlab(page));
1573 __ClearPageSlab(page); 1620 __ClearPageSlab(page);
@@ -1576,8 +1623,6 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1576 if (current->reclaim_state) 1623 if (current->reclaim_state)
1577 current->reclaim_state->reclaimed_slab += nr_freed; 1624 current->reclaim_state->reclaimed_slab += nr_freed;
1578 free_pages((unsigned long)addr, cachep->gfporder); 1625 free_pages((unsigned long)addr, cachep->gfporder);
1579 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1580 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1581} 1626}
1582 1627
1583static void kmem_rcu_free(struct rcu_head *head) 1628static void kmem_rcu_free(struct rcu_head *head)
@@ -1638,10 +1683,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1638static void dump_line(char *data, int offset, int limit) 1683static void dump_line(char *data, int offset, int limit)
1639{ 1684{
1640 int i; 1685 int i;
1686 unsigned char error = 0;
1687 int bad_count = 0;
1688
1641 printk(KERN_ERR "%03x:", offset); 1689 printk(KERN_ERR "%03x:", offset);
1642 for (i = 0; i < limit; i++) 1690 for (i = 0; i < limit; i++) {
1691 if (data[offset + i] != POISON_FREE) {
1692 error = data[offset + i];
1693 bad_count++;
1694 }
1643 printk(" %02x", (unsigned char)data[offset + i]); 1695 printk(" %02x", (unsigned char)data[offset + i]);
1696 }
1644 printk("\n"); 1697 printk("\n");
1698
1699 if (bad_count == 1) {
1700 error ^= POISON_FREE;
1701 if (!(error & (error - 1))) {
1702 printk(KERN_ERR "Single bit error detected. Probably "
1703 "bad RAM.\n");
1704#ifdef CONFIG_X86
1705 printk(KERN_ERR "Run memtest86+ or a similar memory "
1706 "test tool.\n");
1707#else
1708 printk(KERN_ERR "Run a memory test tool.\n");
1709#endif
1710 }
1711 }
1645} 1712}
1646#endif 1713#endif
1647 1714
@@ -1834,6 +1901,27 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
1834 } 1901 }
1835} 1902}
1836 1903
1904static void __kmem_cache_destroy(struct kmem_cache *cachep)
1905{
1906 int i;
1907 struct kmem_list3 *l3;
1908
1909 for_each_online_cpu(i)
1910 kfree(cachep->array[i]);
1911
1912 /* NUMA: free the list3 structures */
1913 for_each_online_node(i) {
1914 l3 = cachep->nodelists[i];
1915 if (l3) {
1916 kfree(l3->shared);
1917 free_alien_cache(l3->alien);
1918 kfree(l3);
1919 }
1920 }
1921 kmem_cache_free(&cache_cache, cachep);
1922}
1923
1924
1837/** 1925/**
1838 * calculate_slab_order - calculate size (page order) of slabs 1926 * calculate_slab_order - calculate size (page order) of slabs
1839 * @cachep: pointer to the cache that is being created 1927 * @cachep: pointer to the cache that is being created
@@ -1904,12 +1992,11 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
1904 return left_over; 1992 return left_over;
1905} 1993}
1906 1994
1907static void setup_cpu_cache(struct kmem_cache *cachep) 1995static int setup_cpu_cache(struct kmem_cache *cachep)
1908{ 1996{
1909 if (g_cpucache_up == FULL) { 1997 if (g_cpucache_up == FULL)
1910 enable_cpucache(cachep); 1998 return enable_cpucache(cachep);
1911 return; 1999
1912 }
1913 if (g_cpucache_up == NONE) { 2000 if (g_cpucache_up == NONE) {
1914 /* 2001 /*
1915 * Note: the first kmem_cache_create must create the cache 2002 * Note: the first kmem_cache_create must create the cache
@@ -1956,6 +2043,7 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
1956 cpu_cache_get(cachep)->touched = 0; 2043 cpu_cache_get(cachep)->touched = 0;
1957 cachep->batchcount = 1; 2044 cachep->batchcount = 1;
1958 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2045 cachep->limit = BOOT_CPUCACHE_ENTRIES;
2046 return 0;
1959} 2047}
1960 2048
1961/** 2049/**
@@ -2097,6 +2185,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2097 } else { 2185 } else {
2098 ralign = BYTES_PER_WORD; 2186 ralign = BYTES_PER_WORD;
2099 } 2187 }
2188
2189 /*
2190 * Redzoning and user store require word alignment. Note this will be
2191 * overridden by architecture or caller mandated alignment if either
2192 * is greater than BYTES_PER_WORD.
2193 */
2194 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2195 ralign = BYTES_PER_WORD;
2196
2100 /* 2) arch mandated alignment: disables debug if necessary */ 2197 /* 2) arch mandated alignment: disables debug if necessary */
2101 if (ralign < ARCH_SLAB_MINALIGN) { 2198 if (ralign < ARCH_SLAB_MINALIGN) {
2102 ralign = ARCH_SLAB_MINALIGN; 2199 ralign = ARCH_SLAB_MINALIGN;
@@ -2110,8 +2207,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2110 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2207 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2111 } 2208 }
2112 /* 2209 /*
2113 * 4) Store it. Note that the debug code below can reduce 2210 * 4) Store it.
2114 * the alignment to BYTES_PER_WORD.
2115 */ 2211 */
2116 align = ralign; 2212 align = ralign;
2117 2213
@@ -2123,20 +2219,19 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2123#if DEBUG 2219#if DEBUG
2124 cachep->obj_size = size; 2220 cachep->obj_size = size;
2125 2221
2222 /*
2223 * Both debugging options require word-alignment which is calculated
2224 * into align above.
2225 */
2126 if (flags & SLAB_RED_ZONE) { 2226 if (flags & SLAB_RED_ZONE) {
2127 /* redzoning only works with word aligned caches */
2128 align = BYTES_PER_WORD;
2129
2130 /* add space for red zone words */ 2227 /* add space for red zone words */
2131 cachep->obj_offset += BYTES_PER_WORD; 2228 cachep->obj_offset += BYTES_PER_WORD;
2132 size += 2 * BYTES_PER_WORD; 2229 size += 2 * BYTES_PER_WORD;
2133 } 2230 }
2134 if (flags & SLAB_STORE_USER) { 2231 if (flags & SLAB_STORE_USER) {
2135 /* user store requires word alignment and 2232 /* user store requires one word storage behind the end of
2136 * one word storage behind the end of the real 2233 * the real object.
2137 * object.
2138 */ 2234 */
2139 align = BYTES_PER_WORD;
2140 size += BYTES_PER_WORD; 2235 size += BYTES_PER_WORD;
2141 } 2236 }
2142#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2237#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
@@ -2200,14 +2295,26 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2200 cachep->gfpflags |= GFP_DMA; 2295 cachep->gfpflags |= GFP_DMA;
2201 cachep->buffer_size = size; 2296 cachep->buffer_size = size;
2202 2297
2203 if (flags & CFLGS_OFF_SLAB) 2298 if (flags & CFLGS_OFF_SLAB) {
2204 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2299 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2300 /*
2301 * This is a possibility for one of the malloc_sizes caches.
2302 * But since we go off slab only for object size greater than
2303 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2304 * this should not happen at all.
2305 * But leave a BUG_ON for some lucky dude.
2306 */
2307 BUG_ON(!cachep->slabp_cache);
2308 }
2205 cachep->ctor = ctor; 2309 cachep->ctor = ctor;
2206 cachep->dtor = dtor; 2310 cachep->dtor = dtor;
2207 cachep->name = name; 2311 cachep->name = name;
2208 2312
2209 2313 if (setup_cpu_cache(cachep)) {
2210 setup_cpu_cache(cachep); 2314 __kmem_cache_destroy(cachep);
2315 cachep = NULL;
2316 goto oops;
2317 }
2211 2318
2212 /* cache setup completed, link it into the list */ 2319 /* cache setup completed, link it into the list */
2213 list_add(&cachep->next, &cache_chain); 2320 list_add(&cachep->next, &cache_chain);
@@ -2375,7 +2482,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2375 * @cachep: the cache to destroy 2482 * @cachep: the cache to destroy
2376 * 2483 *
2377 * Remove a struct kmem_cache object from the slab cache. 2484 * Remove a struct kmem_cache object from the slab cache.
2378 * Returns 0 on success.
2379 * 2485 *
2380 * It is expected this function will be called by a module when it is 2486 * It is expected this function will be called by a module when it is
2381 * unloaded. This will remove the cache completely, and avoid a duplicate 2487 * unloaded. This will remove the cache completely, and avoid a duplicate
@@ -2387,11 +2493,8 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2387 * The caller must guarantee that noone will allocate memory from the cache 2493 * The caller must guarantee that noone will allocate memory from the cache
2388 * during the kmem_cache_destroy(). 2494 * during the kmem_cache_destroy().
2389 */ 2495 */
2390int kmem_cache_destroy(struct kmem_cache *cachep) 2496void kmem_cache_destroy(struct kmem_cache *cachep)
2391{ 2497{
2392 int i;
2393 struct kmem_list3 *l3;
2394
2395 BUG_ON(!cachep || in_interrupt()); 2498 BUG_ON(!cachep || in_interrupt());
2396 2499
2397 /* Don't let CPUs to come and go */ 2500 /* Don't let CPUs to come and go */
@@ -2411,31 +2514,28 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
2411 list_add(&cachep->next, &cache_chain); 2514 list_add(&cachep->next, &cache_chain);
2412 mutex_unlock(&cache_chain_mutex); 2515 mutex_unlock(&cache_chain_mutex);
2413 unlock_cpu_hotplug(); 2516 unlock_cpu_hotplug();
2414 return 1; 2517 return;
2415 } 2518 }
2416 2519
2417 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2520 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2418 synchronize_rcu(); 2521 synchronize_rcu();
2419 2522
2420 for_each_online_cpu(i) 2523 __kmem_cache_destroy(cachep);
2421 kfree(cachep->array[i]);
2422
2423 /* NUMA: free the list3 structures */
2424 for_each_online_node(i) {
2425 l3 = cachep->nodelists[i];
2426 if (l3) {
2427 kfree(l3->shared);
2428 free_alien_cache(l3->alien);
2429 kfree(l3);
2430 }
2431 }
2432 kmem_cache_free(&cache_cache, cachep);
2433 unlock_cpu_hotplug(); 2524 unlock_cpu_hotplug();
2434 return 0;
2435} 2525}
2436EXPORT_SYMBOL(kmem_cache_destroy); 2526EXPORT_SYMBOL(kmem_cache_destroy);
2437 2527
2438/* Get the memory for a slab management obj. */ 2528/*
2529 * Get the memory for a slab management obj.
2530 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2531 * always come from malloc_sizes caches. The slab descriptor cannot
2532 * come from the same cache which is getting created because,
2533 * when we are searching for an appropriate cache for these
2534 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2535 * If we are creating a malloc_sizes cache here it would not be visible to
2536 * kmem_find_general_cachep till the initialization is complete.
2537 * Hence we cannot have slabp_cache same as the original cache.
2538 */
2439static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, 2539static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2440 int colour_off, gfp_t local_flags, 2540 int colour_off, gfp_t local_flags,
2441 int nodeid) 2541 int nodeid)
@@ -2968,14 +3068,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2968 void *objp; 3068 void *objp;
2969 struct array_cache *ac; 3069 struct array_cache *ac;
2970 3070
2971#ifdef CONFIG_NUMA
2972 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
2973 objp = alternate_node_alloc(cachep, flags);
2974 if (objp != NULL)
2975 return objp;
2976 }
2977#endif
2978
2979 check_irq_off(); 3071 check_irq_off();
2980 ac = cpu_cache_get(cachep); 3072 ac = cpu_cache_get(cachep);
2981 if (likely(ac->avail)) { 3073 if (likely(ac->avail)) {
@@ -2993,12 +3085,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
2993 gfp_t flags, void *caller) 3085 gfp_t flags, void *caller)
2994{ 3086{
2995 unsigned long save_flags; 3087 unsigned long save_flags;
2996 void *objp; 3088 void *objp = NULL;
2997 3089
2998 cache_alloc_debugcheck_before(cachep, flags); 3090 cache_alloc_debugcheck_before(cachep, flags);
2999 3091
3000 local_irq_save(save_flags); 3092 local_irq_save(save_flags);
3001 objp = ____cache_alloc(cachep, flags); 3093
3094 if (unlikely(NUMA_BUILD &&
3095 current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3096 objp = alternate_node_alloc(cachep, flags);
3097
3098 if (!objp)
3099 objp = ____cache_alloc(cachep, flags);
3100 /*
3101 * We may just have run out of memory on the local node.
3102 * __cache_alloc_node() knows how to locate memory on other nodes
3103 */
3104 if (NUMA_BUILD && !objp)
3105 objp = __cache_alloc_node(cachep, flags, numa_node_id());
3002 local_irq_restore(save_flags); 3106 local_irq_restore(save_flags);
3003 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 3107 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3004 caller); 3108 caller);
@@ -3017,7 +3121,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3017{ 3121{
3018 int nid_alloc, nid_here; 3122 int nid_alloc, nid_here;
3019 3123
3020 if (in_interrupt()) 3124 if (in_interrupt() || (flags & __GFP_THISNODE))
3021 return NULL; 3125 return NULL;
3022 nid_alloc = nid_here = numa_node_id(); 3126 nid_alloc = nid_here = numa_node_id();
3023 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3127 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
@@ -3030,6 +3134,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3030} 3134}
3031 3135
3032/* 3136/*
3137 * Fallback function if there was no memory available and no objects on a
3138 * certain node and we are allowed to fall back. We mimick the behavior of
3139 * the page allocator. We fall back according to a zonelist determined by
3140 * the policy layer while obeying cpuset constraints.
3141 */
3142void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3143{
3144 struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
3145 ->node_zonelists[gfp_zone(flags)];
3146 struct zone **z;
3147 void *obj = NULL;
3148
3149 for (z = zonelist->zones; *z && !obj; z++)
3150 if (zone_idx(*z) <= ZONE_NORMAL &&
3151 cpuset_zone_allowed(*z, flags))
3152 obj = __cache_alloc_node(cache,
3153 flags | __GFP_THISNODE,
3154 zone_to_nid(*z));
3155 return obj;
3156}
3157
3158/*
3033 * A interface to enable slab creation on nodeid 3159 * A interface to enable slab creation on nodeid
3034 */ 3160 */
3035static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3161static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
@@ -3082,11 +3208,15 @@ retry:
3082must_grow: 3208must_grow:
3083 spin_unlock(&l3->list_lock); 3209 spin_unlock(&l3->list_lock);
3084 x = cache_grow(cachep, flags, nodeid); 3210 x = cache_grow(cachep, flags, nodeid);
3211 if (x)
3212 goto retry;
3085 3213
3086 if (!x) 3214 if (!(flags & __GFP_THISNODE))
3087 return NULL; 3215 /* Unable to grow the cache. Fall back to other nodes. */
3216 return fallback_alloc(cachep, flags);
3217
3218 return NULL;
3088 3219
3089 goto retry;
3090done: 3220done:
3091 return obj; 3221 return obj;
3092} 3222}
@@ -3119,6 +3249,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3119 if (slabp->inuse == 0) { 3249 if (slabp->inuse == 0) {
3120 if (l3->free_objects > l3->free_limit) { 3250 if (l3->free_objects > l3->free_limit) {
3121 l3->free_objects -= cachep->num; 3251 l3->free_objects -= cachep->num;
3252 /* No need to drop any previously held
3253 * lock here, even if we have a off-slab slab
3254 * descriptor it is guaranteed to come from
3255 * a different cache, refer to comments before
3256 * alloc_slabmgmt.
3257 */
3122 slab_destroy(cachep, slabp); 3258 slab_destroy(cachep, slabp);
3123 } else { 3259 } else {
3124 list_add(&slabp->list, &l3->slabs_free); 3260 list_add(&slabp->list, &l3->slabs_free);
@@ -3317,7 +3453,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3317} 3453}
3318EXPORT_SYMBOL(kmem_cache_alloc_node); 3454EXPORT_SYMBOL(kmem_cache_alloc_node);
3319 3455
3320void *kmalloc_node(size_t size, gfp_t flags, int node) 3456void *__kmalloc_node(size_t size, gfp_t flags, int node)
3321{ 3457{
3322 struct kmem_cache *cachep; 3458 struct kmem_cache *cachep;
3323 3459
@@ -3326,7 +3462,7 @@ void *kmalloc_node(size_t size, gfp_t flags, int node)
3326 return NULL; 3462 return NULL;
3327 return kmem_cache_alloc_node(cachep, flags, node); 3463 return kmem_cache_alloc_node(cachep, flags, node);
3328} 3464}
3329EXPORT_SYMBOL(kmalloc_node); 3465EXPORT_SYMBOL(__kmalloc_node);
3330#endif 3466#endif
3331 3467
3332/** 3468/**
@@ -3370,55 +3506,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3370EXPORT_SYMBOL(__kmalloc_track_caller); 3506EXPORT_SYMBOL(__kmalloc_track_caller);
3371#endif 3507#endif
3372 3508
3373#ifdef CONFIG_SMP
3374/**
3375 * __alloc_percpu - allocate one copy of the object for every present
3376 * cpu in the system, zeroing them.
3377 * Objects should be dereferenced using the per_cpu_ptr macro only.
3378 *
3379 * @size: how many bytes of memory are required.
3380 */
3381void *__alloc_percpu(size_t size)
3382{
3383 int i;
3384 struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
3385
3386 if (!pdata)
3387 return NULL;
3388
3389 /*
3390 * Cannot use for_each_online_cpu since a cpu may come online
3391 * and we have no way of figuring out how to fix the array
3392 * that we have allocated then....
3393 */
3394 for_each_possible_cpu(i) {
3395 int node = cpu_to_node(i);
3396
3397 if (node_online(node))
3398 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
3399 else
3400 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
3401
3402 if (!pdata->ptrs[i])
3403 goto unwind_oom;
3404 memset(pdata->ptrs[i], 0, size);
3405 }
3406
3407 /* Catch derefs w/o wrappers */
3408 return (void *)(~(unsigned long)pdata);
3409
3410unwind_oom:
3411 while (--i >= 0) {
3412 if (!cpu_possible(i))
3413 continue;
3414 kfree(pdata->ptrs[i]);
3415 }
3416 kfree(pdata);
3417 return NULL;
3418}
3419EXPORT_SYMBOL(__alloc_percpu);
3420#endif
3421
3422/** 3509/**
3423 * kmem_cache_free - Deallocate an object 3510 * kmem_cache_free - Deallocate an object
3424 * @cachep: The cache the allocation was from. 3511 * @cachep: The cache the allocation was from.
@@ -3464,29 +3551,6 @@ void kfree(const void *objp)
3464} 3551}
3465EXPORT_SYMBOL(kfree); 3552EXPORT_SYMBOL(kfree);
3466 3553
3467#ifdef CONFIG_SMP
3468/**
3469 * free_percpu - free previously allocated percpu memory
3470 * @objp: pointer returned by alloc_percpu.
3471 *
3472 * Don't free memory not originally allocated by alloc_percpu()
3473 * The complemented objp is to check for that.
3474 */
3475void free_percpu(const void *objp)
3476{
3477 int i;
3478 struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3479
3480 /*
3481 * We allocate for all cpus so we cannot use for online cpu here.
3482 */
3483 for_each_possible_cpu(i)
3484 kfree(p->ptrs[i]);
3485 kfree(p);
3486}
3487EXPORT_SYMBOL(free_percpu);
3488#endif
3489
3490unsigned int kmem_cache_size(struct kmem_cache *cachep) 3554unsigned int kmem_cache_size(struct kmem_cache *cachep)
3491{ 3555{
3492 return obj_size(cachep); 3556 return obj_size(cachep);
@@ -3603,22 +3667,26 @@ static void do_ccupdate_local(void *info)
3603static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3667static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3604 int batchcount, int shared) 3668 int batchcount, int shared)
3605{ 3669{
3606 struct ccupdate_struct new; 3670 struct ccupdate_struct *new;
3607 int i, err; 3671 int i;
3672
3673 new = kzalloc(sizeof(*new), GFP_KERNEL);
3674 if (!new)
3675 return -ENOMEM;
3608 3676
3609 memset(&new.new, 0, sizeof(new.new));
3610 for_each_online_cpu(i) { 3677 for_each_online_cpu(i) {
3611 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, 3678 new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3612 batchcount); 3679 batchcount);
3613 if (!new.new[i]) { 3680 if (!new->new[i]) {
3614 for (i--; i >= 0; i--) 3681 for (i--; i >= 0; i--)
3615 kfree(new.new[i]); 3682 kfree(new->new[i]);
3683 kfree(new);
3616 return -ENOMEM; 3684 return -ENOMEM;
3617 } 3685 }
3618 } 3686 }
3619 new.cachep = cachep; 3687 new->cachep = cachep;
3620 3688
3621 on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); 3689 on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
3622 3690
3623 check_irq_on(); 3691 check_irq_on();
3624 cachep->batchcount = batchcount; 3692 cachep->batchcount = batchcount;
@@ -3626,7 +3694,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3626 cachep->shared = shared; 3694 cachep->shared = shared;
3627 3695
3628 for_each_online_cpu(i) { 3696 for_each_online_cpu(i) {
3629 struct array_cache *ccold = new.new[i]; 3697 struct array_cache *ccold = new->new[i];
3630 if (!ccold) 3698 if (!ccold)
3631 continue; 3699 continue;
3632 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3700 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
@@ -3634,18 +3702,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3634 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3702 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3635 kfree(ccold); 3703 kfree(ccold);
3636 } 3704 }
3637 3705 kfree(new);
3638 err = alloc_kmemlist(cachep); 3706 return alloc_kmemlist(cachep);
3639 if (err) {
3640 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3641 cachep->name, -err);
3642 BUG();
3643 }
3644 return 0;
3645} 3707}
3646 3708
3647/* Called with cache_chain_mutex held always */ 3709/* Called with cache_chain_mutex held always */
3648static void enable_cpucache(struct kmem_cache *cachep) 3710static int enable_cpucache(struct kmem_cache *cachep)
3649{ 3711{
3650 int err; 3712 int err;
3651 int limit, shared; 3713 int limit, shared;
@@ -3697,6 +3759,7 @@ static void enable_cpucache(struct kmem_cache *cachep)
3697 if (err) 3759 if (err)
3698 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3760 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3699 cachep->name, -err); 3761 cachep->name, -err);
3762 return err;
3700} 3763}
3701 3764
3702/* 3765/*
@@ -4157,6 +4220,7 @@ static int leaks_show(struct seq_file *m, void *p)
4157 show_symbol(m, n[2*i+2]); 4220 show_symbol(m, n[2*i+2]);
4158 seq_putc(m, '\n'); 4221 seq_putc(m, '\n');
4159 } 4222 }
4223
4160 return 0; 4224 return 0;
4161} 4225}
4162 4226
diff --git a/mm/slob.c b/mm/slob.c
index 7b52b20b96..542394184a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -270,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
270} 270}
271EXPORT_SYMBOL(kmem_cache_create); 271EXPORT_SYMBOL(kmem_cache_create);
272 272
273int kmem_cache_destroy(struct kmem_cache *c) 273void kmem_cache_destroy(struct kmem_cache *c)
274{ 274{
275 slob_free(c, sizeof(struct kmem_cache)); 275 slob_free(c, sizeof(struct kmem_cache));
276 return 0;
277} 276}
278EXPORT_SYMBOL(kmem_cache_destroy); 277EXPORT_SYMBOL(kmem_cache_destroy);
279 278
@@ -339,52 +338,3 @@ void kmem_cache_init(void)
339 338
340 mod_timer(&slob_timer, jiffies + HZ); 339 mod_timer(&slob_timer, jiffies + HZ);
341} 340}
342
343atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
344EXPORT_SYMBOL(slab_reclaim_pages);
345
346#ifdef CONFIG_SMP
347
348void *__alloc_percpu(size_t size)
349{
350 int i;
351 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
352
353 if (!pdata)
354 return NULL;
355
356 for_each_possible_cpu(i) {
357 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
358 if (!pdata->ptrs[i])
359 goto unwind_oom;
360 memset(pdata->ptrs[i], 0, size);
361 }
362
363 /* Catch derefs w/o wrappers */
364 return (void *) (~(unsigned long) pdata);
365
366unwind_oom:
367 while (--i >= 0) {
368 if (!cpu_possible(i))
369 continue;
370 kfree(pdata->ptrs[i]);
371 }
372 kfree(pdata);
373 return NULL;
374}
375EXPORT_SYMBOL(__alloc_percpu);
376
377void
378free_percpu(const void *objp)
379{
380 int i;
381 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
382
383 for_each_possible_cpu(i)
384 kfree(p->ptrs[i]);
385
386 kfree(p);
387}
388EXPORT_SYMBOL(free_percpu);
389
390#endif
diff --git a/mm/swap.c b/mm/swap.c
index 687686a61f..2e0e871f54 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,6 +34,25 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37/*
38 * This path almost never happens for VM activity - pages are normally
39 * freed via pagevecs. But it gets used by networking.
40 */
41static void fastcall __page_cache_release(struct page *page)
42{
43 if (PageLRU(page)) {
44 unsigned long flags;
45 struct zone *zone = page_zone(page);
46
47 spin_lock_irqsave(&zone->lru_lock, flags);
48 VM_BUG_ON(!PageLRU(page));
49 __ClearPageLRU(page);
50 del_page_from_lru(zone, page);
51 spin_unlock_irqrestore(&zone->lru_lock, flags);
52 }
53 free_hot_page(page);
54}
55
37static void put_compound_page(struct page *page) 56static void put_compound_page(struct page *page)
38{ 57{
39 page = (struct page *)page_private(page); 58 page = (struct page *)page_private(page);
@@ -223,26 +242,6 @@ int lru_add_drain_all(void)
223#endif 242#endif
224 243
225/* 244/*
226 * This path almost never happens for VM activity - pages are normally
227 * freed via pagevecs. But it gets used by networking.
228 */
229void fastcall __page_cache_release(struct page *page)
230{
231 if (PageLRU(page)) {
232 unsigned long flags;
233 struct zone *zone = page_zone(page);
234
235 spin_lock_irqsave(&zone->lru_lock, flags);
236 BUG_ON(!PageLRU(page));
237 __ClearPageLRU(page);
238 del_page_from_lru(zone, page);
239 spin_unlock_irqrestore(&zone->lru_lock, flags);
240 }
241 free_hot_page(page);
242}
243EXPORT_SYMBOL(__page_cache_release);
244
245/*
246 * Batched page_cache_release(). Decrement the reference count on all the 245 * Batched page_cache_release(). Decrement the reference count on all the
247 * passed pages. If it fell to zero then remove the page from the LRU and 246 * passed pages. If it fell to zero then remove the page from the LRU and
248 * free it. 247 * free it.
@@ -284,7 +283,7 @@ void release_pages(struct page **pages, int nr, int cold)
284 zone = pagezone; 283 zone = pagezone;
285 spin_lock_irq(&zone->lru_lock); 284 spin_lock_irq(&zone->lru_lock);
286 } 285 }
287 BUG_ON(!PageLRU(page)); 286 VM_BUG_ON(!PageLRU(page));
288 __ClearPageLRU(page); 287 __ClearPageLRU(page);
289 del_page_from_lru(zone, page); 288 del_page_from_lru(zone, page);
290 } 289 }
@@ -337,7 +336,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
337 for (i = 0; i < pagevec_count(pvec); i++) { 336 for (i = 0; i < pagevec_count(pvec); i++) {
338 struct page *page = pvec->pages[i]; 337 struct page *page = pvec->pages[i];
339 338
340 BUG_ON(PageLRU(page)); 339 VM_BUG_ON(PageLRU(page));
341 if (put_page_testzero(page)) 340 if (put_page_testzero(page))
342 pagevec_add(&pages_to_free, page); 341 pagevec_add(&pages_to_free, page);
343 } 342 }
@@ -364,7 +363,7 @@ void __pagevec_lru_add(struct pagevec *pvec)
364 zone = pagezone; 363 zone = pagezone;
365 spin_lock_irq(&zone->lru_lock); 364 spin_lock_irq(&zone->lru_lock);
366 } 365 }
367 BUG_ON(PageLRU(page)); 366 VM_BUG_ON(PageLRU(page));
368 SetPageLRU(page); 367 SetPageLRU(page);
369 add_page_to_inactive_list(zone, page); 368 add_page_to_inactive_list(zone, page);
370 } 369 }
@@ -391,9 +390,9 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
391 zone = pagezone; 390 zone = pagezone;
392 spin_lock_irq(&zone->lru_lock); 391 spin_lock_irq(&zone->lru_lock);
393 } 392 }
394 BUG_ON(PageLRU(page)); 393 VM_BUG_ON(PageLRU(page));
395 SetPageLRU(page); 394 SetPageLRU(page);
396 BUG_ON(PageActive(page)); 395 VM_BUG_ON(PageActive(page));
397 SetPageActive(page); 396 SetPageActive(page);
398 add_page_to_active_list(zone, page); 397 add_page_to_active_list(zone, page);
399 } 398 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1f5ec7837..a15def63f2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1723,13 +1723,14 @@ get_swap_info_struct(unsigned type)
1723 */ 1723 */
1724int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 1724int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1725{ 1725{
1726 int ret = 0, i = 1 << page_cluster; 1726 int our_page_cluster = page_cluster;
1727 int ret = 0, i = 1 << our_page_cluster;
1727 unsigned long toff; 1728 unsigned long toff;
1728 struct swap_info_struct *swapdev = swp_type(entry) + swap_info; 1729 struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
1729 1730
1730 if (!page_cluster) /* no readahead */ 1731 if (!our_page_cluster) /* no readahead */
1731 return 0; 1732 return 0;
1732 toff = (swp_offset(entry) >> page_cluster) << page_cluster; 1733 toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster;
1733 if (!toff) /* first page is swap header */ 1734 if (!toff) /* first page is swap header */
1734 toff++, i--; 1735 toff++, i--;
1735 *offset = toff; 1736 *offset = toff;
diff --git a/mm/truncate.c b/mm/truncate.c
index c6ab55ec68..f4edbc179d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/swap.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/pagevec.h> 15#include <linux/pagevec.h>
@@ -16,6 +17,32 @@
16 do_invalidatepage */ 17 do_invalidatepage */
17 18
18 19
20/**
21 * do_invalidatepage - invalidate part of all of a page
22 * @page: the page which is affected
23 * @offset: the index of the truncation point
24 *
25 * do_invalidatepage() is called when all or part of the page has become
26 * invalidated by a truncate operation.
27 *
28 * do_invalidatepage() does not have to release all buffers, but it must
29 * ensure that no dirty buffer is left outside @offset and that no I/O
30 * is underway against any of the blocks which are outside the truncation
31 * point. Because the caller is about to free (and possibly reuse) those
32 * blocks on-disk.
33 */
34void do_invalidatepage(struct page *page, unsigned long offset)
35{
36 void (*invalidatepage)(struct page *, unsigned long);
37 invalidatepage = page->mapping->a_ops->invalidatepage;
38#ifdef CONFIG_BLOCK
39 if (!invalidatepage)
40 invalidatepage = block_invalidatepage;
41#endif
42 if (invalidatepage)
43 (*invalidatepage)(page, offset);
44}
45
19static inline void truncate_partial_page(struct page *page, unsigned partial) 46static inline void truncate_partial_page(struct page *page, unsigned partial)
20{ 47{
21 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); 48 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
@@ -52,36 +79,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
52/* 79/*
53 * This is for invalidate_inode_pages(). That function can be called at 80 * This is for invalidate_inode_pages(). That function can be called at
54 * any time, and is not supposed to throw away dirty pages. But pages can 81 * any time, and is not supposed to throw away dirty pages. But pages can
55 * be marked dirty at any time too. So we re-check the dirtiness inside 82 * be marked dirty at any time too, so use remove_mapping which safely
56 * ->tree_lock. That provides exclusion against the __set_page_dirty 83 * discards clean, unused pages.
57 * functions.
58 * 84 *
59 * Returns non-zero if the page was successfully invalidated. 85 * Returns non-zero if the page was successfully invalidated.
60 */ 86 */
61static int 87static int
62invalidate_complete_page(struct address_space *mapping, struct page *page) 88invalidate_complete_page(struct address_space *mapping, struct page *page)
63{ 89{
90 int ret;
91
64 if (page->mapping != mapping) 92 if (page->mapping != mapping)
65 return 0; 93 return 0;
66 94
67 if (PagePrivate(page) && !try_to_release_page(page, 0)) 95 if (PagePrivate(page) && !try_to_release_page(page, 0))
68 return 0; 96 return 0;
69 97
70 write_lock_irq(&mapping->tree_lock); 98 ret = remove_mapping(mapping, page);
71 if (PageDirty(page))
72 goto failed;
73 if (page_count(page) != 2) /* caller's ref + pagecache ref */
74 goto failed;
75
76 BUG_ON(PagePrivate(page));
77 __remove_from_page_cache(page);
78 write_unlock_irq(&mapping->tree_lock);
79 ClearPageUptodate(page); 99 ClearPageUptodate(page);
80 page_cache_release(page); /* pagecache ref */ 100
81 return 1; 101 return ret;
82failed:
83 write_unlock_irq(&mapping->tree_lock);
84 return 0;
85} 102}
86 103
87/** 104/**
@@ -270,9 +287,39 @@ unsigned long invalidate_inode_pages(struct address_space *mapping)
270{ 287{
271 return invalidate_mapping_pages(mapping, 0, ~0UL); 288 return invalidate_mapping_pages(mapping, 0, ~0UL);
272} 289}
273
274EXPORT_SYMBOL(invalidate_inode_pages); 290EXPORT_SYMBOL(invalidate_inode_pages);
275 291
292/*
293 * This is like invalidate_complete_page(), except it ignores the page's
294 * refcount. We do this because invalidate_inode_pages2() needs stronger
295 * invalidation guarantees, and cannot afford to leave pages behind because
296 * shrink_list() has a temp ref on them, or because they're transiently sitting
297 * in the lru_cache_add() pagevecs.
298 */
299static int
300invalidate_complete_page2(struct address_space *mapping, struct page *page)
301{
302 if (page->mapping != mapping)
303 return 0;
304
305 if (PagePrivate(page) && !try_to_release_page(page, 0))
306 return 0;
307
308 write_lock_irq(&mapping->tree_lock);
309 if (PageDirty(page))
310 goto failed;
311
312 BUG_ON(PagePrivate(page));
313 __remove_from_page_cache(page);
314 write_unlock_irq(&mapping->tree_lock);
315 ClearPageUptodate(page);
316 page_cache_release(page); /* pagecache ref */
317 return 1;
318failed:
319 write_unlock_irq(&mapping->tree_lock);
320 return 0;
321}
322
276/** 323/**
277 * invalidate_inode_pages2_range - remove range of pages from an address_space 324 * invalidate_inode_pages2_range - remove range of pages from an address_space
278 * @mapping: the address_space 325 * @mapping: the address_space
@@ -339,7 +386,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
339 } 386 }
340 } 387 }
341 was_dirty = test_clear_page_dirty(page); 388 was_dirty = test_clear_page_dirty(page);
342 if (!invalidate_complete_page(mapping, page)) { 389 if (!invalidate_complete_page2(mapping, page)) {
343 if (was_dirty) 390 if (was_dirty)
344 set_page_dirty(page); 391 set_page_dirty(page);
345 ret = -EIO; 392 ret = -EIO;
diff --git a/mm/util.c b/mm/util.c
index 7368479220..e14fa84ef3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -40,6 +40,24 @@ char *kstrdup(const char *s, gfp_t gfp)
40} 40}
41EXPORT_SYMBOL(kstrdup); 41EXPORT_SYMBOL(kstrdup);
42 42
43/**
44 * kmemdup - duplicate region of memory
45 *
46 * @src: memory region to duplicate
47 * @len: memory region length
48 * @gfp: GFP mask to use
49 */
50void *kmemdup(const void *src, size_t len, gfp_t gfp)
51{
52 void *p;
53
54 p = ____kmalloc(len, gfp);
55 if (p)
56 memcpy(p, src, len);
57 return p;
58}
59EXPORT_SYMBOL(kmemdup);
60
43/* 61/*
44 * strndup_user - duplicate an existing string from user space 62 * strndup_user - duplicate an existing string from user space
45 * 63 *
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 266162d2ba..1ac191ce56 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,9 @@
24DEFINE_RWLOCK(vmlist_lock); 24DEFINE_RWLOCK(vmlist_lock);
25struct vm_struct *vmlist; 25struct vm_struct *vmlist;
26 26
27static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
28 int node);
29
27static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 30static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
28{ 31{
29 pte_t *pte; 32 pte_t *pte;
@@ -238,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
238 241
239/** 242/**
240 * get_vm_area - reserve a contingous kernel virtual area 243 * get_vm_area - reserve a contingous kernel virtual area
241 *
242 * @size: size of the area 244 * @size: size of the area
243 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 245 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
244 * 246 *
@@ -270,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr)
270} 272}
271 273
272/* Caller must hold vmlist_lock */ 274/* Caller must hold vmlist_lock */
273struct vm_struct *__remove_vm_area(void *addr) 275static struct vm_struct *__remove_vm_area(void *addr)
274{ 276{
275 struct vm_struct **p, *tmp; 277 struct vm_struct **p, *tmp;
276 278
@@ -293,7 +295,6 @@ found:
293 295
294/** 296/**
295 * remove_vm_area - find and remove a contingous kernel virtual area 297 * remove_vm_area - find and remove a contingous kernel virtual area
296 *
297 * @addr: base address 298 * @addr: base address
298 * 299 *
299 * Search for the kernel VM area starting at @addr, and remove it. 300 * Search for the kernel VM area starting at @addr, and remove it.
@@ -352,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages)
352 353
353/** 354/**
354 * vfree - release memory allocated by vmalloc() 355 * vfree - release memory allocated by vmalloc()
355 *
356 * @addr: memory base address 356 * @addr: memory base address
357 * 357 *
358 * Free the virtually contiguous memory area starting at @addr, as 358 * Free the virtually contiguous memory area starting at @addr, as
@@ -370,7 +370,6 @@ EXPORT_SYMBOL(vfree);
370 370
371/** 371/**
372 * vunmap - release virtual mapping obtained by vmap() 372 * vunmap - release virtual mapping obtained by vmap()
373 *
374 * @addr: memory base address 373 * @addr: memory base address
375 * 374 *
376 * Free the virtually contiguous memory area starting at @addr, 375 * Free the virtually contiguous memory area starting at @addr,
@@ -387,7 +386,6 @@ EXPORT_SYMBOL(vunmap);
387 386
388/** 387/**
389 * vmap - map an array of pages into virtually contiguous space 388 * vmap - map an array of pages into virtually contiguous space
390 *
391 * @pages: array of page pointers 389 * @pages: array of page pointers
392 * @count: number of pages to map 390 * @count: number of pages to map
393 * @flags: vm_area->flags 391 * @flags: vm_area->flags
@@ -468,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
468 466
469/** 467/**
470 * __vmalloc_node - allocate virtually contiguous memory 468 * __vmalloc_node - allocate virtually contiguous memory
471 *
472 * @size: allocation size 469 * @size: allocation size
473 * @gfp_mask: flags for the page level allocator 470 * @gfp_mask: flags for the page level allocator
474 * @prot: protection mask for the allocated pages 471 * @prot: protection mask for the allocated pages
@@ -478,8 +475,8 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
478 * allocator with @gfp_mask flags. Map them into contiguous 475 * allocator with @gfp_mask flags. Map them into contiguous
479 * kernel virtual space, using a pagetable protection of @prot. 476 * kernel virtual space, using a pagetable protection of @prot.
480 */ 477 */
481void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 478static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
482 int node) 479 int node)
483{ 480{
484 struct vm_struct *area; 481 struct vm_struct *area;
485 482
@@ -493,7 +490,6 @@ void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
493 490
494 return __vmalloc_area_node(area, gfp_mask, prot, node); 491 return __vmalloc_area_node(area, gfp_mask, prot, node);
495} 492}
496EXPORT_SYMBOL(__vmalloc_node);
497 493
498void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 494void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
499{ 495{
@@ -503,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc);
503 499
504/** 500/**
505 * vmalloc - allocate virtually contiguous memory 501 * vmalloc - allocate virtually contiguous memory
506 *
507 * @size: allocation size 502 * @size: allocation size
508 *
509 * Allocate enough pages to cover @size from the page level 503 * Allocate enough pages to cover @size from the page level
510 * allocator and map them into contiguous kernel virtual space. 504 * allocator and map them into contiguous kernel virtual space.
511 * 505 *
@@ -519,11 +513,11 @@ void *vmalloc(unsigned long size)
519EXPORT_SYMBOL(vmalloc); 513EXPORT_SYMBOL(vmalloc);
520 514
521/** 515/**
522 * vmalloc_user - allocate virtually contiguous memory which has 516 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
523 * been zeroed so it can be mapped to userspace without 517 * @size: allocation size
524 * leaking data.
525 * 518 *
526 * @size: allocation size 519 * The resulting memory area is zeroed so it can be mapped to userspace
520 * without leaking data.
527 */ 521 */
528void *vmalloc_user(unsigned long size) 522void *vmalloc_user(unsigned long size)
529{ 523{
@@ -542,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user);
542 536
543/** 537/**
544 * vmalloc_node - allocate memory on a specific node 538 * vmalloc_node - allocate memory on a specific node
545 *
546 * @size: allocation size 539 * @size: allocation size
547 * @node: numa node 540 * @node: numa node
548 * 541 *
@@ -564,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node);
564 557
565/** 558/**
566 * vmalloc_exec - allocate virtually contiguous, executable memory 559 * vmalloc_exec - allocate virtually contiguous, executable memory
567 *
568 * @size: allocation size 560 * @size: allocation size
569 * 561 *
570 * Kernel-internal function to allocate enough pages to cover @size 562 * Kernel-internal function to allocate enough pages to cover @size
@@ -582,7 +574,6 @@ void *vmalloc_exec(unsigned long size)
582 574
583/** 575/**
584 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 576 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
585 *
586 * @size: allocation size 577 * @size: allocation size
587 * 578 *
588 * Allocate enough 32bit PA addressable pages to cover @size from the 579 * Allocate enough 32bit PA addressable pages to cover @size from the
@@ -595,11 +586,11 @@ void *vmalloc_32(unsigned long size)
595EXPORT_SYMBOL(vmalloc_32); 586EXPORT_SYMBOL(vmalloc_32);
596 587
597/** 588/**
598 * vmalloc_32_user - allocate virtually contiguous memory (32bit 589 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
599 * addressable) which is zeroed so it can be
600 * mapped to userspace without leaking data.
601 *
602 * @size: allocation size 590 * @size: allocation size
591 *
592 * The resulting memory area is 32bit addressable and zeroed so it can be
593 * mapped to userspace without leaking data.
603 */ 594 */
604void *vmalloc_32_user(unsigned long size) 595void *vmalloc_32_user(unsigned long size)
605{ 596{
@@ -693,7 +684,6 @@ finished:
693 684
694/** 685/**
695 * remap_vmalloc_range - map vmalloc pages to userspace 686 * remap_vmalloc_range - map vmalloc pages to userspace
696 *
697 * @vma: vma to cover (map full range of vma) 687 * @vma: vma to cover (map full range of vma)
698 * @addr: vmalloc memory 688 * @addr: vmalloc memory
699 * @pgoff: number of pages into addr before first page to map 689 * @pgoff: number of pages into addr before first page to map
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5d4c4d0225..eca70310ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/highmem.h> 21#include <linux/highmem.h>
22#include <linux/vmstat.h>
22#include <linux/file.h> 23#include <linux/file.h>
23#include <linux/writeback.h> 24#include <linux/writeback.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
@@ -62,6 +63,8 @@ struct scan_control {
62 int swap_cluster_max; 63 int swap_cluster_max;
63 64
64 int swappiness; 65 int swappiness;
66
67 int all_unreclaimable;
65}; 68};
66 69
67/* 70/*
@@ -368,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
368 /* synchronous write or broken a_ops? */ 371 /* synchronous write or broken a_ops? */
369 ClearPageReclaim(page); 372 ClearPageReclaim(page);
370 } 373 }
371 374 inc_zone_page_state(page, NR_VMSCAN_WRITE);
372 return PAGE_SUCCESS; 375 return PAGE_SUCCESS;
373 } 376 }
374 377
@@ -377,15 +380,34 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
377 380
378int remove_mapping(struct address_space *mapping, struct page *page) 381int remove_mapping(struct address_space *mapping, struct page *page)
379{ 382{
380 if (!mapping) 383 BUG_ON(!PageLocked(page));
381 return 0; /* truncate got there first */ 384 BUG_ON(mapping != page_mapping(page));
382 385
383 write_lock_irq(&mapping->tree_lock); 386 write_lock_irq(&mapping->tree_lock);
384
385 /* 387 /*
386 * The non-racy check for busy page. It is critical to check 388 * The non racy check for a busy page.
387 * PageDirty _after_ making sure that the page is freeable and 389 *
388 * not in use by anybody. (pagecache + us == 2) 390 * Must be careful with the order of the tests. When someone has
391 * a ref to the page, it may be possible that they dirty it then
392 * drop the reference. So if PageDirty is tested before page_count
393 * here, then the following race may occur:
394 *
395 * get_user_pages(&page);
396 * [user mapping goes away]
397 * write_to(page);
398 * !PageDirty(page) [good]
399 * SetPageDirty(page);
400 * put_page(page);
401 * !page_count(page) [good, discard it]
402 *
403 * [oops, our write_to data is lost]
404 *
405 * Reversing the order of the tests ensures such a situation cannot
406 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
407 * load is not satisfied before that of page->_count.
408 *
409 * Note that if SetPageDirty is always performed via set_page_dirty,
410 * and thus under tree_lock, then this ordering is not required.
389 */ 411 */
390 if (unlikely(page_count(page) != 2)) 412 if (unlikely(page_count(page) != 2))
391 goto cannot_free; 413 goto cannot_free;
@@ -440,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
440 if (TestSetPageLocked(page)) 462 if (TestSetPageLocked(page))
441 goto keep; 463 goto keep;
442 464
443 BUG_ON(PageActive(page)); 465 VM_BUG_ON(PageActive(page));
444 466
445 sc->nr_scanned++; 467 sc->nr_scanned++;
446 468
@@ -547,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
547 goto free_it; 569 goto free_it;
548 } 570 }
549 571
550 if (!remove_mapping(mapping, page)) 572 if (!mapping || !remove_mapping(mapping, page))
551 goto keep_locked; 573 goto keep_locked;
552 574
553free_it: 575free_it:
@@ -564,7 +586,7 @@ keep_locked:
564 unlock_page(page); 586 unlock_page(page);
565keep: 587keep:
566 list_add(&page->lru, &ret_pages); 588 list_add(&page->lru, &ret_pages);
567 BUG_ON(PageLRU(page)); 589 VM_BUG_ON(PageLRU(page));
568 } 590 }
569 list_splice(&ret_pages, page_list); 591 list_splice(&ret_pages, page_list);
570 if (pagevec_count(&freed_pvec)) 592 if (pagevec_count(&freed_pvec))
@@ -603,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
603 page = lru_to_page(src); 625 page = lru_to_page(src);
604 prefetchw_prev_lru_page(page, src, flags); 626 prefetchw_prev_lru_page(page, src, flags);
605 627
606 BUG_ON(!PageLRU(page)); 628 VM_BUG_ON(!PageLRU(page));
607 629
608 list_del(&page->lru); 630 list_del(&page->lru);
609 target = src; 631 target = src;
@@ -674,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
674 */ 696 */
675 while (!list_empty(&page_list)) { 697 while (!list_empty(&page_list)) {
676 page = lru_to_page(&page_list); 698 page = lru_to_page(&page_list);
677 BUG_ON(PageLRU(page)); 699 VM_BUG_ON(PageLRU(page));
678 SetPageLRU(page); 700 SetPageLRU(page);
679 list_del(&page->lru); 701 list_del(&page->lru);
680 if (PageActive(page)) 702 if (PageActive(page))
@@ -695,6 +717,11 @@ done:
695 return nr_reclaimed; 717 return nr_reclaimed;
696} 718}
697 719
720static inline int zone_is_near_oom(struct zone *zone)
721{
722 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
723}
724
698/* 725/*
699 * This moves pages from the active list to the inactive list. 726 * This moves pages from the active list to the inactive list.
700 * 727 *
@@ -730,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
730 long distress; 757 long distress;
731 long swap_tendency; 758 long swap_tendency;
732 759
760 if (zone_is_near_oom(zone))
761 goto force_reclaim_mapped;
762
733 /* 763 /*
734 * `distress' is a measure of how much trouble we're having 764 * `distress' is a measure of how much trouble we're having
735 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 765 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
@@ -765,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
765 * memory onto the inactive list. 795 * memory onto the inactive list.
766 */ 796 */
767 if (swap_tendency >= 100) 797 if (swap_tendency >= 100)
798force_reclaim_mapped:
768 reclaim_mapped = 1; 799 reclaim_mapped = 1;
769 } 800 }
770 801
@@ -797,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
797 while (!list_empty(&l_inactive)) { 828 while (!list_empty(&l_inactive)) {
798 page = lru_to_page(&l_inactive); 829 page = lru_to_page(&l_inactive);
799 prefetchw_prev_lru_page(page, &l_inactive, flags); 830 prefetchw_prev_lru_page(page, &l_inactive, flags);
800 BUG_ON(PageLRU(page)); 831 VM_BUG_ON(PageLRU(page));
801 SetPageLRU(page); 832 SetPageLRU(page);
802 BUG_ON(!PageActive(page)); 833 VM_BUG_ON(!PageActive(page));
803 ClearPageActive(page); 834 ClearPageActive(page);
804 835
805 list_move(&page->lru, &zone->inactive_list); 836 list_move(&page->lru, &zone->inactive_list);
@@ -827,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
827 while (!list_empty(&l_active)) { 858 while (!list_empty(&l_active)) {
828 page = lru_to_page(&l_active); 859 page = lru_to_page(&l_active);
829 prefetchw_prev_lru_page(page, &l_active, flags); 860 prefetchw_prev_lru_page(page, &l_active, flags);
830 BUG_ON(PageLRU(page)); 861 VM_BUG_ON(PageLRU(page));
831 SetPageLRU(page); 862 SetPageLRU(page);
832 BUG_ON(!PageActive(page)); 863 VM_BUG_ON(!PageActive(page));
833 list_move(&page->lru, &zone->active_list); 864 list_move(&page->lru, &zone->active_list);
834 pgmoved++; 865 pgmoved++;
835 if (!pagevec_add(&pvec, page)) { 866 if (!pagevec_add(&pvec, page)) {
@@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
925 unsigned long nr_reclaimed = 0; 956 unsigned long nr_reclaimed = 0;
926 int i; 957 int i;
927 958
959 sc->all_unreclaimable = 1;
928 for (i = 0; zones[i] != NULL; i++) { 960 for (i = 0; zones[i] != NULL; i++) {
929 struct zone *zone = zones[i]; 961 struct zone *zone = zones[i];
930 962
@@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
941 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 973 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
942 continue; /* Let kswapd poll it */ 974 continue; /* Let kswapd poll it */
943 975
976 sc->all_unreclaimable = 0;
977
944 nr_reclaimed += shrink_zone(priority, zone, sc); 978 nr_reclaimed += shrink_zone(priority, zone, sc);
945 } 979 }
946 return nr_reclaimed; 980 return nr_reclaimed;
@@ -1021,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1021 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1055 if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
1022 blk_congestion_wait(WRITE, HZ/10); 1056 blk_congestion_wait(WRITE, HZ/10);
1023 } 1057 }
1058 /* top priority shrink_caches still had more to do? don't OOM, then */
1059 if (!sc.all_unreclaimable)
1060 ret = 1;
1024out: 1061out:
1025 for (i = 0; zones[i] != 0; i++) { 1062 for (i = 0; zones[i] != 0; i++) {
1026 struct zone *zone = zones[i]; 1063 struct zone *zone = zones[i];
@@ -1153,7 +1190,7 @@ scan:
1153 if (zone->all_unreclaimable) 1190 if (zone->all_unreclaimable)
1154 continue; 1191 continue;
1155 if (nr_slab == 0 && zone->pages_scanned >= 1192 if (nr_slab == 0 && zone->pages_scanned >=
1156 (zone->nr_active + zone->nr_inactive) * 4) 1193 (zone->nr_active + zone->nr_inactive) * 6)
1157 zone->all_unreclaimable = 1; 1194 zone->all_unreclaimable = 1;
1158 /* 1195 /*
1159 * If we've done a decent amount of scanning and 1196 * If we've done a decent amount of scanning and
@@ -1361,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1361 for_each_zone(zone) 1398 for_each_zone(zone)
1362 lru_pages += zone->nr_active + zone->nr_inactive; 1399 lru_pages += zone->nr_active + zone->nr_inactive;
1363 1400
1364 nr_slab = global_page_state(NR_SLAB); 1401 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1365 /* If slab caches are huge, it's better to hit them first */ 1402 /* If slab caches are huge, it's better to hit them first */
1366 while (nr_slab >= lru_pages) { 1403 while (nr_slab >= lru_pages) {
1367 reclaim_state.reclaimed_slab = 0; 1404 reclaim_state.reclaimed_slab = 0;
@@ -1510,7 +1547,6 @@ int zone_reclaim_mode __read_mostly;
1510#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1547#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
1511#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1548#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1512#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1549#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1513#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
1514 1550
1515/* 1551/*
1516 * Priority for ZONE_RECLAIM. This determines the fraction of pages 1552 * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1526,6 +1562,12 @@ int zone_reclaim_mode __read_mostly;
1526int sysctl_min_unmapped_ratio = 1; 1562int sysctl_min_unmapped_ratio = 1;
1527 1563
1528/* 1564/*
1565 * If the number of slab pages in a zone grows beyond this percentage then
1566 * slab reclaim needs to occur.
1567 */
1568int sysctl_min_slab_ratio = 5;
1569
1570/*
1529 * Try to free up some pages from this zone through reclaim. 1571 * Try to free up some pages from this zone through reclaim.
1530 */ 1572 */
1531static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1573static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -1544,6 +1586,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1544 .gfp_mask = gfp_mask, 1586 .gfp_mask = gfp_mask,
1545 .swappiness = vm_swappiness, 1587 .swappiness = vm_swappiness,
1546 }; 1588 };
1589 unsigned long slab_reclaimable;
1547 1590
1548 disable_swap_token(); 1591 disable_swap_token();
1549 cond_resched(); 1592 cond_resched();
@@ -1556,29 +1599,43 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1556 reclaim_state.reclaimed_slab = 0; 1599 reclaim_state.reclaimed_slab = 0;
1557 p->reclaim_state = &reclaim_state; 1600 p->reclaim_state = &reclaim_state;
1558 1601
1559 /* 1602 if (zone_page_state(zone, NR_FILE_PAGES) -
1560 * Free memory by calling shrink zone with increasing priorities 1603 zone_page_state(zone, NR_FILE_MAPPED) >
1561 * until we have enough memory freed. 1604 zone->min_unmapped_pages) {
1562 */ 1605 /*
1563 priority = ZONE_RECLAIM_PRIORITY; 1606 * Free memory by calling shrink zone with increasing
1564 do { 1607 * priorities until we have enough memory freed.
1565 nr_reclaimed += shrink_zone(priority, zone, &sc); 1608 */
1566 priority--; 1609 priority = ZONE_RECLAIM_PRIORITY;
1567 } while (priority >= 0 && nr_reclaimed < nr_pages); 1610 do {
1611 nr_reclaimed += shrink_zone(priority, zone, &sc);
1612 priority--;
1613 } while (priority >= 0 && nr_reclaimed < nr_pages);
1614 }
1568 1615
1569 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { 1616 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
1617 if (slab_reclaimable > zone->min_slab_pages) {
1570 /* 1618 /*
1571 * shrink_slab() does not currently allow us to determine how 1619 * shrink_slab() does not currently allow us to determine how
1572 * many pages were freed in this zone. So we just shake the slab 1620 * many pages were freed in this zone. So we take the current
1573 * a bit and then go off node for this particular allocation 1621 * number of slab pages and shake the slab until it is reduced
1574 * despite possibly having freed enough memory to allocate in 1622 * by the same nr_pages that we used for reclaiming unmapped
1575 * this zone. If we freed local memory then the next 1623 * pages.
1576 * allocations will be local again.
1577 * 1624 *
1578 * shrink_slab will free memory on all zones and may take 1625 * Note that shrink_slab will free memory on all zones and may
1579 * a long time. 1626 * take a long time.
1627 */
1628 while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
1629 zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
1630 slab_reclaimable - nr_pages)
1631 ;
1632
1633 /*
1634 * Update nr_reclaimed by the number of slab pages we
1635 * reclaimed from this zone.
1580 */ 1636 */
1581 shrink_slab(sc.nr_scanned, gfp_mask, order); 1637 nr_reclaimed += slab_reclaimable -
1638 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
1582 } 1639 }
1583 1640
1584 p->reclaim_state = NULL; 1641 p->reclaim_state = NULL;
@@ -1592,7 +1649,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1592 int node_id; 1649 int node_id;
1593 1650
1594 /* 1651 /*
1595 * Zone reclaim reclaims unmapped file backed pages. 1652 * Zone reclaim reclaims unmapped file backed pages and
1653 * slab pages if we are over the defined limits.
1596 * 1654 *
1597 * A small portion of unmapped file backed pages is needed for 1655 * A small portion of unmapped file backed pages is needed for
1598 * file I/O otherwise pages read by file I/O will be immediately 1656 * file I/O otherwise pages read by file I/O will be immediately
@@ -1601,7 +1659,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1601 * unmapped file backed pages. 1659 * unmapped file backed pages.
1602 */ 1660 */
1603 if (zone_page_state(zone, NR_FILE_PAGES) - 1661 if (zone_page_state(zone, NR_FILE_PAGES) -
1604 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) 1662 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
1663 && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
1664 <= zone->min_slab_pages)
1605 return 0; 1665 return 0;
1606 1666
1607 /* 1667 /*
@@ -1621,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1621 * over remote processors and spread off node memory allocations 1681 * over remote processors and spread off node memory allocations
1622 * as wide as possible. 1682 * as wide as possible.
1623 */ 1683 */
1624 node_id = zone->zone_pgdat->node_id; 1684 node_id = zone_to_nid(zone);
1625 mask = node_to_cpumask(node_id); 1685 mask = node_to_cpumask(node_id);
1626 if (!cpus_empty(mask) && node_id != numa_node_id()) 1686 if (!cpus_empty(mask) && node_id != numa_node_id())
1627 return 0; 1687 return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c1b5f4106b..a2b6a9f96e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -321,6 +321,9 @@ void refresh_cpu_vm_stats(int cpu)
321 for_each_zone(zone) { 321 for_each_zone(zone) {
322 struct per_cpu_pageset *pcp; 322 struct per_cpu_pageset *pcp;
323 323
324 if (!populated_zone(zone))
325 continue;
326
324 pcp = zone_pcp(zone, cpu); 327 pcp = zone_pcp(zone, cpu);
325 328
326 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 329 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
@@ -368,7 +371,7 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z)
368 __inc_zone_state(z, NUMA_MISS); 371 __inc_zone_state(z, NUMA_MISS);
369 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); 372 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
370 } 373 }
371 if (z->zone_pgdat == NODE_DATA(numa_node_id())) 374 if (z->node == numa_node_id())
372 __inc_zone_state(z, NUMA_LOCAL); 375 __inc_zone_state(z, NUMA_LOCAL);
373 else 376 else
374 __inc_zone_state(z, NUMA_OTHER); 377 __inc_zone_state(z, NUMA_OTHER);
@@ -435,17 +438,34 @@ struct seq_operations fragmentation_op = {
435 .show = frag_show, 438 .show = frag_show,
436}; 439};
437 440
441#ifdef CONFIG_ZONE_DMA32
442#define TEXT_FOR_DMA32(xx) xx "_dma32",
443#else
444#define TEXT_FOR_DMA32(xx)
445#endif
446
447#ifdef CONFIG_HIGHMEM
448#define TEXT_FOR_HIGHMEM(xx) xx "_high",
449#else
450#define TEXT_FOR_HIGHMEM(xx)
451#endif
452
453#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
454 TEXT_FOR_HIGHMEM(xx)
455
438static char *vmstat_text[] = { 456static char *vmstat_text[] = {
439 /* Zoned VM counters */ 457 /* Zoned VM counters */
440 "nr_anon_pages", 458 "nr_anon_pages",
441 "nr_mapped", 459 "nr_mapped",
442 "nr_file_pages", 460 "nr_file_pages",
443 "nr_slab", 461 "nr_slab_reclaimable",
462 "nr_slab_unreclaimable",
444 "nr_page_table_pages", 463 "nr_page_table_pages",
445 "nr_dirty", 464 "nr_dirty",
446 "nr_writeback", 465 "nr_writeback",
447 "nr_unstable", 466 "nr_unstable",
448 "nr_bounce", 467 "nr_bounce",
468 "nr_vmscan_write",
449 469
450#ifdef CONFIG_NUMA 470#ifdef CONFIG_NUMA
451 "numa_hit", 471 "numa_hit",
@@ -462,10 +482,7 @@ static char *vmstat_text[] = {
462 "pswpin", 482 "pswpin",
463 "pswpout", 483 "pswpout",
464 484
465 "pgalloc_dma", 485 TEXTS_FOR_ZONES("pgalloc")
466 "pgalloc_dma32",
467 "pgalloc_normal",
468 "pgalloc_high",
469 486
470 "pgfree", 487 "pgfree",
471 "pgactivate", 488 "pgactivate",
@@ -474,25 +491,10 @@ static char *vmstat_text[] = {
474 "pgfault", 491 "pgfault",
475 "pgmajfault", 492 "pgmajfault",
476 493
477 "pgrefill_dma", 494 TEXTS_FOR_ZONES("pgrefill")
478 "pgrefill_dma32", 495 TEXTS_FOR_ZONES("pgsteal")
479 "pgrefill_normal", 496 TEXTS_FOR_ZONES("pgscan_kswapd")
480 "pgrefill_high", 497 TEXTS_FOR_ZONES("pgscan_direct")
481
482 "pgsteal_dma",
483 "pgsteal_dma32",
484 "pgsteal_normal",
485 "pgsteal_high",
486
487 "pgscan_kswapd_dma",
488 "pgscan_kswapd_dma32",
489 "pgscan_kswapd_normal",
490 "pgscan_kswapd_high",
491
492 "pgscan_direct_dma",
493 "pgscan_direct_dma32",
494 "pgscan_direct_normal",
495 "pgscan_direct_high",
496 498
497 "pginodesteal", 499 "pginodesteal",
498 "slabs_scanned", 500 "slabs_scanned",