aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Makefile5
-rw-r--r--mm/allocpercpu.c129
-rw-r--r--mm/bootmem.c206
-rw-r--r--mm/fadvise.c15
-rw-r--r--mm/filemap.c105
-rw-r--r--mm/filemap.h30
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c4
-rw-r--r--mm/highmem.c19
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memory.c221
-rw-r--r--mm/memory_hotplug.c164
-rw-r--r--mm/mempolicy.c51
-rw-r--r--mm/mempool.c9
-rw-r--r--mm/migrate.c32
-rw-r--r--mm/mmap.c38
-rw-r--r--mm/mmzone.c7
-rw-r--r--mm/mprotect.c51
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/msync.c196
-rw-r--r--mm/nommu.c251
-rw-r--r--mm/oom_kill.c134
-rw-r--r--mm/page-writeback.c146
-rw-r--r--mm/page_alloc.c1418
-rw-r--r--mm/page_io.c52
-rw-r--r--mm/pdflush.c15
-rw-r--r--mm/readahead.c20
-rw-r--r--mm/rmap.c81
-rw-r--r--mm/shmem.c120
-rw-r--r--mm/shmem_acl.c197
-rw-r--r--mm/slab.c620
-rw-r--r--mm/slob.c53
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap.c76
-rw-r--r--mm/swap_state.c8
-rw-r--r--mm/swapfile.c11
-rw-r--r--mm/tiny-shmem.c4
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/vmalloc.c47
-rw-r--r--mm/vmscan.c250
-rw-r--r--mm/vmstat.c706
43 files changed, 3826 insertions, 1717 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 66e65ab39426..8f5b45615f7b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,7 +115,8 @@ config SPARSEMEM_EXTREME
115# eventually, we can have this option just 'select SPARSEMEM' 115# eventually, we can have this option just 'select SPARSEMEM'
116config MEMORY_HOTPLUG 116config MEMORY_HOTPLUG
117 bool "Allow for memory hot-add" 117 bool "Allow for memory hot-add"
118 depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND 118 depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
119 depends on (IA64 || X86 || PPC64)
119 120
120comment "Memory hotplug is currently incompatible with Software Suspend" 121comment "Memory hotplug is currently incompatible with Software Suspend"
121 depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND 122 depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
@@ -145,3 +146,9 @@ config MIGRATION
145 while the virtual addresses are not changed. This is useful for 146 while the virtual addresses are not changed. This is useful for
146 example on NUMA systems to put pages nearer to the processors accessing 147 example on NUMA systems to put pages nearer to the processors accessing
147 the page. 148 the page.
149
150config RESOURCES_64BIT
151 bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
152 default 64BIT
153 help
154 This option allows memory and IO resources to be 64 bit.
diff --git a/mm/Makefile b/mm/Makefile
index 0b8f73f2ed16..6200c6d6afd2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,17 +10,18 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \ 11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o $(mmu-y) 13 prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
14 14
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 16obj-$(CONFIG_HUGETLBFS) += hugetlb.o
17obj-$(CONFIG_NUMA) += mempolicy.o 17obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o 18obj-$(CONFIG_SPARSEMEM) += sparse.o
19obj-$(CONFIG_SHMEM) += shmem.o 19obj-$(CONFIG_SHMEM) += shmem.o
20obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 21obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
21obj-$(CONFIG_SLOB) += slob.o 22obj-$(CONFIG_SLOB) += slob.o
22obj-$(CONFIG_SLAB) += slab.o 23obj-$(CONFIG_SLAB) += slab.o
23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 24obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
24obj-$(CONFIG_FS_XIP) += filemap_xip.o 25obj-$(CONFIG_FS_XIP) += filemap_xip.o
25obj-$(CONFIG_MIGRATION) += migrate.o 26obj-$(CONFIG_MIGRATION) += migrate.o
26 27obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
new file mode 100644
index 000000000000..eaa9abeea536
--- /dev/null
+++ b/mm/allocpercpu.c
@@ -0,0 +1,129 @@
1/*
2 * linux/mm/allocpercpu.c
3 *
4 * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
5 */
6#include <linux/mm.h>
7#include <linux/module.h>
8
9/**
10 * percpu_depopulate - depopulate per-cpu data for given cpu
11 * @__pdata: per-cpu data to depopulate
12 * @cpu: depopulate per-cpu data for this cpu
13 *
14 * Depopulating per-cpu data for a cpu going offline would be a typical
15 * use case. You need to register a cpu hotplug handler for that purpose.
16 */
17void percpu_depopulate(void *__pdata, int cpu)
18{
19 struct percpu_data *pdata = __percpu_disguise(__pdata);
20 if (pdata->ptrs[cpu]) {
21 kfree(pdata->ptrs[cpu]);
22 pdata->ptrs[cpu] = NULL;
23 }
24}
25EXPORT_SYMBOL_GPL(percpu_depopulate);
26
27/**
28 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
29 * @__pdata: per-cpu data to depopulate
30 * @mask: depopulate per-cpu data for cpu's selected through mask bits
31 */
32void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
33{
34 int cpu;
35 for_each_cpu_mask(cpu, *mask)
36 percpu_depopulate(__pdata, cpu);
37}
38EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
39
40/**
41 * percpu_populate - populate per-cpu data for given cpu
42 * @__pdata: per-cpu data to populate further
43 * @size: size of per-cpu object
44 * @gfp: may sleep or not etc.
45 * @cpu: populate per-data for this cpu
46 *
47 * Populating per-cpu data for a cpu coming online would be a typical
48 * use case. You need to register a cpu hotplug handler for that purpose.
49 * Per-cpu object is populated with zeroed buffer.
50 */
51void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
52{
53 struct percpu_data *pdata = __percpu_disguise(__pdata);
54 int node = cpu_to_node(cpu);
55
56 BUG_ON(pdata->ptrs[cpu]);
57 if (node_online(node)) {
58 /* FIXME: kzalloc_node(size, gfp, node) */
59 pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
60 if (pdata->ptrs[cpu])
61 memset(pdata->ptrs[cpu], 0, size);
62 } else
63 pdata->ptrs[cpu] = kzalloc(size, gfp);
64 return pdata->ptrs[cpu];
65}
66EXPORT_SYMBOL_GPL(percpu_populate);
67
68/**
69 * percpu_populate_mask - populate per-cpu data for more cpu's
70 * @__pdata: per-cpu data to populate further
71 * @size: size of per-cpu object
72 * @gfp: may sleep or not etc.
73 * @mask: populate per-cpu data for cpu's selected through mask bits
74 *
75 * Per-cpu objects are populated with zeroed buffers.
76 */
77int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
78 cpumask_t *mask)
79{
80 cpumask_t populated = CPU_MASK_NONE;
81 int cpu;
82
83 for_each_cpu_mask(cpu, *mask)
84 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
85 __percpu_depopulate_mask(__pdata, &populated);
86 return -ENOMEM;
87 } else
88 cpu_set(cpu, populated);
89 return 0;
90}
91EXPORT_SYMBOL_GPL(__percpu_populate_mask);
92
93/**
94 * percpu_alloc_mask - initial setup of per-cpu data
95 * @size: size of per-cpu object
96 * @gfp: may sleep or not etc.
97 * @mask: populate per-data for cpu's selected through mask bits
98 *
99 * Populating per-cpu data for all online cpu's would be a typical use case,
100 * which is simplified by the percpu_alloc() wrapper.
101 * Per-cpu objects are populated with zeroed buffers.
102 */
103void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
104{
105 void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
106 void *__pdata = __percpu_disguise(pdata);
107
108 if (unlikely(!pdata))
109 return NULL;
110 if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
111 return __pdata;
112 kfree(pdata);
113 return NULL;
114}
115EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
116
117/**
118 * percpu_free - final cleanup of per-cpu data
119 * @__pdata: object to clean up
120 *
121 * We simply clean up any per-cpu object left. No need for the client to
122 * track and specify through a bis mask which per-cpu objects are to free.
123 */
124void percpu_free(void *__pdata)
125{
126 __percpu_depopulate_mask(__pdata, &cpu_possible_map);
127 kfree(__percpu_disguise(__pdata));
128}
129EXPORT_SYMBOL_GPL(percpu_free);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d213feded10d..d53112fcb404 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -8,17 +8,15 @@
8 * free memory collector. It's used to deal with reserved 8 * free memory collector. It's used to deal with reserved
9 * system memory and memory holes as well. 9 * system memory and memory holes as well.
10 */ 10 */
11
12#include <linux/mm.h>
13#include <linux/kernel_stat.h>
14#include <linux/swap.h>
15#include <linux/interrupt.h>
16#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h>
17#include <linux/bootmem.h> 13#include <linux/bootmem.h>
18#include <linux/mmzone.h>
19#include <linux/module.h> 14#include <linux/module.h>
20#include <asm/dma.h> 15
16#include <asm/bug.h>
21#include <asm/io.h> 17#include <asm/io.h>
18#include <asm/processor.h>
19
22#include "internal.h" 20#include "internal.h"
23 21
24/* 22/*
@@ -29,9 +27,7 @@ unsigned long max_low_pfn;
29unsigned long min_low_pfn; 27unsigned long min_low_pfn;
30unsigned long max_pfn; 28unsigned long max_pfn;
31 29
32EXPORT_SYMBOL(max_pfn); /* This is exported so 30EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */
33 * dma_get_required_mask(), which uses
34 * it, can be an inline function */
35 31
36static LIST_HEAD(bdata_list); 32static LIST_HEAD(bdata_list);
37#ifdef CONFIG_CRASH_DUMP 33#ifdef CONFIG_CRASH_DUMP
@@ -43,7 +39,7 @@ unsigned long saved_max_pfn;
43#endif 39#endif
44 40
45/* return the number of _pages_ that will be allocated for the boot bitmap */ 41/* return the number of _pages_ that will be allocated for the boot bitmap */
46unsigned long __init bootmem_bootmap_pages (unsigned long pages) 42unsigned long __init bootmem_bootmap_pages(unsigned long pages)
47{ 43{
48 unsigned long mapsize; 44 unsigned long mapsize;
49 45
@@ -53,12 +49,14 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
53 49
54 return mapsize; 50 return mapsize;
55} 51}
52
56/* 53/*
57 * link bdata in order 54 * link bdata in order
58 */ 55 */
59static void link_bootmem(bootmem_data_t *bdata) 56static void __init link_bootmem(bootmem_data_t *bdata)
60{ 57{
61 bootmem_data_t *ent; 58 bootmem_data_t *ent;
59
62 if (list_empty(&bdata_list)) { 60 if (list_empty(&bdata_list)) {
63 list_add(&bdata->list, &bdata_list); 61 list_add(&bdata->list, &bdata_list);
64 return; 62 return;
@@ -71,22 +69,32 @@ static void link_bootmem(bootmem_data_t *bdata)
71 } 69 }
72 } 70 }
73 list_add_tail(&bdata->list, &bdata_list); 71 list_add_tail(&bdata->list, &bdata_list);
74 return;
75} 72}
76 73
74/*
75 * Given an initialised bdata, it returns the size of the boot bitmap
76 */
77static unsigned long __init get_mapsize(bootmem_data_t *bdata)
78{
79 unsigned long mapsize;
80 unsigned long start = PFN_DOWN(bdata->node_boot_start);
81 unsigned long end = bdata->node_low_pfn;
82
83 mapsize = ((end - start) + 7) / 8;
84 return ALIGN(mapsize, sizeof(long));
85}
77 86
78/* 87/*
79 * Called once to set up the allocator itself. 88 * Called once to set up the allocator itself.
80 */ 89 */
81static unsigned long __init init_bootmem_core (pg_data_t *pgdat, 90static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
82 unsigned long mapstart, unsigned long start, unsigned long end) 91 unsigned long mapstart, unsigned long start, unsigned long end)
83{ 92{
84 bootmem_data_t *bdata = pgdat->bdata; 93 bootmem_data_t *bdata = pgdat->bdata;
85 unsigned long mapsize = ((end - start)+7)/8; 94 unsigned long mapsize;
86 95
87 mapsize = ALIGN(mapsize, sizeof(long)); 96 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
88 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); 97 bdata->node_boot_start = PFN_PHYS(start);
89 bdata->node_boot_start = (start << PAGE_SHIFT);
90 bdata->node_low_pfn = end; 98 bdata->node_low_pfn = end;
91 link_bootmem(bdata); 99 link_bootmem(bdata);
92 100
@@ -94,6 +102,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
94 * Initially all pages are reserved - setup_arch() has to 102 * Initially all pages are reserved - setup_arch() has to
95 * register free RAM areas explicitly. 103 * register free RAM areas explicitly.
96 */ 104 */
105 mapsize = get_mapsize(bdata);
97 memset(bdata->node_bootmem_map, 0xff, mapsize); 106 memset(bdata->node_bootmem_map, 0xff, mapsize);
98 107
99 return mapsize; 108 return mapsize;
@@ -104,22 +113,22 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
104 * might be used for boot-time allocations - or it might get added 113 * might be used for boot-time allocations - or it might get added
105 * to the free page pool later on. 114 * to the free page pool later on.
106 */ 115 */
107static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) 116static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
117 unsigned long size)
108{ 118{
119 unsigned long sidx, eidx;
109 unsigned long i; 120 unsigned long i;
121
110 /* 122 /*
111 * round up, partially reserved pages are considered 123 * round up, partially reserved pages are considered
112 * fully reserved. 124 * fully reserved.
113 */ 125 */
114 unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
115 unsigned long eidx = (addr + size - bdata->node_boot_start +
116 PAGE_SIZE-1)/PAGE_SIZE;
117 unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
118
119 BUG_ON(!size); 126 BUG_ON(!size);
120 BUG_ON(sidx >= eidx); 127 BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
121 BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn); 128 BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
122 BUG_ON(end > bdata->node_low_pfn); 129
130 sidx = PFN_DOWN(addr - bdata->node_boot_start);
131 eidx = PFN_UP(addr + size - bdata->node_boot_start);
123 132
124 for (i = sidx; i < eidx; i++) 133 for (i = sidx; i < eidx; i++)
125 if (test_and_set_bit(i, bdata->node_bootmem_map)) { 134 if (test_and_set_bit(i, bdata->node_bootmem_map)) {
@@ -129,20 +138,18 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add
129 } 138 }
130} 139}
131 140
132static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) 141static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
142 unsigned long size)
133{ 143{
144 unsigned long sidx, eidx;
134 unsigned long i; 145 unsigned long i;
135 unsigned long start; 146
136 /* 147 /*
137 * round down end of usable mem, partially free pages are 148 * round down end of usable mem, partially free pages are
138 * considered reserved. 149 * considered reserved.
139 */ 150 */
140 unsigned long sidx;
141 unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
142 unsigned long end = (addr + size)/PAGE_SIZE;
143
144 BUG_ON(!size); 151 BUG_ON(!size);
145 BUG_ON(end > bdata->node_low_pfn); 152 BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn);
146 153
147 if (addr < bdata->last_success) 154 if (addr < bdata->last_success)
148 bdata->last_success = addr; 155 bdata->last_success = addr;
@@ -150,8 +157,8 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
150 /* 157 /*
151 * Round up the beginning of the address. 158 * Round up the beginning of the address.
152 */ 159 */
153 start = (addr + PAGE_SIZE-1) / PAGE_SIZE; 160 sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
154 sidx = start - (bdata->node_boot_start/PAGE_SIZE); 161 eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
155 162
156 for (i = sidx; i < eidx; i++) { 163 for (i = sidx; i < eidx; i++) {
157 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) 164 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
@@ -177,10 +184,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
177 unsigned long align, unsigned long goal, unsigned long limit) 184 unsigned long align, unsigned long goal, unsigned long limit)
178{ 185{
179 unsigned long offset, remaining_size, areasize, preferred; 186 unsigned long offset, remaining_size, areasize, preferred;
180 unsigned long i, start = 0, incr, eidx, end_pfn = bdata->node_low_pfn; 187 unsigned long i, start = 0, incr, eidx, end_pfn;
181 void *ret; 188 void *ret;
182 189
183 if(!size) { 190 if (!size) {
184 printk("__alloc_bootmem_core(): zero-sized request\n"); 191 printk("__alloc_bootmem_core(): zero-sized request\n");
185 BUG(); 192 BUG();
186 } 193 }
@@ -189,23 +196,22 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
189 if (limit && bdata->node_boot_start >= limit) 196 if (limit && bdata->node_boot_start >= limit)
190 return NULL; 197 return NULL;
191 198
192 limit >>=PAGE_SHIFT; 199 end_pfn = bdata->node_low_pfn;
200 limit = PFN_DOWN(limit);
193 if (limit && end_pfn > limit) 201 if (limit && end_pfn > limit)
194 end_pfn = limit; 202 end_pfn = limit;
195 203
196 eidx = end_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 204 eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
197 offset = 0; 205 offset = 0;
198 if (align && 206 if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
199 (bdata->node_boot_start & (align - 1UL)) != 0) 207 offset = align - (bdata->node_boot_start & (align - 1UL));
200 offset = (align - (bdata->node_boot_start & (align - 1UL))); 208 offset = PFN_DOWN(offset);
201 offset >>= PAGE_SHIFT;
202 209
203 /* 210 /*
204 * We try to allocate bootmem pages above 'goal' 211 * We try to allocate bootmem pages above 'goal'
205 * first, then we try to allocate lower pages. 212 * first, then we try to allocate lower pages.
206 */ 213 */
207 if (goal && (goal >= bdata->node_boot_start) && 214 if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
208 ((goal >> PAGE_SHIFT) < end_pfn)) {
209 preferred = goal - bdata->node_boot_start; 215 preferred = goal - bdata->node_boot_start;
210 216
211 if (bdata->last_success >= preferred) 217 if (bdata->last_success >= preferred)
@@ -214,9 +220,8 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
214 } else 220 } else
215 preferred = 0; 221 preferred = 0;
216 222
217 preferred = ALIGN(preferred, align) >> PAGE_SHIFT; 223 preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
218 preferred += offset; 224 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
219 areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
220 incr = align >> PAGE_SHIFT ? : 1; 225 incr = align >> PAGE_SHIFT ? : 1;
221 226
222restart_scan: 227restart_scan:
@@ -231,7 +236,7 @@ restart_scan:
231 for (j = i + 1; j < i + areasize; ++j) { 236 for (j = i + 1; j < i + areasize; ++j) {
232 if (j >= eidx) 237 if (j >= eidx)
233 goto fail_block; 238 goto fail_block;
234 if (test_bit (j, bdata->node_bootmem_map)) 239 if (test_bit(j, bdata->node_bootmem_map))
235 goto fail_block; 240 goto fail_block;
236 } 241 }
237 start = i; 242 start = i;
@@ -247,7 +252,7 @@ restart_scan:
247 return NULL; 252 return NULL;
248 253
249found: 254found:
250 bdata->last_success = start << PAGE_SHIFT; 255 bdata->last_success = PFN_PHYS(start);
251 BUG_ON(start >= eidx); 256 BUG_ON(start >= eidx);
252 257
253 /* 258 /*
@@ -259,19 +264,21 @@ found:
259 bdata->last_offset && bdata->last_pos+1 == start) { 264 bdata->last_offset && bdata->last_pos+1 == start) {
260 offset = ALIGN(bdata->last_offset, align); 265 offset = ALIGN(bdata->last_offset, align);
261 BUG_ON(offset > PAGE_SIZE); 266 BUG_ON(offset > PAGE_SIZE);
262 remaining_size = PAGE_SIZE-offset; 267 remaining_size = PAGE_SIZE - offset;
263 if (size < remaining_size) { 268 if (size < remaining_size) {
264 areasize = 0; 269 areasize = 0;
265 /* last_pos unchanged */ 270 /* last_pos unchanged */
266 bdata->last_offset = offset+size; 271 bdata->last_offset = offset + size;
267 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + 272 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
268 bdata->node_boot_start); 273 offset +
274 bdata->node_boot_start);
269 } else { 275 } else {
270 remaining_size = size - remaining_size; 276 remaining_size = size - remaining_size;
271 areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; 277 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
272 ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + 278 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
273 bdata->node_boot_start); 279 offset +
274 bdata->last_pos = start+areasize-1; 280 bdata->node_boot_start);
281 bdata->last_pos = start + areasize - 1;
275 bdata->last_offset = remaining_size; 282 bdata->last_offset = remaining_size;
276 } 283 }
277 bdata->last_offset &= ~PAGE_MASK; 284 bdata->last_offset &= ~PAGE_MASK;
@@ -284,7 +291,7 @@ found:
284 /* 291 /*
285 * Reserve the area now: 292 * Reserve the area now:
286 */ 293 */
287 for (i = start; i < start+areasize; i++) 294 for (i = start; i < start + areasize; i++)
288 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) 295 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
289 BUG(); 296 BUG();
290 memset(ret, 0, size); 297 memset(ret, 0, size);
@@ -305,8 +312,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
305 312
306 count = 0; 313 count = 0;
307 /* first extant page of the node */ 314 /* first extant page of the node */
308 pfn = bdata->node_boot_start >> PAGE_SHIFT; 315 pfn = PFN_DOWN(bdata->node_boot_start);
309 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 316 idx = bdata->node_low_pfn - pfn;
310 map = bdata->node_bootmem_map; 317 map = bdata->node_bootmem_map;
311 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ 318 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
312 if (bdata->node_boot_start == 0 || 319 if (bdata->node_boot_start == 0 ||
@@ -335,7 +342,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
335 } 342 }
336 } 343 }
337 } else { 344 } else {
338 i+=BITS_PER_LONG; 345 i += BITS_PER_LONG;
339 } 346 }
340 pfn += BITS_PER_LONG; 347 pfn += BITS_PER_LONG;
341 } 348 }
@@ -347,9 +354,10 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
347 */ 354 */
348 page = virt_to_page(bdata->node_bootmem_map); 355 page = virt_to_page(bdata->node_bootmem_map);
349 count = 0; 356 count = 0;
350 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { 357 idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
351 count++; 358 for (i = 0; i < idx; i++, page++) {
352 __free_pages_bootmem(page, 0); 359 __free_pages_bootmem(page, 0);
360 count++;
353 } 361 }
354 total += count; 362 total += count;
355 bdata->node_bootmem_map = NULL; 363 bdata->node_bootmem_map = NULL;
@@ -357,64 +365,72 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
357 return total; 365 return total;
358} 366}
359 367
360unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) 368unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
369 unsigned long startpfn, unsigned long endpfn)
361{ 370{
362 return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); 371 return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
363} 372}
364 373
365void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) 374void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
375 unsigned long size)
366{ 376{
367 reserve_bootmem_core(pgdat->bdata, physaddr, size); 377 reserve_bootmem_core(pgdat->bdata, physaddr, size);
368} 378}
369 379
370void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) 380void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
381 unsigned long size)
371{ 382{
372 free_bootmem_core(pgdat->bdata, physaddr, size); 383 free_bootmem_core(pgdat->bdata, physaddr, size);
373} 384}
374 385
375unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) 386unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
376{ 387{
377 return(free_all_bootmem_core(pgdat)); 388 return free_all_bootmem_core(pgdat);
378} 389}
379 390
380unsigned long __init init_bootmem (unsigned long start, unsigned long pages) 391unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
381{ 392{
382 max_low_pfn = pages; 393 max_low_pfn = pages;
383 min_low_pfn = start; 394 min_low_pfn = start;
384 return(init_bootmem_core(NODE_DATA(0), start, 0, pages)); 395 return init_bootmem_core(NODE_DATA(0), start, 0, pages);
385} 396}
386 397
387#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 398#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
388void __init reserve_bootmem (unsigned long addr, unsigned long size) 399void __init reserve_bootmem(unsigned long addr, unsigned long size)
389{ 400{
390 reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); 401 reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
391} 402}
392#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 403#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
393 404
394void __init free_bootmem (unsigned long addr, unsigned long size) 405void __init free_bootmem(unsigned long addr, unsigned long size)
395{ 406{
396 free_bootmem_core(NODE_DATA(0)->bdata, addr, size); 407 free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
397} 408}
398 409
399unsigned long __init free_all_bootmem (void) 410unsigned long __init free_all_bootmem(void)
400{ 411{
401 return(free_all_bootmem_core(NODE_DATA(0))); 412 return free_all_bootmem_core(NODE_DATA(0));
402} 413}
403 414
404void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) 415void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
416 unsigned long goal)
405{ 417{
406 bootmem_data_t *bdata; 418 bootmem_data_t *bdata;
407 void *ptr; 419 void *ptr;
408 420
409 list_for_each_entry(bdata, &bdata_list, list) 421 list_for_each_entry(bdata, &bdata_list, list) {
410 if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) 422 ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
411 return(ptr); 423 if (ptr)
424 return ptr;
425 }
412 return NULL; 426 return NULL;
413} 427}
414 428
415void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) 429void * __init __alloc_bootmem(unsigned long size, unsigned long align,
430 unsigned long goal)
416{ 431{
417 void *mem = __alloc_bootmem_nopanic(size,align,goal); 432 void *mem = __alloc_bootmem_nopanic(size,align,goal);
433
418 if (mem) 434 if (mem)
419 return mem; 435 return mem;
420 /* 436 /*
@@ -426,29 +442,34 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned
426} 442}
427 443
428 444
429void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, 445void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
430 unsigned long goal) 446 unsigned long align, unsigned long goal)
431{ 447{
432 void *ptr; 448 void *ptr;
433 449
434 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 450 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
435 if (ptr) 451 if (ptr)
436 return (ptr); 452 return ptr;
437 453
438 return __alloc_bootmem(size, align, goal); 454 return __alloc_bootmem(size, align, goal);
439} 455}
440 456
441#define LOW32LIMIT 0xffffffff 457#ifndef ARCH_LOW_ADDRESS_LIMIT
458#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
459#endif
442 460
443void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) 461void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
462 unsigned long goal)
444{ 463{
445 bootmem_data_t *bdata; 464 bootmem_data_t *bdata;
446 void *ptr; 465 void *ptr;
447 466
448 list_for_each_entry(bdata, &bdata_list, list) 467 list_for_each_entry(bdata, &bdata_list, list) {
449 if ((ptr = __alloc_bootmem_core(bdata, size, 468 ptr = __alloc_bootmem_core(bdata, size, align, goal,
450 align, goal, LOW32LIMIT))) 469 ARCH_LOW_ADDRESS_LIMIT);
451 return(ptr); 470 if (ptr)
471 return ptr;
472 }
452 473
453 /* 474 /*
454 * Whoops, we cannot satisfy the allocation request. 475 * Whoops, we cannot satisfy the allocation request.
@@ -461,5 +482,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsig
461void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 482void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
462 unsigned long align, unsigned long goal) 483 unsigned long align, unsigned long goal)
463{ 484{
464 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); 485 return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
486 ARCH_LOW_ADDRESS_LIMIT);
465} 487}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0a03357a1f8e..168c78a121bb 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -23,18 +23,6 @@
23/* 23/*
24 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could 24 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
25 * deactivate the pages and clear PG_Referenced. 25 * deactivate the pages and clear PG_Referenced.
26 *
27 * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
28 * offsets `offset' and `offset+len' inclusive. Any pages which are currently
29 * under writeout are skipped, whether or not they are dirty.
30 *
31 * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
32 * offsets `offset' and `offset+len'.
33 *
34 * By combining these two operations the application may do several things:
35 *
36 * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
37 *
38 */ 26 */
39asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) 27asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
40{ 28{
@@ -85,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
85 file->f_ra.ra_pages = bdi->ra_pages * 2; 73 file->f_ra.ra_pages = bdi->ra_pages * 2;
86 break; 74 break;
87 case POSIX_FADV_WILLNEED: 75 case POSIX_FADV_WILLNEED:
88 case POSIX_FADV_NOREUSE:
89 if (!mapping->a_ops->readpage) { 76 if (!mapping->a_ops->readpage) {
90 ret = -EINVAL; 77 ret = -EINVAL;
91 break; 78 break;
@@ -106,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
106 if (ret > 0) 93 if (ret > 0)
107 ret = 0; 94 ret = 0;
108 break; 95 break;
96 case POSIX_FADV_NOREUSE:
97 break;
109 case POSIX_FADV_DONTNEED: 98 case POSIX_FADV_DONTNEED:
110 if (!bdi_write_congested(mapping->backing_dev_info)) 99 if (!bdi_write_congested(mapping->backing_dev_info))
111 filemap_flush(mapping); 100 filemap_flush(mapping);
diff --git a/mm/filemap.c b/mm/filemap.c
index 807a463fd5ed..3277f3b23524 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,7 +9,6 @@
9 * most "normal" filesystems (but you don't /have/ to use this: 9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example) 10 * the NFS filesystem used to do this differently, for example)
11 */ 11 */
12#include <linux/config.h>
13#include <linux/module.h> 12#include <linux/module.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/compiler.h> 14#include <linux/compiler.h>
@@ -120,7 +119,7 @@ void __remove_from_page_cache(struct page *page)
120 radix_tree_delete(&mapping->page_tree, page->index); 119 radix_tree_delete(&mapping->page_tree, page->index);
121 page->mapping = NULL; 120 page->mapping = NULL;
122 mapping->nrpages--; 121 mapping->nrpages--;
123 pagecache_acct(-1); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
124} 123}
125 124
126void remove_from_page_cache(struct page *page) 125void remove_from_page_cache(struct page *page)
@@ -449,7 +448,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
449 page->mapping = mapping; 448 page->mapping = mapping;
450 page->index = offset; 449 page->index = offset;
451 mapping->nrpages++; 450 mapping->nrpages++;
452 pagecache_acct(1); 451 __inc_zone_page_state(page, NR_FILE_PAGES);
453 } 452 }
454 write_unlock_irq(&mapping->tree_lock); 453 write_unlock_irq(&mapping->tree_lock);
455 radix_tree_preload_end(); 454 radix_tree_preload_end();
@@ -489,6 +488,12 @@ struct page *page_cache_alloc_cold(struct address_space *x)
489EXPORT_SYMBOL(page_cache_alloc_cold); 488EXPORT_SYMBOL(page_cache_alloc_cold);
490#endif 489#endif
491 490
491static int __sleep_on_page_lock(void *word)
492{
493 io_schedule();
494 return 0;
495}
496
492/* 497/*
493 * In order to wait for pages to become available there must be 498 * In order to wait for pages to become available there must be
494 * waitqueues associated with pages. By using a hash table of 499 * waitqueues associated with pages. By using a hash table of
@@ -578,13 +583,24 @@ void fastcall __lock_page(struct page *page)
578} 583}
579EXPORT_SYMBOL(__lock_page); 584EXPORT_SYMBOL(__lock_page);
580 585
586/*
587 * Variant of lock_page that does not require the caller to hold a reference
588 * on the page's mapping.
589 */
590void fastcall __lock_page_nosync(struct page *page)
591{
592 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
593 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
594 TASK_UNINTERRUPTIBLE);
595}
596
581/** 597/**
582 * find_get_page - find and get a page reference 598 * find_get_page - find and get a page reference
583 * @mapping: the address_space to search 599 * @mapping: the address_space to search
584 * @offset: the page index 600 * @offset: the page index
585 * 601 *
586 * A rather lightweight function, finding and getting a reference to a 602 * Is there a pagecache struct page at the given (mapping, offset) tuple?
587 * hashed page atomically. 603 * If yes, increment its refcount and return it; if no, return NULL.
588 */ 604 */
589struct page * find_get_page(struct address_space *mapping, unsigned long offset) 605struct page * find_get_page(struct address_space *mapping, unsigned long offset)
590{ 606{
@@ -828,6 +844,30 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
828} 844}
829EXPORT_SYMBOL(grab_cache_page_nowait); 845EXPORT_SYMBOL(grab_cache_page_nowait);
830 846
847/*
848 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
849 * a _large_ part of the i/o request. Imagine the worst scenario:
850 *
851 * ---R__________________________________________B__________
852 * ^ reading here ^ bad block(assume 4k)
853 *
854 * read(R) => miss => readahead(R...B) => media error => frustrating retries
855 * => failing the whole request => read(R) => read(R+1) =>
856 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
857 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
858 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
859 *
860 * It is going insane. Fix it by quickly scaling down the readahead size.
861 */
862static void shrink_readahead_size_eio(struct file *filp,
863 struct file_ra_state *ra)
864{
865 if (!ra->ra_pages)
866 return;
867
868 ra->ra_pages /= 4;
869}
870
831/** 871/**
832 * do_generic_mapping_read - generic file read routine 872 * do_generic_mapping_read - generic file read routine
833 * @mapping: address_space to be read 873 * @mapping: address_space to be read
@@ -947,7 +987,7 @@ page_not_up_to_date:
947 /* Get exclusive access to the page ... */ 987 /* Get exclusive access to the page ... */
948 lock_page(page); 988 lock_page(page);
949 989
950 /* Did it get unhashed before we got the lock? */ 990 /* Did it get truncated before we got the lock? */
951 if (!page->mapping) { 991 if (!page->mapping) {
952 unlock_page(page); 992 unlock_page(page);
953 page_cache_release(page); 993 page_cache_release(page);
@@ -985,6 +1025,7 @@ readpage:
985 } 1025 }
986 unlock_page(page); 1026 unlock_page(page);
987 error = -EIO; 1027 error = -EIO;
1028 shrink_readahead_size_eio(filp, &ra);
988 goto readpage_error; 1029 goto readpage_error;
989 } 1030 }
990 unlock_page(page); 1031 unlock_page(page);
@@ -1389,7 +1430,7 @@ retry_find:
1389 */ 1430 */
1390 if (!did_readaround) { 1431 if (!did_readaround) {
1391 majmin = VM_FAULT_MAJOR; 1432 majmin = VM_FAULT_MAJOR;
1392 inc_page_state(pgmajfault); 1433 count_vm_event(PGMAJFAULT);
1393 } 1434 }
1394 did_readaround = 1; 1435 did_readaround = 1;
1395 ra_pages = max_sane_readahead(file->f_ra.ra_pages); 1436 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
@@ -1430,7 +1471,7 @@ outside_data_content:
1430 * accessible.. 1471 * accessible..
1431 */ 1472 */
1432 if (area->vm_mm == current->mm) 1473 if (area->vm_mm == current->mm)
1433 return NULL; 1474 return NOPAGE_SIGBUS;
1434 /* Fall through to the non-read-ahead case */ 1475 /* Fall through to the non-read-ahead case */
1435no_cached_page: 1476no_cached_page:
1436 /* 1477 /*
@@ -1455,12 +1496,12 @@ no_cached_page:
1455 */ 1496 */
1456 if (error == -ENOMEM) 1497 if (error == -ENOMEM)
1457 return NOPAGE_OOM; 1498 return NOPAGE_OOM;
1458 return NULL; 1499 return NOPAGE_SIGBUS;
1459 1500
1460page_not_uptodate: 1501page_not_uptodate:
1461 if (!did_readaround) { 1502 if (!did_readaround) {
1462 majmin = VM_FAULT_MAJOR; 1503 majmin = VM_FAULT_MAJOR;
1463 inc_page_state(pgmajfault); 1504 count_vm_event(PGMAJFAULT);
1464 } 1505 }
1465 lock_page(page); 1506 lock_page(page);
1466 1507
@@ -1522,8 +1563,9 @@ page_not_uptodate:
1522 * Things didn't work out. Return zero to tell the 1563 * Things didn't work out. Return zero to tell the
1523 * mm layer so, possibly freeing the page cache page first. 1564 * mm layer so, possibly freeing the page cache page first.
1524 */ 1565 */
1566 shrink_readahead_size_eio(file, ra);
1525 page_cache_release(page); 1567 page_cache_release(page);
1526 return NULL; 1568 return NOPAGE_SIGBUS;
1527} 1569}
1528EXPORT_SYMBOL(filemap_nopage); 1570EXPORT_SYMBOL(filemap_nopage);
1529 1571
@@ -1585,7 +1627,7 @@ no_cached_page:
1585page_not_uptodate: 1627page_not_uptodate:
1586 lock_page(page); 1628 lock_page(page);
1587 1629
1588 /* Did it get unhashed while we waited for it? */ 1630 /* Did it get truncated while we waited for it? */
1589 if (!page->mapping) { 1631 if (!page->mapping) {
1590 unlock_page(page); 1632 unlock_page(page);
1591 goto err; 1633 goto err;
@@ -1892,7 +1934,7 @@ int remove_suid(struct dentry *dentry)
1892EXPORT_SYMBOL(remove_suid); 1934EXPORT_SYMBOL(remove_suid);
1893 1935
1894size_t 1936size_t
1895__filemap_copy_from_user_iovec(char *vaddr, 1937__filemap_copy_from_user_iovec_inatomic(char *vaddr,
1896 const struct iovec *iov, size_t base, size_t bytes) 1938 const struct iovec *iov, size_t base, size_t bytes)
1897{ 1939{
1898 size_t copied = 0, left = 0; 1940 size_t copied = 0, left = 0;
@@ -1908,12 +1950,8 @@ __filemap_copy_from_user_iovec(char *vaddr,
1908 vaddr += copy; 1950 vaddr += copy;
1909 iov++; 1951 iov++;
1910 1952
1911 if (unlikely(left)) { 1953 if (unlikely(left))
1912 /* zero the rest of the target like __copy_from_user */
1913 if (bytes)
1914 memset(vaddr, 0, bytes);
1915 break; 1954 break;
1916 }
1917 } 1955 }
1918 return copied - left; 1956 return copied - left;
1919} 1957}
@@ -2045,7 +2083,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2045{ 2083{
2046 struct file *file = iocb->ki_filp; 2084 struct file *file = iocb->ki_filp;
2047 struct address_space * mapping = file->f_mapping; 2085 struct address_space * mapping = file->f_mapping;
2048 struct address_space_operations *a_ops = mapping->a_ops; 2086 const struct address_space_operations *a_ops = mapping->a_ops;
2049 struct inode *inode = mapping->host; 2087 struct inode *inode = mapping->host;
2050 long status = 0; 2088 long status = 0;
2051 struct page *page; 2089 struct page *page;
@@ -2071,14 +2109,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2071 do { 2109 do {
2072 unsigned long index; 2110 unsigned long index;
2073 unsigned long offset; 2111 unsigned long offset;
2074 unsigned long maxlen;
2075 size_t copied; 2112 size_t copied;
2076 2113
2077 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 2114 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2078 index = pos >> PAGE_CACHE_SHIFT; 2115 index = pos >> PAGE_CACHE_SHIFT;
2079 bytes = PAGE_CACHE_SIZE - offset; 2116 bytes = PAGE_CACHE_SIZE - offset;
2080 if (bytes > count) 2117
2081 bytes = count; 2118 /* Limit the size of the copy to the caller's write size */
2119 bytes = min(bytes, count);
2120
2121 /*
2122 * Limit the size of the copy to that of the current segment,
2123 * because fault_in_pages_readable() doesn't know how to walk
2124 * segments.
2125 */
2126 bytes = min(bytes, cur_iov->iov_len - iov_base);
2082 2127
2083 /* 2128 /*
2084 * Bring in the user page that we will copy from _first_. 2129 * Bring in the user page that we will copy from _first_.
@@ -2086,10 +2131,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2086 * same page as we're writing to, without it being marked 2131 * same page as we're writing to, without it being marked
2087 * up-to-date. 2132 * up-to-date.
2088 */ 2133 */
2089 maxlen = cur_iov->iov_len - iov_base; 2134 fault_in_pages_readable(buf, bytes);
2090 if (maxlen > bytes)
2091 maxlen = bytes;
2092 fault_in_pages_readable(buf, maxlen);
2093 2135
2094 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); 2136 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
2095 if (!page) { 2137 if (!page) {
@@ -2097,6 +2139,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2097 break; 2139 break;
2098 } 2140 }
2099 2141
2142 if (unlikely(bytes == 0)) {
2143 status = 0;
2144 copied = 0;
2145 goto zero_length_segment;
2146 }
2147
2100 status = a_ops->prepare_write(file, page, offset, offset+bytes); 2148 status = a_ops->prepare_write(file, page, offset, offset+bytes);
2101 if (unlikely(status)) { 2149 if (unlikely(status)) {
2102 loff_t isize = i_size_read(inode); 2150 loff_t isize = i_size_read(inode);
@@ -2126,7 +2174,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2126 page_cache_release(page); 2174 page_cache_release(page);
2127 continue; 2175 continue;
2128 } 2176 }
2129 if (likely(copied > 0)) { 2177zero_length_segment:
2178 if (likely(copied >= 0)) {
2130 if (!status) 2179 if (!status)
2131 status = copied; 2180 status = copied;
2132 2181
@@ -2191,7 +2240,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2191 unsigned long nr_segs, loff_t *ppos) 2240 unsigned long nr_segs, loff_t *ppos)
2192{ 2241{
2193 struct file *file = iocb->ki_filp; 2242 struct file *file = iocb->ki_filp;
2194 struct address_space * mapping = file->f_mapping; 2243 const struct address_space * mapping = file->f_mapping;
2195 size_t ocount; /* original count */ 2244 size_t ocount; /* original count */
2196 size_t count; /* after file limit checks */ 2245 size_t count; /* after file limit checks */
2197 struct inode *inode = mapping->host; 2246 struct inode *inode = mapping->host;
diff --git a/mm/filemap.h b/mm/filemap.h
index 5683cde22055..3f2a343c6015 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -16,15 +16,23 @@
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17 17
18size_t 18size_t
19__filemap_copy_from_user_iovec(char *vaddr, 19__filemap_copy_from_user_iovec_inatomic(char *vaddr,
20 const struct iovec *iov, 20 const struct iovec *iov,
21 size_t base, 21 size_t base,
22 size_t bytes); 22 size_t bytes);
23 23
24/* 24/*
25 * Copy as much as we can into the page and return the number of bytes which 25 * Copy as much as we can into the page and return the number of bytes which
26 * were sucessfully copied. If a fault is encountered then clear the page 26 * were sucessfully copied. If a fault is encountered then clear the page
27 * out to (offset+bytes) and return the number of bytes which were copied. 27 * out to (offset+bytes) and return the number of bytes which were copied.
28 *
29 * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
30 * to *NOT* zero any tail of the buffer that it failed to copy. If it does,
31 * and if the following non-atomic copy succeeds, then there is a small window
32 * where the target page contains neither the data before the write, nor the
33 * data after the write (it contains zero). A read at this time will see
34 * data that is inconsistent with any ordering of the read and the write.
35 * (This has been detected in practice).
28 */ 36 */
29static inline size_t 37static inline size_t
30filemap_copy_from_user(struct page *page, unsigned long offset, 38filemap_copy_from_user(struct page *page, unsigned long offset,
@@ -60,13 +68,15 @@ filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
60 size_t copied; 68 size_t copied;
61 69
62 kaddr = kmap_atomic(page, KM_USER0); 70 kaddr = kmap_atomic(page, KM_USER0);
63 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, 71 copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
64 base, bytes); 72 base, bytes);
65 kunmap_atomic(kaddr, KM_USER0); 73 kunmap_atomic(kaddr, KM_USER0);
66 if (copied != bytes) { 74 if (copied != bytes) {
67 kaddr = kmap(page); 75 kaddr = kmap(page);
68 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, 76 copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
69 base, bytes); 77 base, bytes);
78 if (bytes - copied)
79 memset(kaddr + offset + copied, 0, bytes - copied);
70 kunmap(page); 80 kunmap(page);
71 } 81 }
72 return copied; 82 return copied;
@@ -78,7 +88,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
78 const struct iovec *iov = *iovp; 88 const struct iovec *iov = *iovp;
79 size_t base = *basep; 89 size_t base = *basep;
80 90
81 while (bytes) { 91 do {
82 int copy = min(bytes, iov->iov_len - base); 92 int copy = min(bytes, iov->iov_len - base);
83 93
84 bytes -= copy; 94 bytes -= copy;
@@ -87,7 +97,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
87 iov++; 97 iov++;
88 base = 0; 98 base = 0;
89 } 99 }
90 } 100 } while (bytes);
91 *iovp = iov; 101 *iovp = iov;
92 *basep = base; 102 *basep = base;
93} 103}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b960ac8e5918..b4fd0d7c9bfb 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -273,7 +273,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
273 size_t count, loff_t pos, loff_t *ppos) 273 size_t count, loff_t pos, loff_t *ppos)
274{ 274{
275 struct address_space * mapping = filp->f_mapping; 275 struct address_space * mapping = filp->f_mapping;
276 struct address_space_operations *a_ops = mapping->a_ops; 276 const struct address_space_operations *a_ops = mapping->a_ops;
277 struct inode *inode = mapping->host; 277 struct inode *inode = mapping->host;
278 long status = 0; 278 long status = 0;
279 struct page *page; 279 struct page *page;
diff --git a/mm/fremap.c b/mm/fremap.c
index 21b7d0cbc98c..aa30618ec6b2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -79,9 +79,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
79 inc_mm_counter(mm, file_rss); 79 inc_mm_counter(mm, file_rss);
80 80
81 flush_icache_page(vma, page); 81 flush_icache_page(vma, page);
82 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 82 pte_val = mk_pte(page, prot);
83 set_pte_at(mm, addr, pte, pte_val);
83 page_add_file_rmap(page); 84 page_add_file_rmap(page);
84 pte_val = *pte;
85 update_mmu_cache(vma, addr, pte_val); 85 update_mmu_cache(vma, addr, pte_val);
86 lazy_mmu_prot_update(pte_val); 86 lazy_mmu_prot_update(pte_val);
87 err = 0; 87 err = 0;
diff --git a/mm/highmem.c b/mm/highmem.c
index 9b274fdf9d08..ee5519b176ee 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -46,6 +46,19 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
46 */ 46 */
47#ifdef CONFIG_HIGHMEM 47#ifdef CONFIG_HIGHMEM
48 48
49unsigned long totalhigh_pages __read_mostly;
50
51unsigned int nr_free_highpages (void)
52{
53 pg_data_t *pgdat;
54 unsigned int pages = 0;
55
56 for_each_online_pgdat(pgdat)
57 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
58
59 return pages;
60}
61
49static int pkmap_count[LAST_PKMAP]; 62static int pkmap_count[LAST_PKMAP];
50static unsigned int last_pkmap_nr; 63static unsigned int last_pkmap_nr;
51static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); 64static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -315,8 +328,8 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
315 if (bvec->bv_page == org_vec->bv_page) 328 if (bvec->bv_page == org_vec->bv_page)
316 continue; 329 continue;
317 330
318 mempool_free(bvec->bv_page, pool); 331 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
319 dec_page_state(nr_bounce); 332 mempool_free(bvec->bv_page, pool);
320 } 333 }
321 334
322 bio_endio(bio_orig, bio_orig->bi_size, err); 335 bio_endio(bio_orig, bio_orig->bi_size, err);
@@ -397,7 +410,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
397 to->bv_page = mempool_alloc(pool, q->bounce_gfp); 410 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
398 to->bv_len = from->bv_len; 411 to->bv_len = from->bv_len;
399 to->bv_offset = from->bv_offset; 412 to->bv_offset = from->bv_offset;
400 inc_page_state(nr_bounce); 413 inc_zone_page_state(to->bv_page, NR_BOUNCE);
401 414
402 if (rw == WRITE) { 415 if (rw == WRITE) {
403 char *vto, *vfrom; 416 char *vto, *vfrom;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index df499973255f..7c7d03dbf73d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -72,7 +72,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
72 struct zone **z; 72 struct zone **z;
73 73
74 for (z = zonelist->zones; *z; z++) { 74 for (z = zonelist->zones; *z; z++) {
75 nid = (*z)->zone_pgdat->node_id; 75 nid = zone_to_nid(*z);
76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && 76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
77 !list_empty(&hugepage_freelists[nid])) 77 !list_empty(&hugepage_freelists[nid]))
78 break; 78 break;
@@ -177,7 +177,7 @@ static void update_and_free_page(struct page *page)
177{ 177{
178 int i; 178 int i;
179 nr_huge_pages--; 179 nr_huge_pages--;
180 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; 180 nr_huge_pages_node[page_to_nid(page)]--;
181 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 181 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
182 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 182 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
183 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 183 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -191,7 +191,8 @@ static void update_and_free_page(struct page *page)
191#ifdef CONFIG_HIGHMEM 191#ifdef CONFIG_HIGHMEM
192static void try_to_free_low(unsigned long count) 192static void try_to_free_low(unsigned long count)
193{ 193{
194 int i, nid; 194 int i;
195
195 for (i = 0; i < MAX_NUMNODES; ++i) { 196 for (i = 0; i < MAX_NUMNODES; ++i) {
196 struct page *page, *next; 197 struct page *page, *next;
197 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 198 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
@@ -199,9 +200,8 @@ static void try_to_free_low(unsigned long count)
199 continue; 200 continue;
200 list_del(&page->lru); 201 list_del(&page->lru);
201 update_and_free_page(page); 202 update_and_free_page(page);
202 nid = page_zone(page)->zone_pgdat->node_id;
203 free_huge_pages--; 203 free_huge_pages--;
204 free_huge_pages_node[nid]--; 204 free_huge_pages_node[page_to_nid(page)]--;
205 if (count >= nr_huge_pages) 205 if (count >= nr_huge_pages)
206 return; 206 return;
207 } 207 }
diff --git a/mm/internal.h b/mm/internal.h
index d20e3cc4aef0..d527b80b292f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,8 +24,8 @@ static inline void set_page_count(struct page *page, int v)
24 */ 24 */
25static inline void set_page_refcounted(struct page *page) 25static inline void set_page_refcounted(struct page *page)
26{ 26{
27 BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); 27 VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
28 BUG_ON(atomic_read(&page->_count)); 28 VM_BUG_ON(atomic_read(&page->_count));
29 set_page_count(page, 1); 29 set_page_count(page, 1);
30} 30}
31 31
diff --git a/mm/memory.c b/mm/memory.c
index 247b5c312b9b..160f5b503ead 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,7 +47,9 @@
47#include <linux/pagemap.h> 47#include <linux/pagemap.h>
48#include <linux/rmap.h> 48#include <linux/rmap.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/delayacct.h>
50#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h>
51 53
52#include <asm/pgalloc.h> 54#include <asm/pgalloc.h>
53#include <asm/uaccess.h> 55#include <asm/uaccess.h>
@@ -126,7 +128,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
126 pmd_clear(pmd); 128 pmd_clear(pmd);
127 pte_lock_deinit(page); 129 pte_lock_deinit(page);
128 pte_free_tlb(tlb, page); 130 pte_free_tlb(tlb, page);
129 dec_page_state(nr_page_table_pages); 131 dec_zone_page_state(page, NR_PAGETABLE);
130 tlb->mm->nr_ptes--; 132 tlb->mm->nr_ptes--;
131} 133}
132 134
@@ -311,7 +313,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
311 pte_free(new); 313 pte_free(new);
312 } else { 314 } else {
313 mm->nr_ptes++; 315 mm->nr_ptes++;
314 inc_page_state(nr_page_table_pages); 316 inc_zone_page_state(new, NR_PAGETABLE);
315 pmd_populate(mm, pmd, new); 317 pmd_populate(mm, pmd, new);
316 } 318 }
317 spin_unlock(&mm->page_table_lock); 319 spin_unlock(&mm->page_table_lock);
@@ -503,7 +505,7 @@ again:
503 return -ENOMEM; 505 return -ENOMEM;
504 src_pte = pte_offset_map_nested(src_pmd, addr); 506 src_pte = pte_offset_map_nested(src_pmd, addr);
505 src_ptl = pte_lockptr(src_mm, src_pmd); 507 src_ptl = pte_lockptr(src_mm, src_pmd);
506 spin_lock(src_ptl); 508 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
507 509
508 do { 510 do {
509 /* 511 /*
@@ -1225,7 +1227,12 @@ out:
1225 return retval; 1227 return retval;
1226} 1228}
1227 1229
1228/* 1230/**
1231 * vm_insert_page - insert single page into user vma
1232 * @vma: user vma to map to
1233 * @addr: target user address of this page
1234 * @page: source kernel page
1235 *
1229 * This allows drivers to insert individual pages they've allocated 1236 * This allows drivers to insert individual pages they've allocated
1230 * into a user vma. 1237 * into a user vma.
1231 * 1238 *
@@ -1317,7 +1324,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1317 return 0; 1324 return 0;
1318} 1325}
1319 1326
1320/* Note: this is only safe if the mm semaphore is held when called. */ 1327/**
1328 * remap_pfn_range - remap kernel memory to userspace
1329 * @vma: user vma to map to
1330 * @addr: target user address to start at
1331 * @pfn: physical address of kernel memory
1332 * @size: size of map area
1333 * @prot: page protection flags for this mapping
1334 *
1335 * Note: this is only safe if the mm semaphore is held when called.
1336 */
1321int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 1337int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1322 unsigned long pfn, unsigned long size, pgprot_t prot) 1338 unsigned long pfn, unsigned long size, pgprot_t prot)
1323{ 1339{
@@ -1457,14 +1473,29 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1457{ 1473{
1458 struct page *old_page, *new_page; 1474 struct page *old_page, *new_page;
1459 pte_t entry; 1475 pte_t entry;
1460 int reuse, ret = VM_FAULT_MINOR; 1476 int reuse = 0, ret = VM_FAULT_MINOR;
1477 struct page *dirty_page = NULL;
1461 1478
1462 old_page = vm_normal_page(vma, address, orig_pte); 1479 old_page = vm_normal_page(vma, address, orig_pte);
1463 if (!old_page) 1480 if (!old_page)
1464 goto gotten; 1481 goto gotten;
1465 1482
1466 if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == 1483 /*
1467 (VM_SHARED|VM_WRITE))) { 1484 * Take out anonymous pages first, anonymous shared vmas are
1485 * not dirty accountable.
1486 */
1487 if (PageAnon(old_page)) {
1488 if (!TestSetPageLocked(old_page)) {
1489 reuse = can_share_swap_page(old_page);
1490 unlock_page(old_page);
1491 }
1492 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1493 (VM_WRITE|VM_SHARED))) {
1494 /*
1495 * Only catch write-faults on shared writable pages,
1496 * read-only shared pages can get COWed by
1497 * get_user_pages(.write=1, .force=1).
1498 */
1468 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 1499 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1469 /* 1500 /*
1470 * Notify the address space that the page is about to 1501 * Notify the address space that the page is about to
@@ -1493,13 +1524,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1493 if (!pte_same(*page_table, orig_pte)) 1524 if (!pte_same(*page_table, orig_pte))
1494 goto unlock; 1525 goto unlock;
1495 } 1526 }
1496 1527 dirty_page = old_page;
1528 get_page(dirty_page);
1497 reuse = 1; 1529 reuse = 1;
1498 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1499 reuse = can_share_swap_page(old_page);
1500 unlock_page(old_page);
1501 } else {
1502 reuse = 0;
1503 } 1530 }
1504 1531
1505 if (reuse) { 1532 if (reuse) {
@@ -1549,9 +1576,16 @@ gotten:
1549 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1576 flush_cache_page(vma, address, pte_pfn(orig_pte));
1550 entry = mk_pte(new_page, vma->vm_page_prot); 1577 entry = mk_pte(new_page, vma->vm_page_prot);
1551 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1578 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1552 ptep_establish(vma, address, page_table, entry);
1553 update_mmu_cache(vma, address, entry);
1554 lazy_mmu_prot_update(entry); 1579 lazy_mmu_prot_update(entry);
1580 /*
1581 * Clear the pte entry and flush it first, before updating the
1582 * pte with the new entry. This will avoid a race condition
1583 * seen in the presence of one thread doing SMC and another
1584 * thread doing COW.
1585 */
1586 ptep_clear_flush(vma, address, page_table);
1587 set_pte_at(mm, address, page_table, entry);
1588 update_mmu_cache(vma, address, entry);
1555 lru_cache_add_active(new_page); 1589 lru_cache_add_active(new_page);
1556 page_add_new_anon_rmap(new_page, vma, address); 1590 page_add_new_anon_rmap(new_page, vma, address);
1557 1591
@@ -1565,6 +1599,10 @@ gotten:
1565 page_cache_release(old_page); 1599 page_cache_release(old_page);
1566unlock: 1600unlock:
1567 pte_unmap_unlock(page_table, ptl); 1601 pte_unmap_unlock(page_table, ptl);
1602 if (dirty_page) {
1603 set_page_dirty_balance(dirty_page);
1604 put_page(dirty_page);
1605 }
1568 return ret; 1606 return ret;
1569oom: 1607oom:
1570 if (old_page) 1608 if (old_page)
@@ -1784,9 +1822,10 @@ void unmap_mapping_range(struct address_space *mapping,
1784} 1822}
1785EXPORT_SYMBOL(unmap_mapping_range); 1823EXPORT_SYMBOL(unmap_mapping_range);
1786 1824
1787/* 1825/**
1788 * Handle all mappings that got truncated by a "truncate()" 1826 * vmtruncate - unmap mappings "freed" by truncate() syscall
1789 * system call. 1827 * @inode: inode of the file used
1828 * @offset: file offset to start truncating
1790 * 1829 *
1791 * NOTE! We have to be ready to update the memory sharing 1830 * NOTE! We have to be ready to update the memory sharing
1792 * between the file and the memory map for a potential last 1831 * between the file and the memory map for a potential last
@@ -1853,13 +1892,18 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1853 1892
1854 return 0; 1893 return 0;
1855} 1894}
1856EXPORT_SYMBOL(vmtruncate_range); 1895EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
1857 1896
1858/* 1897/**
1898 * swapin_readahead - swap in pages in hope we need them soon
1899 * @entry: swap entry of this memory
1900 * @addr: address to start
1901 * @vma: user vma this addresses belong to
1902 *
1859 * Primitive swap readahead code. We simply read an aligned block of 1903 * Primitive swap readahead code. We simply read an aligned block of
1860 * (1 << page_cluster) entries in the swap area. This method is chosen 1904 * (1 << page_cluster) entries in the swap area. This method is chosen
1861 * because it doesn't cost us any seek time. We also make sure to queue 1905 * because it doesn't cost us any seek time. We also make sure to queue
1862 * the 'original' request together with the readahead ones... 1906 * the 'original' request together with the readahead ones...
1863 * 1907 *
1864 * This has been extended to use the NUMA policies from the mm triggering 1908 * This has been extended to use the NUMA policies from the mm triggering
1865 * the readahead. 1909 * the readahead.
@@ -1934,6 +1978,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1934 migration_entry_wait(mm, pmd, address); 1978 migration_entry_wait(mm, pmd, address);
1935 goto out; 1979 goto out;
1936 } 1980 }
1981 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
1937 page = lookup_swap_cache(entry); 1982 page = lookup_swap_cache(entry);
1938 if (!page) { 1983 if (!page) {
1939 swapin_readahead(entry, address, vma); 1984 swapin_readahead(entry, address, vma);
@@ -1946,15 +1991,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1946 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 1991 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1947 if (likely(pte_same(*page_table, orig_pte))) 1992 if (likely(pte_same(*page_table, orig_pte)))
1948 ret = VM_FAULT_OOM; 1993 ret = VM_FAULT_OOM;
1994 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
1949 goto unlock; 1995 goto unlock;
1950 } 1996 }
1951 1997
1952 /* Had to read the page from swap area: Major fault */ 1998 /* Had to read the page from swap area: Major fault */
1953 ret = VM_FAULT_MAJOR; 1999 ret = VM_FAULT_MAJOR;
1954 inc_page_state(pgmajfault); 2000 count_vm_event(PGMAJFAULT);
1955 grab_swap_token(); 2001 grab_swap_token();
1956 } 2002 }
1957 2003
2004 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
1958 mark_page_accessed(page); 2005 mark_page_accessed(page);
1959 lock_page(page); 2006 lock_page(page);
1960 2007
@@ -2094,6 +2141,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2094 unsigned int sequence = 0; 2141 unsigned int sequence = 0;
2095 int ret = VM_FAULT_MINOR; 2142 int ret = VM_FAULT_MINOR;
2096 int anon = 0; 2143 int anon = 0;
2144 struct page *dirty_page = NULL;
2097 2145
2098 pte_unmap(page_table); 2146 pte_unmap(page_table);
2099 BUG_ON(vma->vm_flags & VM_PFNMAP); 2147 BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2188,6 +2236,10 @@ retry:
2188 } else { 2236 } else {
2189 inc_mm_counter(mm, file_rss); 2237 inc_mm_counter(mm, file_rss);
2190 page_add_file_rmap(new_page); 2238 page_add_file_rmap(new_page);
2239 if (write_access) {
2240 dirty_page = new_page;
2241 get_page(dirty_page);
2242 }
2191 } 2243 }
2192 } else { 2244 } else {
2193 /* One of our sibling threads was faster, back out. */ 2245 /* One of our sibling threads was faster, back out. */
@@ -2200,6 +2252,10 @@ retry:
2200 lazy_mmu_prot_update(entry); 2252 lazy_mmu_prot_update(entry);
2201unlock: 2253unlock:
2202 pte_unmap_unlock(page_table, ptl); 2254 pte_unmap_unlock(page_table, ptl);
2255 if (dirty_page) {
2256 set_page_dirty_balance(dirty_page);
2257 put_page(dirty_page);
2258 }
2203 return ret; 2259 return ret;
2204oom: 2260oom:
2205 page_cache_release(new_page); 2261 page_cache_release(new_page);
@@ -2207,6 +2263,54 @@ oom:
2207} 2263}
2208 2264
2209/* 2265/*
2266 * do_no_pfn() tries to create a new page mapping for a page without
2267 * a struct_page backing it
2268 *
2269 * As this is called only for pages that do not currently exist, we
2270 * do not need to flush old virtual caches or the TLB.
2271 *
2272 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2273 * but allow concurrent faults), and pte mapped but not yet locked.
2274 * We return with mmap_sem still held, but pte unmapped and unlocked.
2275 *
2276 * It is expected that the ->nopfn handler always returns the same pfn
2277 * for a given virtual mapping.
2278 *
2279 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2280 */
2281static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2282 unsigned long address, pte_t *page_table, pmd_t *pmd,
2283 int write_access)
2284{
2285 spinlock_t *ptl;
2286 pte_t entry;
2287 unsigned long pfn;
2288 int ret = VM_FAULT_MINOR;
2289
2290 pte_unmap(page_table);
2291 BUG_ON(!(vma->vm_flags & VM_PFNMAP));
2292 BUG_ON(is_cow_mapping(vma->vm_flags));
2293
2294 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2295 if (pfn == NOPFN_OOM)
2296 return VM_FAULT_OOM;
2297 if (pfn == NOPFN_SIGBUS)
2298 return VM_FAULT_SIGBUS;
2299
2300 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2301
2302 /* Only go through if we didn't race with anybody else... */
2303 if (pte_none(*page_table)) {
2304 entry = pfn_pte(pfn, vma->vm_page_prot);
2305 if (write_access)
2306 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2307 set_pte_at(mm, address, page_table, entry);
2308 }
2309 pte_unmap_unlock(page_table, ptl);
2310 return ret;
2311}
2312
2313/*
2210 * Fault of a previously existing named mapping. Repopulate the pte 2314 * Fault of a previously existing named mapping. Repopulate the pte
2211 * from the encoded file_pte if possible. This enables swappable 2315 * from the encoded file_pte if possible. This enables swappable
2212 * nonlinear vmas. 2316 * nonlinear vmas.
@@ -2268,11 +2372,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2268 old_entry = entry = *pte; 2372 old_entry = entry = *pte;
2269 if (!pte_present(entry)) { 2373 if (!pte_present(entry)) {
2270 if (pte_none(entry)) { 2374 if (pte_none(entry)) {
2271 if (!vma->vm_ops || !vma->vm_ops->nopage) 2375 if (vma->vm_ops) {
2272 return do_anonymous_page(mm, vma, address, 2376 if (vma->vm_ops->nopage)
2273 pte, pmd, write_access); 2377 return do_no_page(mm, vma, address,
2274 return do_no_page(mm, vma, address, 2378 pte, pmd,
2275 pte, pmd, write_access); 2379 write_access);
2380 if (unlikely(vma->vm_ops->nopfn))
2381 return do_no_pfn(mm, vma, address, pte,
2382 pmd, write_access);
2383 }
2384 return do_anonymous_page(mm, vma, address,
2385 pte, pmd, write_access);
2276 } 2386 }
2277 if (pte_file(entry)) 2387 if (pte_file(entry))
2278 return do_file_page(mm, vma, address, 2388 return do_file_page(mm, vma, address,
@@ -2324,7 +2434,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2324 2434
2325 __set_current_state(TASK_RUNNING); 2435 __set_current_state(TASK_RUNNING);
2326 2436
2327 inc_page_state(pgfault); 2437 count_vm_event(PGFAULT);
2328 2438
2329 if (unlikely(is_vm_hugetlb_page(vma))) 2439 if (unlikely(is_vm_hugetlb_page(vma)))
2330 return hugetlb_fault(mm, vma, address, write_access); 2440 return hugetlb_fault(mm, vma, address, write_access);
@@ -2501,3 +2611,56 @@ int in_gate_area_no_task(unsigned long addr)
2501} 2611}
2502 2612
2503#endif /* __HAVE_ARCH_GATE_AREA */ 2613#endif /* __HAVE_ARCH_GATE_AREA */
2614
2615/*
2616 * Access another process' address space.
2617 * Source/target buffer must be kernel space,
2618 * Do not walk the page table directly, use get_user_pages
2619 */
2620int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2621{
2622 struct mm_struct *mm;
2623 struct vm_area_struct *vma;
2624 struct page *page;
2625 void *old_buf = buf;
2626
2627 mm = get_task_mm(tsk);
2628 if (!mm)
2629 return 0;
2630
2631 down_read(&mm->mmap_sem);
2632 /* ignore errors, just check how much was sucessfully transfered */
2633 while (len) {
2634 int bytes, ret, offset;
2635 void *maddr;
2636
2637 ret = get_user_pages(tsk, mm, addr, 1,
2638 write, 1, &page, &vma);
2639 if (ret <= 0)
2640 break;
2641
2642 bytes = len;
2643 offset = addr & (PAGE_SIZE-1);
2644 if (bytes > PAGE_SIZE-offset)
2645 bytes = PAGE_SIZE-offset;
2646
2647 maddr = kmap(page);
2648 if (write) {
2649 copy_to_user_page(vma, page, addr,
2650 maddr + offset, buf, bytes);
2651 set_page_dirty_lock(page);
2652 } else {
2653 copy_from_user_page(vma, page, addr,
2654 buf, maddr + offset, bytes);
2655 }
2656 kunmap(page);
2657 page_cache_release(page);
2658 len -= bytes;
2659 buf += bytes;
2660 addr += bytes;
2661 }
2662 up_read(&mm->mmap_sem);
2663 mmput(mm);
2664
2665 return buf - old_buf;
2666}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 841a077d5aeb..2053bb165a21 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 4 * Copyright (C)
5 */ 5 */
6 6
7#include <linux/config.h>
8#include <linux/stddef.h> 7#include <linux/stddef.h>
9#include <linux/mm.h> 8#include <linux/mm.h>
10#include <linux/swap.h> 9#include <linux/swap.h>
@@ -14,6 +13,7 @@
14#include <linux/compiler.h> 13#include <linux/compiler.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/pagevec.h> 15#include <linux/pagevec.h>
16#include <linux/writeback.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/sysctl.h> 18#include <linux/sysctl.h>
19#include <linux/cpu.h> 19#include <linux/cpu.h>
@@ -21,6 +21,8 @@
21#include <linux/memory_hotplug.h> 21#include <linux/memory_hotplug.h>
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/ioport.h>
25#include <linux/cpuset.h>
24 26
25#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
26 28
@@ -52,6 +54,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
52 int nr_pages = PAGES_PER_SECTION; 54 int nr_pages = PAGES_PER_SECTION;
53 int ret; 55 int ret;
54 56
57 if (pfn_valid(phys_start_pfn))
58 return -EEXIST;
59
55 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 60 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
56 61
57 if (ret < 0) 62 if (ret < 0)
@@ -76,15 +81,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
76{ 81{
77 unsigned long i; 82 unsigned long i;
78 int err = 0; 83 int err = 0;
84 int start_sec, end_sec;
85 /* during initialize mem_map, align hot-added range to section */
86 start_sec = pfn_to_section_nr(phys_start_pfn);
87 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
79 88
80 for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { 89 for (i = start_sec; i <= end_sec; i++) {
81 err = __add_section(zone, phys_start_pfn + i); 90 err = __add_section(zone, i << PFN_SECTION_SHIFT);
82 91
83 /* We want to keep adding the rest of the 92 /*
84 * sections if the first ones already exist 93 * EEXIST is finally dealed with by ioresource collision
94 * check. see add_memory() => register_memory_resource()
95 * Warning will be printed if there is collision.
85 */ 96 */
86 if (err && (err != -EEXIST)) 97 if (err && (err != -EEXIST))
87 break; 98 break;
99 err = 0;
88 } 100 }
89 101
90 return err; 102 return err;
@@ -126,6 +138,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
126 unsigned long i; 138 unsigned long i;
127 unsigned long flags; 139 unsigned long flags;
128 unsigned long onlined_pages = 0; 140 unsigned long onlined_pages = 0;
141 struct resource res;
142 u64 section_end;
143 unsigned long start_pfn;
129 struct zone *zone; 144 struct zone *zone;
130 int need_zonelists_rebuild = 0; 145 int need_zonelists_rebuild = 0;
131 146
@@ -148,10 +163,27 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
148 if (!populated_zone(zone)) 163 if (!populated_zone(zone))
149 need_zonelists_rebuild = 1; 164 need_zonelists_rebuild = 1;
150 165
151 for (i = 0; i < nr_pages; i++) { 166 res.start = (u64)pfn << PAGE_SHIFT;
152 struct page *page = pfn_to_page(pfn + i); 167 res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
153 online_page(page); 168 res.flags = IORESOURCE_MEM; /* we just need system ram */
154 onlined_pages++; 169 section_end = res.end;
170
171 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
172 start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
173 nr_pages = (unsigned long)
174 ((res.end + 1 - res.start) >> PAGE_SHIFT);
175
176 if (PageReserved(pfn_to_page(start_pfn))) {
177 /* this region's page is not onlined now */
178 for (i = 0; i < nr_pages; i++) {
179 struct page *page = pfn_to_page(start_pfn + i);
180 online_page(page);
181 onlined_pages++;
182 }
183 }
184
185 res.start = res.end + 1;
186 res.end = section_end;
155 } 187 }
156 zone->present_pages += onlined_pages; 188 zone->present_pages += onlined_pages;
157 zone->zone_pgdat->node_present_pages += onlined_pages; 189 zone->zone_pgdat->node_present_pages += onlined_pages;
@@ -161,5 +193,119 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
161 if (need_zonelists_rebuild) 193 if (need_zonelists_rebuild)
162 build_all_zonelists(); 194 build_all_zonelists();
163 vm_total_pages = nr_free_pagecache_pages(); 195 vm_total_pages = nr_free_pagecache_pages();
196 writeback_set_ratelimit();
164 return 0; 197 return 0;
165} 198}
199
200static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
201{
202 struct pglist_data *pgdat;
203 unsigned long zones_size[MAX_NR_ZONES] = {0};
204 unsigned long zholes_size[MAX_NR_ZONES] = {0};
205 unsigned long start_pfn = start >> PAGE_SHIFT;
206
207 pgdat = arch_alloc_nodedata(nid);
208 if (!pgdat)
209 return NULL;
210
211 arch_refresh_nodedata(nid, pgdat);
212
213 /* we can use NODE_DATA(nid) from here */
214
215 /* init node's zones as empty zones, we don't have any present pages.*/
216 free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
217
218 return pgdat;
219}
220
221static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
222{
223 arch_refresh_nodedata(nid, NULL);
224 arch_free_nodedata(pgdat);
225 return;
226}
227
228/* add this memory to iomem resource */
229static struct resource *register_memory_resource(u64 start, u64 size)
230{
231 struct resource *res;
232 res = kzalloc(sizeof(struct resource), GFP_KERNEL);
233 BUG_ON(!res);
234
235 res->name = "System RAM";
236 res->start = start;
237 res->end = start + size - 1;
238 res->flags = IORESOURCE_MEM;
239 if (request_resource(&iomem_resource, res) < 0) {
240 printk("System RAM resource %llx - %llx cannot be added\n",
241 (unsigned long long)res->start, (unsigned long long)res->end);
242 kfree(res);
243 res = NULL;
244 }
245 return res;
246}
247
248static void release_memory_resource(struct resource *res)
249{
250 if (!res)
251 return;
252 release_resource(res);
253 kfree(res);
254 return;
255}
256
257
258
259int add_memory(int nid, u64 start, u64 size)
260{
261 pg_data_t *pgdat = NULL;
262 int new_pgdat = 0;
263 struct resource *res;
264 int ret;
265
266 res = register_memory_resource(start, size);
267 if (!res)
268 return -EEXIST;
269
270 if (!node_online(nid)) {
271 pgdat = hotadd_new_pgdat(nid, start);
272 if (!pgdat)
273 return -ENOMEM;
274 new_pgdat = 1;
275 ret = kswapd_run(nid);
276 if (ret)
277 goto error;
278 }
279
280 /* call arch's memory hotadd */
281 ret = arch_add_memory(nid, start, size);
282
283 if (ret < 0)
284 goto error;
285
286 /* we online node here. we can't roll back from here. */
287 node_set_online(nid);
288
289 cpuset_track_online_nodes();
290
291 if (new_pgdat) {
292 ret = register_one_node(nid);
293 /*
294 * If sysfs file of new node can't create, cpu on the node
295 * can't be hot-added. There is no rollback way now.
296 * So, check by BUG_ON() to catch it reluctantly..
297 */
298 BUG_ON(ret);
299 }
300
301 return ret;
302error:
303 /* rollback pgdat allocation and others */
304 if (new_pgdat)
305 rollback_node_hotadd(nid, pgdat);
306 if (res)
307 release_memory_resource(res);
308
309 return ret;
310}
311EXPORT_SYMBOL_GPL(add_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ec4a1a950df9..cf18f0942553 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -105,7 +105,7 @@ static struct kmem_cache *sn_cache;
105 105
106/* Highest zone. An specific allocation for a zone below that is not 106/* Highest zone. An specific allocation for a zone below that is not
107 policied. */ 107 policied. */
108int policy_zone = ZONE_DMA; 108enum zone_type policy_zone = ZONE_DMA;
109 109
110struct mempolicy default_policy = { 110struct mempolicy default_policy = {
111 .refcnt = ATOMIC_INIT(1), /* never free it */ 111 .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -137,7 +137,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
137static struct zonelist *bind_zonelist(nodemask_t *nodes) 137static struct zonelist *bind_zonelist(nodemask_t *nodes)
138{ 138{
139 struct zonelist *zl; 139 struct zonelist *zl;
140 int num, max, nd, k; 140 int num, max, nd;
141 enum zone_type k;
141 142
142 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
143 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 144 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
@@ -148,12 +149,16 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
148 lower zones etc. Avoid empty zones because the memory allocator 149 lower zones etc. Avoid empty zones because the memory allocator
149 doesn't like them. If you implement node hot removal you 150 doesn't like them. If you implement node hot removal you
150 have to fix that. */ 151 have to fix that. */
151 for (k = policy_zone; k >= 0; k--) { 152 k = policy_zone;
153 while (1) {
152 for_each_node_mask(nd, *nodes) { 154 for_each_node_mask(nd, *nodes) {
153 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 155 struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 if (z->present_pages > 0) 156 if (z->present_pages > 0)
155 zl->zones[num++] = z; 157 zl->zones[num++] = z;
156 } 158 }
159 if (k == 0)
160 break;
161 k--;
157 } 162 }
158 zl->zones[num] = NULL; 163 zl->zones[num] = NULL;
159 return zl; 164 return zl;
@@ -482,7 +487,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
482 switch (p->policy) { 487 switch (p->policy) {
483 case MPOL_BIND: 488 case MPOL_BIND:
484 for (i = 0; p->v.zonelist->zones[i]; i++) 489 for (i = 0; p->v.zonelist->zones[i]; i++)
485 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, 490 node_set(zone_to_nid(p->v.zonelist->zones[i]),
486 *nodes); 491 *nodes);
487 break; 492 break;
488 case MPOL_DEFAULT: 493 case MPOL_DEFAULT:
@@ -632,6 +637,10 @@ int do_migrate_pages(struct mm_struct *mm,
632 637
633 down_read(&mm->mmap_sem); 638 down_read(&mm->mmap_sem);
634 639
640 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
641 if (err)
642 goto out;
643
635/* 644/*
636 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 645 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
637 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 646 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
@@ -691,7 +700,7 @@ int do_migrate_pages(struct mm_struct *mm,
691 if (err < 0) 700 if (err < 0)
692 break; 701 break;
693 } 702 }
694 703out:
695 up_read(&mm->mmap_sem); 704 up_read(&mm->mmap_sem);
696 if (err < 0) 705 if (err < 0)
697 return err; 706 return err;
@@ -1127,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1127 */ 1136 */
1128unsigned slab_node(struct mempolicy *policy) 1137unsigned slab_node(struct mempolicy *policy)
1129{ 1138{
1130 switch (policy->policy) { 1139 int pol = policy ? policy->policy : MPOL_DEFAULT;
1140
1141 switch (pol) {
1131 case MPOL_INTERLEAVE: 1142 case MPOL_INTERLEAVE:
1132 return interleave_nodes(policy); 1143 return interleave_nodes(policy);
1133 1144
@@ -1136,7 +1147,7 @@ unsigned slab_node(struct mempolicy *policy)
1136 * Follow bind policy behavior and start allocation at the 1147 * Follow bind policy behavior and start allocation at the
1137 * first node. 1148 * first node.
1138 */ 1149 */
1139 return policy->v.zonelist->zones[0]->zone_pgdat->node_id; 1150 return zone_to_nid(policy->v.zonelist->zones[0]);
1140 1151
1141 case MPOL_PREFERRED: 1152 case MPOL_PREFERRED:
1142 if (policy->v.preferred_node >= 0) 1153 if (policy->v.preferred_node >= 0)
@@ -1172,7 +1183,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1172 if (vma) { 1183 if (vma) {
1173 unsigned long off; 1184 unsigned long off;
1174 1185
1175 off = vma->vm_pgoff; 1186 /*
1187 * for small pages, there is no difference between
1188 * shift and PAGE_SHIFT, so the bit-shift is safe.
1189 * for huge pages, since vm_pgoff is in units of small
1190 * pages, we need to shift off the always 0 bits to get
1191 * a useful offset.
1192 */
1193 BUG_ON(shift < PAGE_SHIFT);
1194 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1176 off += (addr - vma->vm_start) >> shift; 1195 off += (addr - vma->vm_start) >> shift;
1177 return offset_il_node(pol, vma, off); 1196 return offset_il_node(pol, vma, off);
1178 } else 1197 } else
@@ -1205,10 +1224,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1205 1224
1206 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); 1225 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1207 page = __alloc_pages(gfp, order, zl); 1226 page = __alloc_pages(gfp, order, zl);
1208 if (page && page_zone(page) == zl->zones[0]) { 1227 if (page && page_zone(page) == zl->zones[0])
1209 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; 1228 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1210 put_cpu();
1211 }
1212 return page; 1229 return page;
1213} 1230}
1214 1231
@@ -1275,7 +1292,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1275 1292
1276 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1293 if ((gfp & __GFP_WAIT) && !in_interrupt())
1277 cpuset_update_task_memory_state(); 1294 cpuset_update_task_memory_state();
1278 if (!pol || in_interrupt()) 1295 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1279 pol = &default_policy; 1296 pol = &default_policy;
1280 if (pol->policy == MPOL_INTERLEAVE) 1297 if (pol->policy == MPOL_INTERLEAVE)
1281 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1298 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
@@ -1634,7 +1651,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1634 1651
1635 nodes_clear(nodes); 1652 nodes_clear(nodes);
1636 for (z = pol->v.zonelist->zones; *z; z++) 1653 for (z = pol->v.zonelist->zones; *z; z++)
1637 node_set((*z)->zone_pgdat->node_id, nodes); 1654 node_set(zone_to_nid(*z), nodes);
1638 nodes_remap(tmp, nodes, *mpolmask, *newmask); 1655 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1639 nodes = tmp; 1656 nodes = tmp;
1640 1657
@@ -1817,7 +1834,7 @@ static inline void check_huge_range(struct vm_area_struct *vma,
1817 1834
1818int show_numa_map(struct seq_file *m, void *v) 1835int show_numa_map(struct seq_file *m, void *v)
1819{ 1836{
1820 struct task_struct *task = m->private; 1837 struct proc_maps_private *priv = m->private;
1821 struct vm_area_struct *vma = v; 1838 struct vm_area_struct *vma = v;
1822 struct numa_maps *md; 1839 struct numa_maps *md;
1823 struct file *file = vma->vm_file; 1840 struct file *file = vma->vm_file;
@@ -1833,7 +1850,7 @@ int show_numa_map(struct seq_file *m, void *v)
1833 return 0; 1850 return 0;
1834 1851
1835 mpol_to_str(buffer, sizeof(buffer), 1852 mpol_to_str(buffer, sizeof(buffer),
1836 get_vma_policy(task, vma, vma->vm_start)); 1853 get_vma_policy(priv->task, vma, vma->vm_start));
1837 1854
1838 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1855 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1839 1856
@@ -1887,7 +1904,7 @@ out:
1887 kfree(md); 1904 kfree(md);
1888 1905
1889 if (m->count < m->size) 1906 if (m->count < m->size)
1890 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 1907 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1891 return 0; 1908 return 0;
1892} 1909}
1893 1910
diff --git a/mm/mempool.c b/mm/mempool.c
index fe6e05289cc5..ccd8cb8cd41f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -238,8 +238,13 @@ repeat_alloc:
238 init_wait(&wait); 238 init_wait(&wait);
239 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 239 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
240 smp_mb(); 240 smp_mb();
241 if (!pool->curr_nr) 241 if (!pool->curr_nr) {
242 io_schedule(); 242 /*
243 * FIXME: this should be io_schedule(). The timeout is there
244 * as a workaround for some DM problems in 2.6.18.
245 */
246 io_schedule_timeout(5*HZ);
247 }
243 finish_wait(&pool->wait, &wait); 248 finish_wait(&pool->wait, &wait);
244 249
245 goto repeat_alloc; 250 goto repeat_alloc;
diff --git a/mm/migrate.c b/mm/migrate.c
index 1c2a71aa05cd..20a8c2687b1e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -616,15 +616,13 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
616 /* 616 /*
617 * Establish migration ptes or remove ptes 617 * Establish migration ptes or remove ptes
618 */ 618 */
619 if (try_to_unmap(page, 1) != SWAP_FAIL) { 619 try_to_unmap(page, 1);
620 if (!page_mapped(page)) 620 if (!page_mapped(page))
621 rc = move_to_new_page(newpage, page); 621 rc = move_to_new_page(newpage, page);
622 } else
623 /* A vma has VM_LOCKED set -> permanent failure */
624 rc = -EPERM;
625 622
626 if (rc) 623 if (rc)
627 remove_migration_ptes(page, page); 624 remove_migration_ptes(page, page);
625
628unlock: 626unlock:
629 unlock_page(page); 627 unlock_page(page);
630 628
@@ -743,7 +741,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
743 741
744 *result = &pm->status; 742 *result = &pm->status;
745 743
746 return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); 744 return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0);
747} 745}
748 746
749/* 747/*
@@ -976,3 +974,23 @@ out2:
976} 974}
977#endif 975#endif
978 976
977/*
978 * Call migration functions in the vma_ops that may prepare
979 * memory in a vm for migration. migration functions may perform
980 * the migration for vmas that do not have an underlying page struct.
981 */
982int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
983 const nodemask_t *from, unsigned long flags)
984{
985 struct vm_area_struct *vma;
986 int err = 0;
987
988 for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
989 if (vma->vm_ops && vma->vm_ops->migrate) {
990 err = vma->vm_ops->migrate(vma, to, from, flags);
991 if (err)
992 break;
993 }
994 }
995 return err;
996}
diff --git a/mm/mmap.c b/mm/mmap.c
index 6446c6134b04..eea8eefd51a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,10 @@
30#include <asm/cacheflush.h> 30#include <asm/cacheflush.h>
31#include <asm/tlb.h> 31#include <asm/tlb.h>
32 32
33#ifndef arch_mmap_check
34#define arch_mmap_check(addr, len, flags) (0)
35#endif
36
33static void unmap_region(struct mm_struct *mm, 37static void unmap_region(struct mm_struct *mm,
34 struct vm_area_struct *vma, struct vm_area_struct *prev, 38 struct vm_area_struct *vma, struct vm_area_struct *prev,
35 unsigned long start, unsigned long end); 39 unsigned long start, unsigned long end);
@@ -60,6 +64,13 @@ pgprot_t protection_map[16] = {
60 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 64 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
61}; 65};
62 66
67pgprot_t vm_get_page_prot(unsigned long vm_flags)
68{
69 return protection_map[vm_flags &
70 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
71}
72EXPORT_SYMBOL(vm_get_page_prot);
73
63int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 74int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
64int sysctl_overcommit_ratio = 50; /* default is 50% */ 75int sysctl_overcommit_ratio = 50; /* default is 50% */
65int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 76int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
@@ -96,7 +107,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
96 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 107 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
97 unsigned long n; 108 unsigned long n;
98 109
99 free = get_page_cache_size(); 110 free = global_page_state(NR_FILE_PAGES);
100 free += nr_swap_pages; 111 free += nr_swap_pages;
101 112
102 /* 113 /*
@@ -105,7 +116,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
105 * which are reclaimable, under pressure. The dentry 116 * which are reclaimable, under pressure. The dentry
106 * cache and most inode caches should fall into this 117 * cache and most inode caches should fall into this
107 */ 118 */
108 free += atomic_read(&slab_reclaim_pages); 119 free += global_page_state(NR_SLAB_RECLAIMABLE);
109 120
110 /* 121 /*
111 * Leave the last 3% for root 122 * Leave the last 3% for root
@@ -913,6 +924,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
913 if (!len) 924 if (!len)
914 return -EINVAL; 925 return -EINVAL;
915 926
927 error = arch_mmap_check(addr, len, flags);
928 if (error)
929 return error;
930
916 /* Careful about overflows.. */ 931 /* Careful about overflows.. */
917 len = PAGE_ALIGN(len); 932 len = PAGE_ALIGN(len);
918 if (!len || len > TASK_SIZE) 933 if (!len || len > TASK_SIZE)
@@ -1090,12 +1105,6 @@ munmap_back:
1090 goto free_vma; 1105 goto free_vma;
1091 } 1106 }
1092 1107
1093 /* Don't make the VMA automatically writable if it's shared, but the
1094 * backer wishes to know when pages are first written to */
1095 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1096 vma->vm_page_prot =
1097 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1098
1099 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform 1108 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1100 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) 1109 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1101 * that memory reservation must be checked; but that reservation 1110 * that memory reservation must be checked; but that reservation
@@ -1113,6 +1122,10 @@ munmap_back:
1113 pgoff = vma->vm_pgoff; 1122 pgoff = vma->vm_pgoff;
1114 vm_flags = vma->vm_flags; 1123 vm_flags = vma->vm_flags;
1115 1124
1125 if (vma_wants_writenotify(vma))
1126 vma->vm_page_prot =
1127 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1128
1116 if (!file || !vma_merge(mm, prev, addr, vma->vm_end, 1129 if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
1117 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { 1130 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
1118 file = vma->vm_file; 1131 file = vma->vm_file;
@@ -1859,6 +1872,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1859 unsigned long flags; 1872 unsigned long flags;
1860 struct rb_node ** rb_link, * rb_parent; 1873 struct rb_node ** rb_link, * rb_parent;
1861 pgoff_t pgoff = addr >> PAGE_SHIFT; 1874 pgoff_t pgoff = addr >> PAGE_SHIFT;
1875 int error;
1862 1876
1863 len = PAGE_ALIGN(len); 1877 len = PAGE_ALIGN(len);
1864 if (!len) 1878 if (!len)
@@ -1867,6 +1881,12 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1867 if ((addr + len) > TASK_SIZE || (addr + len) < addr) 1881 if ((addr + len) > TASK_SIZE || (addr + len) < addr)
1868 return -EINVAL; 1882 return -EINVAL;
1869 1883
1884 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1885
1886 error = arch_mmap_check(addr, len, flags);
1887 if (error)
1888 return error;
1889
1870 /* 1890 /*
1871 * mlock MCL_FUTURE? 1891 * mlock MCL_FUTURE?
1872 */ 1892 */
@@ -1907,8 +1927,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1907 if (security_vm_enough_memory(len >> PAGE_SHIFT)) 1927 if (security_vm_enough_memory(len >> PAGE_SHIFT))
1908 return -ENOMEM; 1928 return -ENOMEM;
1909 1929
1910 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1911
1912 /* Can we just expand an old private anonymous mapping? */ 1930 /* Can we just expand an old private anonymous mapping? */
1913 if (vma_merge(mm, prev, addr, addr + len, flags, 1931 if (vma_merge(mm, prev, addr, addr + len, flags,
1914 NULL, NULL, pgoff, NULL)) 1932 NULL, NULL, pgoff, NULL))
diff --git a/mm/mmzone.c b/mm/mmzone.c
index b022370e612e..febea1c98168 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -5,7 +5,6 @@
5 */ 5 */
6 6
7 7
8#include <linux/config.h>
9#include <linux/stddef.h> 8#include <linux/stddef.h>
10#include <linux/mmzone.h> 9#include <linux/mmzone.h>
11#include <linux/module.h> 10#include <linux/module.h>
@@ -15,7 +14,7 @@ struct pglist_data *first_online_pgdat(void)
15 return NODE_DATA(first_online_node); 14 return NODE_DATA(first_online_node);
16} 15}
17 16
18EXPORT_SYMBOL(first_online_pgdat); 17EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */
19 18
20struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) 19struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
21{ 20{
@@ -25,7 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
25 return NULL; 24 return NULL;
26 return NODE_DATA(nid); 25 return NODE_DATA(nid);
27} 26}
28EXPORT_SYMBOL(next_online_pgdat); 27EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */
29 28
30 29
31/* 30/*
@@ -46,5 +45,5 @@ struct zone *next_zone(struct zone *zone)
46 } 45 }
47 return zone; 46 return zone;
48} 47}
49EXPORT_SYMBOL(next_zone); 48EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */
50 49
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 638edabaff71..955f9d0e38aa 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -27,7 +27,8 @@
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28 28
29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
30 unsigned long addr, unsigned long end, pgprot_t newprot) 30 unsigned long addr, unsigned long end, pgprot_t newprot,
31 int dirty_accountable)
31{ 32{
32 pte_t *pte, oldpte; 33 pte_t *pte, oldpte;
33 spinlock_t *ptl; 34 spinlock_t *ptl;
@@ -42,7 +43,14 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
42 * bits by wiping the pte and then setting the new pte 43 * bits by wiping the pte and then setting the new pte
43 * into place. 44 * into place.
44 */ 45 */
45 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); 46 ptent = ptep_get_and_clear(mm, addr, pte);
47 ptent = pte_modify(ptent, newprot);
48 /*
49 * Avoid taking write faults for pages we know to be
50 * dirty.
51 */
52 if (dirty_accountable && pte_dirty(ptent))
53 ptent = pte_mkwrite(ptent);
46 set_pte_at(mm, addr, pte, ptent); 54 set_pte_at(mm, addr, pte, ptent);
47 lazy_mmu_prot_update(ptent); 55 lazy_mmu_prot_update(ptent);
48#ifdef CONFIG_MIGRATION 56#ifdef CONFIG_MIGRATION
@@ -66,7 +74,8 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
66} 74}
67 75
68static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, 76static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
69 unsigned long addr, unsigned long end, pgprot_t newprot) 77 unsigned long addr, unsigned long end, pgprot_t newprot,
78 int dirty_accountable)
70{ 79{
71 pmd_t *pmd; 80 pmd_t *pmd;
72 unsigned long next; 81 unsigned long next;
@@ -76,12 +85,13 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
76 next = pmd_addr_end(addr, end); 85 next = pmd_addr_end(addr, end);
77 if (pmd_none_or_clear_bad(pmd)) 86 if (pmd_none_or_clear_bad(pmd))
78 continue; 87 continue;
79 change_pte_range(mm, pmd, addr, next, newprot); 88 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
80 } while (pmd++, addr = next, addr != end); 89 } while (pmd++, addr = next, addr != end);
81} 90}
82 91
83static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, 92static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
84 unsigned long addr, unsigned long end, pgprot_t newprot) 93 unsigned long addr, unsigned long end, pgprot_t newprot,
94 int dirty_accountable)
85{ 95{
86 pud_t *pud; 96 pud_t *pud;
87 unsigned long next; 97 unsigned long next;
@@ -91,12 +101,13 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
91 next = pud_addr_end(addr, end); 101 next = pud_addr_end(addr, end);
92 if (pud_none_or_clear_bad(pud)) 102 if (pud_none_or_clear_bad(pud))
93 continue; 103 continue;
94 change_pmd_range(mm, pud, addr, next, newprot); 104 change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
95 } while (pud++, addr = next, addr != end); 105 } while (pud++, addr = next, addr != end);
96} 106}
97 107
98static void change_protection(struct vm_area_struct *vma, 108static void change_protection(struct vm_area_struct *vma,
99 unsigned long addr, unsigned long end, pgprot_t newprot) 109 unsigned long addr, unsigned long end, pgprot_t newprot,
110 int dirty_accountable)
100{ 111{
101 struct mm_struct *mm = vma->vm_mm; 112 struct mm_struct *mm = vma->vm_mm;
102 pgd_t *pgd; 113 pgd_t *pgd;
@@ -110,7 +121,7 @@ static void change_protection(struct vm_area_struct *vma,
110 next = pgd_addr_end(addr, end); 121 next = pgd_addr_end(addr, end);
111 if (pgd_none_or_clear_bad(pgd)) 122 if (pgd_none_or_clear_bad(pgd))
112 continue; 123 continue;
113 change_pud_range(mm, pgd, addr, next, newprot); 124 change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
114 } while (pgd++, addr = next, addr != end); 125 } while (pgd++, addr = next, addr != end);
115 flush_tlb_range(vma, start, end); 126 flush_tlb_range(vma, start, end);
116} 127}
@@ -123,10 +134,9 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
123 unsigned long oldflags = vma->vm_flags; 134 unsigned long oldflags = vma->vm_flags;
124 long nrpages = (end - start) >> PAGE_SHIFT; 135 long nrpages = (end - start) >> PAGE_SHIFT;
125 unsigned long charged = 0; 136 unsigned long charged = 0;
126 unsigned int mask;
127 pgprot_t newprot;
128 pgoff_t pgoff; 137 pgoff_t pgoff;
129 int error; 138 int error;
139 int dirty_accountable = 0;
130 140
131 if (newflags == oldflags) { 141 if (newflags == oldflags) {
132 *pprev = vma; 142 *pprev = vma;
@@ -176,24 +186,23 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
176 } 186 }
177 187
178success: 188success:
179 /* Don't make the VMA automatically writable if it's shared, but the
180 * backer wishes to know when pages are first written to */
181 mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
182 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
183 mask &= ~VM_SHARED;
184
185 newprot = protection_map[newflags & mask];
186
187 /* 189 /*
188 * vm_flags and vm_page_prot are protected by the mmap_sem 190 * vm_flags and vm_page_prot are protected by the mmap_sem
189 * held in write mode. 191 * held in write mode.
190 */ 192 */
191 vma->vm_flags = newflags; 193 vma->vm_flags = newflags;
192 vma->vm_page_prot = newprot; 194 vma->vm_page_prot = protection_map[newflags &
195 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
196 if (vma_wants_writenotify(vma)) {
197 vma->vm_page_prot = protection_map[newflags &
198 (VM_READ|VM_WRITE|VM_EXEC)];
199 dirty_accountable = 1;
200 }
201
193 if (is_vm_hugetlb_page(vma)) 202 if (is_vm_hugetlb_page(vma))
194 hugetlb_change_protection(vma, start, end, newprot); 203 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
195 else 204 else
196 change_protection(vma, start, end, newprot); 205 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
197 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 206 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
198 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 207 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
199 return 0; 208 return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index 1903bdf65e42..7c15cf3373ad 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -97,7 +97,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
97 new_pte = pte_offset_map_nested(new_pmd, new_addr); 97 new_pte = pte_offset_map_nested(new_pmd, new_addr);
98 new_ptl = pte_lockptr(mm, new_pmd); 98 new_ptl = pte_lockptr(mm, new_pmd);
99 if (new_ptl != old_ptl) 99 if (new_ptl != old_ptl)
100 spin_lock(new_ptl); 100 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
101 101
102 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, 102 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
103 new_pte++, new_addr += PAGE_SIZE) { 103 new_pte++, new_addr += PAGE_SIZE) {
diff --git a/mm/msync.c b/mm/msync.c
index d083544df21b..358d73cf7b78 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -7,149 +7,33 @@
7/* 7/*
8 * The msync() system call. 8 * The msync() system call.
9 */ 9 */
10#include <linux/slab.h>
11#include <linux/pagemap.h>
12#include <linux/fs.h> 10#include <linux/fs.h>
13#include <linux/mm.h> 11#include <linux/mm.h>
14#include <linux/mman.h> 12#include <linux/mman.h>
15#include <linux/hugetlb.h>
16#include <linux/writeback.h>
17#include <linux/file.h> 13#include <linux/file.h>
18#include <linux/syscalls.h> 14#include <linux/syscalls.h>
19 15
20#include <asm/pgtable.h>
21#include <asm/tlbflush.h>
22
23static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
24 unsigned long addr, unsigned long end)
25{
26 pte_t *pte;
27 spinlock_t *ptl;
28 int progress = 0;
29 unsigned long ret = 0;
30
31again:
32 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
33 do {
34 struct page *page;
35
36 if (progress >= 64) {
37 progress = 0;
38 if (need_resched() || need_lockbreak(ptl))
39 break;
40 }
41 progress++;
42 if (!pte_present(*pte))
43 continue;
44 if (!pte_maybe_dirty(*pte))
45 continue;
46 page = vm_normal_page(vma, addr, *pte);
47 if (!page)
48 continue;
49 if (ptep_clear_flush_dirty(vma, addr, pte) ||
50 page_test_and_clear_dirty(page))
51 ret += set_page_dirty(page);
52 progress += 3;
53 } while (pte++, addr += PAGE_SIZE, addr != end);
54 pte_unmap_unlock(pte - 1, ptl);
55 cond_resched();
56 if (addr != end)
57 goto again;
58 return ret;
59}
60
61static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
62 pud_t *pud, unsigned long addr, unsigned long end)
63{
64 pmd_t *pmd;
65 unsigned long next;
66 unsigned long ret = 0;
67
68 pmd = pmd_offset(pud, addr);
69 do {
70 next = pmd_addr_end(addr, end);
71 if (pmd_none_or_clear_bad(pmd))
72 continue;
73 ret += msync_pte_range(vma, pmd, addr, next);
74 } while (pmd++, addr = next, addr != end);
75 return ret;
76}
77
78static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
79 pgd_t *pgd, unsigned long addr, unsigned long end)
80{
81 pud_t *pud;
82 unsigned long next;
83 unsigned long ret = 0;
84
85 pud = pud_offset(pgd, addr);
86 do {
87 next = pud_addr_end(addr, end);
88 if (pud_none_or_clear_bad(pud))
89 continue;
90 ret += msync_pmd_range(vma, pud, addr, next);
91 } while (pud++, addr = next, addr != end);
92 return ret;
93}
94
95static unsigned long msync_page_range(struct vm_area_struct *vma,
96 unsigned long addr, unsigned long end)
97{
98 pgd_t *pgd;
99 unsigned long next;
100 unsigned long ret = 0;
101
102 /* For hugepages we can't go walking the page table normally,
103 * but that's ok, hugetlbfs is memory based, so we don't need
104 * to do anything more on an msync().
105 */
106 if (vma->vm_flags & VM_HUGETLB)
107 return 0;
108
109 BUG_ON(addr >= end);
110 pgd = pgd_offset(vma->vm_mm, addr);
111 flush_cache_range(vma, addr, end);
112 do {
113 next = pgd_addr_end(addr, end);
114 if (pgd_none_or_clear_bad(pgd))
115 continue;
116 ret += msync_pud_range(vma, pgd, addr, next);
117 } while (pgd++, addr = next, addr != end);
118 return ret;
119}
120
121/* 16/*
122 * MS_SYNC syncs the entire file - including mappings. 17 * MS_SYNC syncs the entire file - including mappings.
123 * 18 *
124 * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just 19 * MS_ASYNC does not start I/O (it used to, up to 2.5.67).
125 * marks the relevant pages dirty. The application may now run fsync() to 20 * Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
21 * Now it doesn't do anything, since dirty pages are properly tracked.
22 *
23 * The application may now run fsync() to
126 * write out the dirty pages and wait on the writeout and check the result. 24 * write out the dirty pages and wait on the writeout and check the result.
127 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start 25 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
128 * async writeout immediately. 26 * async writeout immediately.
129 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to 27 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
130 * applications. 28 * applications.
131 */ 29 */
132static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
133 unsigned long end, int flags,
134 unsigned long *nr_pages_dirtied)
135{
136 struct file *file = vma->vm_file;
137
138 if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
139 return -EBUSY;
140
141 if (file && (vma->vm_flags & VM_SHARED))
142 *nr_pages_dirtied = msync_page_range(vma, addr, end);
143 return 0;
144}
145
146asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 30asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
147{ 31{
148 unsigned long end; 32 unsigned long end;
33 struct mm_struct *mm = current->mm;
149 struct vm_area_struct *vma; 34 struct vm_area_struct *vma;
150 int unmapped_error = 0; 35 int unmapped_error = 0;
151 int error = -EINVAL; 36 int error = -EINVAL;
152 int done = 0;
153 37
154 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) 38 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
155 goto out; 39 goto out;
@@ -169,64 +53,50 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
169 * If the interval [start,end) covers some unmapped address ranges, 53 * If the interval [start,end) covers some unmapped address ranges,
170 * just ignore them, but return -ENOMEM at the end. 54 * just ignore them, but return -ENOMEM at the end.
171 */ 55 */
172 down_read(&current->mm->mmap_sem); 56 down_read(&mm->mmap_sem);
173 vma = find_vma(current->mm, start); 57 vma = find_vma(mm, start);
174 if (!vma) { 58 for (;;) {
175 error = -ENOMEM;
176 goto out_unlock;
177 }
178 do {
179 unsigned long nr_pages_dirtied = 0;
180 struct file *file; 59 struct file *file;
181 60
61 /* Still start < end. */
62 error = -ENOMEM;
63 if (!vma)
64 goto out_unlock;
182 /* Here start < vma->vm_end. */ 65 /* Here start < vma->vm_end. */
183 if (start < vma->vm_start) { 66 if (start < vma->vm_start) {
184 unmapped_error = -ENOMEM;
185 start = vma->vm_start; 67 start = vma->vm_start;
68 if (start >= end)
69 goto out_unlock;
70 unmapped_error = -ENOMEM;
186 } 71 }
187 /* Here vma->vm_start <= start < vma->vm_end. */ 72 /* Here vma->vm_start <= start < vma->vm_end. */
188 if (end <= vma->vm_end) { 73 if ((flags & MS_INVALIDATE) &&
189 if (start < end) { 74 (vma->vm_flags & VM_LOCKED)) {
190 error = msync_interval(vma, start, end, flags, 75 error = -EBUSY;
191 &nr_pages_dirtied); 76 goto out_unlock;
192 if (error)
193 goto out_unlock;
194 }
195 error = unmapped_error;
196 done = 1;
197 } else {
198 /* Here vma->vm_start <= start < vma->vm_end < end. */
199 error = msync_interval(vma, start, vma->vm_end, flags,
200 &nr_pages_dirtied);
201 if (error)
202 goto out_unlock;
203 } 77 }
204 file = vma->vm_file; 78 file = vma->vm_file;
205 start = vma->vm_end; 79 start = vma->vm_end;
206 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { 80 if ((flags & MS_SYNC) && file &&
207 get_file(file);
208 up_read(&current->mm->mmap_sem);
209 balance_dirty_pages_ratelimited_nr(file->f_mapping,
210 nr_pages_dirtied);
211 fput(file);
212 down_read(&current->mm->mmap_sem);
213 vma = find_vma(current->mm, start);
214 } else if ((flags & MS_SYNC) && file &&
215 (vma->vm_flags & VM_SHARED)) { 81 (vma->vm_flags & VM_SHARED)) {
216 get_file(file); 82 get_file(file);
217 up_read(&current->mm->mmap_sem); 83 up_read(&mm->mmap_sem);
218 error = do_fsync(file, 0); 84 error = do_fsync(file, 0);
219 fput(file); 85 fput(file);
220 down_read(&current->mm->mmap_sem); 86 if (error || start >= end)
221 if (error) 87 goto out;
222 goto out_unlock; 88 down_read(&mm->mmap_sem);
223 vma = find_vma(current->mm, start); 89 vma = find_vma(mm, start);
224 } else { 90 } else {
91 if (start >= end) {
92 error = 0;
93 goto out_unlock;
94 }
225 vma = vma->vm_next; 95 vma = vma->vm_next;
226 } 96 }
227 } while (vma && !done); 97 }
228out_unlock: 98out_unlock:
229 up_read(&current->mm->mmap_sem); 99 up_read(&mm->mmap_sem);
230out: 100out:
231 return error; 101 return error ? : unmapped_error;
232} 102}
diff --git a/mm/nommu.c b/mm/nommu.c
index 029fadac0fb5..564540662192 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp)
122} 122}
123 123
124/* 124/*
125 * The nommu dodgy version :-) 125 * get a list of pages in an address range belonging to the specified process
126 * and indicate the VMA that covers each page
127 * - this is potentially dodgy as we may end incrementing the page count of a
128 * slab page or a secondary page from a compound page
129 * - don't permit access to VMAs that don't support it, such as I/O mappings
126 */ 130 */
127int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 131int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
128 unsigned long start, int len, int write, int force, 132 unsigned long start, int len, int write, int force,
129 struct page **pages, struct vm_area_struct **vmas) 133 struct page **pages, struct vm_area_struct **vmas)
130{ 134{
135 struct vm_area_struct *vma;
136 unsigned long vm_flags;
131 int i; 137 int i;
132 static struct vm_area_struct dummy_vma; 138
139 /* calculate required read or write permissions.
140 * - if 'force' is set, we only require the "MAY" flags.
141 */
142 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
143 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
133 144
134 for (i = 0; i < len; i++) { 145 for (i = 0; i < len; i++) {
146 vma = find_vma(mm, start);
147 if (!vma)
148 goto finish_or_fault;
149
150 /* protect what we can, including chardevs */
151 if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
152 !(vm_flags & vma->vm_flags))
153 goto finish_or_fault;
154
135 if (pages) { 155 if (pages) {
136 pages[i] = virt_to_page(start); 156 pages[i] = virt_to_page(start);
137 if (pages[i]) 157 if (pages[i])
138 page_cache_get(pages[i]); 158 page_cache_get(pages[i]);
139 } 159 }
140 if (vmas) 160 if (vmas)
141 vmas[i] = &dummy_vma; 161 vmas[i] = vma;
142 start += PAGE_SIZE; 162 start += PAGE_SIZE;
143 } 163 }
144 return(i); 164
165 return i;
166
167finish_or_fault:
168 return i ? : -EFAULT;
145} 169}
146 170
147EXPORT_SYMBOL(get_user_pages); 171EXPORT_SYMBOL(get_user_pages);
@@ -286,6 +310,77 @@ static void show_process_blocks(void)
286} 310}
287#endif /* DEBUG */ 311#endif /* DEBUG */
288 312
313/*
314 * add a VMA into a process's mm_struct in the appropriate place in the list
315 * - should be called with mm->mmap_sem held writelocked
316 */
317static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
318{
319 struct vm_list_struct **ppv;
320
321 for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
322 if ((*ppv)->vma->vm_start > vml->vma->vm_start)
323 break;
324
325 vml->next = *ppv;
326 *ppv = vml;
327}
328
329/*
330 * look up the first VMA in which addr resides, NULL if none
331 * - should be called with mm->mmap_sem at least held readlocked
332 */
333struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
334{
335 struct vm_list_struct *loop, *vml;
336
337 /* search the vm_start ordered list */
338 vml = NULL;
339 for (loop = mm->context.vmlist; loop; loop = loop->next) {
340 if (loop->vma->vm_start > addr)
341 break;
342 vml = loop;
343 }
344
345 if (vml && vml->vma->vm_end > addr)
346 return vml->vma;
347
348 return NULL;
349}
350EXPORT_SYMBOL(find_vma);
351
352/*
353 * find a VMA
354 * - we don't extend stack VMAs under NOMMU conditions
355 */
356struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
357{
358 return find_vma(mm, addr);
359}
360
361/*
362 * look up the first VMA exactly that exactly matches addr
363 * - should be called with mm->mmap_sem at least held readlocked
364 */
365static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
366 unsigned long addr)
367{
368 struct vm_list_struct *vml;
369
370 /* search the vm_start ordered list */
371 for (vml = mm->context.vmlist; vml; vml = vml->next) {
372 if (vml->vma->vm_start == addr)
373 return vml->vma;
374 if (vml->vma->vm_start > addr)
375 break;
376 }
377
378 return NULL;
379}
380
381/*
382 * find a VMA in the global tree
383 */
289static inline struct vm_area_struct *find_nommu_vma(unsigned long start) 384static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
290{ 385{
291 struct vm_area_struct *vma; 386 struct vm_area_struct *vma;
@@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
305 return NULL; 400 return NULL;
306} 401}
307 402
403/*
404 * add a VMA in the global tree
405 */
308static void add_nommu_vma(struct vm_area_struct *vma) 406static void add_nommu_vma(struct vm_area_struct *vma)
309{ 407{
310 struct vm_area_struct *pvma; 408 struct vm_area_struct *pvma;
@@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma)
351 rb_insert_color(&vma->vm_rb, &nommu_vma_tree); 449 rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
352} 450}
353 451
452/*
453 * delete a VMA from the global list
454 */
354static void delete_nommu_vma(struct vm_area_struct *vma) 455static void delete_nommu_vma(struct vm_area_struct *vma)
355{ 456{
356 struct address_space *mapping; 457 struct address_space *mapping;
@@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file,
828 realalloc += kobjsize(vml); 929 realalloc += kobjsize(vml);
829 askedalloc += sizeof(*vml); 930 askedalloc += sizeof(*vml);
830 931
831 vml->next = current->mm->context.vmlist; 932 add_vma_to_mm(current->mm, vml);
832 current->mm->context.vmlist = vml;
833 933
834 up_write(&nommu_vma_sem); 934 up_write(&nommu_vma_sem);
835 935
@@ -908,6 +1008,11 @@ static void put_vma(struct vm_area_struct *vma)
908 } 1008 }
909} 1009}
910 1010
1011/*
1012 * release a mapping
1013 * - under NOMMU conditions the parameters must match exactly to the mapping to
1014 * be removed
1015 */
911int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) 1016int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
912{ 1017{
913 struct vm_list_struct *vml, **parent; 1018 struct vm_list_struct *vml, **parent;
@@ -917,10 +1022,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
917 printk("do_munmap:\n"); 1022 printk("do_munmap:\n");
918#endif 1023#endif
919 1024
920 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) 1025 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
1026 if ((*parent)->vma->vm_start > addr)
1027 break;
921 if ((*parent)->vma->vm_start == addr && 1028 if ((*parent)->vma->vm_start == addr &&
922 ((len == 0) || ((*parent)->vma->vm_end == end))) 1029 ((len == 0) || ((*parent)->vma->vm_end == end)))
923 goto found; 1030 goto found;
1031 }
924 1032
925 printk("munmap of non-mmaped memory by process %d (%s): %p\n", 1033 printk("munmap of non-mmaped memory by process %d (%s): %p\n",
926 current->pid, current->comm, (void *) addr); 1034 current->pid, current->comm, (void *) addr);
@@ -946,7 +1054,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
946 return 0; 1054 return 0;
947} 1055}
948 1056
949/* Release all mmaps. */ 1057asmlinkage long sys_munmap(unsigned long addr, size_t len)
1058{
1059 int ret;
1060 struct mm_struct *mm = current->mm;
1061
1062 down_write(&mm->mmap_sem);
1063 ret = do_munmap(mm, addr, len);
1064 up_write(&mm->mmap_sem);
1065 return ret;
1066}
1067
1068/*
1069 * Release all mappings
1070 */
950void exit_mmap(struct mm_struct * mm) 1071void exit_mmap(struct mm_struct * mm)
951{ 1072{
952 struct vm_list_struct *tmp; 1073 struct vm_list_struct *tmp;
@@ -973,37 +1094,26 @@ void exit_mmap(struct mm_struct * mm)
973 } 1094 }
974} 1095}
975 1096
976asmlinkage long sys_munmap(unsigned long addr, size_t len)
977{
978 int ret;
979 struct mm_struct *mm = current->mm;
980
981 down_write(&mm->mmap_sem);
982 ret = do_munmap(mm, addr, len);
983 up_write(&mm->mmap_sem);
984 return ret;
985}
986
987unsigned long do_brk(unsigned long addr, unsigned long len) 1097unsigned long do_brk(unsigned long addr, unsigned long len)
988{ 1098{
989 return -ENOMEM; 1099 return -ENOMEM;
990} 1100}
991 1101
992/* 1102/*
993 * Expand (or shrink) an existing mapping, potentially moving it at the 1103 * expand (or shrink) an existing mapping, potentially moving it at the same
994 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1104 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
995 * 1105 *
996 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 1106 * under NOMMU conditions, we only permit changing a mapping's size, and only
997 * This option implies MREMAP_MAYMOVE. 1107 * as long as it stays within the hole allocated by the kmalloc() call in
1108 * do_mmap_pgoff() and the block is not shareable
998 * 1109 *
999 * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the 1110 * MREMAP_FIXED is not supported under NOMMU conditions
1000 * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable
1001 */ 1111 */
1002unsigned long do_mremap(unsigned long addr, 1112unsigned long do_mremap(unsigned long addr,
1003 unsigned long old_len, unsigned long new_len, 1113 unsigned long old_len, unsigned long new_len,
1004 unsigned long flags, unsigned long new_addr) 1114 unsigned long flags, unsigned long new_addr)
1005{ 1115{
1006 struct vm_list_struct *vml = NULL; 1116 struct vm_area_struct *vma;
1007 1117
1008 /* insanity checks first */ 1118 /* insanity checks first */
1009 if (new_len == 0) 1119 if (new_len == 0)
@@ -1012,64 +1122,53 @@ unsigned long do_mremap(unsigned long addr,
1012 if (flags & MREMAP_FIXED && new_addr != addr) 1122 if (flags & MREMAP_FIXED && new_addr != addr)
1013 return (unsigned long) -EINVAL; 1123 return (unsigned long) -EINVAL;
1014 1124
1015 for (vml = current->mm->context.vmlist; vml; vml = vml->next) 1125 vma = find_vma_exact(current->mm, addr);
1016 if (vml->vma->vm_start == addr) 1126 if (!vma)
1017 goto found; 1127 return (unsigned long) -EINVAL;
1018
1019 return (unsigned long) -EINVAL;
1020 1128
1021 found: 1129 if (vma->vm_end != vma->vm_start + old_len)
1022 if (vml->vma->vm_end != vml->vma->vm_start + old_len)
1023 return (unsigned long) -EFAULT; 1130 return (unsigned long) -EFAULT;
1024 1131
1025 if (vml->vma->vm_flags & VM_MAYSHARE) 1132 if (vma->vm_flags & VM_MAYSHARE)
1026 return (unsigned long) -EPERM; 1133 return (unsigned long) -EPERM;
1027 1134
1028 if (new_len > kobjsize((void *) addr)) 1135 if (new_len > kobjsize((void *) addr))
1029 return (unsigned long) -ENOMEM; 1136 return (unsigned long) -ENOMEM;
1030 1137
1031 /* all checks complete - do it */ 1138 /* all checks complete - do it */
1032 vml->vma->vm_end = vml->vma->vm_start + new_len; 1139 vma->vm_end = vma->vm_start + new_len;
1033 1140
1034 askedalloc -= old_len; 1141 askedalloc -= old_len;
1035 askedalloc += new_len; 1142 askedalloc += new_len;
1036 1143
1037 return vml->vma->vm_start; 1144 return vma->vm_start;
1038} 1145}
1039 1146
1040/* 1147asmlinkage unsigned long sys_mremap(unsigned long addr,
1041 * Look up the first VMA which satisfies addr < vm_end, NULL if none 1148 unsigned long old_len, unsigned long new_len,
1042 */ 1149 unsigned long flags, unsigned long new_addr)
1043struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1044{ 1150{
1045 struct vm_list_struct *vml; 1151 unsigned long ret;
1046
1047 for (vml = mm->context.vmlist; vml; vml = vml->next)
1048 if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
1049 return vml->vma;
1050 1152
1051 return NULL; 1153 down_write(&current->mm->mmap_sem);
1154 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
1155 up_write(&current->mm->mmap_sem);
1156 return ret;
1052} 1157}
1053 1158
1054EXPORT_SYMBOL(find_vma);
1055
1056struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1159struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1057 unsigned int foll_flags) 1160 unsigned int foll_flags)
1058{ 1161{
1059 return NULL; 1162 return NULL;
1060} 1163}
1061 1164
1062struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
1063{
1064 return NULL;
1065}
1066
1067int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, 1165int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
1068 unsigned long to, unsigned long size, pgprot_t prot) 1166 unsigned long to, unsigned long size, pgprot_t prot)
1069{ 1167{
1070 vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; 1168 vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
1071 return 0; 1169 return 0;
1072} 1170}
1171EXPORT_SYMBOL(remap_pfn_range);
1073 1172
1074void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1173void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1075{ 1174{
@@ -1090,6 +1189,7 @@ void unmap_mapping_range(struct address_space *mapping,
1090 int even_cows) 1189 int even_cows)
1091{ 1190{
1092} 1191}
1192EXPORT_SYMBOL(unmap_mapping_range);
1093 1193
1094/* 1194/*
1095 * Check that a process has enough memory to allocate a new virtual 1195 * Check that a process has enough memory to allocate a new virtual
@@ -1122,7 +1222,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
1122 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 1222 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
1123 unsigned long n; 1223 unsigned long n;
1124 1224
1125 free = get_page_cache_size(); 1225 free = global_page_state(NR_FILE_PAGES);
1126 free += nr_swap_pages; 1226 free += nr_swap_pages;
1127 1227
1128 /* 1228 /*
@@ -1131,7 +1231,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
1131 * which are reclaimable, under pressure. The dentry 1231 * which are reclaimable, under pressure. The dentry
1132 * cache and most inode caches should fall into this 1232 * cache and most inode caches should fall into this
1133 */ 1233 */
1134 free += atomic_read(&slab_reclaim_pages); 1234 free += global_page_state(NR_SLAB_RECLAIMABLE);
1135 1235
1136 /* 1236 /*
1137 * Leave the last 3% for root 1237 * Leave the last 3% for root
@@ -1204,3 +1304,44 @@ struct page *filemap_nopage(struct vm_area_struct *area,
1204 BUG(); 1304 BUG();
1205 return NULL; 1305 return NULL;
1206} 1306}
1307
1308/*
1309 * Access another process' address space.
1310 * - source/target buffer must be kernel space
1311 */
1312int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
1313{
1314 struct vm_area_struct *vma;
1315 struct mm_struct *mm;
1316
1317 if (addr + len < addr)
1318 return 0;
1319
1320 mm = get_task_mm(tsk);
1321 if (!mm)
1322 return 0;
1323
1324 down_read(&mm->mmap_sem);
1325
1326 /* the access must start within one of the target process's mappings */
1327 vma = find_vma(mm, addr);
1328 if (vma) {
1329 /* don't overrun this mapping */
1330 if (addr + len >= vma->vm_end)
1331 len = vma->vm_end - addr;
1332
1333 /* only read or write mappings where it is permitted */
1334 if (write && vma->vm_flags & VM_MAYWRITE)
1335 len -= copy_to_user((void *) addr, buf, len);
1336 else if (!write && vma->vm_flags & VM_MAYREAD)
1337 len -= copy_from_user(buf, (void *) addr, len);
1338 else
1339 len = 0;
1340 } else {
1341 len = 0;
1342 }
1343
1344 up_read(&mm->mmap_sem);
1345 mmput(mm);
1346 return len;
1347}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d46ed0f1dc06..20f41b082e16 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -21,6 +21,8 @@
21#include <linux/timex.h> 21#include <linux/timex.h>
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/module.h>
25#include <linux/notifier.h>
24 26
25int sysctl_panic_on_oom; 27int sysctl_panic_on_oom;
26/* #define DEBUG */ 28/* #define DEBUG */
@@ -58,6 +60,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 } 60 }
59 61
60 /* 62 /*
63 * swapoff can easily use up all memory, so kill those first.
64 */
65 if (p->flags & PF_SWAPOFF)
66 return ULONG_MAX;
67
68 /*
61 * The memory size of the process is the basis for the badness. 69 * The memory size of the process is the basis for the badness.
62 */ 70 */
63 points = mm->total_vm; 71 points = mm->total_vm;
@@ -127,6 +135,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
127 points /= 4; 135 points /= 4;
128 136
129 /* 137 /*
138 * If p's nodes don't overlap ours, it may still help to kill p
139 * because p may have allocated or otherwise mapped memory on
140 * this node before. However it will be less likely.
141 */
142 if (!cpuset_excl_nodes_overlap(p))
143 points /= 8;
144
145 /*
130 * Adjust the score by oomkilladj. 146 * Adjust the score by oomkilladj.
131 */ 147 */
132 if (p->oomkilladj) { 148 if (p->oomkilladj) {
@@ -161,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
161 177
162 for (z = zonelist->zones; *z; z++) 178 for (z = zonelist->zones; *z; z++)
163 if (cpuset_zone_allowed(*z, gfp_mask)) 179 if (cpuset_zone_allowed(*z, gfp_mask))
164 node_clear((*z)->zone_pgdat->node_id, 180 node_clear(zone_to_nid(*z), nodes);
165 nodes);
166 else 181 else
167 return CONSTRAINT_CPUSET; 182 return CONSTRAINT_CPUSET;
168 183
@@ -189,27 +204,49 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
189 do_posix_clock_monotonic_gettime(&uptime); 204 do_posix_clock_monotonic_gettime(&uptime);
190 do_each_thread(g, p) { 205 do_each_thread(g, p) {
191 unsigned long points; 206 unsigned long points;
192 int releasing;
193 207
194 /* skip the init task with pid == 1 */ 208 /*
195 if (p->pid == 1) 209 * skip kernel threads and tasks which have already released
196 continue; 210 * their mm.
197 if (p->oomkilladj == OOM_DISABLE) 211 */
212 if (!p->mm)
198 continue; 213 continue;
199 /* If p's nodes don't overlap ours, it won't help to kill p. */ 214 /* skip the init task */
200 if (!cpuset_excl_nodes_overlap(p)) 215 if (is_init(p))
201 continue; 216 continue;
202 217
203 /* 218 /*
219 * This task already has access to memory reserves and is
220 * being killed. Don't allow any other task access to the
221 * memory reserve.
222 *
223 * Note: this may have a chance of deadlock if it gets
224 * blocked waiting for another task which itself is waiting
225 * for memory. Is there a better alternative?
226 */
227 if (test_tsk_thread_flag(p, TIF_MEMDIE))
228 return ERR_PTR(-1UL);
229
230 /*
204 * This is in the process of releasing memory so wait for it 231 * This is in the process of releasing memory so wait for it
205 * to finish before killing some other task by mistake. 232 * to finish before killing some other task by mistake.
233 *
234 * However, if p is the current task, we allow the 'kill' to
235 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
236 * which will allow it to gain access to memory reserves in
237 * the process of exiting and releasing its resources.
238 * Otherwise we could get an easy OOM deadlock.
206 */ 239 */
207 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || 240 if (p->flags & PF_EXITING) {
208 p->flags & PF_EXITING; 241 if (p != current)
209 if (releasing && !(p->flags & PF_DEAD)) 242 return ERR_PTR(-1UL);
210 return ERR_PTR(-1UL); 243
211 if (p->flags & PF_SWAPOFF) 244 chosen = p;
212 return p; 245 *ppoints = ULONG_MAX;
246 }
247
248 if (p->oomkilladj == OOM_DISABLE)
249 continue;
213 250
214 points = badness(p, uptime.tv_sec); 251 points = badness(p, uptime.tv_sec);
215 if (points > *ppoints || !chosen) { 252 if (points > *ppoints || !chosen) {
@@ -217,32 +254,33 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
217 *ppoints = points; 254 *ppoints = points;
218 } 255 }
219 } while_each_thread(g, p); 256 } while_each_thread(g, p);
257
220 return chosen; 258 return chosen;
221} 259}
222 260
223/** 261/**
224 * We must be careful though to never send SIGKILL a process with 262 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
225 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that 263 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
226 * we select a process with CAP_SYS_RAW_IO set). 264 * set.
227 */ 265 */
228static void __oom_kill_task(task_t *p, const char *message) 266static void __oom_kill_task(struct task_struct *p, const char *message)
229{ 267{
230 if (p->pid == 1) { 268 if (is_init(p)) {
231 WARN_ON(1); 269 WARN_ON(1);
232 printk(KERN_WARNING "tried to kill init!\n"); 270 printk(KERN_WARNING "tried to kill init!\n");
233 return; 271 return;
234 } 272 }
235 273
236 task_lock(p); 274 if (!p->mm) {
237 if (!p->mm || p->mm == &init_mm) {
238 WARN_ON(1); 275 WARN_ON(1);
239 printk(KERN_WARNING "tried to kill an mm-less task!\n"); 276 printk(KERN_WARNING "tried to kill an mm-less task!\n");
240 task_unlock(p);
241 return; 277 return;
242 } 278 }
243 task_unlock(p); 279
244 printk(KERN_ERR "%s: Killed process %d (%s).\n", 280 if (message) {
281 printk(KERN_ERR "%s: Killed process %d (%s).\n",
245 message, p->pid, p->comm); 282 message, p->pid, p->comm);
283 }
246 284
247 /* 285 /*
248 * We give our sacrificial lamb high priority and access to 286 * We give our sacrificial lamb high priority and access to
@@ -255,10 +293,10 @@ static void __oom_kill_task(task_t *p, const char *message)
255 force_sig(SIGKILL, p); 293 force_sig(SIGKILL, p);
256} 294}
257 295
258static int oom_kill_task(task_t *p, const char *message) 296static int oom_kill_task(struct task_struct *p, const char *message)
259{ 297{
260 struct mm_struct *mm; 298 struct mm_struct *mm;
261 task_t * g, * q; 299 struct task_struct *g, *q;
262 300
263 mm = p->mm; 301 mm = p->mm;
264 302
@@ -271,7 +309,7 @@ static int oom_kill_task(task_t *p, const char *message)
271 * However, this is of no concern to us. 309 * However, this is of no concern to us.
272 */ 310 */
273 311
274 if (mm == NULL || mm == &init_mm) 312 if (mm == NULL)
275 return 1; 313 return 1;
276 314
277 __oom_kill_task(p, message); 315 __oom_kill_task(p, message);
@@ -293,8 +331,17 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
293 struct task_struct *c; 331 struct task_struct *c;
294 struct list_head *tsk; 332 struct list_head *tsk;
295 333
296 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " 334 /*
297 "children.\n", p->pid, p->comm, points); 335 * If the task is already exiting, don't alarm the sysadmin or kill
336 * its children or threads, just set TIF_MEMDIE so it can die quickly
337 */
338 if (p->flags & PF_EXITING) {
339 __oom_kill_task(p, NULL);
340 return 0;
341 }
342
343 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
344 " and children.\n", p->pid, p->comm, points);
298 /* Try to kill a child first */ 345 /* Try to kill a child first */
299 list_for_each(tsk, &p->children) { 346 list_for_each(tsk, &p->children) {
300 c = list_entry(tsk, struct task_struct, sibling); 347 c = list_entry(tsk, struct task_struct, sibling);
@@ -306,6 +353,20 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
306 return oom_kill_task(p, message); 353 return oom_kill_task(p, message);
307} 354}
308 355
356static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
357
358int register_oom_notifier(struct notifier_block *nb)
359{
360 return blocking_notifier_chain_register(&oom_notify_list, nb);
361}
362EXPORT_SYMBOL_GPL(register_oom_notifier);
363
364int unregister_oom_notifier(struct notifier_block *nb)
365{
366 return blocking_notifier_chain_unregister(&oom_notify_list, nb);
367}
368EXPORT_SYMBOL_GPL(unregister_oom_notifier);
369
309/** 370/**
310 * out_of_memory - kill the "best" process when we run out of memory 371 * out_of_memory - kill the "best" process when we run out of memory
311 * 372 *
@@ -316,12 +377,19 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
316 */ 377 */
317void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) 378void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
318{ 379{
319 task_t *p; 380 struct task_struct *p;
320 unsigned long points = 0; 381 unsigned long points = 0;
382 unsigned long freed = 0;
383
384 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
385 if (freed > 0)
386 /* Got some memory back in the last second. */
387 return;
321 388
322 if (printk_ratelimit()) { 389 if (printk_ratelimit()) {
323 printk("oom-killer: gfp_mask=0x%x, order=%d\n", 390 printk(KERN_WARNING "%s invoked oom-killer: "
324 gfp_mask, order); 391 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
392 current->comm, gfp_mask, order, current->oomkilladj);
325 dump_stack(); 393 dump_stack();
326 show_mem(); 394 show_mem();
327 } 395 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8ccf6f1b1473..488b7088557c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -23,6 +23,7 @@
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/mpage.h> 25#include <linux/mpage.h>
26#include <linux/rmap.h>
26#include <linux/percpu.h> 27#include <linux/percpu.h>
27#include <linux/notifier.h> 28#include <linux/notifier.h>
28#include <linux/smp.h> 29#include <linux/smp.h>
@@ -45,7 +46,6 @@
45 */ 46 */
46static long ratelimit_pages = 32; 47static long ratelimit_pages = 32;
47 48
48static long total_pages; /* The total number of pages in the machine. */
49static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ 49static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
50 50
51/* 51/*
@@ -99,22 +99,6 @@ EXPORT_SYMBOL(laptop_mode);
99 99
100static void background_writeout(unsigned long _min_pages); 100static void background_writeout(unsigned long _min_pages);
101 101
102struct writeback_state
103{
104 unsigned long nr_dirty;
105 unsigned long nr_unstable;
106 unsigned long nr_mapped;
107 unsigned long nr_writeback;
108};
109
110static void get_writeback_state(struct writeback_state *wbs)
111{
112 wbs->nr_dirty = read_page_state(nr_dirty);
113 wbs->nr_unstable = read_page_state(nr_unstable);
114 wbs->nr_mapped = read_page_state(nr_mapped);
115 wbs->nr_writeback = read_page_state(nr_writeback);
116}
117
118/* 102/*
119 * Work out the current dirty-memory clamping and background writeout 103 * Work out the current dirty-memory clamping and background writeout
120 * thresholds. 104 * thresholds.
@@ -133,19 +117,17 @@ static void get_writeback_state(struct writeback_state *wbs)
133 * clamping level. 117 * clamping level.
134 */ 118 */
135static void 119static void
136get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, 120get_dirty_limits(long *pbackground, long *pdirty,
137 struct address_space *mapping) 121 struct address_space *mapping)
138{ 122{
139 int background_ratio; /* Percentages */ 123 int background_ratio; /* Percentages */
140 int dirty_ratio; 124 int dirty_ratio;
141 int unmapped_ratio; 125 int unmapped_ratio;
142 long background; 126 long background;
143 long dirty; 127 long dirty;
144 unsigned long available_memory = total_pages; 128 unsigned long available_memory = vm_total_pages;
145 struct task_struct *tsk; 129 struct task_struct *tsk;
146 130
147 get_writeback_state(wbs);
148
149#ifdef CONFIG_HIGHMEM 131#ifdef CONFIG_HIGHMEM
150 /* 132 /*
151 * If this mapping can only allocate from low memory, 133 * If this mapping can only allocate from low memory,
@@ -156,7 +138,9 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
156#endif 138#endif
157 139
158 140
159 unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; 141 unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
142 global_page_state(NR_ANON_PAGES)) * 100) /
143 vm_total_pages;
160 144
161 dirty_ratio = vm_dirty_ratio; 145 dirty_ratio = vm_dirty_ratio;
162 if (dirty_ratio > unmapped_ratio / 2) 146 if (dirty_ratio > unmapped_ratio / 2)
@@ -189,7 +173,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
189 */ 173 */
190static void balance_dirty_pages(struct address_space *mapping) 174static void balance_dirty_pages(struct address_space *mapping)
191{ 175{
192 struct writeback_state wbs;
193 long nr_reclaimable; 176 long nr_reclaimable;
194 long background_thresh; 177 long background_thresh;
195 long dirty_thresh; 178 long dirty_thresh;
@@ -207,11 +190,12 @@ static void balance_dirty_pages(struct address_space *mapping)
207 .range_cyclic = 1, 190 .range_cyclic = 1,
208 }; 191 };
209 192
210 get_dirty_limits(&wbs, &background_thresh, 193 get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
211 &dirty_thresh, mapping); 194 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
212 nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; 195 global_page_state(NR_UNSTABLE_NFS);
213 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) 196 if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
214 break; 197 dirty_thresh)
198 break;
215 199
216 if (!dirty_exceeded) 200 if (!dirty_exceeded)
217 dirty_exceeded = 1; 201 dirty_exceeded = 1;
@@ -224,11 +208,14 @@ static void balance_dirty_pages(struct address_space *mapping)
224 */ 208 */
225 if (nr_reclaimable) { 209 if (nr_reclaimable) {
226 writeback_inodes(&wbc); 210 writeback_inodes(&wbc);
227 get_dirty_limits(&wbs, &background_thresh, 211 get_dirty_limits(&background_thresh,
228 &dirty_thresh, mapping); 212 &dirty_thresh, mapping);
229 nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; 213 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
230 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) 214 global_page_state(NR_UNSTABLE_NFS);
231 break; 215 if (nr_reclaimable +
216 global_page_state(NR_WRITEBACK)
217 <= dirty_thresh)
218 break;
232 pages_written += write_chunk - wbc.nr_to_write; 219 pages_written += write_chunk - wbc.nr_to_write;
233 if (pages_written >= write_chunk) 220 if (pages_written >= write_chunk)
234 break; /* We've done our duty */ 221 break; /* We've done our duty */
@@ -236,8 +223,9 @@ static void balance_dirty_pages(struct address_space *mapping)
236 blk_congestion_wait(WRITE, HZ/10); 223 blk_congestion_wait(WRITE, HZ/10);
237 } 224 }
238 225
239 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded) 226 if (nr_reclaimable + global_page_state(NR_WRITEBACK)
240 dirty_exceeded = 0; 227 <= dirty_thresh && dirty_exceeded)
228 dirty_exceeded = 0;
241 229
242 if (writeback_in_progress(bdi)) 230 if (writeback_in_progress(bdi))
243 return; /* pdflush is already working this queue */ 231 return; /* pdflush is already working this queue */
@@ -255,6 +243,16 @@ static void balance_dirty_pages(struct address_space *mapping)
255 pdflush_operation(background_writeout, 0); 243 pdflush_operation(background_writeout, 0);
256} 244}
257 245
246void set_page_dirty_balance(struct page *page)
247{
248 if (set_page_dirty(page)) {
249 struct address_space *mapping = page_mapping(page);
250
251 if (mapping)
252 balance_dirty_pages_ratelimited(mapping);
253 }
254}
255
258/** 256/**
259 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 257 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
260 * @mapping: address_space which was dirtied 258 * @mapping: address_space which was dirtied
@@ -299,12 +297,11 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
299 297
300void throttle_vm_writeout(void) 298void throttle_vm_writeout(void)
301{ 299{
302 struct writeback_state wbs;
303 long background_thresh; 300 long background_thresh;
304 long dirty_thresh; 301 long dirty_thresh;
305 302
306 for ( ; ; ) { 303 for ( ; ; ) {
307 get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); 304 get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
308 305
309 /* 306 /*
310 * Boost the allowable dirty threshold a bit for page 307 * Boost the allowable dirty threshold a bit for page
@@ -312,8 +309,9 @@ void throttle_vm_writeout(void)
312 */ 309 */
313 dirty_thresh += dirty_thresh / 10; /* wheeee... */ 310 dirty_thresh += dirty_thresh / 10; /* wheeee... */
314 311
315 if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) 312 if (global_page_state(NR_UNSTABLE_NFS) +
316 break; 313 global_page_state(NR_WRITEBACK) <= dirty_thresh)
314 break;
317 blk_congestion_wait(WRITE, HZ/10); 315 blk_congestion_wait(WRITE, HZ/10);
318 } 316 }
319} 317}
@@ -336,12 +334,12 @@ static void background_writeout(unsigned long _min_pages)
336 }; 334 };
337 335
338 for ( ; ; ) { 336 for ( ; ; ) {
339 struct writeback_state wbs;
340 long background_thresh; 337 long background_thresh;
341 long dirty_thresh; 338 long dirty_thresh;
342 339
343 get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); 340 get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
344 if (wbs.nr_dirty + wbs.nr_unstable < background_thresh 341 if (global_page_state(NR_FILE_DIRTY) +
342 global_page_state(NR_UNSTABLE_NFS) < background_thresh
345 && min_pages <= 0) 343 && min_pages <= 0)
346 break; 344 break;
347 wbc.encountered_congestion = 0; 345 wbc.encountered_congestion = 0;
@@ -365,12 +363,9 @@ static void background_writeout(unsigned long _min_pages)
365 */ 363 */
366int wakeup_pdflush(long nr_pages) 364int wakeup_pdflush(long nr_pages)
367{ 365{
368 if (nr_pages == 0) { 366 if (nr_pages == 0)
369 struct writeback_state wbs; 367 nr_pages = global_page_state(NR_FILE_DIRTY) +
370 368 global_page_state(NR_UNSTABLE_NFS);
371 get_writeback_state(&wbs);
372 nr_pages = wbs.nr_dirty + wbs.nr_unstable;
373 }
374 return pdflush_operation(background_writeout, nr_pages); 369 return pdflush_operation(background_writeout, nr_pages);
375} 370}
376 371
@@ -401,7 +396,6 @@ static void wb_kupdate(unsigned long arg)
401 unsigned long start_jif; 396 unsigned long start_jif;
402 unsigned long next_jif; 397 unsigned long next_jif;
403 long nr_to_write; 398 long nr_to_write;
404 struct writeback_state wbs;
405 struct writeback_control wbc = { 399 struct writeback_control wbc = {
406 .bdi = NULL, 400 .bdi = NULL,
407 .sync_mode = WB_SYNC_NONE, 401 .sync_mode = WB_SYNC_NONE,
@@ -414,11 +408,11 @@ static void wb_kupdate(unsigned long arg)
414 408
415 sync_supers(); 409 sync_supers();
416 410
417 get_writeback_state(&wbs);
418 oldest_jif = jiffies - dirty_expire_interval; 411 oldest_jif = jiffies - dirty_expire_interval;
419 start_jif = jiffies; 412 start_jif = jiffies;
420 next_jif = start_jif + dirty_writeback_interval; 413 next_jif = start_jif + dirty_writeback_interval;
421 nr_to_write = wbs.nr_dirty + wbs.nr_unstable + 414 nr_to_write = global_page_state(NR_FILE_DIRTY) +
415 global_page_state(NR_UNSTABLE_NFS) +
422 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 416 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
423 while (nr_to_write > 0) { 417 while (nr_to_write > 0) {
424 wbc.encountered_congestion = 0; 418 wbc.encountered_congestion = 0;
@@ -507,23 +501,23 @@ void laptop_sync_completion(void)
507 * will write six megabyte chunks, max. 501 * will write six megabyte chunks, max.
508 */ 502 */
509 503
510static void set_ratelimit(void) 504void writeback_set_ratelimit(void)
511{ 505{
512 ratelimit_pages = total_pages / (num_online_cpus() * 32); 506 ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
513 if (ratelimit_pages < 16) 507 if (ratelimit_pages < 16)
514 ratelimit_pages = 16; 508 ratelimit_pages = 16;
515 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) 509 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
516 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; 510 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
517} 511}
518 512
519static int 513static int __cpuinit
520ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) 514ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
521{ 515{
522 set_ratelimit(); 516 writeback_set_ratelimit();
523 return 0; 517 return 0;
524} 518}
525 519
526static struct notifier_block ratelimit_nb = { 520static struct notifier_block __cpuinitdata ratelimit_nb = {
527 .notifier_call = ratelimit_handler, 521 .notifier_call = ratelimit_handler,
528 .next = NULL, 522 .next = NULL,
529}; 523};
@@ -538,9 +532,7 @@ void __init page_writeback_init(void)
538 long buffer_pages = nr_free_buffer_pages(); 532 long buffer_pages = nr_free_buffer_pages();
539 long correction; 533 long correction;
540 534
541 total_pages = nr_free_pagecache_pages(); 535 correction = (100 * 4 * buffer_pages) / vm_total_pages;
542
543 correction = (100 * 4 * buffer_pages) / total_pages;
544 536
545 if (correction < 100) { 537 if (correction < 100) {
546 dirty_background_ratio *= correction; 538 dirty_background_ratio *= correction;
@@ -554,7 +546,7 @@ void __init page_writeback_init(void)
554 vm_dirty_ratio = 1; 546 vm_dirty_ratio = 1;
555 } 547 }
556 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 548 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
557 set_ratelimit(); 549 writeback_set_ratelimit();
558 register_cpu_notifier(&ratelimit_nb); 550 register_cpu_notifier(&ratelimit_nb);
559} 551}
560 552
@@ -566,7 +558,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
566 return 0; 558 return 0;
567 wbc->for_writepages = 1; 559 wbc->for_writepages = 1;
568 if (mapping->a_ops->writepages) 560 if (mapping->a_ops->writepages)
569 ret = mapping->a_ops->writepages(mapping, wbc); 561 ret = mapping->a_ops->writepages(mapping, wbc);
570 else 562 else
571 ret = generic_writepages(mapping, wbc); 563 ret = generic_writepages(mapping, wbc);
572 wbc->for_writepages = 0; 564 wbc->for_writepages = 0;
@@ -640,7 +632,8 @@ int __set_page_dirty_nobuffers(struct page *page)
640 if (mapping2) { /* Race with truncate? */ 632 if (mapping2) { /* Race with truncate? */
641 BUG_ON(mapping2 != mapping); 633 BUG_ON(mapping2 != mapping);
642 if (mapping_cap_account_dirty(mapping)) 634 if (mapping_cap_account_dirty(mapping))
643 inc_page_state(nr_dirty); 635 __inc_zone_page_state(page,
636 NR_FILE_DIRTY);
644 radix_tree_tag_set(&mapping->page_tree, 637 radix_tree_tag_set(&mapping->page_tree,
645 page_index(page), PAGECACHE_TAG_DIRTY); 638 page_index(page), PAGECACHE_TAG_DIRTY);
646 } 639 }
@@ -705,7 +698,7 @@ int set_page_dirty_lock(struct page *page)
705{ 698{
706 int ret; 699 int ret;
707 700
708 lock_page(page); 701 lock_page_nosync(page);
709 ret = set_page_dirty(page); 702 ret = set_page_dirty(page);
710 unlock_page(page); 703 unlock_page(page);
711 return ret; 704 return ret;
@@ -728,8 +721,14 @@ int test_clear_page_dirty(struct page *page)
728 page_index(page), 721 page_index(page),
729 PAGECACHE_TAG_DIRTY); 722 PAGECACHE_TAG_DIRTY);
730 write_unlock_irqrestore(&mapping->tree_lock, flags); 723 write_unlock_irqrestore(&mapping->tree_lock, flags);
731 if (mapping_cap_account_dirty(mapping)) 724 /*
732 dec_page_state(nr_dirty); 725 * We can continue to use `mapping' here because the
726 * page is locked, which pins the address_space
727 */
728 if (mapping_cap_account_dirty(mapping)) {
729 page_mkclean(page);
730 dec_zone_page_state(page, NR_FILE_DIRTY);
731 }
733 return 1; 732 return 1;
734 } 733 }
735 write_unlock_irqrestore(&mapping->tree_lock, flags); 734 write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -759,8 +758,10 @@ int clear_page_dirty_for_io(struct page *page)
759 758
760 if (mapping) { 759 if (mapping) {
761 if (TestClearPageDirty(page)) { 760 if (TestClearPageDirty(page)) {
762 if (mapping_cap_account_dirty(mapping)) 761 if (mapping_cap_account_dirty(mapping)) {
763 dec_page_state(nr_dirty); 762 page_mkclean(page);
763 dec_zone_page_state(page, NR_FILE_DIRTY);
764 }
764 return 1; 765 return 1;
765 } 766 }
766 return 0; 767 return 0;
@@ -818,6 +819,15 @@ int test_set_page_writeback(struct page *page)
818EXPORT_SYMBOL(test_set_page_writeback); 819EXPORT_SYMBOL(test_set_page_writeback);
819 820
820/* 821/*
822 * Wakes up tasks that are being throttled due to writeback congestion
823 */
824void writeback_congestion_end(void)
825{
826 blk_congestion_end(WRITE);
827}
828EXPORT_SYMBOL(writeback_congestion_end);
829
830/*
821 * Return true if any of the pages in the mapping are marged with the 831 * Return true if any of the pages in the mapping are marged with the
822 * passed tag. 832 * passed tag.
823 */ 833 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 423db0db7c02..4f59d90b81e6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -14,7 +14,6 @@
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17#include <linux/config.h>
18#include <linux/stddef.h> 17#include <linux/stddef.h>
19#include <linux/mm.h> 18#include <linux/mm.h>
20#include <linux/swap.h> 19#include <linux/swap.h>
@@ -38,6 +37,8 @@
38#include <linux/vmalloc.h> 37#include <linux/vmalloc.h>
39#include <linux/mempolicy.h> 38#include <linux/mempolicy.h>
40#include <linux/stop_machine.h> 39#include <linux/stop_machine.h>
40#include <linux/sort.h>
41#include <linux/pfn.h>
41 42
42#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
43#include <asm/div64.h> 44#include <asm/div64.h>
@@ -52,7 +53,6 @@ EXPORT_SYMBOL(node_online_map);
52nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 53nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
53EXPORT_SYMBOL(node_possible_map); 54EXPORT_SYMBOL(node_possible_map);
54unsigned long totalram_pages __read_mostly; 55unsigned long totalram_pages __read_mostly;
55unsigned long totalhigh_pages __read_mostly;
56unsigned long totalreserve_pages __read_mostly; 56unsigned long totalreserve_pages __read_mostly;
57long nr_swap_pages; 57long nr_swap_pages;
58int percpu_pagelist_fraction; 58int percpu_pagelist_fraction;
@@ -70,7 +70,15 @@ static void __free_pages_ok(struct page *page, unsigned int order);
70 * TBD: should special case ZONE_DMA32 machines here - in those we normally 70 * TBD: should special case ZONE_DMA32 machines here - in those we normally
71 * don't need any ZONE_NORMAL reservation 71 * don't need any ZONE_NORMAL reservation
72 */ 72 */
73int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; 73int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
74 256,
75#ifdef CONFIG_ZONE_DMA32
76 256,
77#endif
78#ifdef CONFIG_HIGHMEM
79 32
80#endif
81};
74 82
75EXPORT_SYMBOL(totalram_pages); 83EXPORT_SYMBOL(totalram_pages);
76 84
@@ -81,11 +89,53 @@ EXPORT_SYMBOL(totalram_pages);
81struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 89struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
82EXPORT_SYMBOL(zone_table); 90EXPORT_SYMBOL(zone_table);
83 91
84static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 92static char *zone_names[MAX_NR_ZONES] = {
93 "DMA",
94#ifdef CONFIG_ZONE_DMA32
95 "DMA32",
96#endif
97 "Normal",
98#ifdef CONFIG_HIGHMEM
99 "HighMem"
100#endif
101};
102
85int min_free_kbytes = 1024; 103int min_free_kbytes = 1024;
86 104
87unsigned long __meminitdata nr_kernel_pages; 105unsigned long __meminitdata nr_kernel_pages;
88unsigned long __meminitdata nr_all_pages; 106unsigned long __meminitdata nr_all_pages;
107static unsigned long __initdata dma_reserve;
108
109#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
110 /*
111 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
112 * ranges of memory (RAM) that may be registered with add_active_range().
113 * Ranges passed to add_active_range() will be merged if possible
114 * so the number of times add_active_range() can be called is
115 * related to the number of nodes and the number of holes
116 */
117 #ifdef CONFIG_MAX_ACTIVE_REGIONS
118 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
119 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
120 #else
121 #if MAX_NUMNODES >= 32
122 /* If there can be many nodes, allow up to 50 holes per node */
123 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
124 #else
125 /* By default, allow up to 256 distinct regions */
126 #define MAX_ACTIVE_REGIONS 256
127 #endif
128 #endif
129
130 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
131 int __initdata nr_nodemap_entries;
132 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
133 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
134#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
135 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
136 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
137#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
138#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
89 139
90#ifdef CONFIG_DEBUG_VM 140#ifdef CONFIG_DEBUG_VM
91static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 141static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -128,7 +178,6 @@ static int bad_range(struct zone *zone, struct page *page)
128 178
129 return 0; 179 return 0;
130} 180}
131
132#else 181#else
133static inline int bad_range(struct zone *zone, struct page *page) 182static inline int bad_range(struct zone *zone, struct page *page)
134{ 183{
@@ -219,12 +268,12 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
219{ 268{
220 int i; 269 int i;
221 270
222 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 271 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
223 /* 272 /*
224 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 273 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
225 * and __GFP_HIGHMEM from hard or soft interrupt context. 274 * and __GFP_HIGHMEM from hard or soft interrupt context.
226 */ 275 */
227 BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 276 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
228 for (i = 0; i < (1 << order); i++) 277 for (i = 0; i < (1 << order); i++)
229 clear_highpage(page + i); 278 clear_highpage(page + i);
230} 279}
@@ -266,7 +315,7 @@ static inline void rmv_page_order(struct page *page)
266 * satisfies the following equation: 315 * satisfies the following equation:
267 * P = B & ~(1 << O) 316 * P = B & ~(1 << O)
268 * 317 *
269 * Assumption: *_mem_map is contigious at least up to MAX_ORDER 318 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
270 */ 319 */
271static inline struct page * 320static inline struct page *
272__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 321__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
@@ -348,8 +397,8 @@ static inline void __free_one_page(struct page *page,
348 397
349 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 398 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
350 399
351 BUG_ON(page_idx & (order_size - 1)); 400 VM_BUG_ON(page_idx & (order_size - 1));
352 BUG_ON(bad_range(zone, page)); 401 VM_BUG_ON(bad_range(zone, page));
353 402
354 zone->free_pages += order_size; 403 zone->free_pages += order_size;
355 while (order < MAX_ORDER-1) { 404 while (order < MAX_ORDER-1) {
@@ -422,7 +471,7 @@ static void free_pages_bulk(struct zone *zone, int count,
422 while (count--) { 471 while (count--) {
423 struct page *page; 472 struct page *page;
424 473
425 BUG_ON(list_empty(list)); 474 VM_BUG_ON(list_empty(list));
426 page = list_entry(list->prev, struct page, lru); 475 page = list_entry(list->prev, struct page, lru);
427 /* have to delete it as __free_one_page list manipulates */ 476 /* have to delete it as __free_one_page list manipulates */
428 list_del(&page->lru); 477 list_del(&page->lru);
@@ -433,9 +482,11 @@ static void free_pages_bulk(struct zone *zone, int count,
433 482
434static void free_one_page(struct zone *zone, struct page *page, int order) 483static void free_one_page(struct zone *zone, struct page *page, int order)
435{ 484{
436 LIST_HEAD(list); 485 spin_lock(&zone->lock);
437 list_add(&page->lru, &list); 486 zone->all_unreclaimable = 0;
438 free_pages_bulk(zone, 1, &list, order); 487 zone->pages_scanned = 0;
488 __free_one_page(page, zone ,order);
489 spin_unlock(&zone->lock);
439} 490}
440 491
441static void __free_pages_ok(struct page *page, unsigned int order) 492static void __free_pages_ok(struct page *page, unsigned int order)
@@ -446,8 +497,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
446 497
447 arch_free_page(page, order); 498 arch_free_page(page, order);
448 if (!PageHighMem(page)) 499 if (!PageHighMem(page))
449 mutex_debug_check_no_locks_freed(page_address(page), 500 debug_check_no_locks_freed(page_address(page),
450 PAGE_SIZE<<order); 501 PAGE_SIZE<<order);
451 502
452 for (i = 0 ; i < (1 << order) ; ++i) 503 for (i = 0 ; i < (1 << order) ; ++i)
453 reserved += free_pages_check(page + i); 504 reserved += free_pages_check(page + i);
@@ -456,7 +507,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
456 507
457 kernel_map_pages(page, 1 << order, 0); 508 kernel_map_pages(page, 1 << order, 0);
458 local_irq_save(flags); 509 local_irq_save(flags);
459 __mod_page_state(pgfree, 1 << order); 510 __count_vm_events(PGFREE, 1 << order);
460 free_one_page(page_zone(page), page, order); 511 free_one_page(page_zone(page), page, order);
461 local_irq_restore(flags); 512 local_irq_restore(flags);
462} 513}
@@ -513,7 +564,7 @@ static inline void expand(struct zone *zone, struct page *page,
513 area--; 564 area--;
514 high--; 565 high--;
515 size >>= 1; 566 size >>= 1;
516 BUG_ON(bad_range(zone, &page[size])); 567 VM_BUG_ON(bad_range(zone, &page[size]));
517 list_add(&page[size].lru, &area->free_list); 568 list_add(&page[size].lru, &area->free_list);
518 area->nr_free++; 569 area->nr_free++;
519 set_page_order(&page[size], high); 570 set_page_order(&page[size], high);
@@ -616,19 +667,23 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
616#ifdef CONFIG_NUMA 667#ifdef CONFIG_NUMA
617/* 668/*
618 * Called from the slab reaper to drain pagesets on a particular node that 669 * Called from the slab reaper to drain pagesets on a particular node that
619 * belong to the currently executing processor. 670 * belongs to the currently executing processor.
620 * Note that this function must be called with the thread pinned to 671 * Note that this function must be called with the thread pinned to
621 * a single processor. 672 * a single processor.
622 */ 673 */
623void drain_node_pages(int nodeid) 674void drain_node_pages(int nodeid)
624{ 675{
625 int i, z; 676 int i;
677 enum zone_type z;
626 unsigned long flags; 678 unsigned long flags;
627 679
628 for (z = 0; z < MAX_NR_ZONES; z++) { 680 for (z = 0; z < MAX_NR_ZONES; z++) {
629 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 681 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
630 struct per_cpu_pageset *pset; 682 struct per_cpu_pageset *pset;
631 683
684 if (!populated_zone(zone))
685 continue;
686
632 pset = zone_pcp(zone, smp_processor_id()); 687 pset = zone_pcp(zone, smp_processor_id());
633 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 688 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
634 struct per_cpu_pages *pcp; 689 struct per_cpu_pages *pcp;
@@ -673,7 +728,8 @@ static void __drain_pages(unsigned int cpu)
673 728
674void mark_free_pages(struct zone *zone) 729void mark_free_pages(struct zone *zone)
675{ 730{
676 unsigned long zone_pfn, flags; 731 unsigned long pfn, max_zone_pfn;
732 unsigned long flags;
677 int order; 733 int order;
678 struct list_head *curr; 734 struct list_head *curr;
679 735
@@ -681,18 +737,25 @@ void mark_free_pages(struct zone *zone)
681 return; 737 return;
682 738
683 spin_lock_irqsave(&zone->lock, flags); 739 spin_lock_irqsave(&zone->lock, flags);
684 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 740
685 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); 741 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
742 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
743 if (pfn_valid(pfn)) {
744 struct page *page = pfn_to_page(pfn);
745
746 if (!PageNosave(page))
747 ClearPageNosaveFree(page);
748 }
686 749
687 for (order = MAX_ORDER - 1; order >= 0; --order) 750 for (order = MAX_ORDER - 1; order >= 0; --order)
688 list_for_each(curr, &zone->free_area[order].free_list) { 751 list_for_each(curr, &zone->free_area[order].free_list) {
689 unsigned long start_pfn, i; 752 unsigned long i;
690 753
691 start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); 754 pfn = page_to_pfn(list_entry(curr, struct page, lru));
755 for (i = 0; i < (1UL << order); i++)
756 SetPageNosaveFree(pfn_to_page(pfn + i));
757 }
692 758
693 for (i=0; i < (1<<order); i++)
694 SetPageNosaveFree(pfn_to_page(start_pfn+i));
695 }
696 spin_unlock_irqrestore(&zone->lock, flags); 759 spin_unlock_irqrestore(&zone->lock, flags);
697} 760}
698 761
@@ -709,27 +772,6 @@ void drain_local_pages(void)
709} 772}
710#endif /* CONFIG_PM */ 773#endif /* CONFIG_PM */
711 774
712static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
713{
714#ifdef CONFIG_NUMA
715 pg_data_t *pg = z->zone_pgdat;
716 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
717 struct per_cpu_pageset *p;
718
719 p = zone_pcp(z, cpu);
720 if (pg == orig) {
721 p->numa_hit++;
722 } else {
723 p->numa_miss++;
724 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
725 }
726 if (pg == NODE_DATA(numa_node_id()))
727 p->local_node++;
728 else
729 p->other_node++;
730#endif
731}
732
733/* 775/*
734 * Free a 0-order page 776 * Free a 0-order page
735 */ 777 */
@@ -750,7 +792,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
750 792
751 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 793 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
752 local_irq_save(flags); 794 local_irq_save(flags);
753 __inc_page_state(pgfree); 795 __count_vm_event(PGFREE);
754 list_add(&page->lru, &pcp->list); 796 list_add(&page->lru, &pcp->list);
755 pcp->count++; 797 pcp->count++;
756 if (pcp->count >= pcp->high) { 798 if (pcp->count >= pcp->high) {
@@ -783,8 +825,8 @@ void split_page(struct page *page, unsigned int order)
783{ 825{
784 int i; 826 int i;
785 827
786 BUG_ON(PageCompound(page)); 828 VM_BUG_ON(PageCompound(page));
787 BUG_ON(!page_count(page)); 829 VM_BUG_ON(!page_count(page));
788 for (i = 1; i < (1 << order); i++) 830 for (i = 1; i < (1 << order); i++)
789 set_page_refcounted(page + i); 831 set_page_refcounted(page + i);
790} 832}
@@ -826,12 +868,12 @@ again:
826 goto failed; 868 goto failed;
827 } 869 }
828 870
829 __mod_page_state_zone(zone, pgalloc, 1 << order); 871 __count_zone_vm_events(PGALLOC, zone, 1 << order);
830 zone_statistics(zonelist, zone, cpu); 872 zone_statistics(zonelist, zone);
831 local_irq_restore(flags); 873 local_irq_restore(flags);
832 put_cpu(); 874 put_cpu();
833 875
834 BUG_ON(bad_range(zone, page)); 876 VM_BUG_ON(bad_range(zone, page));
835 if (prep_new_page(page, order, gfp_flags)) 877 if (prep_new_page(page, order, gfp_flags))
836 goto again; 878 goto again;
837 return page; 879 return page;
@@ -892,32 +934,37 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
892 struct zone **z = zonelist->zones; 934 struct zone **z = zonelist->zones;
893 struct page *page = NULL; 935 struct page *page = NULL;
894 int classzone_idx = zone_idx(*z); 936 int classzone_idx = zone_idx(*z);
937 struct zone *zone;
895 938
896 /* 939 /*
897 * Go through the zonelist once, looking for a zone with enough free. 940 * Go through the zonelist once, looking for a zone with enough free.
898 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 941 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
899 */ 942 */
900 do { 943 do {
944 zone = *z;
945 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
946 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
947 break;
901 if ((alloc_flags & ALLOC_CPUSET) && 948 if ((alloc_flags & ALLOC_CPUSET) &&
902 !cpuset_zone_allowed(*z, gfp_mask)) 949 !cpuset_zone_allowed(zone, gfp_mask))
903 continue; 950 continue;
904 951
905 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 952 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
906 unsigned long mark; 953 unsigned long mark;
907 if (alloc_flags & ALLOC_WMARK_MIN) 954 if (alloc_flags & ALLOC_WMARK_MIN)
908 mark = (*z)->pages_min; 955 mark = zone->pages_min;
909 else if (alloc_flags & ALLOC_WMARK_LOW) 956 else if (alloc_flags & ALLOC_WMARK_LOW)
910 mark = (*z)->pages_low; 957 mark = zone->pages_low;
911 else 958 else
912 mark = (*z)->pages_high; 959 mark = zone->pages_high;
913 if (!zone_watermark_ok(*z, order, mark, 960 if (!zone_watermark_ok(zone , order, mark,
914 classzone_idx, alloc_flags)) 961 classzone_idx, alloc_flags))
915 if (!zone_reclaim_mode || 962 if (!zone_reclaim_mode ||
916 !zone_reclaim(*z, gfp_mask, order)) 963 !zone_reclaim(zone, gfp_mask, order))
917 continue; 964 continue;
918 } 965 }
919 966
920 page = buffered_rmqueue(zonelist, *z, order, gfp_mask); 967 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
921 if (page) { 968 if (page) {
922 break; 969 break;
923 } 970 }
@@ -957,8 +1004,7 @@ restart:
957 goto got_pg; 1004 goto got_pg;
958 1005
959 do { 1006 do {
960 if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL)) 1007 wakeup_kswapd(*z, order);
961 wakeup_kswapd(*z, order);
962 } while (*(++z)); 1008 } while (*(++z));
963 1009
964 /* 1010 /*
@@ -1106,7 +1152,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1106 * get_zeroed_page() returns a 32-bit address, which cannot represent 1152 * get_zeroed_page() returns a 32-bit address, which cannot represent
1107 * a highmem page 1153 * a highmem page
1108 */ 1154 */
1109 BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1155 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1110 1156
1111 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1157 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1112 if (page) 1158 if (page)
@@ -1139,7 +1185,7 @@ EXPORT_SYMBOL(__free_pages);
1139fastcall void free_pages(unsigned long addr, unsigned int order) 1185fastcall void free_pages(unsigned long addr, unsigned int order)
1140{ 1186{
1141 if (addr != 0) { 1187 if (addr != 0) {
1142 BUG_ON(!virt_addr_valid((void *)addr)); 1188 VM_BUG_ON(!virt_addr_valid((void *)addr));
1143 __free_pages(virt_to_page((void *)addr), order); 1189 __free_pages(virt_to_page((void *)addr), order);
1144 } 1190 }
1145} 1191}
@@ -1165,7 +1211,8 @@ EXPORT_SYMBOL(nr_free_pages);
1165#ifdef CONFIG_NUMA 1211#ifdef CONFIG_NUMA
1166unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 1212unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1167{ 1213{
1168 unsigned int i, sum = 0; 1214 unsigned int sum = 0;
1215 enum zone_type i;
1169 1216
1170 for (i = 0; i < MAX_NR_ZONES; i++) 1217 for (i = 0; i < MAX_NR_ZONES; i++)
1171 sum += pgdat->node_zones[i].free_pages; 1218 sum += pgdat->node_zones[i].free_pages;
@@ -1210,161 +1257,10 @@ unsigned int nr_free_pagecache_pages(void)
1210 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1257 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1211} 1258}
1212 1259
1213#ifdef CONFIG_HIGHMEM 1260static inline void show_node(struct zone *zone)
1214unsigned int nr_free_highpages (void)
1215{
1216 pg_data_t *pgdat;
1217 unsigned int pages = 0;
1218
1219 for_each_online_pgdat(pgdat)
1220 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1221
1222 return pages;
1223}
1224#endif
1225
1226#ifdef CONFIG_NUMA
1227static void show_node(struct zone *zone)
1228{
1229 printk("Node %d ", zone->zone_pgdat->node_id);
1230}
1231#else
1232#define show_node(zone) do { } while (0)
1233#endif
1234
1235/*
1236 * Accumulate the page_state information across all CPUs.
1237 * The result is unavoidably approximate - it can change
1238 * during and after execution of this function.
1239 */
1240static DEFINE_PER_CPU(struct page_state, page_states) = {0};
1241
1242atomic_t nr_pagecache = ATOMIC_INIT(0);
1243EXPORT_SYMBOL(nr_pagecache);
1244#ifdef CONFIG_SMP
1245DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1246#endif
1247
1248static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1249{
1250 unsigned cpu;
1251
1252 memset(ret, 0, nr * sizeof(unsigned long));
1253 cpus_and(*cpumask, *cpumask, cpu_online_map);
1254
1255 for_each_cpu_mask(cpu, *cpumask) {
1256 unsigned long *in;
1257 unsigned long *out;
1258 unsigned off;
1259 unsigned next_cpu;
1260
1261 in = (unsigned long *)&per_cpu(page_states, cpu);
1262
1263 next_cpu = next_cpu(cpu, *cpumask);
1264 if (likely(next_cpu < NR_CPUS))
1265 prefetch(&per_cpu(page_states, next_cpu));
1266
1267 out = (unsigned long *)ret;
1268 for (off = 0; off < nr; off++)
1269 *out++ += *in++;
1270 }
1271}
1272
1273void get_page_state_node(struct page_state *ret, int node)
1274{
1275 int nr;
1276 cpumask_t mask = node_to_cpumask(node);
1277
1278 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1279 nr /= sizeof(unsigned long);
1280
1281 __get_page_state(ret, nr+1, &mask);
1282}
1283
1284void get_page_state(struct page_state *ret)
1285{
1286 int nr;
1287 cpumask_t mask = CPU_MASK_ALL;
1288
1289 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1290 nr /= sizeof(unsigned long);
1291
1292 __get_page_state(ret, nr + 1, &mask);
1293}
1294
1295void get_full_page_state(struct page_state *ret)
1296{
1297 cpumask_t mask = CPU_MASK_ALL;
1298
1299 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1300}
1301
1302unsigned long read_page_state_offset(unsigned long offset)
1303{
1304 unsigned long ret = 0;
1305 int cpu;
1306
1307 for_each_online_cpu(cpu) {
1308 unsigned long in;
1309
1310 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
1311 ret += *((unsigned long *)in);
1312 }
1313 return ret;
1314}
1315
1316void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1317{ 1261{
1318 void *ptr; 1262 if (NUMA_BUILD)
1319 1263 printk("Node %ld ", zone_to_nid(zone));
1320 ptr = &__get_cpu_var(page_states);
1321 *(unsigned long *)(ptr + offset) += delta;
1322}
1323EXPORT_SYMBOL(__mod_page_state_offset);
1324
1325void mod_page_state_offset(unsigned long offset, unsigned long delta)
1326{
1327 unsigned long flags;
1328 void *ptr;
1329
1330 local_irq_save(flags);
1331 ptr = &__get_cpu_var(page_states);
1332 *(unsigned long *)(ptr + offset) += delta;
1333 local_irq_restore(flags);
1334}
1335EXPORT_SYMBOL(mod_page_state_offset);
1336
1337void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1338 unsigned long *free, struct pglist_data *pgdat)
1339{
1340 struct zone *zones = pgdat->node_zones;
1341 int i;
1342
1343 *active = 0;
1344 *inactive = 0;
1345 *free = 0;
1346 for (i = 0; i < MAX_NR_ZONES; i++) {
1347 *active += zones[i].nr_active;
1348 *inactive += zones[i].nr_inactive;
1349 *free += zones[i].free_pages;
1350 }
1351}
1352
1353void get_zone_counts(unsigned long *active,
1354 unsigned long *inactive, unsigned long *free)
1355{
1356 struct pglist_data *pgdat;
1357
1358 *active = 0;
1359 *inactive = 0;
1360 *free = 0;
1361 for_each_online_pgdat(pgdat) {
1362 unsigned long l, m, n;
1363 __get_zone_counts(&l, &m, &n, pgdat);
1364 *active += l;
1365 *inactive += m;
1366 *free += n;
1367 }
1368} 1264}
1369 1265
1370void si_meminfo(struct sysinfo *val) 1266void si_meminfo(struct sysinfo *val)
@@ -1373,13 +1269,8 @@ void si_meminfo(struct sysinfo *val)
1373 val->sharedram = 0; 1269 val->sharedram = 0;
1374 val->freeram = nr_free_pages(); 1270 val->freeram = nr_free_pages();
1375 val->bufferram = nr_blockdev_pages(); 1271 val->bufferram = nr_blockdev_pages();
1376#ifdef CONFIG_HIGHMEM
1377 val->totalhigh = totalhigh_pages; 1272 val->totalhigh = totalhigh_pages;
1378 val->freehigh = nr_free_highpages(); 1273 val->freehigh = nr_free_highpages();
1379#else
1380 val->totalhigh = 0;
1381 val->freehigh = 0;
1382#endif
1383 val->mem_unit = PAGE_SIZE; 1274 val->mem_unit = PAGE_SIZE;
1384} 1275}
1385 1276
@@ -1392,8 +1283,13 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1392 1283
1393 val->totalram = pgdat->node_present_pages; 1284 val->totalram = pgdat->node_present_pages;
1394 val->freeram = nr_free_pages_pgdat(pgdat); 1285 val->freeram = nr_free_pages_pgdat(pgdat);
1286#ifdef CONFIG_HIGHMEM
1395 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1287 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1396 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1288 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1289#else
1290 val->totalhigh = 0;
1291 val->freehigh = 0;
1292#endif
1397 val->mem_unit = PAGE_SIZE; 1293 val->mem_unit = PAGE_SIZE;
1398} 1294}
1399#endif 1295#endif
@@ -1407,60 +1303,54 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1407 */ 1303 */
1408void show_free_areas(void) 1304void show_free_areas(void)
1409{ 1305{
1410 struct page_state ps; 1306 int cpu;
1411 int cpu, temperature;
1412 unsigned long active; 1307 unsigned long active;
1413 unsigned long inactive; 1308 unsigned long inactive;
1414 unsigned long free; 1309 unsigned long free;
1415 struct zone *zone; 1310 struct zone *zone;
1416 1311
1417 for_each_zone(zone) { 1312 for_each_zone(zone) {
1418 show_node(zone); 1313 if (!populated_zone(zone))
1419 printk("%s per-cpu:", zone->name);
1420
1421 if (!populated_zone(zone)) {
1422 printk(" empty\n");
1423 continue; 1314 continue;
1424 } else 1315
1425 printk("\n"); 1316 show_node(zone);
1317 printk("%s per-cpu:\n", zone->name);
1426 1318
1427 for_each_online_cpu(cpu) { 1319 for_each_online_cpu(cpu) {
1428 struct per_cpu_pageset *pageset; 1320 struct per_cpu_pageset *pageset;
1429 1321
1430 pageset = zone_pcp(zone, cpu); 1322 pageset = zone_pcp(zone, cpu);
1431 1323
1432 for (temperature = 0; temperature < 2; temperature++) 1324 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
1433 printk("cpu %d %s: high %d, batch %d used:%d\n", 1325 "Cold: hi:%5d, btch:%4d usd:%4d\n",
1434 cpu, 1326 cpu, pageset->pcp[0].high,
1435 temperature ? "cold" : "hot", 1327 pageset->pcp[0].batch, pageset->pcp[0].count,
1436 pageset->pcp[temperature].high, 1328 pageset->pcp[1].high, pageset->pcp[1].batch,
1437 pageset->pcp[temperature].batch, 1329 pageset->pcp[1].count);
1438 pageset->pcp[temperature].count);
1439 } 1330 }
1440 } 1331 }
1441 1332
1442 get_page_state(&ps);
1443 get_zone_counts(&active, &inactive, &free); 1333 get_zone_counts(&active, &inactive, &free);
1444 1334
1445 printk("Free pages: %11ukB (%ukB HighMem)\n",
1446 K(nr_free_pages()),
1447 K(nr_free_highpages()));
1448
1449 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1335 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1450 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1336 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1451 active, 1337 active,
1452 inactive, 1338 inactive,
1453 ps.nr_dirty, 1339 global_page_state(NR_FILE_DIRTY),
1454 ps.nr_writeback, 1340 global_page_state(NR_WRITEBACK),
1455 ps.nr_unstable, 1341 global_page_state(NR_UNSTABLE_NFS),
1456 nr_free_pages(), 1342 nr_free_pages(),
1457 ps.nr_slab, 1343 global_page_state(NR_SLAB_RECLAIMABLE) +
1458 ps.nr_mapped, 1344 global_page_state(NR_SLAB_UNRECLAIMABLE),
1459 ps.nr_page_table_pages); 1345 global_page_state(NR_FILE_MAPPED),
1346 global_page_state(NR_PAGETABLE));
1460 1347
1461 for_each_zone(zone) { 1348 for_each_zone(zone) {
1462 int i; 1349 int i;
1463 1350
1351 if (!populated_zone(zone))
1352 continue;
1353
1464 show_node(zone); 1354 show_node(zone);
1465 printk("%s" 1355 printk("%s"
1466 " free:%lukB" 1356 " free:%lukB"
@@ -1493,12 +1383,11 @@ void show_free_areas(void)
1493 for_each_zone(zone) { 1383 for_each_zone(zone) {
1494 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1384 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1495 1385
1386 if (!populated_zone(zone))
1387 continue;
1388
1496 show_node(zone); 1389 show_node(zone);
1497 printk("%s: ", zone->name); 1390 printk("%s: ", zone->name);
1498 if (!populated_zone(zone)) {
1499 printk("empty\n");
1500 continue;
1501 }
1502 1391
1503 spin_lock_irqsave(&zone->lock, flags); 1392 spin_lock_irqsave(&zone->lock, flags);
1504 for (order = 0; order < MAX_ORDER; order++) { 1393 for (order = 0; order < MAX_ORDER; order++) {
@@ -1520,39 +1409,25 @@ void show_free_areas(void)
1520 * Add all populated zones of a node to the zonelist. 1409 * Add all populated zones of a node to the zonelist.
1521 */ 1410 */
1522static int __meminit build_zonelists_node(pg_data_t *pgdat, 1411static int __meminit build_zonelists_node(pg_data_t *pgdat,
1523 struct zonelist *zonelist, int nr_zones, int zone_type) 1412 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
1524{ 1413{
1525 struct zone *zone; 1414 struct zone *zone;
1526 1415
1527 BUG_ON(zone_type > ZONE_HIGHMEM); 1416 BUG_ON(zone_type >= MAX_NR_ZONES);
1417 zone_type++;
1528 1418
1529 do { 1419 do {
1420 zone_type--;
1530 zone = pgdat->node_zones + zone_type; 1421 zone = pgdat->node_zones + zone_type;
1531 if (populated_zone(zone)) { 1422 if (populated_zone(zone)) {
1532#ifndef CONFIG_HIGHMEM
1533 BUG_ON(zone_type > ZONE_NORMAL);
1534#endif
1535 zonelist->zones[nr_zones++] = zone; 1423 zonelist->zones[nr_zones++] = zone;
1536 check_highest_zone(zone_type); 1424 check_highest_zone(zone_type);
1537 } 1425 }
1538 zone_type--;
1539 1426
1540 } while (zone_type >= 0); 1427 } while (zone_type);
1541 return nr_zones; 1428 return nr_zones;
1542} 1429}
1543 1430
1544static inline int highest_zone(int zone_bits)
1545{
1546 int res = ZONE_NORMAL;
1547 if (zone_bits & (__force int)__GFP_HIGHMEM)
1548 res = ZONE_HIGHMEM;
1549 if (zone_bits & (__force int)__GFP_DMA32)
1550 res = ZONE_DMA32;
1551 if (zone_bits & (__force int)__GFP_DMA)
1552 res = ZONE_DMA;
1553 return res;
1554}
1555
1556#ifdef CONFIG_NUMA 1431#ifdef CONFIG_NUMA
1557#define MAX_NODE_LOAD (num_online_nodes()) 1432#define MAX_NODE_LOAD (num_online_nodes())
1558static int __meminitdata node_load[MAX_NUMNODES]; 1433static int __meminitdata node_load[MAX_NUMNODES];
@@ -1618,13 +1493,14 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1618 1493
1619static void __meminit build_zonelists(pg_data_t *pgdat) 1494static void __meminit build_zonelists(pg_data_t *pgdat)
1620{ 1495{
1621 int i, j, k, node, local_node; 1496 int j, node, local_node;
1497 enum zone_type i;
1622 int prev_node, load; 1498 int prev_node, load;
1623 struct zonelist *zonelist; 1499 struct zonelist *zonelist;
1624 nodemask_t used_mask; 1500 nodemask_t used_mask;
1625 1501
1626 /* initialize zonelists */ 1502 /* initialize zonelists */
1627 for (i = 0; i < GFP_ZONETYPES; i++) { 1503 for (i = 0; i < MAX_NR_ZONES; i++) {
1628 zonelist = pgdat->node_zonelists + i; 1504 zonelist = pgdat->node_zonelists + i;
1629 zonelist->zones[0] = NULL; 1505 zonelist->zones[0] = NULL;
1630 } 1506 }
@@ -1654,13 +1530,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1654 node_load[node] += load; 1530 node_load[node] += load;
1655 prev_node = node; 1531 prev_node = node;
1656 load--; 1532 load--;
1657 for (i = 0; i < GFP_ZONETYPES; i++) { 1533 for (i = 0; i < MAX_NR_ZONES; i++) {
1658 zonelist = pgdat->node_zonelists + i; 1534 zonelist = pgdat->node_zonelists + i;
1659 for (j = 0; zonelist->zones[j] != NULL; j++); 1535 for (j = 0; zonelist->zones[j] != NULL; j++);
1660 1536
1661 k = highest_zone(i); 1537 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1662
1663 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1664 zonelist->zones[j] = NULL; 1538 zonelist->zones[j] = NULL;
1665 } 1539 }
1666 } 1540 }
@@ -1670,17 +1544,16 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1670 1544
1671static void __meminit build_zonelists(pg_data_t *pgdat) 1545static void __meminit build_zonelists(pg_data_t *pgdat)
1672{ 1546{
1673 int i, j, k, node, local_node; 1547 int node, local_node;
1548 enum zone_type i,j;
1674 1549
1675 local_node = pgdat->node_id; 1550 local_node = pgdat->node_id;
1676 for (i = 0; i < GFP_ZONETYPES; i++) { 1551 for (i = 0; i < MAX_NR_ZONES; i++) {
1677 struct zonelist *zonelist; 1552 struct zonelist *zonelist;
1678 1553
1679 zonelist = pgdat->node_zonelists + i; 1554 zonelist = pgdat->node_zonelists + i;
1680 1555
1681 j = 0; 1556 j = build_zonelists_node(pgdat, zonelist, 0, i);
1682 k = highest_zone(i);
1683 j = build_zonelists_node(pgdat, zonelist, j, k);
1684 /* 1557 /*
1685 * Now we build the zonelist so that it contains the zones 1558 * Now we build the zonelist so that it contains the zones
1686 * of all the other nodes. 1559 * of all the other nodes.
@@ -1692,12 +1565,12 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1692 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1565 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1693 if (!node_online(node)) 1566 if (!node_online(node))
1694 continue; 1567 continue;
1695 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1568 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1696 } 1569 }
1697 for (node = 0; node < local_node; node++) { 1570 for (node = 0; node < local_node; node++) {
1698 if (!node_online(node)) 1571 if (!node_online(node))
1699 continue; 1572 continue;
1700 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1573 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1701 } 1574 }
1702 1575
1703 zonelist->zones[j] = NULL; 1576 zonelist->zones[j] = NULL;
@@ -1718,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy)
1718void __meminit build_all_zonelists(void) 1591void __meminit build_all_zonelists(void)
1719{ 1592{
1720 if (system_state == SYSTEM_BOOTING) { 1593 if (system_state == SYSTEM_BOOTING) {
1721 __build_all_zonelists(0); 1594 __build_all_zonelists(NULL);
1722 cpuset_init_current_mems_allowed(); 1595 cpuset_init_current_mems_allowed();
1723 } else { 1596 } else {
1724 /* we have to stop all cpus to guaranntee there is no user 1597 /* we have to stop all cpus to guaranntee there is no user
@@ -1799,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
1799 1672
1800#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1673#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1801 1674
1802static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1803 unsigned long *zones_size, unsigned long *zholes_size)
1804{
1805 unsigned long realtotalpages, totalpages = 0;
1806 int i;
1807
1808 for (i = 0; i < MAX_NR_ZONES; i++)
1809 totalpages += zones_size[i];
1810 pgdat->node_spanned_pages = totalpages;
1811
1812 realtotalpages = totalpages;
1813 if (zholes_size)
1814 for (i = 0; i < MAX_NR_ZONES; i++)
1815 realtotalpages -= zholes_size[i];
1816 pgdat->node_present_pages = realtotalpages;
1817 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1818}
1819
1820
1821/* 1675/*
1822 * Initially all pages are reserved - free ones are freed 1676 * Initially all pages are reserved - free ones are freed
1823 * up by free_all_bootmem() once the early boot process is 1677 * up by free_all_bootmem() once the early boot process is
@@ -1858,8 +1712,8 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1858} 1712}
1859 1713
1860#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) 1714#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1861void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 1715void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
1862 unsigned long size) 1716 unsigned long pfn, unsigned long size)
1863{ 1717{
1864 unsigned long snum = pfn_to_section_nr(pfn); 1718 unsigned long snum = pfn_to_section_nr(pfn);
1865 unsigned long end = pfn_to_section_nr(pfn + size); 1719 unsigned long end = pfn_to_section_nr(pfn + size);
@@ -1975,6 +1829,9 @@ static int __cpuinit process_zones(int cpu)
1975 1829
1976 for_each_zone(zone) { 1830 for_each_zone(zone) {
1977 1831
1832 if (!populated_zone(zone))
1833 continue;
1834
1978 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 1835 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1979 GFP_KERNEL, cpu_to_node(cpu)); 1836 GFP_KERNEL, cpu_to_node(cpu));
1980 if (!zone_pcp(zone, cpu)) 1837 if (!zone_pcp(zone, cpu))
@@ -2005,12 +1862,14 @@ static inline void free_zone_pagesets(int cpu)
2005 for_each_zone(zone) { 1862 for_each_zone(zone) {
2006 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 1863 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
2007 1864
1865 /* Free per_cpu_pageset if it is slab allocated */
1866 if (pset != &boot_pageset[cpu])
1867 kfree(pset);
2008 zone_pcp(zone, cpu) = NULL; 1868 zone_pcp(zone, cpu) = NULL;
2009 kfree(pset);
2010 } 1869 }
2011} 1870}
2012 1871
2013static int pageset_cpuup_callback(struct notifier_block *nfb, 1872static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
2014 unsigned long action, 1873 unsigned long action,
2015 void *hcpu) 1874 void *hcpu)
2016{ 1875{
@@ -2032,7 +1891,7 @@ static int pageset_cpuup_callback(struct notifier_block *nfb,
2032 return ret; 1891 return ret;
2033} 1892}
2034 1893
2035static struct notifier_block pageset_notifier = 1894static struct notifier_block __cpuinitdata pageset_notifier =
2036 { &pageset_cpuup_callback, NULL, 0 }; 1895 { &pageset_cpuup_callback, NULL, 0 };
2037 1896
2038void __init setup_per_cpu_pageset(void) 1897void __init setup_per_cpu_pageset(void)
@@ -2132,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone,
2132 return 0; 1991 return 0;
2133} 1992}
2134 1993
1994#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
1995/*
1996 * Basic iterator support. Return the first range of PFNs for a node
1997 * Note: nid == MAX_NUMNODES returns first region regardless of node
1998 */
1999static int __init first_active_region_index_in_nid(int nid)
2000{
2001 int i;
2002
2003 for (i = 0; i < nr_nodemap_entries; i++)
2004 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2005 return i;
2006
2007 return -1;
2008}
2009
2010/*
2011 * Basic iterator support. Return the next active range of PFNs for a node
2012 * Note: nid == MAX_NUMNODES returns next region regardles of node
2013 */
2014static int __init next_active_region_index_in_nid(int index, int nid)
2015{
2016 for (index = index + 1; index < nr_nodemap_entries; index++)
2017 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2018 return index;
2019
2020 return -1;
2021}
2022
2023#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2024/*
2025 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2026 * Architectures may implement their own version but if add_active_range()
2027 * was used and there are no special requirements, this is a convenient
2028 * alternative
2029 */
2030int __init early_pfn_to_nid(unsigned long pfn)
2031{
2032 int i;
2033
2034 for (i = 0; i < nr_nodemap_entries; i++) {
2035 unsigned long start_pfn = early_node_map[i].start_pfn;
2036 unsigned long end_pfn = early_node_map[i].end_pfn;
2037
2038 if (start_pfn <= pfn && pfn < end_pfn)
2039 return early_node_map[i].nid;
2040 }
2041
2042 return 0;
2043}
2044#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2045
2046/* Basic iterator support to walk early_node_map[] */
2047#define for_each_active_range_index_in_nid(i, nid) \
2048 for (i = first_active_region_index_in_nid(nid); i != -1; \
2049 i = next_active_region_index_in_nid(i, nid))
2050
2051/**
2052 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2053 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
2054 * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
2055 *
2056 * If an architecture guarantees that all ranges registered with
2057 * add_active_ranges() contain no holes and may be freed, this
2058 * this function may be used instead of calling free_bootmem() manually.
2059 */
2060void __init free_bootmem_with_active_regions(int nid,
2061 unsigned long max_low_pfn)
2062{
2063 int i;
2064
2065 for_each_active_range_index_in_nid(i, nid) {
2066 unsigned long size_pages = 0;
2067 unsigned long end_pfn = early_node_map[i].end_pfn;
2068
2069 if (early_node_map[i].start_pfn >= max_low_pfn)
2070 continue;
2071
2072 if (end_pfn > max_low_pfn)
2073 end_pfn = max_low_pfn;
2074
2075 size_pages = end_pfn - early_node_map[i].start_pfn;
2076 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2077 PFN_PHYS(early_node_map[i].start_pfn),
2078 size_pages << PAGE_SHIFT);
2079 }
2080}
2081
2082/**
2083 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2084 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
2085 *
2086 * If an architecture guarantees that all ranges registered with
2087 * add_active_ranges() contain no holes and may be freed, this
2088 * this function may be used instead of calling memory_present() manually.
2089 */
2090void __init sparse_memory_present_with_active_regions(int nid)
2091{
2092 int i;
2093
2094 for_each_active_range_index_in_nid(i, nid)
2095 memory_present(early_node_map[i].nid,
2096 early_node_map[i].start_pfn,
2097 early_node_map[i].end_pfn);
2098}
2099
2100/**
2101 * push_node_boundaries - Push node boundaries to at least the requested boundary
2102 * @nid: The nid of the node to push the boundary for
2103 * @start_pfn: The start pfn of the node
2104 * @end_pfn: The end pfn of the node
2105 *
2106 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2107 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2108 * be hotplugged even though no physical memory exists. This function allows
2109 * an arch to push out the node boundaries so mem_map is allocated that can
2110 * be used later.
2111 */
2112#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2113void __init push_node_boundaries(unsigned int nid,
2114 unsigned long start_pfn, unsigned long end_pfn)
2115{
2116 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2117 nid, start_pfn, end_pfn);
2118
2119 /* Initialise the boundary for this node if necessary */
2120 if (node_boundary_end_pfn[nid] == 0)
2121 node_boundary_start_pfn[nid] = -1UL;
2122
2123 /* Update the boundaries */
2124 if (node_boundary_start_pfn[nid] > start_pfn)
2125 node_boundary_start_pfn[nid] = start_pfn;
2126 if (node_boundary_end_pfn[nid] < end_pfn)
2127 node_boundary_end_pfn[nid] = end_pfn;
2128}
2129
2130/* If necessary, push the node boundary out for reserve hotadd */
2131static void __init account_node_boundary(unsigned int nid,
2132 unsigned long *start_pfn, unsigned long *end_pfn)
2133{
2134 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2135 nid, *start_pfn, *end_pfn);
2136
2137 /* Return if boundary information has not been provided */
2138 if (node_boundary_end_pfn[nid] == 0)
2139 return;
2140
2141 /* Check the boundaries and update if necessary */
2142 if (node_boundary_start_pfn[nid] < *start_pfn)
2143 *start_pfn = node_boundary_start_pfn[nid];
2144 if (node_boundary_end_pfn[nid] > *end_pfn)
2145 *end_pfn = node_boundary_end_pfn[nid];
2146}
2147#else
2148void __init push_node_boundaries(unsigned int nid,
2149 unsigned long start_pfn, unsigned long end_pfn) {}
2150
2151static void __init account_node_boundary(unsigned int nid,
2152 unsigned long *start_pfn, unsigned long *end_pfn) {}
2153#endif
2154
2155
2156/**
2157 * get_pfn_range_for_nid - Return the start and end page frames for a node
2158 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
2159 * @start_pfn: Passed by reference. On return, it will have the node start_pfn
2160 * @end_pfn: Passed by reference. On return, it will have the node end_pfn
2161 *
2162 * It returns the start and end page frame of a node based on information
2163 * provided by an arch calling add_active_range(). If called for a node
2164 * with no available memory, a warning is printed and the start and end
2165 * PFNs will be 0
2166 */
2167void __init get_pfn_range_for_nid(unsigned int nid,
2168 unsigned long *start_pfn, unsigned long *end_pfn)
2169{
2170 int i;
2171 *start_pfn = -1UL;
2172 *end_pfn = 0;
2173
2174 for_each_active_range_index_in_nid(i, nid) {
2175 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
2176 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2177 }
2178
2179 if (*start_pfn == -1UL) {
2180 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2181 *start_pfn = 0;
2182 }
2183
2184 /* Push the node boundaries out if requested */
2185 account_node_boundary(nid, start_pfn, end_pfn);
2186}
2187
2188/*
2189 * Return the number of pages a zone spans in a node, including holes
2190 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2191 */
2192unsigned long __init zone_spanned_pages_in_node(int nid,
2193 unsigned long zone_type,
2194 unsigned long *ignored)
2195{
2196 unsigned long node_start_pfn, node_end_pfn;
2197 unsigned long zone_start_pfn, zone_end_pfn;
2198
2199 /* Get the start and end of the node and zone */
2200 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2201 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2202 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2203
2204 /* Check that this node has pages within the zone's required range */
2205 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
2206 return 0;
2207
2208 /* Move the zone boundaries inside the node if necessary */
2209 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2210 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2211
2212 /* Return the spanned pages */
2213 return zone_end_pfn - zone_start_pfn;
2214}
2215
2216/*
2217 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2218 * then all holes in the requested range will be accounted for
2219 */
2220unsigned long __init __absent_pages_in_range(int nid,
2221 unsigned long range_start_pfn,
2222 unsigned long range_end_pfn)
2223{
2224 int i = 0;
2225 unsigned long prev_end_pfn = 0, hole_pages = 0;
2226 unsigned long start_pfn;
2227
2228 /* Find the end_pfn of the first active range of pfns in the node */
2229 i = first_active_region_index_in_nid(nid);
2230 if (i == -1)
2231 return 0;
2232
2233 /* Account for ranges before physical memory on this node */
2234 if (early_node_map[i].start_pfn > range_start_pfn)
2235 hole_pages = early_node_map[i].start_pfn - range_start_pfn;
2236
2237 prev_end_pfn = early_node_map[i].start_pfn;
2238
2239 /* Find all holes for the zone within the node */
2240 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2241
2242 /* No need to continue if prev_end_pfn is outside the zone */
2243 if (prev_end_pfn >= range_end_pfn)
2244 break;
2245
2246 /* Make sure the end of the zone is not within the hole */
2247 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2248 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2249
2250 /* Update the hole size cound and move on */
2251 if (start_pfn > range_start_pfn) {
2252 BUG_ON(prev_end_pfn > start_pfn);
2253 hole_pages += start_pfn - prev_end_pfn;
2254 }
2255 prev_end_pfn = early_node_map[i].end_pfn;
2256 }
2257
2258 /* Account for ranges past physical memory on this node */
2259 if (range_end_pfn > prev_end_pfn)
2260 hole_pages = range_end_pfn -
2261 max(range_start_pfn, prev_end_pfn);
2262
2263 return hole_pages;
2264}
2265
2266/**
2267 * absent_pages_in_range - Return number of page frames in holes within a range
2268 * @start_pfn: The start PFN to start searching for holes
2269 * @end_pfn: The end PFN to stop searching for holes
2270 *
2271 * It returns the number of pages frames in memory holes within a range
2272 */
2273unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2274 unsigned long end_pfn)
2275{
2276 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2277}
2278
2279/* Return the number of page frames in holes in a zone on a node */
2280unsigned long __init zone_absent_pages_in_node(int nid,
2281 unsigned long zone_type,
2282 unsigned long *ignored)
2283{
2284 unsigned long node_start_pfn, node_end_pfn;
2285 unsigned long zone_start_pfn, zone_end_pfn;
2286
2287 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2288 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
2289 node_start_pfn);
2290 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2291 node_end_pfn);
2292
2293 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2294}
2295
2296/* Return the zone index a PFN is in */
2297int memmap_zone_idx(struct page *lmem_map)
2298{
2299 int i;
2300 unsigned long phys_addr = virt_to_phys(lmem_map);
2301 unsigned long pfn = phys_addr >> PAGE_SHIFT;
2302
2303 for (i = 0; i < MAX_NR_ZONES; i++)
2304 if (pfn < arch_zone_highest_possible_pfn[i])
2305 break;
2306
2307 return i;
2308}
2309#else
2310static inline unsigned long zone_spanned_pages_in_node(int nid,
2311 unsigned long zone_type,
2312 unsigned long *zones_size)
2313{
2314 return zones_size[zone_type];
2315}
2316
2317static inline unsigned long zone_absent_pages_in_node(int nid,
2318 unsigned long zone_type,
2319 unsigned long *zholes_size)
2320{
2321 if (!zholes_size)
2322 return 0;
2323
2324 return zholes_size[zone_type];
2325}
2326
2327static inline int memmap_zone_idx(struct page *lmem_map)
2328{
2329 return MAX_NR_ZONES;
2330}
2331#endif
2332
2333static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2334 unsigned long *zones_size, unsigned long *zholes_size)
2335{
2336 unsigned long realtotalpages, totalpages = 0;
2337 enum zone_type i;
2338
2339 for (i = 0; i < MAX_NR_ZONES; i++)
2340 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2341 zones_size);
2342 pgdat->node_spanned_pages = totalpages;
2343
2344 realtotalpages = totalpages;
2345 for (i = 0; i < MAX_NR_ZONES; i++)
2346 realtotalpages -=
2347 zone_absent_pages_in_node(pgdat->node_id, i,
2348 zholes_size);
2349 pgdat->node_present_pages = realtotalpages;
2350 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2351 realtotalpages);
2352}
2353
2135/* 2354/*
2136 * Set up the zone data structures: 2355 * Set up the zone data structures:
2137 * - mark all pages reserved 2356 * - mark all pages reserved
@@ -2141,7 +2360,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
2141static void __meminit free_area_init_core(struct pglist_data *pgdat, 2360static void __meminit free_area_init_core(struct pglist_data *pgdat,
2142 unsigned long *zones_size, unsigned long *zholes_size) 2361 unsigned long *zones_size, unsigned long *zholes_size)
2143{ 2362{
2144 unsigned long j; 2363 enum zone_type j;
2145 int nid = pgdat->node_id; 2364 int nid = pgdat->node_id;
2146 unsigned long zone_start_pfn = pgdat->node_start_pfn; 2365 unsigned long zone_start_pfn = pgdat->node_start_pfn;
2147 int ret; 2366 int ret;
@@ -2153,18 +2372,47 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2153 2372
2154 for (j = 0; j < MAX_NR_ZONES; j++) { 2373 for (j = 0; j < MAX_NR_ZONES; j++) {
2155 struct zone *zone = pgdat->node_zones + j; 2374 struct zone *zone = pgdat->node_zones + j;
2156 unsigned long size, realsize; 2375 unsigned long size, realsize, memmap_pages;
2157 2376
2158 realsize = size = zones_size[j]; 2377 size = zone_spanned_pages_in_node(nid, j, zones_size);
2159 if (zholes_size) 2378 realsize = size - zone_absent_pages_in_node(nid, j,
2160 realsize -= zholes_size[j]; 2379 zholes_size);
2161 2380
2162 if (j < ZONE_HIGHMEM) 2381 /*
2382 * Adjust realsize so that it accounts for how much memory
2383 * is used by this zone for memmap. This affects the watermark
2384 * and per-cpu initialisations
2385 */
2386 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
2387 if (realsize >= memmap_pages) {
2388 realsize -= memmap_pages;
2389 printk(KERN_DEBUG
2390 " %s zone: %lu pages used for memmap\n",
2391 zone_names[j], memmap_pages);
2392 } else
2393 printk(KERN_WARNING
2394 " %s zone: %lu pages exceeds realsize %lu\n",
2395 zone_names[j], memmap_pages, realsize);
2396
2397 /* Account for reserved DMA pages */
2398 if (j == ZONE_DMA && realsize > dma_reserve) {
2399 realsize -= dma_reserve;
2400 printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",
2401 dma_reserve);
2402 }
2403
2404 if (!is_highmem_idx(j))
2163 nr_kernel_pages += realsize; 2405 nr_kernel_pages += realsize;
2164 nr_all_pages += realsize; 2406 nr_all_pages += realsize;
2165 2407
2166 zone->spanned_pages = size; 2408 zone->spanned_pages = size;
2167 zone->present_pages = realsize; 2409 zone->present_pages = realsize;
2410#ifdef CONFIG_NUMA
2411 zone->node = nid;
2412 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2413 / 100;
2414 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
2415#endif
2168 zone->name = zone_names[j]; 2416 zone->name = zone_names[j];
2169 spin_lock_init(&zone->lock); 2417 spin_lock_init(&zone->lock);
2170 spin_lock_init(&zone->lru_lock); 2418 spin_lock_init(&zone->lru_lock);
@@ -2181,6 +2429,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2181 zone->nr_scan_inactive = 0; 2429 zone->nr_scan_inactive = 0;
2182 zone->nr_active = 0; 2430 zone->nr_active = 0;
2183 zone->nr_inactive = 0; 2431 zone->nr_inactive = 0;
2432 zap_zone_vm_stats(zone);
2184 atomic_set(&zone->reclaim_in_progress, 0); 2433 atomic_set(&zone->reclaim_in_progress, 0);
2185 if (!size) 2434 if (!size)
2186 continue; 2435 continue;
@@ -2222,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2222 /* 2471 /*
2223 * With no DISCONTIG, the global mem_map is just set as node 0's 2472 * With no DISCONTIG, the global mem_map is just set as node 0's
2224 */ 2473 */
2225 if (pgdat == NODE_DATA(0)) 2474 if (pgdat == NODE_DATA(0)) {
2226 mem_map = NODE_DATA(0)->node_mem_map; 2475 mem_map = NODE_DATA(0)->node_mem_map;
2476#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2477 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2478 mem_map -= pgdat->node_start_pfn;
2479#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2480 }
2227#endif 2481#endif
2228#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2482#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2229} 2483}
@@ -2234,327 +2488,280 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2234{ 2488{
2235 pgdat->node_id = nid; 2489 pgdat->node_id = nid;
2236 pgdat->node_start_pfn = node_start_pfn; 2490 pgdat->node_start_pfn = node_start_pfn;
2237 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 2491 calculate_node_totalpages(pgdat, zones_size, zholes_size);
2238 2492
2239 alloc_node_mem_map(pgdat); 2493 alloc_node_mem_map(pgdat);
2240 2494
2241 free_area_init_core(pgdat, zones_size, zholes_size); 2495 free_area_init_core(pgdat, zones_size, zholes_size);
2242} 2496}
2243 2497
2244#ifndef CONFIG_NEED_MULTIPLE_NODES 2498#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2245static bootmem_data_t contig_bootmem_data; 2499/**
2246struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2500 * add_active_range - Register a range of PFNs backed by physical memory
2501 * @nid: The node ID the range resides on
2502 * @start_pfn: The start PFN of the available physical memory
2503 * @end_pfn: The end PFN of the available physical memory
2504 *
2505 * These ranges are stored in an early_node_map[] and later used by
2506 * free_area_init_nodes() to calculate zone sizes and holes. If the
2507 * range spans a memory hole, it is up to the architecture to ensure
2508 * the memory is not freed by the bootmem allocator. If possible
2509 * the range being registered will be merged with existing ranges.
2510 */
2511void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2512 unsigned long end_pfn)
2513{
2514 int i;
2247 2515
2248EXPORT_SYMBOL(contig_page_data); 2516 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2249#endif 2517 "%d entries of %d used\n",
2518 nid, start_pfn, end_pfn,
2519 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2250 2520
2251void __init free_area_init(unsigned long *zones_size) 2521 /* Merge with existing active regions if possible */
2252{ 2522 for (i = 0; i < nr_nodemap_entries; i++) {
2253 free_area_init_node(0, NODE_DATA(0), zones_size, 2523 if (early_node_map[i].nid != nid)
2254 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2524 continue;
2255}
2256 2525
2257#ifdef CONFIG_PROC_FS 2526 /* Skip if an existing region covers this new one */
2527 if (start_pfn >= early_node_map[i].start_pfn &&
2528 end_pfn <= early_node_map[i].end_pfn)
2529 return;
2258 2530
2259#include <linux/seq_file.h> 2531 /* Merge forward if suitable */
2532 if (start_pfn <= early_node_map[i].end_pfn &&
2533 end_pfn > early_node_map[i].end_pfn) {
2534 early_node_map[i].end_pfn = end_pfn;
2535 return;
2536 }
2260 2537
2261static void *frag_start(struct seq_file *m, loff_t *pos) 2538 /* Merge backward if suitable */
2262{ 2539 if (start_pfn < early_node_map[i].end_pfn &&
2263 pg_data_t *pgdat; 2540 end_pfn >= early_node_map[i].start_pfn) {
2264 loff_t node = *pos; 2541 early_node_map[i].start_pfn = start_pfn;
2265 for (pgdat = first_online_pgdat(); 2542 return;
2266 pgdat && node; 2543 }
2267 pgdat = next_online_pgdat(pgdat)) 2544 }
2268 --node;
2269 2545
2270 return pgdat; 2546 /* Check that early_node_map is large enough */
2547 if (i >= MAX_ACTIVE_REGIONS) {
2548 printk(KERN_CRIT "More than %d memory regions, truncating\n",
2549 MAX_ACTIVE_REGIONS);
2550 return;
2551 }
2552
2553 early_node_map[i].nid = nid;
2554 early_node_map[i].start_pfn = start_pfn;
2555 early_node_map[i].end_pfn = end_pfn;
2556 nr_nodemap_entries = i + 1;
2271} 2557}
2272 2558
2273static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 2559/**
2560 * shrink_active_range - Shrink an existing registered range of PFNs
2561 * @nid: The node id the range is on that should be shrunk
2562 * @old_end_pfn: The old end PFN of the range
2563 * @new_end_pfn: The new PFN of the range
2564 *
2565 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2566 * The map is kept at the end physical page range that has already been
2567 * registered with add_active_range(). This function allows an arch to shrink
2568 * an existing registered range.
2569 */
2570void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2571 unsigned long new_end_pfn)
2274{ 2572{
2275 pg_data_t *pgdat = (pg_data_t *)arg; 2573 int i;
2276 2574
2277 (*pos)++; 2575 /* Find the old active region end and shrink */
2278 return next_online_pgdat(pgdat); 2576 for_each_active_range_index_in_nid(i, nid)
2577 if (early_node_map[i].end_pfn == old_end_pfn) {
2578 early_node_map[i].end_pfn = new_end_pfn;
2579 break;
2580 }
2279} 2581}
2280 2582
2281static void frag_stop(struct seq_file *m, void *arg) 2583/**
2584 * remove_all_active_ranges - Remove all currently registered regions
2585 * During discovery, it may be found that a table like SRAT is invalid
2586 * and an alternative discovery method must be used. This function removes
2587 * all currently registered regions.
2588 */
2589void __init remove_all_active_ranges()
2282{ 2590{
2591 memset(early_node_map, 0, sizeof(early_node_map));
2592 nr_nodemap_entries = 0;
2593#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2594 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
2595 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
2596#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
2283} 2597}
2284 2598
2285/* 2599/* Compare two active node_active_regions */
2286 * This walks the free areas for each zone. 2600static int __init cmp_node_active_region(const void *a, const void *b)
2287 */
2288static int frag_show(struct seq_file *m, void *arg)
2289{ 2601{
2290 pg_data_t *pgdat = (pg_data_t *)arg; 2602 struct node_active_region *arange = (struct node_active_region *)a;
2291 struct zone *zone; 2603 struct node_active_region *brange = (struct node_active_region *)b;
2292 struct zone *node_zones = pgdat->node_zones;
2293 unsigned long flags;
2294 int order;
2295 2604
2296 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2605 /* Done this way to avoid overflows */
2297 if (!populated_zone(zone)) 2606 if (arange->start_pfn > brange->start_pfn)
2298 continue; 2607 return 1;
2608 if (arange->start_pfn < brange->start_pfn)
2609 return -1;
2299 2610
2300 spin_lock_irqsave(&zone->lock, flags);
2301 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
2302 for (order = 0; order < MAX_ORDER; ++order)
2303 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
2304 spin_unlock_irqrestore(&zone->lock, flags);
2305 seq_putc(m, '\n');
2306 }
2307 return 0; 2611 return 0;
2308} 2612}
2309 2613
2310struct seq_operations fragmentation_op = { 2614/* sort the node_map by start_pfn */
2311 .start = frag_start, 2615static void __init sort_node_map(void)
2312 .next = frag_next,
2313 .stop = frag_stop,
2314 .show = frag_show,
2315};
2316
2317/*
2318 * Output information about zones in @pgdat.
2319 */
2320static int zoneinfo_show(struct seq_file *m, void *arg)
2321{ 2616{
2322 pg_data_t *pgdat = arg; 2617 sort(early_node_map, (size_t)nr_nodemap_entries,
2323 struct zone *zone; 2618 sizeof(struct node_active_region),
2324 struct zone *node_zones = pgdat->node_zones; 2619 cmp_node_active_region, NULL);
2325 unsigned long flags; 2620}
2326
2327 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2328 int i;
2329 2621
2330 if (!populated_zone(zone)) 2622/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
2331 continue; 2623unsigned long __init find_min_pfn_for_node(unsigned long nid)
2624{
2625 int i;
2332 2626
2333 spin_lock_irqsave(&zone->lock, flags); 2627 /* Assuming a sorted map, the first range found has the starting pfn */
2334 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 2628 for_each_active_range_index_in_nid(i, nid)
2335 seq_printf(m, 2629 return early_node_map[i].start_pfn;
2336 "\n pages free %lu"
2337 "\n min %lu"
2338 "\n low %lu"
2339 "\n high %lu"
2340 "\n active %lu"
2341 "\n inactive %lu"
2342 "\n scanned %lu (a: %lu i: %lu)"
2343 "\n spanned %lu"
2344 "\n present %lu",
2345 zone->free_pages,
2346 zone->pages_min,
2347 zone->pages_low,
2348 zone->pages_high,
2349 zone->nr_active,
2350 zone->nr_inactive,
2351 zone->pages_scanned,
2352 zone->nr_scan_active, zone->nr_scan_inactive,
2353 zone->spanned_pages,
2354 zone->present_pages);
2355 seq_printf(m,
2356 "\n protection: (%lu",
2357 zone->lowmem_reserve[0]);
2358 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
2359 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
2360 seq_printf(m,
2361 ")"
2362 "\n pagesets");
2363 for_each_online_cpu(i) {
2364 struct per_cpu_pageset *pageset;
2365 int j;
2366 2630
2367 pageset = zone_pcp(zone, i); 2631 printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
2368 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2369 if (pageset->pcp[j].count)
2370 break;
2371 }
2372 if (j == ARRAY_SIZE(pageset->pcp))
2373 continue;
2374 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2375 seq_printf(m,
2376 "\n cpu: %i pcp: %i"
2377 "\n count: %i"
2378 "\n high: %i"
2379 "\n batch: %i",
2380 i, j,
2381 pageset->pcp[j].count,
2382 pageset->pcp[j].high,
2383 pageset->pcp[j].batch);
2384 }
2385#ifdef CONFIG_NUMA
2386 seq_printf(m,
2387 "\n numa_hit: %lu"
2388 "\n numa_miss: %lu"
2389 "\n numa_foreign: %lu"
2390 "\n interleave_hit: %lu"
2391 "\n local_node: %lu"
2392 "\n other_node: %lu",
2393 pageset->numa_hit,
2394 pageset->numa_miss,
2395 pageset->numa_foreign,
2396 pageset->interleave_hit,
2397 pageset->local_node,
2398 pageset->other_node);
2399#endif
2400 }
2401 seq_printf(m,
2402 "\n all_unreclaimable: %u"
2403 "\n prev_priority: %i"
2404 "\n temp_priority: %i"
2405 "\n start_pfn: %lu",
2406 zone->all_unreclaimable,
2407 zone->prev_priority,
2408 zone->temp_priority,
2409 zone->zone_start_pfn);
2410 spin_unlock_irqrestore(&zone->lock, flags);
2411 seq_putc(m, '\n');
2412 }
2413 return 0; 2632 return 0;
2414} 2633}
2415 2634
2416struct seq_operations zoneinfo_op = { 2635/**
2417 .start = frag_start, /* iterate over all zones. The same as in 2636 * find_min_pfn_with_active_regions - Find the minimum PFN registered
2418 * fragmentation. */ 2637 *
2419 .next = frag_next, 2638 * It returns the minimum PFN based on information provided via
2420 .stop = frag_stop, 2639 * add_active_range()
2421 .show = zoneinfo_show, 2640 */
2422}; 2641unsigned long __init find_min_pfn_with_active_regions(void)
2423 2642{
2424static char *vmstat_text[] = { 2643 return find_min_pfn_for_node(MAX_NUMNODES);
2425 "nr_dirty", 2644}
2426 "nr_writeback",
2427 "nr_unstable",
2428 "nr_page_table_pages",
2429 "nr_mapped",
2430 "nr_slab",
2431
2432 "pgpgin",
2433 "pgpgout",
2434 "pswpin",
2435 "pswpout",
2436
2437 "pgalloc_high",
2438 "pgalloc_normal",
2439 "pgalloc_dma32",
2440 "pgalloc_dma",
2441
2442 "pgfree",
2443 "pgactivate",
2444 "pgdeactivate",
2445
2446 "pgfault",
2447 "pgmajfault",
2448
2449 "pgrefill_high",
2450 "pgrefill_normal",
2451 "pgrefill_dma32",
2452 "pgrefill_dma",
2453
2454 "pgsteal_high",
2455 "pgsteal_normal",
2456 "pgsteal_dma32",
2457 "pgsteal_dma",
2458
2459 "pgscan_kswapd_high",
2460 "pgscan_kswapd_normal",
2461 "pgscan_kswapd_dma32",
2462 "pgscan_kswapd_dma",
2463
2464 "pgscan_direct_high",
2465 "pgscan_direct_normal",
2466 "pgscan_direct_dma32",
2467 "pgscan_direct_dma",
2468
2469 "pginodesteal",
2470 "slabs_scanned",
2471 "kswapd_steal",
2472 "kswapd_inodesteal",
2473 "pageoutrun",
2474 "allocstall",
2475
2476 "pgrotated",
2477 "nr_bounce",
2478};
2479 2645
2480static void *vmstat_start(struct seq_file *m, loff_t *pos) 2646/**
2647 * find_max_pfn_with_active_regions - Find the maximum PFN registered
2648 *
2649 * It returns the maximum PFN based on information provided via
2650 * add_active_range()
2651 */
2652unsigned long __init find_max_pfn_with_active_regions(void)
2481{ 2653{
2482 struct page_state *ps; 2654 int i;
2655 unsigned long max_pfn = 0;
2483 2656
2484 if (*pos >= ARRAY_SIZE(vmstat_text)) 2657 for (i = 0; i < nr_nodemap_entries; i++)
2485 return NULL; 2658 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2486 2659
2487 ps = kmalloc(sizeof(*ps), GFP_KERNEL); 2660 return max_pfn;
2488 m->private = ps;
2489 if (!ps)
2490 return ERR_PTR(-ENOMEM);
2491 get_full_page_state(ps);
2492 ps->pgpgin /= 2; /* sectors -> kbytes */
2493 ps->pgpgout /= 2;
2494 return (unsigned long *)ps + *pos;
2495} 2661}
2496 2662
2497static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 2663/**
2498{ 2664 * free_area_init_nodes - Initialise all pg_data_t and zone data
2499 (*pos)++; 2665 * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
2500 if (*pos >= ARRAY_SIZE(vmstat_text)) 2666 * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
2501 return NULL; 2667 * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
2502 return (unsigned long *)m->private + *pos; 2668 * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
2503} 2669 *
2670 * This will call free_area_init_node() for each active node in the system.
2671 * Using the page ranges provided by add_active_range(), the size of each
2672 * zone in each node and their holes is calculated. If the maximum PFN
2673 * between two adjacent zones match, it is assumed that the zone is empty.
2674 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2675 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2676 * starts where the previous one ended. For example, ZONE_DMA32 starts
2677 * at arch_max_dma_pfn.
2678 */
2679void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2680{
2681 unsigned long nid;
2682 enum zone_type i;
2683
2684 /* Record where the zone boundaries are */
2685 memset(arch_zone_lowest_possible_pfn, 0,
2686 sizeof(arch_zone_lowest_possible_pfn));
2687 memset(arch_zone_highest_possible_pfn, 0,
2688 sizeof(arch_zone_highest_possible_pfn));
2689 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2690 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2691 for (i = 1; i < MAX_NR_ZONES; i++) {
2692 arch_zone_lowest_possible_pfn[i] =
2693 arch_zone_highest_possible_pfn[i-1];
2694 arch_zone_highest_possible_pfn[i] =
2695 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2696 }
2504 2697
2505static int vmstat_show(struct seq_file *m, void *arg) 2698 /* Regions in the early_node_map can be in any order */
2506{ 2699 sort_node_map();
2507 unsigned long *l = arg;
2508 unsigned long off = l - (unsigned long *)m->private;
2509 2700
2510 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 2701 /* Print out the zone ranges */
2511 return 0; 2702 printk("Zone PFN ranges:\n");
2703 for (i = 0; i < MAX_NR_ZONES; i++)
2704 printk(" %-8s %8lu -> %8lu\n",
2705 zone_names[i],
2706 arch_zone_lowest_possible_pfn[i],
2707 arch_zone_highest_possible_pfn[i]);
2708
2709 /* Print out the early_node_map[] */
2710 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2711 for (i = 0; i < nr_nodemap_entries; i++)
2712 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2713 early_node_map[i].start_pfn,
2714 early_node_map[i].end_pfn);
2715
2716 /* Initialise every node */
2717 for_each_online_node(nid) {
2718 pg_data_t *pgdat = NODE_DATA(nid);
2719 free_area_init_node(nid, pgdat, NULL,
2720 find_min_pfn_for_node(nid), NULL);
2721 }
2512} 2722}
2723#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2513 2724
2514static void vmstat_stop(struct seq_file *m, void *arg) 2725/**
2726 * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA
2727 * @new_dma_reserve - The number of pages to mark reserved
2728 *
2729 * The per-cpu batchsize and zone watermarks are determined by present_pages.
2730 * In the DMA zone, a significant percentage may be consumed by kernel image
2731 * and other unfreeable allocations which can skew the watermarks badly. This
2732 * function may optionally be used to account for unfreeable pages in
2733 * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize
2734 */
2735void __init set_dma_reserve(unsigned long new_dma_reserve)
2515{ 2736{
2516 kfree(m->private); 2737 dma_reserve = new_dma_reserve;
2517 m->private = NULL;
2518} 2738}
2519 2739
2520struct seq_operations vmstat_op = { 2740#ifndef CONFIG_NEED_MULTIPLE_NODES
2521 .start = vmstat_start, 2741static bootmem_data_t contig_bootmem_data;
2522 .next = vmstat_next, 2742struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
2523 .stop = vmstat_stop, 2743
2524 .show = vmstat_show, 2744EXPORT_SYMBOL(contig_page_data);
2525}; 2745#endif
2526 2746
2527#endif /* CONFIG_PROC_FS */ 2747void __init free_area_init(unsigned long *zones_size)
2748{
2749 free_area_init_node(0, NODE_DATA(0), zones_size,
2750 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2751}
2528 2752
2529#ifdef CONFIG_HOTPLUG_CPU 2753#ifdef CONFIG_HOTPLUG_CPU
2530static int page_alloc_cpu_notify(struct notifier_block *self, 2754static int page_alloc_cpu_notify(struct notifier_block *self,
2531 unsigned long action, void *hcpu) 2755 unsigned long action, void *hcpu)
2532{ 2756{
2533 int cpu = (unsigned long)hcpu; 2757 int cpu = (unsigned long)hcpu;
2534 long *count;
2535 unsigned long *src, *dest;
2536 2758
2537 if (action == CPU_DEAD) { 2759 if (action == CPU_DEAD) {
2538 int i;
2539
2540 /* Drain local pagecache count. */
2541 count = &per_cpu(nr_pagecache_local, cpu);
2542 atomic_add(*count, &nr_pagecache);
2543 *count = 0;
2544 local_irq_disable(); 2760 local_irq_disable();
2545 __drain_pages(cpu); 2761 __drain_pages(cpu);
2546 2762 vm_events_fold_cpu(cpu);
2547 /* Add dead cpu's page_states to our own. */
2548 dest = (unsigned long *)&__get_cpu_var(page_states);
2549 src = (unsigned long *)&per_cpu(page_states, cpu);
2550
2551 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
2552 i++) {
2553 dest[i] += src[i];
2554 src[i] = 0;
2555 }
2556
2557 local_irq_enable(); 2763 local_irq_enable();
2764 refresh_cpu_vm_stats(cpu);
2558 } 2765 }
2559 return NOTIFY_OK; 2766 return NOTIFY_OK;
2560} 2767}
@@ -2573,7 +2780,7 @@ static void calculate_totalreserve_pages(void)
2573{ 2780{
2574 struct pglist_data *pgdat; 2781 struct pglist_data *pgdat;
2575 unsigned long reserve_pages = 0; 2782 unsigned long reserve_pages = 0;
2576 int i, j; 2783 enum zone_type i, j;
2577 2784
2578 for_each_online_pgdat(pgdat) { 2785 for_each_online_pgdat(pgdat) {
2579 for (i = 0; i < MAX_NR_ZONES; i++) { 2786 for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -2606,7 +2813,7 @@ static void calculate_totalreserve_pages(void)
2606static void setup_per_zone_lowmem_reserve(void) 2813static void setup_per_zone_lowmem_reserve(void)
2607{ 2814{
2608 struct pglist_data *pgdat; 2815 struct pglist_data *pgdat;
2609 int j, idx; 2816 enum zone_type j, idx;
2610 2817
2611 for_each_online_pgdat(pgdat) { 2818 for_each_online_pgdat(pgdat) {
2612 for (j = 0; j < MAX_NR_ZONES; j++) { 2819 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -2615,9 +2822,12 @@ static void setup_per_zone_lowmem_reserve(void)
2615 2822
2616 zone->lowmem_reserve[j] = 0; 2823 zone->lowmem_reserve[j] = 0;
2617 2824
2618 for (idx = j-1; idx >= 0; idx--) { 2825 idx = j;
2826 while (idx) {
2619 struct zone *lower_zone; 2827 struct zone *lower_zone;
2620 2828
2829 idx--;
2830
2621 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2831 if (sysctl_lowmem_reserve_ratio[idx] < 1)
2622 sysctl_lowmem_reserve_ratio[idx] = 1; 2832 sysctl_lowmem_reserve_ratio[idx] = 1;
2623 2833
@@ -2746,6 +2956,40 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
2746 return 0; 2956 return 0;
2747} 2957}
2748 2958
2959#ifdef CONFIG_NUMA
2960int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
2961 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2962{
2963 struct zone *zone;
2964 int rc;
2965
2966 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2967 if (rc)
2968 return rc;
2969
2970 for_each_zone(zone)
2971 zone->min_unmapped_pages = (zone->present_pages *
2972 sysctl_min_unmapped_ratio) / 100;
2973 return 0;
2974}
2975
2976int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
2977 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2978{
2979 struct zone *zone;
2980 int rc;
2981
2982 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2983 if (rc)
2984 return rc;
2985
2986 for_each_zone(zone)
2987 zone->min_slab_pages = (zone->present_pages *
2988 sysctl_min_slab_ratio) / 100;
2989 return 0;
2990}
2991#endif
2992
2749/* 2993/*
2750 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 2994 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
2751 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 2995 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
@@ -2789,7 +3033,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
2789 return 0; 3033 return 0;
2790} 3034}
2791 3035
2792__initdata int hashdist = HASHDIST_DEFAULT; 3036int hashdist = HASHDIST_DEFAULT;
2793 3037
2794#ifdef CONFIG_NUMA 3038#ifdef CONFIG_NUMA
2795static int __init set_hashdist(char *str) 3039static int __init set_hashdist(char *str)
diff --git a/mm/page_io.c b/mm/page_io.c
index bb2b0d53889c..d4840ecbf8f9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -52,14 +52,29 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
52 if (bio->bi_size) 52 if (bio->bi_size)
53 return 1; 53 return 1;
54 54
55 if (!uptodate) 55 if (!uptodate) {
56 SetPageError(page); 56 SetPageError(page);
57 /*
58 * We failed to write the page out to swap-space.
59 * Re-dirty the page in order to avoid it being reclaimed.
60 * Also print a dire warning that things will go BAD (tm)
61 * very quickly.
62 *
63 * Also clear PG_reclaim to avoid rotate_reclaimable_page()
64 */
65 set_page_dirty(page);
66 printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
67 imajor(bio->bi_bdev->bd_inode),
68 iminor(bio->bi_bdev->bd_inode),
69 (unsigned long long)bio->bi_sector);
70 ClearPageReclaim(page);
71 }
57 end_page_writeback(page); 72 end_page_writeback(page);
58 bio_put(bio); 73 bio_put(bio);
59 return 0; 74 return 0;
60} 75}
61 76
62static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) 77int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
63{ 78{
64 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 79 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
65 struct page *page = bio->bi_io_vec[0].bv_page; 80 struct page *page = bio->bi_io_vec[0].bv_page;
@@ -70,6 +85,10 @@ static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
70 if (!uptodate) { 85 if (!uptodate) {
71 SetPageError(page); 86 SetPageError(page);
72 ClearPageUptodate(page); 87 ClearPageUptodate(page);
88 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
89 imajor(bio->bi_bdev->bd_inode),
90 iminor(bio->bi_bdev->bd_inode),
91 (unsigned long long)bio->bi_sector);
73 } else { 92 } else {
74 SetPageUptodate(page); 93 SetPageUptodate(page);
75 } 94 }
@@ -101,7 +120,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
101 } 120 }
102 if (wbc->sync_mode == WB_SYNC_ALL) 121 if (wbc->sync_mode == WB_SYNC_ALL)
103 rw |= (1 << BIO_RW_SYNC); 122 rw |= (1 << BIO_RW_SYNC);
104 inc_page_state(pswpout); 123 count_vm_event(PSWPOUT);
105 set_page_writeback(page); 124 set_page_writeback(page);
106 unlock_page(page); 125 unlock_page(page);
107 submit_bio(rw, bio); 126 submit_bio(rw, bio);
@@ -123,7 +142,7 @@ int swap_readpage(struct file *file, struct page *page)
123 ret = -ENOMEM; 142 ret = -ENOMEM;
124 goto out; 143 goto out;
125 } 144 }
126 inc_page_state(pswpin); 145 count_vm_event(PSWPIN);
127 submit_bio(READ, bio); 146 submit_bio(READ, bio);
128out: 147out:
129 return ret; 148 return ret;
@@ -137,10 +156,12 @@ out:
137 * We use end_swap_bio_read() even for writes, because it happens to do what 156 * We use end_swap_bio_read() even for writes, because it happens to do what
138 * we want. 157 * we want.
139 */ 158 */
140int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) 159int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
160 struct bio **bio_chain)
141{ 161{
142 struct bio *bio; 162 struct bio *bio;
143 int ret = 0; 163 int ret = 0;
164 int bio_rw;
144 165
145 lock_page(page); 166 lock_page(page);
146 167
@@ -151,11 +172,22 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
151 goto out; 172 goto out;
152 } 173 }
153 174
154 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 175 bio_rw = rw;
155 wait_on_page_locked(page); 176 if (!bio_chain)
156 177 bio_rw |= (1 << BIO_RW_SYNC);
157 if (!PageUptodate(page) || PageError(page)) 178 if (bio_chain)
158 ret = -EIO; 179 bio_get(bio);
180 submit_bio(bio_rw, bio);
181 if (bio_chain == NULL) {
182 wait_on_page_locked(page);
183
184 if (!PageUptodate(page) || PageError(page))
185 ret = -EIO;
186 }
187 if (bio_chain) {
188 bio->bi_private = *bio_chain;
189 *bio_chain = bio;
190 }
159out: 191out:
160 return ret; 192 return ret;
161} 193}
diff --git a/mm/pdflush.c b/mm/pdflush.c
index df7e50b8f70c..b02102feeb4b 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -104,21 +104,20 @@ static int __pdflush(struct pdflush_work *my_work)
104 list_move(&my_work->list, &pdflush_list); 104 list_move(&my_work->list, &pdflush_list);
105 my_work->when_i_went_to_sleep = jiffies; 105 my_work->when_i_went_to_sleep = jiffies;
106 spin_unlock_irq(&pdflush_lock); 106 spin_unlock_irq(&pdflush_lock);
107
108 schedule(); 107 schedule();
109 if (try_to_freeze()) { 108 try_to_freeze();
110 spin_lock_irq(&pdflush_lock);
111 continue;
112 }
113
114 spin_lock_irq(&pdflush_lock); 109 spin_lock_irq(&pdflush_lock);
115 if (!list_empty(&my_work->list)) { 110 if (!list_empty(&my_work->list)) {
116 printk("pdflush: bogus wakeup!\n"); 111 /*
112 * Someone woke us up, but without removing our control
113 * structure from the global list. swsusp will do this
114 * in try_to_freeze()->refrigerator(). Handle it.
115 */
117 my_work->fn = NULL; 116 my_work->fn = NULL;
118 continue; 117 continue;
119 } 118 }
120 if (my_work->fn == NULL) { 119 if (my_work->fn == NULL) {
121 printk("pdflush: NULL work function\n"); 120 printk("pdflush: bogus wakeup\n");
122 continue; 121 continue;
123 } 122 }
124 spin_unlock_irq(&pdflush_lock); 123 spin_unlock_irq(&pdflush_lock);
diff --git a/mm/readahead.c b/mm/readahead.c
index 0f142a40984b..aa7ec424656a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -118,8 +118,7 @@ static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
118#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) 118#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
119 119
120/** 120/**
121 * read_cache_pages - populate an address space with some pages, and 121 * read_cache_pages - populate an address space with some pages & start reads against them
122 * start reads against them.
123 * @mapping: the address_space 122 * @mapping: the address_space
124 * @pages: The address of a list_head which contains the target pages. These 123 * @pages: The address of a list_head which contains the target pages. These
125 * pages have their ->index populated and are otherwise uninitialised. 124 * pages have their ->index populated and are otherwise uninitialised.
@@ -182,14 +181,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
182 list_del(&page->lru); 181 list_del(&page->lru);
183 if (!add_to_page_cache(page, mapping, 182 if (!add_to_page_cache(page, mapping,
184 page->index, GFP_KERNEL)) { 183 page->index, GFP_KERNEL)) {
185 ret = mapping->a_ops->readpage(filp, page); 184 mapping->a_ops->readpage(filp, page);
186 if (ret != AOP_TRUNCATED_PAGE) { 185 if (!pagevec_add(&lru_pvec, page))
187 if (!pagevec_add(&lru_pvec, page)) 186 __pagevec_lru_add(&lru_pvec);
188 __pagevec_lru_add(&lru_pvec); 187 } else
189 continue; 188 page_cache_release(page);
190 } /* else fall through to release */
191 }
192 page_cache_release(page);
193 } 189 }
194 pagevec_lru_add(&lru_pvec); 190 pagevec_lru_add(&lru_pvec);
195 ret = 0; 191 ret = 0;
@@ -394,8 +390,8 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
394 * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' 390 * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
395 * is set wait till the read completes. Otherwise attempt to read without 391 * is set wait till the read completes. Otherwise attempt to read without
396 * blocking. 392 * blocking.
397 * Returns 1 meaning 'success' if read is succesfull without switching off 393 * Returns 1 meaning 'success' if read is successful without switching off
398 * readhaead mode. Otherwise return failure. 394 * readahead mode. Otherwise return failure.
399 */ 395 */
400static int 396static int
401blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, 397blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
diff --git a/mm/rmap.c b/mm/rmap.c
index 882a85826bb2..e2155d791d99 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -434,6 +434,71 @@ int page_referenced(struct page *page, int is_locked)
434 return referenced; 434 return referenced;
435} 435}
436 436
437static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
438{
439 struct mm_struct *mm = vma->vm_mm;
440 unsigned long address;
441 pte_t *pte, entry;
442 spinlock_t *ptl;
443 int ret = 0;
444
445 address = vma_address(page, vma);
446 if (address == -EFAULT)
447 goto out;
448
449 pte = page_check_address(page, mm, address, &ptl);
450 if (!pte)
451 goto out;
452
453 if (!pte_dirty(*pte) && !pte_write(*pte))
454 goto unlock;
455
456 entry = ptep_get_and_clear(mm, address, pte);
457 entry = pte_mkclean(entry);
458 entry = pte_wrprotect(entry);
459 ptep_establish(vma, address, pte, entry);
460 lazy_mmu_prot_update(entry);
461 ret = 1;
462
463unlock:
464 pte_unmap_unlock(pte, ptl);
465out:
466 return ret;
467}
468
469static int page_mkclean_file(struct address_space *mapping, struct page *page)
470{
471 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
472 struct vm_area_struct *vma;
473 struct prio_tree_iter iter;
474 int ret = 0;
475
476 BUG_ON(PageAnon(page));
477
478 spin_lock(&mapping->i_mmap_lock);
479 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
480 if (vma->vm_flags & VM_SHARED)
481 ret += page_mkclean_one(page, vma);
482 }
483 spin_unlock(&mapping->i_mmap_lock);
484 return ret;
485}
486
487int page_mkclean(struct page *page)
488{
489 int ret = 0;
490
491 BUG_ON(!PageLocked(page));
492
493 if (page_mapped(page)) {
494 struct address_space *mapping = page_mapping(page);
495 if (mapping)
496 ret = page_mkclean_file(mapping, page);
497 }
498
499 return ret;
500}
501
437/** 502/**
438 * page_set_anon_rmap - setup new anonymous rmap 503 * page_set_anon_rmap - setup new anonymous rmap
439 * @page: the page to add the mapping to 504 * @page: the page to add the mapping to
@@ -455,7 +520,7 @@ static void __page_set_anon_rmap(struct page *page,
455 * nr_mapped state can be updated without turning off 520 * nr_mapped state can be updated without turning off
456 * interrupts because it is not modified via interrupt. 521 * interrupts because it is not modified via interrupt.
457 */ 522 */
458 __inc_page_state(nr_mapped); 523 __inc_zone_page_state(page, NR_ANON_PAGES);
459} 524}
460 525
461/** 526/**
@@ -499,7 +564,7 @@ void page_add_new_anon_rmap(struct page *page,
499void page_add_file_rmap(struct page *page) 564void page_add_file_rmap(struct page *page)
500{ 565{
501 if (atomic_inc_and_test(&page->_mapcount)) 566 if (atomic_inc_and_test(&page->_mapcount))
502 __inc_page_state(nr_mapped); 567 __inc_zone_page_state(page, NR_FILE_MAPPED);
503} 568}
504 569
505/** 570/**
@@ -531,7 +596,8 @@ void page_remove_rmap(struct page *page)
531 */ 596 */
532 if (page_test_and_clear_dirty(page)) 597 if (page_test_and_clear_dirty(page))
533 set_page_dirty(page); 598 set_page_dirty(page);
534 __dec_page_state(nr_mapped); 599 __dec_zone_page_state(page,
600 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
535 } 601 }
536} 602}
537 603
@@ -562,9 +628,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
562 * If it's recently referenced (perhaps page_referenced 628 * If it's recently referenced (perhaps page_referenced
563 * skipped over this mm) then we should reactivate it. 629 * skipped over this mm) then we should reactivate it.
564 */ 630 */
565 if ((vma->vm_flags & VM_LOCKED) || 631 if (!migration && ((vma->vm_flags & VM_LOCKED) ||
566 (ptep_clear_flush_young(vma, address, pte) 632 (ptep_clear_flush_young(vma, address, pte)))) {
567 && !migration)) {
568 ret = SWAP_FAIL; 633 ret = SWAP_FAIL;
569 goto out_unmap; 634 goto out_unmap;
570 } 635 }
@@ -771,7 +836,7 @@ static int try_to_unmap_file(struct page *page, int migration)
771 836
772 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 837 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
773 shared.vm_set.list) { 838 shared.vm_set.list) {
774 if (vma->vm_flags & VM_LOCKED) 839 if ((vma->vm_flags & VM_LOCKED) && !migration)
775 continue; 840 continue;
776 cursor = (unsigned long) vma->vm_private_data; 841 cursor = (unsigned long) vma->vm_private_data;
777 if (cursor > max_nl_cursor) 842 if (cursor > max_nl_cursor)
@@ -805,7 +870,7 @@ static int try_to_unmap_file(struct page *page, int migration)
805 do { 870 do {
806 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 871 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
807 shared.vm_set.list) { 872 shared.vm_set.list) {
808 if (vma->vm_flags & VM_LOCKED) 873 if ((vma->vm_flags & VM_LOCKED) && !migration)
809 continue; 874 continue;
810 cursor = (unsigned long) vma->vm_private_data; 875 cursor = (unsigned long) vma->vm_private_data;
811 while ( cursor < max_nl_cursor && 876 while ( cursor < max_nl_cursor &&
diff --git a/mm/shmem.c b/mm/shmem.c
index 84b5cf9b63c5..b96de69f236b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -23,11 +23,11 @@
23 * which makes it a completely usable filesystem. 23 * which makes it a completely usable filesystem.
24 */ 24 */
25 25
26#include <linux/config.h>
27#include <linux/module.h> 26#include <linux/module.h>
28#include <linux/init.h> 27#include <linux/init.h>
29#include <linux/devfs_fs_kernel.h>
30#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/xattr.h>
30#include <linux/generic_acl.h>
31#include <linux/mm.h> 31#include <linux/mm.h>
32#include <linux/mman.h> 32#include <linux/mman.h>
33#include <linux/file.h> 33#include <linux/file.h>
@@ -47,6 +47,7 @@
47#include <linux/namei.h> 47#include <linux/namei.h>
48#include <linux/ctype.h> 48#include <linux/ctype.h>
49#include <linux/migrate.h> 49#include <linux/migrate.h>
50#include <linux/highmem.h>
50 51
51#include <asm/uaccess.h> 52#include <asm/uaccess.h>
52#include <asm/div64.h> 53#include <asm/div64.h>
@@ -174,10 +175,11 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
174} 175}
175 176
176static struct super_operations shmem_ops; 177static struct super_operations shmem_ops;
177static struct address_space_operations shmem_aops; 178static const struct address_space_operations shmem_aops;
178static struct file_operations shmem_file_operations; 179static struct file_operations shmem_file_operations;
179static struct inode_operations shmem_inode_operations; 180static struct inode_operations shmem_inode_operations;
180static struct inode_operations shmem_dir_inode_operations; 181static struct inode_operations shmem_dir_inode_operations;
182static struct inode_operations shmem_special_inode_operations;
181static struct vm_operations_struct shmem_vm_ops; 183static struct vm_operations_struct shmem_vm_ops;
182 184
183static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 185static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
@@ -638,7 +640,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
638 struct page *page = NULL; 640 struct page *page = NULL;
639 int error; 641 int error;
640 642
641 if (attr->ia_valid & ATTR_SIZE) { 643 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
642 if (attr->ia_size < inode->i_size) { 644 if (attr->ia_size < inode->i_size) {
643 /* 645 /*
644 * If truncating down to a partial page, then 646 * If truncating down to a partial page, then
@@ -671,6 +673,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
671 error = inode_change_ok(inode, attr); 673 error = inode_change_ok(inode, attr);
672 if (!error) 674 if (!error)
673 error = inode_setattr(inode, attr); 675 error = inode_setattr(inode, attr);
676#ifdef CONFIG_TMPFS_POSIX_ACL
677 if (!error && (attr->ia_valid & ATTR_MODE))
678 error = generic_acl_chmod(inode, &shmem_acl_ops);
679#endif
674 if (page) 680 if (page)
675 page_cache_release(page); 681 page_cache_release(page);
676 return error; 682 return error;
@@ -1046,12 +1052,12 @@ repeat:
1046 swappage = lookup_swap_cache(swap); 1052 swappage = lookup_swap_cache(swap);
1047 if (!swappage) { 1053 if (!swappage) {
1048 shmem_swp_unmap(entry); 1054 shmem_swp_unmap(entry);
1049 spin_unlock(&info->lock);
1050 /* here we actually do the io */ 1055 /* here we actually do the io */
1051 if (type && *type == VM_FAULT_MINOR) { 1056 if (type && *type == VM_FAULT_MINOR) {
1052 inc_page_state(pgmajfault); 1057 __count_vm_event(PGMAJFAULT);
1053 *type = VM_FAULT_MAJOR; 1058 *type = VM_FAULT_MAJOR;
1054 } 1059 }
1060 spin_unlock(&info->lock);
1055 swappage = shmem_swapin(info, swap, idx); 1061 swappage = shmem_swapin(info, swap, idx);
1056 if (!swappage) { 1062 if (!swappage) {
1057 spin_lock(&info->lock); 1063 spin_lock(&info->lock);
@@ -1352,7 +1358,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1352 inode->i_mode = mode; 1358 inode->i_mode = mode;
1353 inode->i_uid = current->fsuid; 1359 inode->i_uid = current->fsuid;
1354 inode->i_gid = current->fsgid; 1360 inode->i_gid = current->fsgid;
1355 inode->i_blksize = PAGE_CACHE_SIZE;
1356 inode->i_blocks = 0; 1361 inode->i_blocks = 0;
1357 inode->i_mapping->a_ops = &shmem_aops; 1362 inode->i_mapping->a_ops = &shmem_aops;
1358 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1363 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1364,6 +1369,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1364 1369
1365 switch (mode & S_IFMT) { 1370 switch (mode & S_IFMT) {
1366 default: 1371 default:
1372 inode->i_op = &shmem_special_inode_operations;
1367 init_special_inode(inode, mode, dev); 1373 init_special_inode(inode, mode, dev);
1368 break; 1374 break;
1369 case S_IFREG: 1375 case S_IFREG:
@@ -1684,7 +1690,11 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1684 iput(inode); 1690 iput(inode);
1685 return error; 1691 return error;
1686 } 1692 }
1687 error = 0; 1693 }
1694 error = shmem_acl_init(inode, dir);
1695 if (error) {
1696 iput(inode);
1697 return error;
1688 } 1698 }
1689 if (dir->i_mode & S_ISGID) { 1699 if (dir->i_mode & S_ISGID) {
1690 inode->i_gid = dir->i_gid; 1700 inode->i_gid = dir->i_gid;
@@ -1899,6 +1909,53 @@ static struct inode_operations shmem_symlink_inode_operations = {
1899 .put_link = shmem_put_link, 1909 .put_link = shmem_put_link,
1900}; 1910};
1901 1911
1912#ifdef CONFIG_TMPFS_POSIX_ACL
1913/**
1914 * Superblocks without xattr inode operations will get security.* xattr
1915 * support from the VFS "for free". As soon as we have any other xattrs
1916 * like ACLs, we also need to implement the security.* handlers at
1917 * filesystem level, though.
1918 */
1919
1920static size_t shmem_xattr_security_list(struct inode *inode, char *list,
1921 size_t list_len, const char *name,
1922 size_t name_len)
1923{
1924 return security_inode_listsecurity(inode, list, list_len);
1925}
1926
1927static int shmem_xattr_security_get(struct inode *inode, const char *name,
1928 void *buffer, size_t size)
1929{
1930 if (strcmp(name, "") == 0)
1931 return -EINVAL;
1932 return security_inode_getsecurity(inode, name, buffer, size,
1933 -EOPNOTSUPP);
1934}
1935
1936static int shmem_xattr_security_set(struct inode *inode, const char *name,
1937 const void *value, size_t size, int flags)
1938{
1939 if (strcmp(name, "") == 0)
1940 return -EINVAL;
1941 return security_inode_setsecurity(inode, name, value, size, flags);
1942}
1943
1944struct xattr_handler shmem_xattr_security_handler = {
1945 .prefix = XATTR_SECURITY_PREFIX,
1946 .list = shmem_xattr_security_list,
1947 .get = shmem_xattr_security_get,
1948 .set = shmem_xattr_security_set,
1949};
1950
1951static struct xattr_handler *shmem_xattr_handlers[] = {
1952 &shmem_xattr_acl_access_handler,
1953 &shmem_xattr_acl_default_handler,
1954 &shmem_xattr_security_handler,
1955 NULL
1956};
1957#endif
1958
1902static int shmem_parse_options(char *options, int *mode, uid_t *uid, 1959static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1903 gid_t *gid, unsigned long *blocks, unsigned long *inodes, 1960 gid_t *gid, unsigned long *blocks, unsigned long *inodes,
1904 int *policy, nodemask_t *policy_nodes) 1961 int *policy, nodemask_t *policy_nodes)
@@ -2096,6 +2153,10 @@ static int shmem_fill_super(struct super_block *sb,
2096 sb->s_magic = TMPFS_MAGIC; 2153 sb->s_magic = TMPFS_MAGIC;
2097 sb->s_op = &shmem_ops; 2154 sb->s_op = &shmem_ops;
2098 sb->s_time_gran = 1; 2155 sb->s_time_gran = 1;
2156#ifdef CONFIG_TMPFS_POSIX_ACL
2157 sb->s_xattr = shmem_xattr_handlers;
2158 sb->s_flags |= MS_POSIXACL;
2159#endif
2099 2160
2100 inode = shmem_get_inode(sb, S_IFDIR | mode, 0); 2161 inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2101 if (!inode) 2162 if (!inode)
@@ -2132,6 +2193,7 @@ static void shmem_destroy_inode(struct inode *inode)
2132 /* only struct inode is valid if it's an inline symlink */ 2193 /* only struct inode is valid if it's an inline symlink */
2133 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2194 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2134 } 2195 }
2196 shmem_acl_destroy_inode(inode);
2135 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2197 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2136} 2198}
2137 2199
@@ -2143,6 +2205,10 @@ static void init_once(void *foo, struct kmem_cache *cachep,
2143 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 2205 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2144 SLAB_CTOR_CONSTRUCTOR) { 2206 SLAB_CTOR_CONSTRUCTOR) {
2145 inode_init_once(&p->vfs_inode); 2207 inode_init_once(&p->vfs_inode);
2208#ifdef CONFIG_TMPFS_POSIX_ACL
2209 p->i_acl = NULL;
2210 p->i_default_acl = NULL;
2211#endif
2146 } 2212 }
2147} 2213}
2148 2214
@@ -2158,11 +2224,10 @@ static int init_inodecache(void)
2158 2224
2159static void destroy_inodecache(void) 2225static void destroy_inodecache(void)
2160{ 2226{
2161 if (kmem_cache_destroy(shmem_inode_cachep)) 2227 kmem_cache_destroy(shmem_inode_cachep);
2162 printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
2163} 2228}
2164 2229
2165static struct address_space_operations shmem_aops = { 2230static const struct address_space_operations shmem_aops = {
2166 .writepage = shmem_writepage, 2231 .writepage = shmem_writepage,
2167 .set_page_dirty = __set_page_dirty_nobuffers, 2232 .set_page_dirty = __set_page_dirty_nobuffers,
2168#ifdef CONFIG_TMPFS 2233#ifdef CONFIG_TMPFS
@@ -2187,6 +2252,14 @@ static struct inode_operations shmem_inode_operations = {
2187 .truncate = shmem_truncate, 2252 .truncate = shmem_truncate,
2188 .setattr = shmem_notify_change, 2253 .setattr = shmem_notify_change,
2189 .truncate_range = shmem_truncate_range, 2254 .truncate_range = shmem_truncate_range,
2255#ifdef CONFIG_TMPFS_POSIX_ACL
2256 .setxattr = generic_setxattr,
2257 .getxattr = generic_getxattr,
2258 .listxattr = generic_listxattr,
2259 .removexattr = generic_removexattr,
2260 .permission = shmem_permission,
2261#endif
2262
2190}; 2263};
2191 2264
2192static struct inode_operations shmem_dir_inode_operations = { 2265static struct inode_operations shmem_dir_inode_operations = {
@@ -2201,6 +2274,25 @@ static struct inode_operations shmem_dir_inode_operations = {
2201 .mknod = shmem_mknod, 2274 .mknod = shmem_mknod,
2202 .rename = shmem_rename, 2275 .rename = shmem_rename,
2203#endif 2276#endif
2277#ifdef CONFIG_TMPFS_POSIX_ACL
2278 .setattr = shmem_notify_change,
2279 .setxattr = generic_setxattr,
2280 .getxattr = generic_getxattr,
2281 .listxattr = generic_listxattr,
2282 .removexattr = generic_removexattr,
2283 .permission = shmem_permission,
2284#endif
2285};
2286
2287static struct inode_operations shmem_special_inode_operations = {
2288#ifdef CONFIG_TMPFS_POSIX_ACL
2289 .setattr = shmem_notify_change,
2290 .setxattr = generic_setxattr,
2291 .getxattr = generic_getxattr,
2292 .listxattr = generic_listxattr,
2293 .removexattr = generic_removexattr,
2294 .permission = shmem_permission,
2295#endif
2204}; 2296};
2205 2297
2206static struct super_operations shmem_ops = { 2298static struct super_operations shmem_ops = {
@@ -2252,10 +2344,8 @@ static int __init init_tmpfs(void)
2252 printk(KERN_ERR "Could not register tmpfs\n"); 2344 printk(KERN_ERR "Could not register tmpfs\n");
2253 goto out2; 2345 goto out2;
2254 } 2346 }
2255#ifdef CONFIG_TMPFS 2347
2256 devfs_mk_dir("shm"); 2348 shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
2257#endif
2258 shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
2259 tmpfs_fs_type.name, NULL); 2349 tmpfs_fs_type.name, NULL);
2260 if (IS_ERR(shm_mnt)) { 2350 if (IS_ERR(shm_mnt)) {
2261 error = PTR_ERR(shm_mnt); 2351 error = PTR_ERR(shm_mnt);
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
new file mode 100644
index 000000000000..c946bf468718
--- /dev/null
+++ b/mm/shmem_acl.c
@@ -0,0 +1,197 @@
1/*
2 * mm/shmem_acl.c
3 *
4 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
5 *
6 * This file is released under the GPL.
7 */
8
9#include <linux/fs.h>
10#include <linux/shmem_fs.h>
11#include <linux/xattr.h>
12#include <linux/generic_acl.h>
13
14/**
15 * shmem_get_acl - generic_acl_operations->getacl() operation
16 */
17static struct posix_acl *
18shmem_get_acl(struct inode *inode, int type)
19{
20 struct posix_acl *acl = NULL;
21
22 spin_lock(&inode->i_lock);
23 switch(type) {
24 case ACL_TYPE_ACCESS:
25 acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
26 break;
27
28 case ACL_TYPE_DEFAULT:
29 acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
30 break;
31 }
32 spin_unlock(&inode->i_lock);
33
34 return acl;
35}
36
37/**
38 * shmem_get_acl - generic_acl_operations->setacl() operation
39 */
40static void
41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
42{
43 struct posix_acl *free = NULL;
44
45 spin_lock(&inode->i_lock);
46 switch(type) {
47 case ACL_TYPE_ACCESS:
48 free = SHMEM_I(inode)->i_acl;
49 SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
50 break;
51
52 case ACL_TYPE_DEFAULT:
53 free = SHMEM_I(inode)->i_default_acl;
54 SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
55 break;
56 }
57 spin_unlock(&inode->i_lock);
58 posix_acl_release(free);
59}
60
61struct generic_acl_operations shmem_acl_ops = {
62 .getacl = shmem_get_acl,
63 .setacl = shmem_set_acl,
64};
65
66/**
67 * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
68 * shmem_xattr_acl_access_handler - plumbing code to implement the
69 * system.posix_acl_access xattr using the generic acl functions.
70 */
71
72static size_t
73shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
74 const char *name, size_t name_len)
75{
76 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
77 list, list_size);
78}
79
80static int
81shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
82 size_t size)
83{
84 if (strcmp(name, "") != 0)
85 return -EINVAL;
86 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
87 size);
88}
89
90static int
91shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
92 size_t size, int flags)
93{
94 if (strcmp(name, "") != 0)
95 return -EINVAL;
96 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
97 size);
98}
99
100struct xattr_handler shmem_xattr_acl_access_handler = {
101 .prefix = POSIX_ACL_XATTR_ACCESS,
102 .list = shmem_list_acl_access,
103 .get = shmem_get_acl_access,
104 .set = shmem_set_acl_access,
105};
106
107/**
108 * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
109 * shmem_xattr_acl_default_handler - plumbing code to implement the
110 * system.posix_acl_default xattr using the generic acl functions.
111 */
112
113static size_t
114shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
115 const char *name, size_t name_len)
116{
117 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
118 list, list_size);
119}
120
121static int
122shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
123 size_t size)
124{
125 if (strcmp(name, "") != 0)
126 return -EINVAL;
127 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
128 size);
129}
130
131static int
132shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
133 size_t size, int flags)
134{
135 if (strcmp(name, "") != 0)
136 return -EINVAL;
137 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
138 size);
139}
140
141struct xattr_handler shmem_xattr_acl_default_handler = {
142 .prefix = POSIX_ACL_XATTR_DEFAULT,
143 .list = shmem_list_acl_default,
144 .get = shmem_get_acl_default,
145 .set = shmem_set_acl_default,
146};
147
148/**
149 * shmem_acl_init - Inizialize the acl(s) of a new inode
150 */
151int
152shmem_acl_init(struct inode *inode, struct inode *dir)
153{
154 return generic_acl_init(inode, dir, &shmem_acl_ops);
155}
156
157/**
158 * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode
159 *
160 * This is done before destroying the actual inode.
161 */
162
163void
164shmem_acl_destroy_inode(struct inode *inode)
165{
166 if (SHMEM_I(inode)->i_acl)
167 posix_acl_release(SHMEM_I(inode)->i_acl);
168 SHMEM_I(inode)->i_acl = NULL;
169 if (SHMEM_I(inode)->i_default_acl)
170 posix_acl_release(SHMEM_I(inode)->i_default_acl);
171 SHMEM_I(inode)->i_default_acl = NULL;
172}
173
174/**
175 * shmem_check_acl - check_acl() callback for generic_permission()
176 */
177static int
178shmem_check_acl(struct inode *inode, int mask)
179{
180 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
181
182 if (acl) {
183 int error = posix_acl_permission(inode, acl, mask);
184 posix_acl_release(acl);
185 return error;
186 }
187 return -EAGAIN;
188}
189
190/**
191 * shmem_permission - permission() inode operation
192 */
193int
194shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
195{
196 return generic_permission(inode, mask, shmem_check_acl);
197}
diff --git a/mm/slab.c b/mm/slab.c
index 98ac20bc0de9..3dbd6f4e7477 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -89,6 +89,7 @@
89#include <linux/config.h> 89#include <linux/config.h>
90#include <linux/slab.h> 90#include <linux/slab.h>
91#include <linux/mm.h> 91#include <linux/mm.h>
92#include <linux/poison.h>
92#include <linux/swap.h> 93#include <linux/swap.h>
93#include <linux/cache.h> 94#include <linux/cache.h>
94#include <linux/interrupt.h> 95#include <linux/interrupt.h>
@@ -106,6 +107,7 @@
106#include <linux/nodemask.h> 107#include <linux/nodemask.h>
107#include <linux/mempolicy.h> 108#include <linux/mempolicy.h>
108#include <linux/mutex.h> 109#include <linux/mutex.h>
110#include <linux/rtmutex.h>
109 111
110#include <asm/uaccess.h> 112#include <asm/uaccess.h>
111#include <asm/cacheflush.h> 113#include <asm/cacheflush.h>
@@ -307,6 +309,13 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
307#define SIZE_AC 1 309#define SIZE_AC 1
308#define SIZE_L3 (1 + MAX_NUMNODES) 310#define SIZE_L3 (1 + MAX_NUMNODES)
309 311
312static int drain_freelist(struct kmem_cache *cache,
313 struct kmem_list3 *l3, int tofree);
314static void free_block(struct kmem_cache *cachep, void **objpp, int len,
315 int node);
316static int enable_cpucache(struct kmem_cache *cachep);
317static void cache_reap(void *unused);
318
310/* 319/*
311 * This function must be completely optimized away if a constant is passed to 320 * This function must be completely optimized away if a constant is passed to
312 * it. Mostly the same as what is in linux/slab.h except it returns an index. 321 * it. Mostly the same as what is in linux/slab.h except it returns an index.
@@ -454,7 +463,7 @@ struct kmem_cache {
454#define STATS_DEC_ACTIVE(x) ((x)->num_active--) 463#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
455#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 464#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
456#define STATS_INC_GROWN(x) ((x)->grown++) 465#define STATS_INC_GROWN(x) ((x)->grown++)
457#define STATS_INC_REAPED(x) ((x)->reaped++) 466#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
458#define STATS_SET_HIGH(x) \ 467#define STATS_SET_HIGH(x) \
459 do { \ 468 do { \
460 if ((x)->num_active > (x)->high_mark) \ 469 if ((x)->num_active > (x)->high_mark) \
@@ -478,7 +487,7 @@ struct kmem_cache {
478#define STATS_DEC_ACTIVE(x) do { } while (0) 487#define STATS_DEC_ACTIVE(x) do { } while (0)
479#define STATS_INC_ALLOCED(x) do { } while (0) 488#define STATS_INC_ALLOCED(x) do { } while (0)
480#define STATS_INC_GROWN(x) do { } while (0) 489#define STATS_INC_GROWN(x) do { } while (0)
481#define STATS_INC_REAPED(x) do { } while (0) 490#define STATS_ADD_REAPED(x,y) do { } while (0)
482#define STATS_SET_HIGH(x) do { } while (0) 491#define STATS_SET_HIGH(x) do { } while (0)
483#define STATS_INC_ERR(x) do { } while (0) 492#define STATS_INC_ERR(x) do { } while (0)
484#define STATS_INC_NODEALLOCS(x) do { } while (0) 493#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -492,17 +501,6 @@ struct kmem_cache {
492#endif 501#endif
493 502
494#if DEBUG 503#if DEBUG
495/*
496 * Magic nums for obj red zoning.
497 * Placed in the first word before and the first word after an obj.
498 */
499#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */
500#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */
501
502/* ...and for poisoning */
503#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */
504#define POISON_FREE 0x6b /* for use-after-free poisoning */
505#define POISON_END 0xa5 /* end-byte of poisoning */
506 504
507/* 505/*
508 * memory layout of objects: 506 * memory layout of objects:
@@ -676,17 +674,66 @@ static struct kmem_cache cache_cache = {
676#endif 674#endif
677}; 675};
678 676
679/* Guard access to the cache-chain. */ 677#define BAD_ALIEN_MAGIC 0x01020304ul
680static DEFINE_MUTEX(cache_chain_mutex); 678
681static struct list_head cache_chain; 679#ifdef CONFIG_LOCKDEP
682 680
683/* 681/*
684 * vm_enough_memory() looks at this to determine how many slab-allocated pages 682 * Slab sometimes uses the kmalloc slabs to store the slab headers
685 * are possibly freeable under pressure 683 * for other slabs "off slab".
684 * The locking for this is tricky in that it nests within the locks
685 * of all other slabs in a few places; to deal with this special
686 * locking we put on-slab caches into a separate lock-class.
686 * 687 *
687 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 688 * We set lock class for alien array caches which are up during init.
689 * The lock annotation will be lost if all cpus of a node goes down and
690 * then comes back up during hotplug
688 */ 691 */
689atomic_t slab_reclaim_pages; 692static struct lock_class_key on_slab_l3_key;
693static struct lock_class_key on_slab_alc_key;
694
695static inline void init_lock_keys(void)
696
697{
698 int q;
699 struct cache_sizes *s = malloc_sizes;
700
701 while (s->cs_size != ULONG_MAX) {
702 for_each_node(q) {
703 struct array_cache **alc;
704 int r;
705 struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
706 if (!l3 || OFF_SLAB(s->cs_cachep))
707 continue;
708 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
709 alc = l3->alien;
710 /*
711 * FIXME: This check for BAD_ALIEN_MAGIC
712 * should go away when common slab code is taught to
713 * work even without alien caches.
714 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
715 * for alloc_alien_cache,
716 */
717 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
718 continue;
719 for_each_node(r) {
720 if (alc[r])
721 lockdep_set_class(&alc[r]->lock,
722 &on_slab_alc_key);
723 }
724 }
725 s++;
726 }
727}
728#else
729static inline void init_lock_keys(void)
730{
731}
732#endif
733
734/* Guard access to the cache-chain. */
735static DEFINE_MUTEX(cache_chain_mutex);
736static struct list_head cache_chain;
690 737
691/* 738/*
692 * chicken and egg problem: delay the per-cpu array allocation 739 * chicken and egg problem: delay the per-cpu array allocation
@@ -709,12 +756,6 @@ int slab_is_available(void)
709 756
710static DEFINE_PER_CPU(struct work_struct, reap_work); 757static DEFINE_PER_CPU(struct work_struct, reap_work);
711 758
712static void free_block(struct kmem_cache *cachep, void **objpp, int len,
713 int node);
714static void enable_cpucache(struct kmem_cache *cachep);
715static void cache_reap(void *unused);
716static int __node_shrink(struct kmem_cache *cachep, int node);
717
718static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 759static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
719{ 760{
720 return cachep->array[smp_processor_id()]; 761 return cachep->array[smp_processor_id()];
@@ -745,11 +786,10 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
745 return csizep->cs_cachep; 786 return csizep->cs_cachep;
746} 787}
747 788
748struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 789static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
749{ 790{
750 return __find_general_cachep(size, gfpflags); 791 return __find_general_cachep(size, gfpflags);
751} 792}
752EXPORT_SYMBOL(kmem_find_general_cachep);
753 793
754static size_t slab_mgmt_size(size_t nr_objs, size_t align) 794static size_t slab_mgmt_size(size_t nr_objs, size_t align)
755{ 795{
@@ -932,7 +972,39 @@ static int transfer_objects(struct array_cache *to,
932 return nr; 972 return nr;
933} 973}
934 974
935#ifdef CONFIG_NUMA 975#ifndef CONFIG_NUMA
976
977#define drain_alien_cache(cachep, alien) do { } while (0)
978#define reap_alien(cachep, l3) do { } while (0)
979
980static inline struct array_cache **alloc_alien_cache(int node, int limit)
981{
982 return (struct array_cache **)BAD_ALIEN_MAGIC;
983}
984
985static inline void free_alien_cache(struct array_cache **ac_ptr)
986{
987}
988
989static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
990{
991 return 0;
992}
993
994static inline void *alternate_node_alloc(struct kmem_cache *cachep,
995 gfp_t flags)
996{
997 return NULL;
998}
999
1000static inline void *__cache_alloc_node(struct kmem_cache *cachep,
1001 gfp_t flags, int nodeid)
1002{
1003 return NULL;
1004}
1005
1006#else /* CONFIG_NUMA */
1007
936static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 1008static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
937static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1009static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
938 1010
@@ -1061,29 +1133,9 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1061 } 1133 }
1062 return 1; 1134 return 1;
1063} 1135}
1064
1065#else
1066
1067#define drain_alien_cache(cachep, alien) do { } while (0)
1068#define reap_alien(cachep, l3) do { } while (0)
1069
1070static inline struct array_cache **alloc_alien_cache(int node, int limit)
1071{
1072 return (struct array_cache **) 0x01020304ul;
1073}
1074
1075static inline void free_alien_cache(struct array_cache **ac_ptr)
1076{
1077}
1078
1079static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1080{
1081 return 0;
1082}
1083
1084#endif 1136#endif
1085 1137
1086static int cpuup_callback(struct notifier_block *nfb, 1138static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1087 unsigned long action, void *hcpu) 1139 unsigned long action, void *hcpu)
1088{ 1140{
1089 long cpu = (long)hcpu; 1141 long cpu = (long)hcpu;
@@ -1250,10 +1302,7 @@ free_array_cache:
1250 l3 = cachep->nodelists[node]; 1302 l3 = cachep->nodelists[node];
1251 if (!l3) 1303 if (!l3)
1252 continue; 1304 continue;
1253 spin_lock_irq(&l3->list_lock); 1305 drain_freelist(cachep, l3, l3->free_objects);
1254 /* free slabs belonging to this node */
1255 __node_shrink(cachep, node);
1256 spin_unlock_irq(&l3->list_lock);
1257 } 1306 }
1258 mutex_unlock(&cache_chain_mutex); 1307 mutex_unlock(&cache_chain_mutex);
1259 break; 1308 break;
@@ -1265,7 +1314,9 @@ bad:
1265 return NOTIFY_BAD; 1314 return NOTIFY_BAD;
1266} 1315}
1267 1316
1268static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 1317static struct notifier_block __cpuinitdata cpucache_notifier = {
1318 &cpuup_callback, NULL, 0
1319};
1269 1320
1270/* 1321/*
1271 * swap the static kmem_list3 with kmalloced memory 1322 * swap the static kmem_list3 with kmalloced memory
@@ -1281,6 +1332,11 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1281 1332
1282 local_irq_disable(); 1333 local_irq_disable();
1283 memcpy(ptr, list, sizeof(struct kmem_list3)); 1334 memcpy(ptr, list, sizeof(struct kmem_list3));
1335 /*
1336 * Do not assume that spinlocks can be initialized via memcpy:
1337 */
1338 spin_lock_init(&ptr->list_lock);
1339
1284 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1340 MAKE_ALL_LISTS(cachep, ptr, nodeid);
1285 cachep->nodelists[nodeid] = ptr; 1341 cachep->nodelists[nodeid] = ptr;
1286 local_irq_enable(); 1342 local_irq_enable();
@@ -1407,7 +1463,7 @@ void __init kmem_cache_init(void)
1407 } 1463 }
1408 /* 4) Replace the bootstrap head arrays */ 1464 /* 4) Replace the bootstrap head arrays */
1409 { 1465 {
1410 void *ptr; 1466 struct array_cache *ptr;
1411 1467
1412 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1468 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1413 1469
@@ -1415,6 +1471,11 @@ void __init kmem_cache_init(void)
1415 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1471 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1416 memcpy(ptr, cpu_cache_get(&cache_cache), 1472 memcpy(ptr, cpu_cache_get(&cache_cache),
1417 sizeof(struct arraycache_init)); 1473 sizeof(struct arraycache_init));
1474 /*
1475 * Do not assume that spinlocks can be initialized via memcpy:
1476 */
1477 spin_lock_init(&ptr->lock);
1478
1418 cache_cache.array[smp_processor_id()] = ptr; 1479 cache_cache.array[smp_processor_id()] = ptr;
1419 local_irq_enable(); 1480 local_irq_enable();
1420 1481
@@ -1425,6 +1486,11 @@ void __init kmem_cache_init(void)
1425 != &initarray_generic.cache); 1486 != &initarray_generic.cache);
1426 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1487 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1427 sizeof(struct arraycache_init)); 1488 sizeof(struct arraycache_init));
1489 /*
1490 * Do not assume that spinlocks can be initialized via memcpy:
1491 */
1492 spin_lock_init(&ptr->lock);
1493
1428 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1494 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1429 ptr; 1495 ptr;
1430 local_irq_enable(); 1496 local_irq_enable();
@@ -1453,10 +1519,15 @@ void __init kmem_cache_init(void)
1453 struct kmem_cache *cachep; 1519 struct kmem_cache *cachep;
1454 mutex_lock(&cache_chain_mutex); 1520 mutex_lock(&cache_chain_mutex);
1455 list_for_each_entry(cachep, &cache_chain, next) 1521 list_for_each_entry(cachep, &cache_chain, next)
1456 enable_cpucache(cachep); 1522 if (enable_cpucache(cachep))
1523 BUG();
1457 mutex_unlock(&cache_chain_mutex); 1524 mutex_unlock(&cache_chain_mutex);
1458 } 1525 }
1459 1526
1527 /* Annotate slab for lockdep -- annotate the malloc caches */
1528 init_lock_keys();
1529
1530
1460 /* Done! */ 1531 /* Done! */
1461 g_cpucache_up = FULL; 1532 g_cpucache_up = FULL;
1462 1533
@@ -1505,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1505 */ 1576 */
1506 flags |= __GFP_COMP; 1577 flags |= __GFP_COMP;
1507#endif 1578#endif
1508 flags |= cachep->gfpflags; 1579
1580 /*
1581 * Under NUMA we want memory on the indicated node. We will handle
1582 * the needed fallback ourselves since we want to serve from our
1583 * per node object lists first for other nodes.
1584 */
1585 flags |= cachep->gfpflags | GFP_THISNODE;
1509 1586
1510 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1587 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1511 if (!page) 1588 if (!page)
@@ -1513,8 +1590,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1513 1590
1514 nr_pages = (1 << cachep->gfporder); 1591 nr_pages = (1 << cachep->gfporder);
1515 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1592 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1516 atomic_add(nr_pages, &slab_reclaim_pages); 1593 add_zone_page_state(page_zone(page),
1517 add_page_state(nr_slab, nr_pages); 1594 NR_SLAB_RECLAIMABLE, nr_pages);
1595 else
1596 add_zone_page_state(page_zone(page),
1597 NR_SLAB_UNRECLAIMABLE, nr_pages);
1518 for (i = 0; i < nr_pages; i++) 1598 for (i = 0; i < nr_pages; i++)
1519 __SetPageSlab(page + i); 1599 __SetPageSlab(page + i);
1520 return page_address(page); 1600 return page_address(page);
@@ -1529,17 +1609,20 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1529 struct page *page = virt_to_page(addr); 1609 struct page *page = virt_to_page(addr);
1530 const unsigned long nr_freed = i; 1610 const unsigned long nr_freed = i;
1531 1611
1612 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1613 sub_zone_page_state(page_zone(page),
1614 NR_SLAB_RECLAIMABLE, nr_freed);
1615 else
1616 sub_zone_page_state(page_zone(page),
1617 NR_SLAB_UNRECLAIMABLE, nr_freed);
1532 while (i--) { 1618 while (i--) {
1533 BUG_ON(!PageSlab(page)); 1619 BUG_ON(!PageSlab(page));
1534 __ClearPageSlab(page); 1620 __ClearPageSlab(page);
1535 page++; 1621 page++;
1536 } 1622 }
1537 sub_page_state(nr_slab, nr_freed);
1538 if (current->reclaim_state) 1623 if (current->reclaim_state)
1539 current->reclaim_state->reclaimed_slab += nr_freed; 1624 current->reclaim_state->reclaimed_slab += nr_freed;
1540 free_pages((unsigned long)addr, cachep->gfporder); 1625 free_pages((unsigned long)addr, cachep->gfporder);
1541 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1542 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1543} 1626}
1544 1627
1545static void kmem_rcu_free(struct rcu_head *head) 1628static void kmem_rcu_free(struct rcu_head *head)
@@ -1600,10 +1683,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1600static void dump_line(char *data, int offset, int limit) 1683static void dump_line(char *data, int offset, int limit)
1601{ 1684{
1602 int i; 1685 int i;
1686 unsigned char error = 0;
1687 int bad_count = 0;
1688
1603 printk(KERN_ERR "%03x:", offset); 1689 printk(KERN_ERR "%03x:", offset);
1604 for (i = 0; i < limit; i++) 1690 for (i = 0; i < limit; i++) {
1691 if (data[offset + i] != POISON_FREE) {
1692 error = data[offset + i];
1693 bad_count++;
1694 }
1605 printk(" %02x", (unsigned char)data[offset + i]); 1695 printk(" %02x", (unsigned char)data[offset + i]);
1696 }
1606 printk("\n"); 1697 printk("\n");
1698
1699 if (bad_count == 1) {
1700 error ^= POISON_FREE;
1701 if (!(error & (error - 1))) {
1702 printk(KERN_ERR "Single bit error detected. Probably "
1703 "bad RAM.\n");
1704#ifdef CONFIG_X86
1705 printk(KERN_ERR "Run memtest86+ or a similar memory "
1706 "test tool.\n");
1707#else
1708 printk(KERN_ERR "Run a memory test tool.\n");
1709#endif
1710 }
1711 }
1607} 1712}
1608#endif 1713#endif
1609 1714
@@ -1796,6 +1901,27 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
1796 } 1901 }
1797} 1902}
1798 1903
1904static void __kmem_cache_destroy(struct kmem_cache *cachep)
1905{
1906 int i;
1907 struct kmem_list3 *l3;
1908
1909 for_each_online_cpu(i)
1910 kfree(cachep->array[i]);
1911
1912 /* NUMA: free the list3 structures */
1913 for_each_online_node(i) {
1914 l3 = cachep->nodelists[i];
1915 if (l3) {
1916 kfree(l3->shared);
1917 free_alien_cache(l3->alien);
1918 kfree(l3);
1919 }
1920 }
1921 kmem_cache_free(&cache_cache, cachep);
1922}
1923
1924
1799/** 1925/**
1800 * calculate_slab_order - calculate size (page order) of slabs 1926 * calculate_slab_order - calculate size (page order) of slabs
1801 * @cachep: pointer to the cache that is being created 1927 * @cachep: pointer to the cache that is being created
@@ -1866,12 +1992,11 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
1866 return left_over; 1992 return left_over;
1867} 1993}
1868 1994
1869static void setup_cpu_cache(struct kmem_cache *cachep) 1995static int setup_cpu_cache(struct kmem_cache *cachep)
1870{ 1996{
1871 if (g_cpucache_up == FULL) { 1997 if (g_cpucache_up == FULL)
1872 enable_cpucache(cachep); 1998 return enable_cpucache(cachep);
1873 return; 1999
1874 }
1875 if (g_cpucache_up == NONE) { 2000 if (g_cpucache_up == NONE) {
1876 /* 2001 /*
1877 * Note: the first kmem_cache_create must create the cache 2002 * Note: the first kmem_cache_create must create the cache
@@ -1918,6 +2043,7 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
1918 cpu_cache_get(cachep)->touched = 0; 2043 cpu_cache_get(cachep)->touched = 0;
1919 cachep->batchcount = 1; 2044 cachep->batchcount = 1;
1920 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2045 cachep->limit = BOOT_CPUCACHE_ENTRIES;
2046 return 0;
1921} 2047}
1922 2048
1923/** 2049/**
@@ -2059,6 +2185,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2059 } else { 2185 } else {
2060 ralign = BYTES_PER_WORD; 2186 ralign = BYTES_PER_WORD;
2061 } 2187 }
2188
2189 /*
2190 * Redzoning and user store require word alignment. Note this will be
2191 * overridden by architecture or caller mandated alignment if either
2192 * is greater than BYTES_PER_WORD.
2193 */
2194 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2195 ralign = BYTES_PER_WORD;
2196
2062 /* 2) arch mandated alignment: disables debug if necessary */ 2197 /* 2) arch mandated alignment: disables debug if necessary */
2063 if (ralign < ARCH_SLAB_MINALIGN) { 2198 if (ralign < ARCH_SLAB_MINALIGN) {
2064 ralign = ARCH_SLAB_MINALIGN; 2199 ralign = ARCH_SLAB_MINALIGN;
@@ -2072,8 +2207,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2072 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2207 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2073 } 2208 }
2074 /* 2209 /*
2075 * 4) Store it. Note that the debug code below can reduce 2210 * 4) Store it.
2076 * the alignment to BYTES_PER_WORD.
2077 */ 2211 */
2078 align = ralign; 2212 align = ralign;
2079 2213
@@ -2085,20 +2219,19 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2085#if DEBUG 2219#if DEBUG
2086 cachep->obj_size = size; 2220 cachep->obj_size = size;
2087 2221
2222 /*
2223 * Both debugging options require word-alignment which is calculated
2224 * into align above.
2225 */
2088 if (flags & SLAB_RED_ZONE) { 2226 if (flags & SLAB_RED_ZONE) {
2089 /* redzoning only works with word aligned caches */
2090 align = BYTES_PER_WORD;
2091
2092 /* add space for red zone words */ 2227 /* add space for red zone words */
2093 cachep->obj_offset += BYTES_PER_WORD; 2228 cachep->obj_offset += BYTES_PER_WORD;
2094 size += 2 * BYTES_PER_WORD; 2229 size += 2 * BYTES_PER_WORD;
2095 } 2230 }
2096 if (flags & SLAB_STORE_USER) { 2231 if (flags & SLAB_STORE_USER) {
2097 /* user store requires word alignment and 2232 /* user store requires one word storage behind the end of
2098 * one word storage behind the end of the real 2233 * the real object.
2099 * object.
2100 */ 2234 */
2101 align = BYTES_PER_WORD;
2102 size += BYTES_PER_WORD; 2235 size += BYTES_PER_WORD;
2103 } 2236 }
2104#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2237#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
@@ -2162,14 +2295,26 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2162 cachep->gfpflags |= GFP_DMA; 2295 cachep->gfpflags |= GFP_DMA;
2163 cachep->buffer_size = size; 2296 cachep->buffer_size = size;
2164 2297
2165 if (flags & CFLGS_OFF_SLAB) 2298 if (flags & CFLGS_OFF_SLAB) {
2166 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2299 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2300 /*
2301 * This is a possibility for one of the malloc_sizes caches.
2302 * But since we go off slab only for object size greater than
2303 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2304 * this should not happen at all.
2305 * But leave a BUG_ON for some lucky dude.
2306 */
2307 BUG_ON(!cachep->slabp_cache);
2308 }
2167 cachep->ctor = ctor; 2309 cachep->ctor = ctor;
2168 cachep->dtor = dtor; 2310 cachep->dtor = dtor;
2169 cachep->name = name; 2311 cachep->name = name;
2170 2312
2171 2313 if (setup_cpu_cache(cachep)) {
2172 setup_cpu_cache(cachep); 2314 __kmem_cache_destroy(cachep);
2315 cachep = NULL;
2316 goto oops;
2317 }
2173 2318
2174 /* cache setup completed, link it into the list */ 2319 /* cache setup completed, link it into the list */
2175 list_add(&cachep->next, &cache_chain); 2320 list_add(&cachep->next, &cache_chain);
@@ -2255,32 +2400,45 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2255 } 2400 }
2256} 2401}
2257 2402
2258static int __node_shrink(struct kmem_cache *cachep, int node) 2403/*
2404 * Remove slabs from the list of free slabs.
2405 * Specify the number of slabs to drain in tofree.
2406 *
2407 * Returns the actual number of slabs released.
2408 */
2409static int drain_freelist(struct kmem_cache *cache,
2410 struct kmem_list3 *l3, int tofree)
2259{ 2411{
2412 struct list_head *p;
2413 int nr_freed;
2260 struct slab *slabp; 2414 struct slab *slabp;
2261 struct kmem_list3 *l3 = cachep->nodelists[node];
2262 int ret;
2263 2415
2264 for (;;) { 2416 nr_freed = 0;
2265 struct list_head *p; 2417 while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2266 2418
2419 spin_lock_irq(&l3->list_lock);
2267 p = l3->slabs_free.prev; 2420 p = l3->slabs_free.prev;
2268 if (p == &l3->slabs_free) 2421 if (p == &l3->slabs_free) {
2269 break; 2422 spin_unlock_irq(&l3->list_lock);
2423 goto out;
2424 }
2270 2425
2271 slabp = list_entry(l3->slabs_free.prev, struct slab, list); 2426 slabp = list_entry(p, struct slab, list);
2272#if DEBUG 2427#if DEBUG
2273 BUG_ON(slabp->inuse); 2428 BUG_ON(slabp->inuse);
2274#endif 2429#endif
2275 list_del(&slabp->list); 2430 list_del(&slabp->list);
2276 2431 /*
2277 l3->free_objects -= cachep->num; 2432 * Safe to drop the lock. The slab is no longer linked
2433 * to the cache.
2434 */
2435 l3->free_objects -= cache->num;
2278 spin_unlock_irq(&l3->list_lock); 2436 spin_unlock_irq(&l3->list_lock);
2279 slab_destroy(cachep, slabp); 2437 slab_destroy(cache, slabp);
2280 spin_lock_irq(&l3->list_lock); 2438 nr_freed++;
2281 } 2439 }
2282 ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); 2440out:
2283 return ret; 2441 return nr_freed;
2284} 2442}
2285 2443
2286static int __cache_shrink(struct kmem_cache *cachep) 2444static int __cache_shrink(struct kmem_cache *cachep)
@@ -2293,11 +2451,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
2293 check_irq_on(); 2451 check_irq_on();
2294 for_each_online_node(i) { 2452 for_each_online_node(i) {
2295 l3 = cachep->nodelists[i]; 2453 l3 = cachep->nodelists[i];
2296 if (l3) { 2454 if (!l3)
2297 spin_lock_irq(&l3->list_lock); 2455 continue;
2298 ret += __node_shrink(cachep, i); 2456
2299 spin_unlock_irq(&l3->list_lock); 2457 drain_freelist(cachep, l3, l3->free_objects);
2300 } 2458
2459 ret += !list_empty(&l3->slabs_full) ||
2460 !list_empty(&l3->slabs_partial);
2301 } 2461 }
2302 return (ret ? 1 : 0); 2462 return (ret ? 1 : 0);
2303} 2463}
@@ -2322,7 +2482,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2322 * @cachep: the cache to destroy 2482 * @cachep: the cache to destroy
2323 * 2483 *
2324 * Remove a struct kmem_cache object from the slab cache. 2484 * Remove a struct kmem_cache object from the slab cache.
2325 * Returns 0 on success.
2326 * 2485 *
2327 * It is expected this function will be called by a module when it is 2486 * It is expected this function will be called by a module when it is
2328 * unloaded. This will remove the cache completely, and avoid a duplicate 2487 * unloaded. This will remove the cache completely, and avoid a duplicate
@@ -2334,11 +2493,8 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2334 * The caller must guarantee that noone will allocate memory from the cache 2493 * The caller must guarantee that noone will allocate memory from the cache
2335 * during the kmem_cache_destroy(). 2494 * during the kmem_cache_destroy().
2336 */ 2495 */
2337int kmem_cache_destroy(struct kmem_cache *cachep) 2496void kmem_cache_destroy(struct kmem_cache *cachep)
2338{ 2497{
2339 int i;
2340 struct kmem_list3 *l3;
2341
2342 BUG_ON(!cachep || in_interrupt()); 2498 BUG_ON(!cachep || in_interrupt());
2343 2499
2344 /* Don't let CPUs to come and go */ 2500 /* Don't let CPUs to come and go */
@@ -2358,31 +2514,28 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
2358 list_add(&cachep->next, &cache_chain); 2514 list_add(&cachep->next, &cache_chain);
2359 mutex_unlock(&cache_chain_mutex); 2515 mutex_unlock(&cache_chain_mutex);
2360 unlock_cpu_hotplug(); 2516 unlock_cpu_hotplug();
2361 return 1; 2517 return;
2362 } 2518 }
2363 2519
2364 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2520 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2365 synchronize_rcu(); 2521 synchronize_rcu();
2366 2522
2367 for_each_online_cpu(i) 2523 __kmem_cache_destroy(cachep);
2368 kfree(cachep->array[i]);
2369
2370 /* NUMA: free the list3 structures */
2371 for_each_online_node(i) {
2372 l3 = cachep->nodelists[i];
2373 if (l3) {
2374 kfree(l3->shared);
2375 free_alien_cache(l3->alien);
2376 kfree(l3);
2377 }
2378 }
2379 kmem_cache_free(&cache_cache, cachep);
2380 unlock_cpu_hotplug(); 2524 unlock_cpu_hotplug();
2381 return 0;
2382} 2525}
2383EXPORT_SYMBOL(kmem_cache_destroy); 2526EXPORT_SYMBOL(kmem_cache_destroy);
2384 2527
2385/* Get the memory for a slab management obj. */ 2528/*
2529 * Get the memory for a slab management obj.
2530 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2531 * always come from malloc_sizes caches. The slab descriptor cannot
2532 * come from the same cache which is getting created because,
2533 * when we are searching for an appropriate cache for these
2534 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2535 * If we are creating a malloc_sizes cache here it would not be visible to
2536 * kmem_find_general_cachep till the initialization is complete.
2537 * Hence we cannot have slabp_cache same as the original cache.
2538 */
2386static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, 2539static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2387 int colour_off, gfp_t local_flags, 2540 int colour_off, gfp_t local_flags,
2388 int nodeid) 2541 int nodeid)
@@ -2915,14 +3068,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2915 void *objp; 3068 void *objp;
2916 struct array_cache *ac; 3069 struct array_cache *ac;
2917 3070
2918#ifdef CONFIG_NUMA
2919 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
2920 objp = alternate_node_alloc(cachep, flags);
2921 if (objp != NULL)
2922 return objp;
2923 }
2924#endif
2925
2926 check_irq_off(); 3071 check_irq_off();
2927 ac = cpu_cache_get(cachep); 3072 ac = cpu_cache_get(cachep);
2928 if (likely(ac->avail)) { 3073 if (likely(ac->avail)) {
@@ -2940,12 +3085,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
2940 gfp_t flags, void *caller) 3085 gfp_t flags, void *caller)
2941{ 3086{
2942 unsigned long save_flags; 3087 unsigned long save_flags;
2943 void *objp; 3088 void *objp = NULL;
2944 3089
2945 cache_alloc_debugcheck_before(cachep, flags); 3090 cache_alloc_debugcheck_before(cachep, flags);
2946 3091
2947 local_irq_save(save_flags); 3092 local_irq_save(save_flags);
2948 objp = ____cache_alloc(cachep, flags); 3093
3094 if (unlikely(NUMA_BUILD &&
3095 current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3096 objp = alternate_node_alloc(cachep, flags);
3097
3098 if (!objp)
3099 objp = ____cache_alloc(cachep, flags);
3100 /*
3101 * We may just have run out of memory on the local node.
3102 * __cache_alloc_node() knows how to locate memory on other nodes
3103 */
3104 if (NUMA_BUILD && !objp)
3105 objp = __cache_alloc_node(cachep, flags, numa_node_id());
2949 local_irq_restore(save_flags); 3106 local_irq_restore(save_flags);
2950 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 3107 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2951 caller); 3108 caller);
@@ -2964,7 +3121,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
2964{ 3121{
2965 int nid_alloc, nid_here; 3122 int nid_alloc, nid_here;
2966 3123
2967 if (in_interrupt()) 3124 if (in_interrupt() || (flags & __GFP_THISNODE))
2968 return NULL; 3125 return NULL;
2969 nid_alloc = nid_here = numa_node_id(); 3126 nid_alloc = nid_here = numa_node_id();
2970 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3127 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
@@ -2977,6 +3134,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
2977} 3134}
2978 3135
2979/* 3136/*
3137 * Fallback function if there was no memory available and no objects on a
3138 * certain node and we are allowed to fall back. We mimick the behavior of
3139 * the page allocator. We fall back according to a zonelist determined by
3140 * the policy layer while obeying cpuset constraints.
3141 */
3142void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3143{
3144 struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
3145 ->node_zonelists[gfp_zone(flags)];
3146 struct zone **z;
3147 void *obj = NULL;
3148
3149 for (z = zonelist->zones; *z && !obj; z++)
3150 if (zone_idx(*z) <= ZONE_NORMAL &&
3151 cpuset_zone_allowed(*z, flags))
3152 obj = __cache_alloc_node(cache,
3153 flags | __GFP_THISNODE,
3154 zone_to_nid(*z));
3155 return obj;
3156}
3157
3158/*
2980 * A interface to enable slab creation on nodeid 3159 * A interface to enable slab creation on nodeid
2981 */ 3160 */
2982static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3161static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
@@ -3029,11 +3208,15 @@ retry:
3029must_grow: 3208must_grow:
3030 spin_unlock(&l3->list_lock); 3209 spin_unlock(&l3->list_lock);
3031 x = cache_grow(cachep, flags, nodeid); 3210 x = cache_grow(cachep, flags, nodeid);
3211 if (x)
3212 goto retry;
3032 3213
3033 if (!x) 3214 if (!(flags & __GFP_THISNODE))
3034 return NULL; 3215 /* Unable to grow the cache. Fall back to other nodes. */
3216 return fallback_alloc(cachep, flags);
3217
3218 return NULL;
3035 3219
3036 goto retry;
3037done: 3220done:
3038 return obj; 3221 return obj;
3039} 3222}
@@ -3066,6 +3249,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3066 if (slabp->inuse == 0) { 3249 if (slabp->inuse == 0) {
3067 if (l3->free_objects > l3->free_limit) { 3250 if (l3->free_objects > l3->free_limit) {
3068 l3->free_objects -= cachep->num; 3251 l3->free_objects -= cachep->num;
3252 /* No need to drop any previously held
3253 * lock here, even if we have a off-slab slab
3254 * descriptor it is guaranteed to come from
3255 * a different cache, refer to comments before
3256 * alloc_slabmgmt.
3257 */
3069 slab_destroy(cachep, slabp); 3258 slab_destroy(cachep, slabp);
3070 } else { 3259 } else {
3071 list_add(&slabp->list, &l3->slabs_free); 3260 list_add(&slabp->list, &l3->slabs_free);
@@ -3171,7 +3360,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3171EXPORT_SYMBOL(kmem_cache_alloc); 3360EXPORT_SYMBOL(kmem_cache_alloc);
3172 3361
3173/** 3362/**
3174 * kmem_cache_alloc - Allocate an object. The memory is set to zero. 3363 * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
3175 * @cache: The cache to allocate from. 3364 * @cache: The cache to allocate from.
3176 * @flags: See kmalloc(). 3365 * @flags: See kmalloc().
3177 * 3366 *
@@ -3264,7 +3453,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3264} 3453}
3265EXPORT_SYMBOL(kmem_cache_alloc_node); 3454EXPORT_SYMBOL(kmem_cache_alloc_node);
3266 3455
3267void *kmalloc_node(size_t size, gfp_t flags, int node) 3456void *__kmalloc_node(size_t size, gfp_t flags, int node)
3268{ 3457{
3269 struct kmem_cache *cachep; 3458 struct kmem_cache *cachep;
3270 3459
@@ -3273,7 +3462,7 @@ void *kmalloc_node(size_t size, gfp_t flags, int node)
3273 return NULL; 3462 return NULL;
3274 return kmem_cache_alloc_node(cachep, flags, node); 3463 return kmem_cache_alloc_node(cachep, flags, node);
3275} 3464}
3276EXPORT_SYMBOL(kmalloc_node); 3465EXPORT_SYMBOL(__kmalloc_node);
3277#endif 3466#endif
3278 3467
3279/** 3468/**
@@ -3317,55 +3506,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3317EXPORT_SYMBOL(__kmalloc_track_caller); 3506EXPORT_SYMBOL(__kmalloc_track_caller);
3318#endif 3507#endif
3319 3508
3320#ifdef CONFIG_SMP
3321/**
3322 * __alloc_percpu - allocate one copy of the object for every present
3323 * cpu in the system, zeroing them.
3324 * Objects should be dereferenced using the per_cpu_ptr macro only.
3325 *
3326 * @size: how many bytes of memory are required.
3327 */
3328void *__alloc_percpu(size_t size)
3329{
3330 int i;
3331 struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
3332
3333 if (!pdata)
3334 return NULL;
3335
3336 /*
3337 * Cannot use for_each_online_cpu since a cpu may come online
3338 * and we have no way of figuring out how to fix the array
3339 * that we have allocated then....
3340 */
3341 for_each_possible_cpu(i) {
3342 int node = cpu_to_node(i);
3343
3344 if (node_online(node))
3345 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
3346 else
3347 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
3348
3349 if (!pdata->ptrs[i])
3350 goto unwind_oom;
3351 memset(pdata->ptrs[i], 0, size);
3352 }
3353
3354 /* Catch derefs w/o wrappers */
3355 return (void *)(~(unsigned long)pdata);
3356
3357unwind_oom:
3358 while (--i >= 0) {
3359 if (!cpu_possible(i))
3360 continue;
3361 kfree(pdata->ptrs[i]);
3362 }
3363 kfree(pdata);
3364 return NULL;
3365}
3366EXPORT_SYMBOL(__alloc_percpu);
3367#endif
3368
3369/** 3509/**
3370 * kmem_cache_free - Deallocate an object 3510 * kmem_cache_free - Deallocate an object
3371 * @cachep: The cache the allocation was from. 3511 * @cachep: The cache the allocation was from.
@@ -3405,35 +3545,12 @@ void kfree(const void *objp)
3405 local_irq_save(flags); 3545 local_irq_save(flags);
3406 kfree_debugcheck(objp); 3546 kfree_debugcheck(objp);
3407 c = virt_to_cache(objp); 3547 c = virt_to_cache(objp);
3408 mutex_debug_check_no_locks_freed(objp, obj_size(c)); 3548 debug_check_no_locks_freed(objp, obj_size(c));
3409 __cache_free(c, (void *)objp); 3549 __cache_free(c, (void *)objp);
3410 local_irq_restore(flags); 3550 local_irq_restore(flags);
3411} 3551}
3412EXPORT_SYMBOL(kfree); 3552EXPORT_SYMBOL(kfree);
3413 3553
3414#ifdef CONFIG_SMP
3415/**
3416 * free_percpu - free previously allocated percpu memory
3417 * @objp: pointer returned by alloc_percpu.
3418 *
3419 * Don't free memory not originally allocated by alloc_percpu()
3420 * The complemented objp is to check for that.
3421 */
3422void free_percpu(const void *objp)
3423{
3424 int i;
3425 struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3426
3427 /*
3428 * We allocate for all cpus so we cannot use for online cpu here.
3429 */
3430 for_each_possible_cpu(i)
3431 kfree(p->ptrs[i]);
3432 kfree(p);
3433}
3434EXPORT_SYMBOL(free_percpu);
3435#endif
3436
3437unsigned int kmem_cache_size(struct kmem_cache *cachep) 3554unsigned int kmem_cache_size(struct kmem_cache *cachep)
3438{ 3555{
3439 return obj_size(cachep); 3556 return obj_size(cachep);
@@ -3550,22 +3667,26 @@ static void do_ccupdate_local(void *info)
3550static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3667static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3551 int batchcount, int shared) 3668 int batchcount, int shared)
3552{ 3669{
3553 struct ccupdate_struct new; 3670 struct ccupdate_struct *new;
3554 int i, err; 3671 int i;
3672
3673 new = kzalloc(sizeof(*new), GFP_KERNEL);
3674 if (!new)
3675 return -ENOMEM;
3555 3676
3556 memset(&new.new, 0, sizeof(new.new));
3557 for_each_online_cpu(i) { 3677 for_each_online_cpu(i) {
3558 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, 3678 new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3559 batchcount); 3679 batchcount);
3560 if (!new.new[i]) { 3680 if (!new->new[i]) {
3561 for (i--; i >= 0; i--) 3681 for (i--; i >= 0; i--)
3562 kfree(new.new[i]); 3682 kfree(new->new[i]);
3683 kfree(new);
3563 return -ENOMEM; 3684 return -ENOMEM;
3564 } 3685 }
3565 } 3686 }
3566 new.cachep = cachep; 3687 new->cachep = cachep;
3567 3688
3568 on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); 3689 on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
3569 3690
3570 check_irq_on(); 3691 check_irq_on();
3571 cachep->batchcount = batchcount; 3692 cachep->batchcount = batchcount;
@@ -3573,7 +3694,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3573 cachep->shared = shared; 3694 cachep->shared = shared;
3574 3695
3575 for_each_online_cpu(i) { 3696 for_each_online_cpu(i) {
3576 struct array_cache *ccold = new.new[i]; 3697 struct array_cache *ccold = new->new[i];
3577 if (!ccold) 3698 if (!ccold)
3578 continue; 3699 continue;
3579 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3700 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
@@ -3581,18 +3702,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3581 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3702 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3582 kfree(ccold); 3703 kfree(ccold);
3583 } 3704 }
3584 3705 kfree(new);
3585 err = alloc_kmemlist(cachep); 3706 return alloc_kmemlist(cachep);
3586 if (err) {
3587 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3588 cachep->name, -err);
3589 BUG();
3590 }
3591 return 0;
3592} 3707}
3593 3708
3594/* Called with cache_chain_mutex held always */ 3709/* Called with cache_chain_mutex held always */
3595static void enable_cpucache(struct kmem_cache *cachep) 3710static int enable_cpucache(struct kmem_cache *cachep)
3596{ 3711{
3597 int err; 3712 int err;
3598 int limit, shared; 3713 int limit, shared;
@@ -3644,6 +3759,7 @@ static void enable_cpucache(struct kmem_cache *cachep)
3644 if (err) 3759 if (err)
3645 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3760 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3646 cachep->name, -err); 3761 cachep->name, -err);
3762 return err;
3647} 3763}
3648 3764
3649/* 3765/*
@@ -3701,10 +3817,6 @@ static void cache_reap(void *unused)
3701 } 3817 }
3702 3818
3703 list_for_each_entry(searchp, &cache_chain, next) { 3819 list_for_each_entry(searchp, &cache_chain, next) {
3704 struct list_head *p;
3705 int tofree;
3706 struct slab *slabp;
3707
3708 check_irq_on(); 3820 check_irq_on();
3709 3821
3710 /* 3822 /*
@@ -3729,47 +3841,22 @@ static void cache_reap(void *unused)
3729 3841
3730 drain_array(searchp, l3, l3->shared, 0, node); 3842 drain_array(searchp, l3, l3->shared, 0, node);
3731 3843
3732 if (l3->free_touched) { 3844 if (l3->free_touched)
3733 l3->free_touched = 0; 3845 l3->free_touched = 0;
3734 goto next; 3846 else {
3735 } 3847 int freed;
3736 3848
3737 tofree = (l3->free_limit + 5 * searchp->num - 1) / 3849 freed = drain_freelist(searchp, l3, (l3->free_limit +
3738 (5 * searchp->num); 3850 5 * searchp->num - 1) / (5 * searchp->num));
3739 do { 3851 STATS_ADD_REAPED(searchp, freed);
3740 /* 3852 }
3741 * Do not lock if there are no free blocks.
3742 */
3743 if (list_empty(&l3->slabs_free))
3744 break;
3745
3746 spin_lock_irq(&l3->list_lock);
3747 p = l3->slabs_free.next;
3748 if (p == &(l3->slabs_free)) {
3749 spin_unlock_irq(&l3->list_lock);
3750 break;
3751 }
3752
3753 slabp = list_entry(p, struct slab, list);
3754 BUG_ON(slabp->inuse);
3755 list_del(&slabp->list);
3756 STATS_INC_REAPED(searchp);
3757
3758 /*
3759 * Safe to drop the lock. The slab is no longer linked
3760 * to the cache. searchp cannot disappear, we hold
3761 * cache_chain_lock
3762 */
3763 l3->free_objects -= searchp->num;
3764 spin_unlock_irq(&l3->list_lock);
3765 slab_destroy(searchp, slabp);
3766 } while (--tofree > 0);
3767next: 3853next:
3768 cond_resched(); 3854 cond_resched();
3769 } 3855 }
3770 check_irq_on(); 3856 check_irq_on();
3771 mutex_unlock(&cache_chain_mutex); 3857 mutex_unlock(&cache_chain_mutex);
3772 next_reap_node(); 3858 next_reap_node();
3859 refresh_cpu_vm_stats(smp_processor_id());
3773 /* Set up the next iteration */ 3860 /* Set up the next iteration */
3774 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3861 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3775} 3862}
@@ -4133,6 +4220,7 @@ static int leaks_show(struct seq_file *m, void *p)
4133 show_symbol(m, n[2*i+2]); 4220 show_symbol(m, n[2*i+2]);
4134 seq_putc(m, '\n'); 4221 seq_putc(m, '\n');
4135 } 4222 }
4223
4136 return 0; 4224 return 0;
4137} 4225}
4138 4226
diff --git a/mm/slob.c b/mm/slob.c
index a68255ba4553..542394184a58 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -29,7 +29,6 @@
29 * essentially no allocation space overhead. 29 * essentially no allocation space overhead.
30 */ 30 */
31 31
32#include <linux/config.h>
33#include <linux/slab.h> 32#include <linux/slab.h>
34#include <linux/mm.h> 33#include <linux/mm.h>
35#include <linux/cache.h> 34#include <linux/cache.h>
@@ -271,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
271} 270}
272EXPORT_SYMBOL(kmem_cache_create); 271EXPORT_SYMBOL(kmem_cache_create);
273 272
274int kmem_cache_destroy(struct kmem_cache *c) 273void kmem_cache_destroy(struct kmem_cache *c)
275{ 274{
276 slob_free(c, sizeof(struct kmem_cache)); 275 slob_free(c, sizeof(struct kmem_cache));
277 return 0;
278} 276}
279EXPORT_SYMBOL(kmem_cache_destroy); 277EXPORT_SYMBOL(kmem_cache_destroy);
280 278
@@ -340,52 +338,3 @@ void kmem_cache_init(void)
340 338
341 mod_timer(&slob_timer, jiffies + HZ); 339 mod_timer(&slob_timer, jiffies + HZ);
342} 340}
343
344atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
345EXPORT_SYMBOL(slab_reclaim_pages);
346
347#ifdef CONFIG_SMP
348
349void *__alloc_percpu(size_t size)
350{
351 int i;
352 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
353
354 if (!pdata)
355 return NULL;
356
357 for_each_possible_cpu(i) {
358 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
359 if (!pdata->ptrs[i])
360 goto unwind_oom;
361 memset(pdata->ptrs[i], 0, size);
362 }
363
364 /* Catch derefs w/o wrappers */
365 return (void *) (~(unsigned long) pdata);
366
367unwind_oom:
368 while (--i >= 0) {
369 if (!cpu_possible(i))
370 continue;
371 kfree(pdata->ptrs[i]);
372 }
373 kfree(pdata);
374 return NULL;
375}
376EXPORT_SYMBOL(__alloc_percpu);
377
378void
379free_percpu(const void *objp)
380{
381 int i;
382 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
383
384 for_each_possible_cpu(i)
385 kfree(p->ptrs[i]);
386
387 kfree(p);
388}
389EXPORT_SYMBOL(free_percpu);
390
391#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index e0a3fe48aa37..86c52ab80878 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -1,7 +1,6 @@
1/* 1/*
2 * sparse memory mappings. 2 * sparse memory mappings.
3 */ 3 */
4#include <linux/config.h>
5#include <linux/mm.h> 4#include <linux/mm.h>
6#include <linux/mmzone.h> 5#include <linux/mmzone.h>
7#include <linux/bootmem.h> 6#include <linux/bootmem.h>
@@ -45,7 +44,7 @@ static struct mem_section *sparse_index_alloc(int nid)
45 44
46static int sparse_index_init(unsigned long section_nr, int nid) 45static int sparse_index_init(unsigned long section_nr, int nid)
47{ 46{
48 static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED; 47 static DEFINE_SPINLOCK(index_init_lock);
49 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 48 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
50 struct mem_section *section; 49 struct mem_section *section;
51 int ret = 0; 50 int ret = 0;
diff --git a/mm/swap.c b/mm/swap.c
index 03ae2076f92f..2e0e871f542f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,6 +34,25 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37/*
38 * This path almost never happens for VM activity - pages are normally
39 * freed via pagevecs. But it gets used by networking.
40 */
41static void fastcall __page_cache_release(struct page *page)
42{
43 if (PageLRU(page)) {
44 unsigned long flags;
45 struct zone *zone = page_zone(page);
46
47 spin_lock_irqsave(&zone->lru_lock, flags);
48 VM_BUG_ON(!PageLRU(page));
49 __ClearPageLRU(page);
50 del_page_from_lru(zone, page);
51 spin_unlock_irqrestore(&zone->lru_lock, flags);
52 }
53 free_hot_page(page);
54}
55
37static void put_compound_page(struct page *page) 56static void put_compound_page(struct page *page)
38{ 57{
39 page = (struct page *)page_private(page); 58 page = (struct page *)page_private(page);
@@ -54,6 +73,26 @@ void put_page(struct page *page)
54} 73}
55EXPORT_SYMBOL(put_page); 74EXPORT_SYMBOL(put_page);
56 75
76/**
77 * put_pages_list(): release a list of pages
78 *
79 * Release a list of pages which are strung together on page.lru. Currently
80 * used by read_cache_pages() and related error recovery code.
81 *
82 * @pages: list of pages threaded on page->lru
83 */
84void put_pages_list(struct list_head *pages)
85{
86 while (!list_empty(pages)) {
87 struct page *victim;
88
89 victim = list_entry(pages->prev, struct page, lru);
90 list_del(&victim->lru);
91 page_cache_release(victim);
92 }
93}
94EXPORT_SYMBOL(put_pages_list);
95
57/* 96/*
58 * Writeback is about to end against a page which has been marked for immediate 97 * Writeback is about to end against a page which has been marked for immediate
59 * reclaim. If it still appears to be reclaimable, move it to the tail of the 98 * reclaim. If it still appears to be reclaimable, move it to the tail of the
@@ -86,9 +125,8 @@ int rotate_reclaimable_page(struct page *page)
86 zone = page_zone(page); 125 zone = page_zone(page);
87 spin_lock_irqsave(&zone->lru_lock, flags); 126 spin_lock_irqsave(&zone->lru_lock, flags);
88 if (PageLRU(page) && !PageActive(page)) { 127 if (PageLRU(page) && !PageActive(page)) {
89 list_del(&page->lru); 128 list_move_tail(&page->lru, &zone->inactive_list);
90 list_add_tail(&page->lru, &zone->inactive_list); 129 __count_vm_event(PGROTATED);
91 inc_page_state(pgrotated);
92 } 130 }
93 if (!test_clear_page_writeback(page)) 131 if (!test_clear_page_writeback(page))
94 BUG(); 132 BUG();
@@ -108,7 +146,7 @@ void fastcall activate_page(struct page *page)
108 del_page_from_inactive_list(zone, page); 146 del_page_from_inactive_list(zone, page);
109 SetPageActive(page); 147 SetPageActive(page);
110 add_page_to_active_list(zone, page); 148 add_page_to_active_list(zone, page);
111 inc_page_state(pgactivate); 149 __count_vm_event(PGACTIVATE);
112 } 150 }
113 spin_unlock_irq(&zone->lru_lock); 151 spin_unlock_irq(&zone->lru_lock);
114} 152}
@@ -204,26 +242,6 @@ int lru_add_drain_all(void)
204#endif 242#endif
205 243
206/* 244/*
207 * This path almost never happens for VM activity - pages are normally
208 * freed via pagevecs. But it gets used by networking.
209 */
210void fastcall __page_cache_release(struct page *page)
211{
212 if (PageLRU(page)) {
213 unsigned long flags;
214 struct zone *zone = page_zone(page);
215
216 spin_lock_irqsave(&zone->lru_lock, flags);
217 BUG_ON(!PageLRU(page));
218 __ClearPageLRU(page);
219 del_page_from_lru(zone, page);
220 spin_unlock_irqrestore(&zone->lru_lock, flags);
221 }
222 free_hot_page(page);
223}
224EXPORT_SYMBOL(__page_cache_release);
225
226/*
227 * Batched page_cache_release(). Decrement the reference count on all the 245 * Batched page_cache_release(). Decrement the reference count on all the
228 * passed pages. If it fell to zero then remove the page from the LRU and 246 * passed pages. If it fell to zero then remove the page from the LRU and
229 * free it. 247 * free it.
@@ -265,7 +283,7 @@ void release_pages(struct page **pages, int nr, int cold)
265 zone = pagezone; 283 zone = pagezone;
266 spin_lock_irq(&zone->lru_lock); 284 spin_lock_irq(&zone->lru_lock);
267 } 285 }
268 BUG_ON(!PageLRU(page)); 286 VM_BUG_ON(!PageLRU(page));
269 __ClearPageLRU(page); 287 __ClearPageLRU(page);
270 del_page_from_lru(zone, page); 288 del_page_from_lru(zone, page);
271 } 289 }
@@ -318,7 +336,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
318 for (i = 0; i < pagevec_count(pvec); i++) { 336 for (i = 0; i < pagevec_count(pvec); i++) {
319 struct page *page = pvec->pages[i]; 337 struct page *page = pvec->pages[i];
320 338
321 BUG_ON(PageLRU(page)); 339 VM_BUG_ON(PageLRU(page));
322 if (put_page_testzero(page)) 340 if (put_page_testzero(page))
323 pagevec_add(&pages_to_free, page); 341 pagevec_add(&pages_to_free, page);
324 } 342 }
@@ -345,7 +363,7 @@ void __pagevec_lru_add(struct pagevec *pvec)
345 zone = pagezone; 363 zone = pagezone;
346 spin_lock_irq(&zone->lru_lock); 364 spin_lock_irq(&zone->lru_lock);
347 } 365 }
348 BUG_ON(PageLRU(page)); 366 VM_BUG_ON(PageLRU(page));
349 SetPageLRU(page); 367 SetPageLRU(page);
350 add_page_to_inactive_list(zone, page); 368 add_page_to_inactive_list(zone, page);
351 } 369 }
@@ -372,9 +390,9 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
372 zone = pagezone; 390 zone = pagezone;
373 spin_lock_irq(&zone->lru_lock); 391 spin_lock_irq(&zone->lru_lock);
374 } 392 }
375 BUG_ON(PageLRU(page)); 393 VM_BUG_ON(PageLRU(page));
376 SetPageLRU(page); 394 SetPageLRU(page);
377 BUG_ON(PageActive(page)); 395 VM_BUG_ON(PageActive(page));
378 SetPageActive(page); 396 SetPageActive(page);
379 add_page_to_active_list(zone, page); 397 add_page_to_active_list(zone, page);
380 } 398 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e0e1583f32c2..5f7cf2a4cb55 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,7 +24,7 @@
24 * vmscan's shrink_list, to make sync_page look nicer, and to allow 24 * vmscan's shrink_list, to make sync_page look nicer, and to allow
25 * future use of radix_tree tags in the swap cache. 25 * future use of radix_tree tags in the swap cache.
26 */ 26 */
27static struct address_space_operations swap_aops = { 27static const struct address_space_operations swap_aops = {
28 .writepage = swap_writepage, 28 .writepage = swap_writepage,
29 .sync_page = block_sync_page, 29 .sync_page = block_sync_page,
30 .set_page_dirty = __set_page_dirty_nobuffers, 30 .set_page_dirty = __set_page_dirty_nobuffers,
@@ -38,7 +38,7 @@ static struct backing_dev_info swap_backing_dev_info = {
38 38
39struct address_space swapper_space = { 39struct address_space swapper_space = {
40 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 40 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
41 .tree_lock = RW_LOCK_UNLOCKED, 41 .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
42 .a_ops = &swap_aops, 42 .a_ops = &swap_aops,
43 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 43 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
44 .backing_dev_info = &swap_backing_dev_info, 44 .backing_dev_info = &swap_backing_dev_info,
@@ -87,7 +87,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
87 SetPageSwapCache(page); 87 SetPageSwapCache(page);
88 set_page_private(page, entry.val); 88 set_page_private(page, entry.val);
89 total_swapcache_pages++; 89 total_swapcache_pages++;
90 pagecache_acct(1); 90 __inc_zone_page_state(page, NR_FILE_PAGES);
91 } 91 }
92 write_unlock_irq(&swapper_space.tree_lock); 92 write_unlock_irq(&swapper_space.tree_lock);
93 radix_tree_preload_end(); 93 radix_tree_preload_end();
@@ -132,7 +132,7 @@ void __delete_from_swap_cache(struct page *page)
132 set_page_private(page, 0); 132 set_page_private(page, 0);
133 ClearPageSwapCache(page); 133 ClearPageSwapCache(page);
134 total_swapcache_pages--; 134 total_swapcache_pages--;
135 pagecache_acct(-1); 135 __dec_zone_page_state(page, NR_FILE_PAGES);
136 INC_CACHE_INFO(del_total); 136 INC_CACHE_INFO(del_total);
137} 137}
138 138
diff --git a/mm/swapfile.c b/mm/swapfile.c
index cc367f7e75d8..a15def63f28f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -5,7 +5,6 @@
5 * Swap reorganised 29.12.95, Stephen Tweedie 5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */ 6 */
7 7
8#include <linux/config.h>
9#include <linux/mm.h> 8#include <linux/mm.h>
10#include <linux/hugetlb.h> 9#include <linux/hugetlb.h>
11#include <linux/mman.h> 10#include <linux/mman.h>
@@ -443,11 +442,12 @@ int swap_type_of(dev_t device)
443 442
444 if (!(swap_info[i].flags & SWP_WRITEOK)) 443 if (!(swap_info[i].flags & SWP_WRITEOK))
445 continue; 444 continue;
445
446 if (!device) { 446 if (!device) {
447 spin_unlock(&swap_lock); 447 spin_unlock(&swap_lock);
448 return i; 448 return i;
449 } 449 }
450 inode = swap_info->swap_file->f_dentry->d_inode; 450 inode = swap_info[i].swap_file->f_dentry->d_inode;
451 if (S_ISBLK(inode->i_mode) && 451 if (S_ISBLK(inode->i_mode) &&
452 device == MKDEV(imajor(inode), iminor(inode))) { 452 device == MKDEV(imajor(inode), iminor(inode))) {
453 spin_unlock(&swap_lock); 453 spin_unlock(&swap_lock);
@@ -1723,13 +1723,14 @@ get_swap_info_struct(unsigned type)
1723 */ 1723 */
1724int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 1724int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1725{ 1725{
1726 int ret = 0, i = 1 << page_cluster; 1726 int our_page_cluster = page_cluster;
1727 int ret = 0, i = 1 << our_page_cluster;
1727 unsigned long toff; 1728 unsigned long toff;
1728 struct swap_info_struct *swapdev = swp_type(entry) + swap_info; 1729 struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
1729 1730
1730 if (!page_cluster) /* no readahead */ 1731 if (!our_page_cluster) /* no readahead */
1731 return 0; 1732 return 0;
1732 toff = (swp_offset(entry) >> page_cluster) << page_cluster; 1733 toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster;
1733 if (!toff) /* first page is swap header */ 1734 if (!toff) /* first page is swap header */
1734 toff++, i--; 1735 toff++, i--;
1735 *offset = toff; 1736 *offset = toff;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index f9d6a9cc91c4..5f2cbf0f153c 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -12,7 +12,6 @@
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/devfs_fs_kernel.h>
16#include <linux/vfs.h> 15#include <linux/vfs.h>
17#include <linux/mount.h> 16#include <linux/mount.h>
18#include <linux/file.h> 17#include <linux/file.h>
@@ -33,9 +32,6 @@ static int __init init_tmpfs(void)
33{ 32{
34 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 33 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
35 34
36#ifdef CONFIG_TMPFS
37 devfs_mk_dir("shm");
38#endif
39 shm_mnt = kern_mount(&tmpfs_fs_type); 35 shm_mnt = kern_mount(&tmpfs_fs_type);
40 BUG_ON(IS_ERR(shm_mnt)); 36 BUG_ON(IS_ERR(shm_mnt));
41 37
diff --git a/mm/truncate.c b/mm/truncate.c
index cf1b015df4a7..a654928323dc 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/swap.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/pagevec.h> 15#include <linux/pagevec.h>
@@ -52,33 +53,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
52/* 53/*
53 * This is for invalidate_inode_pages(). That function can be called at 54 * This is for invalidate_inode_pages(). That function can be called at
54 * any time, and is not supposed to throw away dirty pages. But pages can 55 * any time, and is not supposed to throw away dirty pages. But pages can
55 * be marked dirty at any time too. So we re-check the dirtiness inside 56 * be marked dirty at any time too, so use remove_mapping which safely
56 * ->tree_lock. That provides exclusion against the __set_page_dirty 57 * discards clean, unused pages.
57 * functions.
58 * 58 *
59 * Returns non-zero if the page was successfully invalidated. 59 * Returns non-zero if the page was successfully invalidated.
60 */ 60 */
61static int 61static int
62invalidate_complete_page(struct address_space *mapping, struct page *page) 62invalidate_complete_page(struct address_space *mapping, struct page *page)
63{ 63{
64 int ret;
65
64 if (page->mapping != mapping) 66 if (page->mapping != mapping)
65 return 0; 67 return 0;
66 68
67 if (PagePrivate(page) && !try_to_release_page(page, 0)) 69 if (PagePrivate(page) && !try_to_release_page(page, 0))
68 return 0; 70 return 0;
69 71
70 write_lock_irq(&mapping->tree_lock); 72 ret = remove_mapping(mapping, page);
71 if (PageDirty(page)) {
72 write_unlock_irq(&mapping->tree_lock);
73 return 0;
74 }
75
76 BUG_ON(PagePrivate(page));
77 __remove_from_page_cache(page);
78 write_unlock_irq(&mapping->tree_lock);
79 ClearPageUptodate(page); 73 ClearPageUptodate(page);
80 page_cache_release(page); /* pagecache ref */ 74
81 return 1; 75 return ret;
82} 76}
83 77
84/** 78/**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35f8553f893a..1ac191ce5641 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,9 @@
24DEFINE_RWLOCK(vmlist_lock); 24DEFINE_RWLOCK(vmlist_lock);
25struct vm_struct *vmlist; 25struct vm_struct *vmlist;
26 26
27static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
28 int node);
29
27static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 30static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
28{ 31{
29 pte_t *pte; 32 pte_t *pte;
@@ -238,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
238 241
239/** 242/**
240 * get_vm_area - reserve a contingous kernel virtual area 243 * get_vm_area - reserve a contingous kernel virtual area
241 *
242 * @size: size of the area 244 * @size: size of the area
243 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 245 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
244 * 246 *
@@ -270,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr)
270} 272}
271 273
272/* Caller must hold vmlist_lock */ 274/* Caller must hold vmlist_lock */
273struct vm_struct *__remove_vm_area(void *addr) 275static struct vm_struct *__remove_vm_area(void *addr)
274{ 276{
275 struct vm_struct **p, *tmp; 277 struct vm_struct **p, *tmp;
276 278
@@ -293,7 +295,6 @@ found:
293 295
294/** 296/**
295 * remove_vm_area - find and remove a contingous kernel virtual area 297 * remove_vm_area - find and remove a contingous kernel virtual area
296 *
297 * @addr: base address 298 * @addr: base address
298 * 299 *
299 * Search for the kernel VM area starting at @addr, and remove it. 300 * Search for the kernel VM area starting at @addr, and remove it.
@@ -330,6 +331,8 @@ void __vunmap(void *addr, int deallocate_pages)
330 return; 331 return;
331 } 332 }
332 333
334 debug_check_no_locks_freed(addr, area->size);
335
333 if (deallocate_pages) { 336 if (deallocate_pages) {
334 int i; 337 int i;
335 338
@@ -338,7 +341,7 @@ void __vunmap(void *addr, int deallocate_pages)
338 __free_page(area->pages[i]); 341 __free_page(area->pages[i]);
339 } 342 }
340 343
341 if (area->nr_pages > PAGE_SIZE/sizeof(struct page *)) 344 if (area->flags & VM_VPAGES)
342 vfree(area->pages); 345 vfree(area->pages);
343 else 346 else
344 kfree(area->pages); 347 kfree(area->pages);
@@ -350,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages)
350 353
351/** 354/**
352 * vfree - release memory allocated by vmalloc() 355 * vfree - release memory allocated by vmalloc()
353 *
354 * @addr: memory base address 356 * @addr: memory base address
355 * 357 *
356 * Free the virtually contiguous memory area starting at @addr, as 358 * Free the virtually contiguous memory area starting at @addr, as
@@ -368,7 +370,6 @@ EXPORT_SYMBOL(vfree);
368 370
369/** 371/**
370 * vunmap - release virtual mapping obtained by vmap() 372 * vunmap - release virtual mapping obtained by vmap()
371 *
372 * @addr: memory base address 373 * @addr: memory base address
373 * 374 *
374 * Free the virtually contiguous memory area starting at @addr, 375 * Free the virtually contiguous memory area starting at @addr,
@@ -385,7 +386,6 @@ EXPORT_SYMBOL(vunmap);
385 386
386/** 387/**
387 * vmap - map an array of pages into virtually contiguous space 388 * vmap - map an array of pages into virtually contiguous space
388 *
389 * @pages: array of page pointers 389 * @pages: array of page pointers
390 * @count: number of pages to map 390 * @count: number of pages to map
391 * @flags: vm_area->flags 391 * @flags: vm_area->flags
@@ -425,9 +425,10 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
425 425
426 area->nr_pages = nr_pages; 426 area->nr_pages = nr_pages;
427 /* Please note that the recursion is strictly bounded. */ 427 /* Please note that the recursion is strictly bounded. */
428 if (array_size > PAGE_SIZE) 428 if (array_size > PAGE_SIZE) {
429 pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); 429 pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
430 else 430 area->flags |= VM_VPAGES;
431 } else
431 pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); 432 pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
432 area->pages = pages; 433 area->pages = pages;
433 if (!area->pages) { 434 if (!area->pages) {
@@ -465,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
465 466
466/** 467/**
467 * __vmalloc_node - allocate virtually contiguous memory 468 * __vmalloc_node - allocate virtually contiguous memory
468 *
469 * @size: allocation size 469 * @size: allocation size
470 * @gfp_mask: flags for the page level allocator 470 * @gfp_mask: flags for the page level allocator
471 * @prot: protection mask for the allocated pages 471 * @prot: protection mask for the allocated pages
@@ -475,8 +475,8 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
475 * allocator with @gfp_mask flags. Map them into contiguous 475 * allocator with @gfp_mask flags. Map them into contiguous
476 * kernel virtual space, using a pagetable protection of @prot. 476 * kernel virtual space, using a pagetable protection of @prot.
477 */ 477 */
478void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 478static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
479 int node) 479 int node)
480{ 480{
481 struct vm_struct *area; 481 struct vm_struct *area;
482 482
@@ -490,7 +490,6 @@ void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
490 490
491 return __vmalloc_area_node(area, gfp_mask, prot, node); 491 return __vmalloc_area_node(area, gfp_mask, prot, node);
492} 492}
493EXPORT_SYMBOL(__vmalloc_node);
494 493
495void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 494void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
496{ 495{
@@ -500,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc);
500 499
501/** 500/**
502 * vmalloc - allocate virtually contiguous memory 501 * vmalloc - allocate virtually contiguous memory
503 *
504 * @size: allocation size 502 * @size: allocation size
505 *
506 * Allocate enough pages to cover @size from the page level 503 * Allocate enough pages to cover @size from the page level
507 * allocator and map them into contiguous kernel virtual space. 504 * allocator and map them into contiguous kernel virtual space.
508 * 505 *
@@ -516,11 +513,11 @@ void *vmalloc(unsigned long size)
516EXPORT_SYMBOL(vmalloc); 513EXPORT_SYMBOL(vmalloc);
517 514
518/** 515/**
519 * vmalloc_user - allocate virtually contiguous memory which has 516 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
520 * been zeroed so it can be mapped to userspace without 517 * @size: allocation size
521 * leaking data.
522 * 518 *
523 * @size: allocation size 519 * The resulting memory area is zeroed so it can be mapped to userspace
520 * without leaking data.
524 */ 521 */
525void *vmalloc_user(unsigned long size) 522void *vmalloc_user(unsigned long size)
526{ 523{
@@ -539,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user);
539 536
540/** 537/**
541 * vmalloc_node - allocate memory on a specific node 538 * vmalloc_node - allocate memory on a specific node
542 *
543 * @size: allocation size 539 * @size: allocation size
544 * @node: numa node 540 * @node: numa node
545 * 541 *
@@ -561,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node);
561 557
562/** 558/**
563 * vmalloc_exec - allocate virtually contiguous, executable memory 559 * vmalloc_exec - allocate virtually contiguous, executable memory
564 *
565 * @size: allocation size 560 * @size: allocation size
566 * 561 *
567 * Kernel-internal function to allocate enough pages to cover @size 562 * Kernel-internal function to allocate enough pages to cover @size
@@ -579,7 +574,6 @@ void *vmalloc_exec(unsigned long size)
579 574
580/** 575/**
581 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 576 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
582 *
583 * @size: allocation size 577 * @size: allocation size
584 * 578 *
585 * Allocate enough 32bit PA addressable pages to cover @size from the 579 * Allocate enough 32bit PA addressable pages to cover @size from the
@@ -592,11 +586,11 @@ void *vmalloc_32(unsigned long size)
592EXPORT_SYMBOL(vmalloc_32); 586EXPORT_SYMBOL(vmalloc_32);
593 587
594/** 588/**
595 * vmalloc_32_user - allocate virtually contiguous memory (32bit 589 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
596 * addressable) which is zeroed so it can be
597 * mapped to userspace without leaking data.
598 *
599 * @size: allocation size 590 * @size: allocation size
591 *
592 * The resulting memory area is 32bit addressable and zeroed so it can be
593 * mapped to userspace without leaking data.
600 */ 594 */
601void *vmalloc_32_user(unsigned long size) 595void *vmalloc_32_user(unsigned long size)
602{ 596{
@@ -690,7 +684,6 @@ finished:
690 684
691/** 685/**
692 * remap_vmalloc_range - map vmalloc pages to userspace 686 * remap_vmalloc_range - map vmalloc pages to userspace
693 *
694 * @vma: vma to cover (map full range of vma) 687 * @vma: vma to cover (map full range of vma)
695 * @addr: vmalloc memory 688 * @addr: vmalloc memory
696 * @pgoff: number of pages into addr before first page to map 689 * @pgoff: number of pages into addr before first page to map
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 72babac71dea..eca70310adb2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/highmem.h> 21#include <linux/highmem.h>
22#include <linux/vmstat.h>
22#include <linux/file.h> 23#include <linux/file.h>
23#include <linux/writeback.h> 24#include <linux/writeback.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
@@ -34,6 +35,7 @@
34#include <linux/notifier.h> 35#include <linux/notifier.h>
35#include <linux/rwsem.h> 36#include <linux/rwsem.h>
36#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/kthread.h>
37 39
38#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
39#include <asm/div64.h> 41#include <asm/div64.h>
@@ -46,8 +48,6 @@ struct scan_control {
46 /* Incremented by the number of inactive pages that were scanned */ 48 /* Incremented by the number of inactive pages that were scanned */
47 unsigned long nr_scanned; 49 unsigned long nr_scanned;
48 50
49 unsigned long nr_mapped; /* From page_state */
50
51 /* This context's GFP mask */ 51 /* This context's GFP mask */
52 gfp_t gfp_mask; 52 gfp_t gfp_mask;
53 53
@@ -63,6 +63,8 @@ struct scan_control {
63 int swap_cluster_max; 63 int swap_cluster_max;
64 64
65 int swappiness; 65 int swappiness;
66
67 int all_unreclaimable;
66}; 68};
67 69
68/* 70/*
@@ -216,7 +218,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
216 break; 218 break;
217 if (shrink_ret < nr_before) 219 if (shrink_ret < nr_before)
218 ret += nr_before - shrink_ret; 220 ret += nr_before - shrink_ret;
219 mod_page_state(slabs_scanned, this_scan); 221 count_vm_events(SLABS_SCANNED, this_scan);
220 total_scan -= this_scan; 222 total_scan -= this_scan;
221 223
222 cond_resched(); 224 cond_resched();
@@ -369,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
369 /* synchronous write or broken a_ops? */ 371 /* synchronous write or broken a_ops? */
370 ClearPageReclaim(page); 372 ClearPageReclaim(page);
371 } 373 }
372 374 inc_zone_page_state(page, NR_VMSCAN_WRITE);
373 return PAGE_SUCCESS; 375 return PAGE_SUCCESS;
374 } 376 }
375 377
@@ -378,15 +380,34 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
378 380
379int remove_mapping(struct address_space *mapping, struct page *page) 381int remove_mapping(struct address_space *mapping, struct page *page)
380{ 382{
381 if (!mapping) 383 BUG_ON(!PageLocked(page));
382 return 0; /* truncate got there first */ 384 BUG_ON(mapping != page_mapping(page));
383 385
384 write_lock_irq(&mapping->tree_lock); 386 write_lock_irq(&mapping->tree_lock);
385
386 /* 387 /*
387 * The non-racy check for busy page. It is critical to check 388 * The non racy check for a busy page.
388 * PageDirty _after_ making sure that the page is freeable and 389 *
389 * not in use by anybody. (pagecache + us == 2) 390 * Must be careful with the order of the tests. When someone has
391 * a ref to the page, it may be possible that they dirty it then
392 * drop the reference. So if PageDirty is tested before page_count
393 * here, then the following race may occur:
394 *
395 * get_user_pages(&page);
396 * [user mapping goes away]
397 * write_to(page);
398 * !PageDirty(page) [good]
399 * SetPageDirty(page);
400 * put_page(page);
401 * !page_count(page) [good, discard it]
402 *
403 * [oops, our write_to data is lost]
404 *
405 * Reversing the order of the tests ensures such a situation cannot
406 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
407 * load is not satisfied before that of page->_count.
408 *
409 * Note that if SetPageDirty is always performed via set_page_dirty,
410 * and thus under tree_lock, then this ordering is not required.
390 */ 411 */
391 if (unlikely(page_count(page) != 2)) 412 if (unlikely(page_count(page) != 2))
392 goto cannot_free; 413 goto cannot_free;
@@ -441,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
441 if (TestSetPageLocked(page)) 462 if (TestSetPageLocked(page))
442 goto keep; 463 goto keep;
443 464
444 BUG_ON(PageActive(page)); 465 VM_BUG_ON(PageActive(page));
445 466
446 sc->nr_scanned++; 467 sc->nr_scanned++;
447 468
@@ -548,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
548 goto free_it; 569 goto free_it;
549 } 570 }
550 571
551 if (!remove_mapping(mapping, page)) 572 if (!mapping || !remove_mapping(mapping, page))
552 goto keep_locked; 573 goto keep_locked;
553 574
554free_it: 575free_it:
@@ -565,12 +586,12 @@ keep_locked:
565 unlock_page(page); 586 unlock_page(page);
566keep: 587keep:
567 list_add(&page->lru, &ret_pages); 588 list_add(&page->lru, &ret_pages);
568 BUG_ON(PageLRU(page)); 589 VM_BUG_ON(PageLRU(page));
569 } 590 }
570 list_splice(&ret_pages, page_list); 591 list_splice(&ret_pages, page_list);
571 if (pagevec_count(&freed_pvec)) 592 if (pagevec_count(&freed_pvec))
572 __pagevec_release_nonlru(&freed_pvec); 593 __pagevec_release_nonlru(&freed_pvec);
573 mod_page_state(pgactivate, pgactivate); 594 count_vm_events(PGACTIVATE, pgactivate);
574 return nr_reclaimed; 595 return nr_reclaimed;
575} 596}
576 597
@@ -604,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
604 page = lru_to_page(src); 625 page = lru_to_page(src);
605 prefetchw_prev_lru_page(page, src, flags); 626 prefetchw_prev_lru_page(page, src, flags);
606 627
607 BUG_ON(!PageLRU(page)); 628 VM_BUG_ON(!PageLRU(page));
608 629
609 list_del(&page->lru); 630 list_del(&page->lru);
610 target = src; 631 target = src;
@@ -660,11 +681,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
660 nr_reclaimed += nr_freed; 681 nr_reclaimed += nr_freed;
661 local_irq_disable(); 682 local_irq_disable();
662 if (current_is_kswapd()) { 683 if (current_is_kswapd()) {
663 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); 684 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
664 __mod_page_state(kswapd_steal, nr_freed); 685 __count_vm_events(KSWAPD_STEAL, nr_freed);
665 } else 686 } else
666 __mod_page_state_zone(zone, pgscan_direct, nr_scan); 687 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
667 __mod_page_state_zone(zone, pgsteal, nr_freed); 688 __count_vm_events(PGACTIVATE, nr_freed);
668 689
669 if (nr_taken == 0) 690 if (nr_taken == 0)
670 goto done; 691 goto done;
@@ -675,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
675 */ 696 */
676 while (!list_empty(&page_list)) { 697 while (!list_empty(&page_list)) {
677 page = lru_to_page(&page_list); 698 page = lru_to_page(&page_list);
678 BUG_ON(PageLRU(page)); 699 VM_BUG_ON(PageLRU(page));
679 SetPageLRU(page); 700 SetPageLRU(page);
680 list_del(&page->lru); 701 list_del(&page->lru);
681 if (PageActive(page)) 702 if (PageActive(page))
@@ -696,6 +717,11 @@ done:
696 return nr_reclaimed; 717 return nr_reclaimed;
697} 718}
698 719
720static inline int zone_is_near_oom(struct zone *zone)
721{
722 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
723}
724
699/* 725/*
700 * This moves pages from the active list to the inactive list. 726 * This moves pages from the active list to the inactive list.
701 * 727 *
@@ -731,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
731 long distress; 757 long distress;
732 long swap_tendency; 758 long swap_tendency;
733 759
760 if (zone_is_near_oom(zone))
761 goto force_reclaim_mapped;
762
734 /* 763 /*
735 * `distress' is a measure of how much trouble we're having 764 * `distress' is a measure of how much trouble we're having
736 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 765 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
@@ -743,7 +772,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
743 * how much memory 772 * how much memory
744 * is mapped. 773 * is mapped.
745 */ 774 */
746 mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages; 775 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
776 global_page_state(NR_ANON_PAGES)) * 100) /
777 vm_total_pages;
747 778
748 /* 779 /*
749 * Now decide how much we really want to unmap some pages. The 780 * Now decide how much we really want to unmap some pages. The
@@ -764,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
764 * memory onto the inactive list. 795 * memory onto the inactive list.
765 */ 796 */
766 if (swap_tendency >= 100) 797 if (swap_tendency >= 100)
798force_reclaim_mapped:
767 reclaim_mapped = 1; 799 reclaim_mapped = 1;
768 } 800 }
769 801
@@ -796,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
796 while (!list_empty(&l_inactive)) { 828 while (!list_empty(&l_inactive)) {
797 page = lru_to_page(&l_inactive); 829 page = lru_to_page(&l_inactive);
798 prefetchw_prev_lru_page(page, &l_inactive, flags); 830 prefetchw_prev_lru_page(page, &l_inactive, flags);
799 BUG_ON(PageLRU(page)); 831 VM_BUG_ON(PageLRU(page));
800 SetPageLRU(page); 832 SetPageLRU(page);
801 BUG_ON(!PageActive(page)); 833 VM_BUG_ON(!PageActive(page));
802 ClearPageActive(page); 834 ClearPageActive(page);
803 835
804 list_move(&page->lru, &zone->inactive_list); 836 list_move(&page->lru, &zone->inactive_list);
@@ -826,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
826 while (!list_empty(&l_active)) { 858 while (!list_empty(&l_active)) {
827 page = lru_to_page(&l_active); 859 page = lru_to_page(&l_active);
828 prefetchw_prev_lru_page(page, &l_active, flags); 860 prefetchw_prev_lru_page(page, &l_active, flags);
829 BUG_ON(PageLRU(page)); 861 VM_BUG_ON(PageLRU(page));
830 SetPageLRU(page); 862 SetPageLRU(page);
831 BUG_ON(!PageActive(page)); 863 VM_BUG_ON(!PageActive(page));
832 list_move(&page->lru, &zone->active_list); 864 list_move(&page->lru, &zone->active_list);
833 pgmoved++; 865 pgmoved++;
834 if (!pagevec_add(&pvec, page)) { 866 if (!pagevec_add(&pvec, page)) {
@@ -840,11 +872,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
840 } 872 }
841 } 873 }
842 zone->nr_active += pgmoved; 874 zone->nr_active += pgmoved;
843 spin_unlock(&zone->lru_lock);
844 875
845 __mod_page_state_zone(zone, pgrefill, pgscanned); 876 __count_zone_vm_events(PGREFILL, zone, pgscanned);
846 __mod_page_state(pgdeactivate, pgdeactivate); 877 __count_vm_events(PGDEACTIVATE, pgdeactivate);
847 local_irq_enable(); 878 spin_unlock_irq(&zone->lru_lock);
848 879
849 pagevec_release(&pvec); 880 pagevec_release(&pvec);
850} 881}
@@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
925 unsigned long nr_reclaimed = 0; 956 unsigned long nr_reclaimed = 0;
926 int i; 957 int i;
927 958
959 sc->all_unreclaimable = 1;
928 for (i = 0; zones[i] != NULL; i++) { 960 for (i = 0; zones[i] != NULL; i++) {
929 struct zone *zone = zones[i]; 961 struct zone *zone = zones[i];
930 962
@@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
941 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 973 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
942 continue; /* Let kswapd poll it */ 974 continue; /* Let kswapd poll it */
943 975
976 sc->all_unreclaimable = 0;
977
944 nr_reclaimed += shrink_zone(priority, zone, sc); 978 nr_reclaimed += shrink_zone(priority, zone, sc);
945 } 979 }
946 return nr_reclaimed; 980 return nr_reclaimed;
@@ -976,7 +1010,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
976 .swappiness = vm_swappiness, 1010 .swappiness = vm_swappiness,
977 }; 1011 };
978 1012
979 inc_page_state(allocstall); 1013 count_vm_event(ALLOCSTALL);
980 1014
981 for (i = 0; zones[i] != NULL; i++) { 1015 for (i = 0; zones[i] != NULL; i++) {
982 struct zone *zone = zones[i]; 1016 struct zone *zone = zones[i];
@@ -989,7 +1023,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
989 } 1023 }
990 1024
991 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1025 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
992 sc.nr_mapped = read_page_state(nr_mapped);
993 sc.nr_scanned = 0; 1026 sc.nr_scanned = 0;
994 if (!priority) 1027 if (!priority)
995 disable_swap_token(); 1028 disable_swap_token();
@@ -1022,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1022 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1055 if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
1023 blk_congestion_wait(WRITE, HZ/10); 1056 blk_congestion_wait(WRITE, HZ/10);
1024 } 1057 }
1058 /* top priority shrink_caches still had more to do? don't OOM, then */
1059 if (!sc.all_unreclaimable)
1060 ret = 1;
1025out: 1061out:
1026 for (i = 0; zones[i] != 0; i++) { 1062 for (i = 0; zones[i] != 0; i++) {
1027 struct zone *zone = zones[i]; 1063 struct zone *zone = zones[i];
@@ -1074,9 +1110,7 @@ loop_again:
1074 total_scanned = 0; 1110 total_scanned = 0;
1075 nr_reclaimed = 0; 1111 nr_reclaimed = 0;
1076 sc.may_writepage = !laptop_mode; 1112 sc.may_writepage = !laptop_mode;
1077 sc.nr_mapped = read_page_state(nr_mapped); 1113 count_vm_event(PAGEOUTRUN);
1078
1079 inc_page_state(pageoutrun);
1080 1114
1081 for (i = 0; i < pgdat->nr_zones; i++) { 1115 for (i = 0; i < pgdat->nr_zones; i++) {
1082 struct zone *zone = pgdat->node_zones + i; 1116 struct zone *zone = pgdat->node_zones + i;
@@ -1156,7 +1190,7 @@ scan:
1156 if (zone->all_unreclaimable) 1190 if (zone->all_unreclaimable)
1157 continue; 1191 continue;
1158 if (nr_slab == 0 && zone->pages_scanned >= 1192 if (nr_slab == 0 && zone->pages_scanned >=
1159 (zone->nr_active + zone->nr_inactive) * 4) 1193 (zone->nr_active + zone->nr_inactive) * 6)
1160 zone->all_unreclaimable = 1; 1194 zone->all_unreclaimable = 1;
1161 /* 1195 /*
1162 * If we've done a decent amount of scanning and 1196 * If we've done a decent amount of scanning and
@@ -1223,7 +1257,6 @@ static int kswapd(void *p)
1223 }; 1257 };
1224 cpumask_t cpumask; 1258 cpumask_t cpumask;
1225 1259
1226 daemonize("kswapd%d", pgdat->node_id);
1227 cpumask = node_to_cpumask(pgdat->node_id); 1260 cpumask = node_to_cpumask(pgdat->node_id);
1228 if (!cpus_empty(cpumask)) 1261 if (!cpus_empty(cpumask))
1229 set_cpus_allowed(tsk, cpumask); 1262 set_cpus_allowed(tsk, cpumask);
@@ -1365,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1365 for_each_zone(zone) 1398 for_each_zone(zone)
1366 lru_pages += zone->nr_active + zone->nr_inactive; 1399 lru_pages += zone->nr_active + zone->nr_inactive;
1367 1400
1368 nr_slab = read_page_state(nr_slab); 1401 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1369 /* If slab caches are huge, it's better to hit them first */ 1402 /* If slab caches are huge, it's better to hit them first */
1370 while (nr_slab >= lru_pages) { 1403 while (nr_slab >= lru_pages) {
1371 reclaim_state.reclaimed_slab = 0; 1404 reclaim_state.reclaimed_slab = 0;
@@ -1407,9 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1407 for (prio = DEF_PRIORITY; prio >= 0; prio--) { 1440 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
1408 unsigned long nr_to_scan = nr_pages - ret; 1441 unsigned long nr_to_scan = nr_pages - ret;
1409 1442
1410 sc.nr_mapped = read_page_state(nr_mapped);
1411 sc.nr_scanned = 0; 1443 sc.nr_scanned = 0;
1412
1413 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); 1444 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
1414 if (ret >= nr_pages) 1445 if (ret >= nr_pages)
1415 goto out; 1446 goto out;
@@ -1450,7 +1481,7 @@ out:
1450 not required for correctness. So if the last cpu in a node goes 1481 not required for correctness. So if the last cpu in a node goes
1451 away, we get changed to run anywhere: as the first one comes back, 1482 away, we get changed to run anywhere: as the first one comes back,
1452 restore their cpu bindings. */ 1483 restore their cpu bindings. */
1453static int cpu_callback(struct notifier_block *nfb, 1484static int __devinit cpu_callback(struct notifier_block *nfb,
1454 unsigned long action, void *hcpu) 1485 unsigned long action, void *hcpu)
1455{ 1486{
1456 pg_data_t *pgdat; 1487 pg_data_t *pgdat;
@@ -1468,20 +1499,35 @@ static int cpu_callback(struct notifier_block *nfb,
1468} 1499}
1469#endif /* CONFIG_HOTPLUG_CPU */ 1500#endif /* CONFIG_HOTPLUG_CPU */
1470 1501
1502/*
1503 * This kswapd start function will be called by init and node-hot-add.
1504 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
1505 */
1506int kswapd_run(int nid)
1507{
1508 pg_data_t *pgdat = NODE_DATA(nid);
1509 int ret = 0;
1510
1511 if (pgdat->kswapd)
1512 return 0;
1513
1514 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
1515 if (IS_ERR(pgdat->kswapd)) {
1516 /* failure at boot is fatal */
1517 BUG_ON(system_state == SYSTEM_BOOTING);
1518 printk("Failed to start kswapd on node %d\n",nid);
1519 ret = -1;
1520 }
1521 return ret;
1522}
1523
1471static int __init kswapd_init(void) 1524static int __init kswapd_init(void)
1472{ 1525{
1473 pg_data_t *pgdat; 1526 int nid;
1474 1527
1475 swap_setup(); 1528 swap_setup();
1476 for_each_online_pgdat(pgdat) { 1529 for_each_online_node(nid)
1477 pid_t pid; 1530 kswapd_run(nid);
1478
1479 pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
1480 BUG_ON(pid < 0);
1481 read_lock(&tasklist_lock);
1482 pgdat->kswapd = find_task_by_pid(pid);
1483 read_unlock(&tasklist_lock);
1484 }
1485 hotcpu_notifier(cpu_callback, 0); 1531 hotcpu_notifier(cpu_callback, 0);
1486 return 0; 1532 return 0;
1487} 1533}
@@ -1494,10 +1540,6 @@ module_init(kswapd_init)
1494 * 1540 *
1495 * If non-zero call zone_reclaim when the number of free pages falls below 1541 * If non-zero call zone_reclaim when the number of free pages falls below
1496 * the watermarks. 1542 * the watermarks.
1497 *
1498 * In the future we may add flags to the mode. However, the page allocator
1499 * should only have to check that zone_reclaim_mode != 0 before calling
1500 * zone_reclaim().
1501 */ 1543 */
1502int zone_reclaim_mode __read_mostly; 1544int zone_reclaim_mode __read_mostly;
1503 1545
@@ -1505,12 +1547,6 @@ int zone_reclaim_mode __read_mostly;
1505#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1547#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
1506#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1548#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1507#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1549#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1508#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
1509
1510/*
1511 * Mininum time between zone reclaim scans
1512 */
1513int zone_reclaim_interval __read_mostly = 30*HZ;
1514 1550
1515/* 1551/*
1516 * Priority for ZONE_RECLAIM. This determines the fraction of pages 1552 * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1520,6 +1556,18 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
1520#define ZONE_RECLAIM_PRIORITY 4 1556#define ZONE_RECLAIM_PRIORITY 4
1521 1557
1522/* 1558/*
1559 * Percentage of pages in a zone that must be unmapped for zone_reclaim to
1560 * occur.
1561 */
1562int sysctl_min_unmapped_ratio = 1;
1563
1564/*
1565 * If the number of slab pages in a zone grows beyond this percentage then
1566 * slab reclaim needs to occur.
1567 */
1568int sysctl_min_slab_ratio = 5;
1569
1570/*
1523 * Try to free up some pages from this zone through reclaim. 1571 * Try to free up some pages from this zone through reclaim.
1524 */ 1572 */
1525static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1573static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -1533,12 +1581,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1533 struct scan_control sc = { 1581 struct scan_control sc = {
1534 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 1582 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
1535 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 1583 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
1536 .nr_mapped = read_page_state(nr_mapped),
1537 .swap_cluster_max = max_t(unsigned long, nr_pages, 1584 .swap_cluster_max = max_t(unsigned long, nr_pages,
1538 SWAP_CLUSTER_MAX), 1585 SWAP_CLUSTER_MAX),
1539 .gfp_mask = gfp_mask, 1586 .gfp_mask = gfp_mask,
1540 .swappiness = vm_swappiness, 1587 .swappiness = vm_swappiness,
1541 }; 1588 };
1589 unsigned long slab_reclaimable;
1542 1590
1543 disable_swap_token(); 1591 disable_swap_token();
1544 cond_resched(); 1592 cond_resched();
@@ -1551,43 +1599,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1551 reclaim_state.reclaimed_slab = 0; 1599 reclaim_state.reclaimed_slab = 0;
1552 p->reclaim_state = &reclaim_state; 1600 p->reclaim_state = &reclaim_state;
1553 1601
1554 /* 1602 if (zone_page_state(zone, NR_FILE_PAGES) -
1555 * Free memory by calling shrink zone with increasing priorities 1603 zone_page_state(zone, NR_FILE_MAPPED) >
1556 * until we have enough memory freed. 1604 zone->min_unmapped_pages) {
1557 */ 1605 /*
1558 priority = ZONE_RECLAIM_PRIORITY; 1606 * Free memory by calling shrink zone with increasing
1559 do { 1607 * priorities until we have enough memory freed.
1560 nr_reclaimed += shrink_zone(priority, zone, &sc); 1608 */
1561 priority--; 1609 priority = ZONE_RECLAIM_PRIORITY;
1562 } while (priority >= 0 && nr_reclaimed < nr_pages); 1610 do {
1611 nr_reclaimed += shrink_zone(priority, zone, &sc);
1612 priority--;
1613 } while (priority >= 0 && nr_reclaimed < nr_pages);
1614 }
1563 1615
1564 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { 1616 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
1617 if (slab_reclaimable > zone->min_slab_pages) {
1565 /* 1618 /*
1566 * shrink_slab() does not currently allow us to determine how 1619 * shrink_slab() does not currently allow us to determine how
1567 * many pages were freed in this zone. So we just shake the slab 1620 * many pages were freed in this zone. So we take the current
1568 * a bit and then go off node for this particular allocation 1621 * number of slab pages and shake the slab until it is reduced
1569 * despite possibly having freed enough memory to allocate in 1622 * by the same nr_pages that we used for reclaiming unmapped
1570 * this zone. If we freed local memory then the next 1623 * pages.
1571 * allocations will be local again.
1572 * 1624 *
1573 * shrink_slab will free memory on all zones and may take 1625 * Note that shrink_slab will free memory on all zones and may
1574 * a long time. 1626 * take a long time.
1575 */ 1627 */
1576 shrink_slab(sc.nr_scanned, gfp_mask, order); 1628 while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
1577 } 1629 zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
1630 slab_reclaimable - nr_pages)
1631 ;
1578 1632
1579 p->reclaim_state = NULL;
1580 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
1581
1582 if (nr_reclaimed == 0) {
1583 /* 1633 /*
1584 * We were unable to reclaim enough pages to stay on node. We 1634 * Update nr_reclaimed by the number of slab pages we
1585 * now allow off node accesses for a certain time period before 1635 * reclaimed from this zone.
1586 * trying again to reclaim pages from the local zone.
1587 */ 1636 */
1588 zone->last_unsuccessful_zone_reclaim = jiffies; 1637 nr_reclaimed += slab_reclaimable -
1638 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
1589 } 1639 }
1590 1640
1641 p->reclaim_state = NULL;
1642 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
1591 return nr_reclaimed >= nr_pages; 1643 return nr_reclaimed >= nr_pages;
1592} 1644}
1593 1645
@@ -1597,14 +1649,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1597 int node_id; 1649 int node_id;
1598 1650
1599 /* 1651 /*
1600 * Do not reclaim if there was a recent unsuccessful attempt at zone 1652 * Zone reclaim reclaims unmapped file backed pages and
1601 * reclaim. In that case we let allocations go off node for the 1653 * slab pages if we are over the defined limits.
1602 * zone_reclaim_interval. Otherwise we would scan for each off-node 1654 *
1603 * page allocation. 1655 * A small portion of unmapped file backed pages is needed for
1656 * file I/O otherwise pages read by file I/O will be immediately
1657 * thrown out if the zone is overallocated. So we do not reclaim
1658 * if less than a specified percentage of the zone is used by
1659 * unmapped file backed pages.
1604 */ 1660 */
1605 if (time_before(jiffies, 1661 if (zone_page_state(zone, NR_FILE_PAGES) -
1606 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) 1662 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
1607 return 0; 1663 && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
1664 <= zone->min_slab_pages)
1665 return 0;
1608 1666
1609 /* 1667 /*
1610 * Avoid concurrent zone reclaims, do not reclaim in a zone that does 1668 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
@@ -1623,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1623 * over remote processors and spread off node memory allocations 1681 * over remote processors and spread off node memory allocations
1624 * as wide as possible. 1682 * as wide as possible.
1625 */ 1683 */
1626 node_id = zone->zone_pgdat->node_id; 1684 node_id = zone_to_nid(zone);
1627 mask = node_to_cpumask(node_id); 1685 mask = node_to_cpumask(node_id);
1628 if (!cpus_empty(mask) && node_id != numa_node_id()) 1686 if (!cpus_empty(mask) && node_id != numa_node_id())
1629 return 0; 1687 return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
new file mode 100644
index 000000000000..a2b6a9f96e5c
--- /dev/null
+++ b/mm/vmstat.c
@@ -0,0 +1,706 @@
1/*
2 * linux/mm/vmstat.c
3 *
4 * Manages VM statistics
5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 *
7 * zoned VM statistics
8 * Copyright (C) 2006 Silicon Graphics, Inc.,
9 * Christoph Lameter <christoph@lameter.com>
10 */
11
12#include <linux/config.h>
13#include <linux/mm.h>
14#include <linux/module.h>
15#include <linux/cpu.h>
16
17void __get_zone_counts(unsigned long *active, unsigned long *inactive,
18 unsigned long *free, struct pglist_data *pgdat)
19{
20 struct zone *zones = pgdat->node_zones;
21 int i;
22
23 *active = 0;
24 *inactive = 0;
25 *free = 0;
26 for (i = 0; i < MAX_NR_ZONES; i++) {
27 *active += zones[i].nr_active;
28 *inactive += zones[i].nr_inactive;
29 *free += zones[i].free_pages;
30 }
31}
32
33void get_zone_counts(unsigned long *active,
34 unsigned long *inactive, unsigned long *free)
35{
36 struct pglist_data *pgdat;
37
38 *active = 0;
39 *inactive = 0;
40 *free = 0;
41 for_each_online_pgdat(pgdat) {
42 unsigned long l, m, n;
43 __get_zone_counts(&l, &m, &n, pgdat);
44 *active += l;
45 *inactive += m;
46 *free += n;
47 }
48}
49
50#ifdef CONFIG_VM_EVENT_COUNTERS
51DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
52EXPORT_PER_CPU_SYMBOL(vm_event_states);
53
54static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
55{
56 int cpu = 0;
57 int i;
58
59 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
60
61 cpu = first_cpu(*cpumask);
62 while (cpu < NR_CPUS) {
63 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
64
65 cpu = next_cpu(cpu, *cpumask);
66
67 if (cpu < NR_CPUS)
68 prefetch(&per_cpu(vm_event_states, cpu));
69
70
71 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
72 ret[i] += this->event[i];
73 }
74}
75
76/*
77 * Accumulate the vm event counters across all CPUs.
78 * The result is unavoidably approximate - it can change
79 * during and after execution of this function.
80*/
81void all_vm_events(unsigned long *ret)
82{
83 sum_vm_events(ret, &cpu_online_map);
84}
85EXPORT_SYMBOL_GPL(all_vm_events);
86
87#ifdef CONFIG_HOTPLUG
88/*
89 * Fold the foreign cpu events into our own.
90 *
91 * This is adding to the events on one processor
92 * but keeps the global counts constant.
93 */
94void vm_events_fold_cpu(int cpu)
95{
96 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
97 int i;
98
99 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
100 count_vm_events(i, fold_state->event[i]);
101 fold_state->event[i] = 0;
102 }
103}
104#endif /* CONFIG_HOTPLUG */
105
106#endif /* CONFIG_VM_EVENT_COUNTERS */
107
108/*
109 * Manage combined zone based / global counters
110 *
111 * vm_stat contains the global counters
112 */
113atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
114EXPORT_SYMBOL(vm_stat);
115
116#ifdef CONFIG_SMP
117
118static int calculate_threshold(struct zone *zone)
119{
120 int threshold;
121 int mem; /* memory in 128 MB units */
122
123 /*
124 * The threshold scales with the number of processors and the amount
125 * of memory per zone. More memory means that we can defer updates for
126 * longer, more processors could lead to more contention.
127 * fls() is used to have a cheap way of logarithmic scaling.
128 *
129 * Some sample thresholds:
130 *
131 * Threshold Processors (fls) Zonesize fls(mem+1)
132 * ------------------------------------------------------------------
133 * 8 1 1 0.9-1 GB 4
134 * 16 2 2 0.9-1 GB 4
135 * 20 2 2 1-2 GB 5
136 * 24 2 2 2-4 GB 6
137 * 28 2 2 4-8 GB 7
138 * 32 2 2 8-16 GB 8
139 * 4 2 2 <128M 1
140 * 30 4 3 2-4 GB 5
141 * 48 4 3 8-16 GB 8
142 * 32 8 4 1-2 GB 4
143 * 32 8 4 0.9-1GB 4
144 * 10 16 5 <128M 1
145 * 40 16 5 900M 4
146 * 70 64 7 2-4 GB 5
147 * 84 64 7 4-8 GB 6
148 * 108 512 9 4-8 GB 6
149 * 125 1024 10 8-16 GB 8
150 * 125 1024 10 16-32 GB 9
151 */
152
153 mem = zone->present_pages >> (27 - PAGE_SHIFT);
154
155 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
156
157 /*
158 * Maximum threshold is 125
159 */
160 threshold = min(125, threshold);
161
162 return threshold;
163}
164
165/*
166 * Refresh the thresholds for each zone.
167 */
168static void refresh_zone_stat_thresholds(void)
169{
170 struct zone *zone;
171 int cpu;
172 int threshold;
173
174 for_each_zone(zone) {
175
176 if (!zone->present_pages)
177 continue;
178
179 threshold = calculate_threshold(zone);
180
181 for_each_online_cpu(cpu)
182 zone_pcp(zone, cpu)->stat_threshold = threshold;
183 }
184}
185
186/*
187 * For use when we know that interrupts are disabled.
188 */
189void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
190 int delta)
191{
192 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
193 s8 *p = pcp->vm_stat_diff + item;
194 long x;
195
196 x = delta + *p;
197
198 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
199 zone_page_state_add(x, zone, item);
200 x = 0;
201 }
202 *p = x;
203}
204EXPORT_SYMBOL(__mod_zone_page_state);
205
206/*
207 * For an unknown interrupt state
208 */
209void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
210 int delta)
211{
212 unsigned long flags;
213
214 local_irq_save(flags);
215 __mod_zone_page_state(zone, item, delta);
216 local_irq_restore(flags);
217}
218EXPORT_SYMBOL(mod_zone_page_state);
219
220/*
221 * Optimized increment and decrement functions.
222 *
223 * These are only for a single page and therefore can take a struct page *
224 * argument instead of struct zone *. This allows the inclusion of the code
225 * generated for page_zone(page) into the optimized functions.
226 *
227 * No overflow check is necessary and therefore the differential can be
228 * incremented or decremented in place which may allow the compilers to
229 * generate better code.
230 * The increment or decrement is known and therefore one boundary check can
231 * be omitted.
232 *
233 * NOTE: These functions are very performance sensitive. Change only
234 * with care.
235 *
236 * Some processors have inc/dec instructions that are atomic vs an interrupt.
237 * However, the code must first determine the differential location in a zone
238 * based on the processor number and then inc/dec the counter. There is no
239 * guarantee without disabling preemption that the processor will not change
240 * in between and therefore the atomicity vs. interrupt cannot be exploited
241 * in a useful way here.
242 */
243static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
244{
245 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
246 s8 *p = pcp->vm_stat_diff + item;
247
248 (*p)++;
249
250 if (unlikely(*p > pcp->stat_threshold)) {
251 int overstep = pcp->stat_threshold / 2;
252
253 zone_page_state_add(*p + overstep, zone, item);
254 *p = -overstep;
255 }
256}
257
258void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
259{
260 __inc_zone_state(page_zone(page), item);
261}
262EXPORT_SYMBOL(__inc_zone_page_state);
263
264void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
265{
266 struct zone *zone = page_zone(page);
267 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
268 s8 *p = pcp->vm_stat_diff + item;
269
270 (*p)--;
271
272 if (unlikely(*p < - pcp->stat_threshold)) {
273 int overstep = pcp->stat_threshold / 2;
274
275 zone_page_state_add(*p - overstep, zone, item);
276 *p = overstep;
277 }
278}
279EXPORT_SYMBOL(__dec_zone_page_state);
280
281void inc_zone_state(struct zone *zone, enum zone_stat_item item)
282{
283 unsigned long flags;
284
285 local_irq_save(flags);
286 __inc_zone_state(zone, item);
287 local_irq_restore(flags);
288}
289
290void inc_zone_page_state(struct page *page, enum zone_stat_item item)
291{
292 unsigned long flags;
293 struct zone *zone;
294
295 zone = page_zone(page);
296 local_irq_save(flags);
297 __inc_zone_state(zone, item);
298 local_irq_restore(flags);
299}
300EXPORT_SYMBOL(inc_zone_page_state);
301
302void dec_zone_page_state(struct page *page, enum zone_stat_item item)
303{
304 unsigned long flags;
305
306 local_irq_save(flags);
307 __dec_zone_page_state(page, item);
308 local_irq_restore(flags);
309}
310EXPORT_SYMBOL(dec_zone_page_state);
311
312/*
313 * Update the zone counters for one cpu.
314 */
315void refresh_cpu_vm_stats(int cpu)
316{
317 struct zone *zone;
318 int i;
319 unsigned long flags;
320
321 for_each_zone(zone) {
322 struct per_cpu_pageset *pcp;
323
324 if (!populated_zone(zone))
325 continue;
326
327 pcp = zone_pcp(zone, cpu);
328
329 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
330 if (pcp->vm_stat_diff[i]) {
331 local_irq_save(flags);
332 zone_page_state_add(pcp->vm_stat_diff[i],
333 zone, i);
334 pcp->vm_stat_diff[i] = 0;
335 local_irq_restore(flags);
336 }
337 }
338}
339
340static void __refresh_cpu_vm_stats(void *dummy)
341{
342 refresh_cpu_vm_stats(smp_processor_id());
343}
344
345/*
346 * Consolidate all counters.
347 *
348 * Note that the result is less inaccurate but still inaccurate
349 * if concurrent processes are allowed to run.
350 */
351void refresh_vm_stats(void)
352{
353 on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
354}
355EXPORT_SYMBOL(refresh_vm_stats);
356
357#endif
358
359#ifdef CONFIG_NUMA
360/*
361 * zonelist = the list of zones passed to the allocator
362 * z = the zone from which the allocation occurred.
363 *
364 * Must be called with interrupts disabled.
365 */
366void zone_statistics(struct zonelist *zonelist, struct zone *z)
367{
368 if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
369 __inc_zone_state(z, NUMA_HIT);
370 } else {
371 __inc_zone_state(z, NUMA_MISS);
372 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
373 }
374 if (z->node == numa_node_id())
375 __inc_zone_state(z, NUMA_LOCAL);
376 else
377 __inc_zone_state(z, NUMA_OTHER);
378}
379#endif
380
381#ifdef CONFIG_PROC_FS
382
383#include <linux/seq_file.h>
384
385static void *frag_start(struct seq_file *m, loff_t *pos)
386{
387 pg_data_t *pgdat;
388 loff_t node = *pos;
389 for (pgdat = first_online_pgdat();
390 pgdat && node;
391 pgdat = next_online_pgdat(pgdat))
392 --node;
393
394 return pgdat;
395}
396
397static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
398{
399 pg_data_t *pgdat = (pg_data_t *)arg;
400
401 (*pos)++;
402 return next_online_pgdat(pgdat);
403}
404
405static void frag_stop(struct seq_file *m, void *arg)
406{
407}
408
409/*
410 * This walks the free areas for each zone.
411 */
412static int frag_show(struct seq_file *m, void *arg)
413{
414 pg_data_t *pgdat = (pg_data_t *)arg;
415 struct zone *zone;
416 struct zone *node_zones = pgdat->node_zones;
417 unsigned long flags;
418 int order;
419
420 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
421 if (!populated_zone(zone))
422 continue;
423
424 spin_lock_irqsave(&zone->lock, flags);
425 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
426 for (order = 0; order < MAX_ORDER; ++order)
427 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
428 spin_unlock_irqrestore(&zone->lock, flags);
429 seq_putc(m, '\n');
430 }
431 return 0;
432}
433
434struct seq_operations fragmentation_op = {
435 .start = frag_start,
436 .next = frag_next,
437 .stop = frag_stop,
438 .show = frag_show,
439};
440
441#ifdef CONFIG_ZONE_DMA32
442#define TEXT_FOR_DMA32(xx) xx "_dma32",
443#else
444#define TEXT_FOR_DMA32(xx)
445#endif
446
447#ifdef CONFIG_HIGHMEM
448#define TEXT_FOR_HIGHMEM(xx) xx "_high",
449#else
450#define TEXT_FOR_HIGHMEM(xx)
451#endif
452
453#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
454 TEXT_FOR_HIGHMEM(xx)
455
456static char *vmstat_text[] = {
457 /* Zoned VM counters */
458 "nr_anon_pages",
459 "nr_mapped",
460 "nr_file_pages",
461 "nr_slab_reclaimable",
462 "nr_slab_unreclaimable",
463 "nr_page_table_pages",
464 "nr_dirty",
465 "nr_writeback",
466 "nr_unstable",
467 "nr_bounce",
468 "nr_vmscan_write",
469
470#ifdef CONFIG_NUMA
471 "numa_hit",
472 "numa_miss",
473 "numa_foreign",
474 "numa_interleave",
475 "numa_local",
476 "numa_other",
477#endif
478
479#ifdef CONFIG_VM_EVENT_COUNTERS
480 "pgpgin",
481 "pgpgout",
482 "pswpin",
483 "pswpout",
484
485 TEXTS_FOR_ZONES("pgalloc")
486
487 "pgfree",
488 "pgactivate",
489 "pgdeactivate",
490
491 "pgfault",
492 "pgmajfault",
493
494 TEXTS_FOR_ZONES("pgrefill")
495 TEXTS_FOR_ZONES("pgsteal")
496 TEXTS_FOR_ZONES("pgscan_kswapd")
497 TEXTS_FOR_ZONES("pgscan_direct")
498
499 "pginodesteal",
500 "slabs_scanned",
501 "kswapd_steal",
502 "kswapd_inodesteal",
503 "pageoutrun",
504 "allocstall",
505
506 "pgrotated",
507#endif
508};
509
510/*
511 * Output information about zones in @pgdat.
512 */
513static int zoneinfo_show(struct seq_file *m, void *arg)
514{
515 pg_data_t *pgdat = arg;
516 struct zone *zone;
517 struct zone *node_zones = pgdat->node_zones;
518 unsigned long flags;
519
520 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
521 int i;
522
523 if (!populated_zone(zone))
524 continue;
525
526 spin_lock_irqsave(&zone->lock, flags);
527 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
528 seq_printf(m,
529 "\n pages free %lu"
530 "\n min %lu"
531 "\n low %lu"
532 "\n high %lu"
533 "\n active %lu"
534 "\n inactive %lu"
535 "\n scanned %lu (a: %lu i: %lu)"
536 "\n spanned %lu"
537 "\n present %lu",
538 zone->free_pages,
539 zone->pages_min,
540 zone->pages_low,
541 zone->pages_high,
542 zone->nr_active,
543 zone->nr_inactive,
544 zone->pages_scanned,
545 zone->nr_scan_active, zone->nr_scan_inactive,
546 zone->spanned_pages,
547 zone->present_pages);
548
549 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
550 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
551 zone_page_state(zone, i));
552
553 seq_printf(m,
554 "\n protection: (%lu",
555 zone->lowmem_reserve[0]);
556 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
557 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
558 seq_printf(m,
559 ")"
560 "\n pagesets");
561 for_each_online_cpu(i) {
562 struct per_cpu_pageset *pageset;
563 int j;
564
565 pageset = zone_pcp(zone, i);
566 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
567 if (pageset->pcp[j].count)
568 break;
569 }
570 if (j == ARRAY_SIZE(pageset->pcp))
571 continue;
572 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
573 seq_printf(m,
574 "\n cpu: %i pcp: %i"
575 "\n count: %i"
576 "\n high: %i"
577 "\n batch: %i",
578 i, j,
579 pageset->pcp[j].count,
580 pageset->pcp[j].high,
581 pageset->pcp[j].batch);
582 }
583#ifdef CONFIG_SMP
584 seq_printf(m, "\n vm stats threshold: %d",
585 pageset->stat_threshold);
586#endif
587 }
588 seq_printf(m,
589 "\n all_unreclaimable: %u"
590 "\n prev_priority: %i"
591 "\n temp_priority: %i"
592 "\n start_pfn: %lu",
593 zone->all_unreclaimable,
594 zone->prev_priority,
595 zone->temp_priority,
596 zone->zone_start_pfn);
597 spin_unlock_irqrestore(&zone->lock, flags);
598 seq_putc(m, '\n');
599 }
600 return 0;
601}
602
603struct seq_operations zoneinfo_op = {
604 .start = frag_start, /* iterate over all zones. The same as in
605 * fragmentation. */
606 .next = frag_next,
607 .stop = frag_stop,
608 .show = zoneinfo_show,
609};
610
611static void *vmstat_start(struct seq_file *m, loff_t *pos)
612{
613 unsigned long *v;
614#ifdef CONFIG_VM_EVENT_COUNTERS
615 unsigned long *e;
616#endif
617 int i;
618
619 if (*pos >= ARRAY_SIZE(vmstat_text))
620 return NULL;
621
622#ifdef CONFIG_VM_EVENT_COUNTERS
623 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
624 + sizeof(struct vm_event_state), GFP_KERNEL);
625#else
626 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
627 GFP_KERNEL);
628#endif
629 m->private = v;
630 if (!v)
631 return ERR_PTR(-ENOMEM);
632 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
633 v[i] = global_page_state(i);
634#ifdef CONFIG_VM_EVENT_COUNTERS
635 e = v + NR_VM_ZONE_STAT_ITEMS;
636 all_vm_events(e);
637 e[PGPGIN] /= 2; /* sectors -> kbytes */
638 e[PGPGOUT] /= 2;
639#endif
640 return v + *pos;
641}
642
643static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
644{
645 (*pos)++;
646 if (*pos >= ARRAY_SIZE(vmstat_text))
647 return NULL;
648 return (unsigned long *)m->private + *pos;
649}
650
651static int vmstat_show(struct seq_file *m, void *arg)
652{
653 unsigned long *l = arg;
654 unsigned long off = l - (unsigned long *)m->private;
655
656 seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
657 return 0;
658}
659
660static void vmstat_stop(struct seq_file *m, void *arg)
661{
662 kfree(m->private);
663 m->private = NULL;
664}
665
666struct seq_operations vmstat_op = {
667 .start = vmstat_start,
668 .next = vmstat_next,
669 .stop = vmstat_stop,
670 .show = vmstat_show,
671};
672
673#endif /* CONFIG_PROC_FS */
674
675#ifdef CONFIG_SMP
676/*
677 * Use the cpu notifier to insure that the thresholds are recalculated
678 * when necessary.
679 */
680static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
681 unsigned long action,
682 void *hcpu)
683{
684 switch (action) {
685 case CPU_UP_PREPARE:
686 case CPU_UP_CANCELED:
687 case CPU_DEAD:
688 refresh_zone_stat_thresholds();
689 break;
690 default:
691 break;
692 }
693 return NOTIFY_OK;
694}
695
696static struct notifier_block __cpuinitdata vmstat_notifier =
697 { &vmstat_cpuup_callback, NULL, 0 };
698
699int __init setup_vmstat(void)
700{
701 refresh_zone_stat_thresholds();
702 register_cpu_notifier(&vmstat_notifier);
703 return 0;
704}
705module_init(setup_vmstat)
706#endif