aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-07-26 11:48:49 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-26 11:48:49 -0400
commitc3cc99ff5d24e2eeaf7ec2032e720681916990e3 (patch)
treec3e74171bbbd2adde9d60b9db1c440415c8d2831 /mm
parent38ffbe66d59051fd9cfcfc8545f164700e2fa3bc (diff)
parent024e8ac04453b3525448c31ef39848cf675ba6db (diff)
Merge branch 'linus' into x86/xen
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c935
-rw-r--r--mm/filemap.c168
-rw-r--r--mm/hugetlb.c1612
-rw-r--r--mm/internal.h61
-rw-r--r--mm/memcontrol.c364
-rw-r--r--mm/memory.c243
-rw-r--r--mm/memory_hotplug.c80
-rw-r--r--mm/mempolicy.c9
-rw-r--r--mm/migrate.c24
-rw-r--r--mm/mm_init.c152
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/page_alloc.c152
-rw-r--r--mm/pdflush.c4
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c91
-rw-r--r--mm/slob.c12
-rw-r--r--mm/slub.c65
-rw-r--r--mm/sparse.c115
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swapfile.c49
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c5
-rw-r--r--mm/vmstat.c1
26 files changed, 2858 insertions, 1348 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c4de85285bb4..aa799007a11b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS
174config MIGRATION 174config MIGRATION
175 bool "Page migration" 175 bool "Page migration"
176 def_bool y 176 def_bool y
177 depends on NUMA 177 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
178 help 178 help
179 Allows the migration of the physical location of pages of processes 179 Allows the migration of the physical location of pages of processes
180 while the virtual addresses are not changed. This is useful for 180 while the virtual addresses are not changed. This is useful for
diff --git a/mm/Makefile b/mm/Makefile
index 18c143b3c46c..06ca2381fef1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15 15
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
17obj-$(CONFIG_BOUNCE) += bounce.o 17obj-$(CONFIG_BOUNCE) += bounce.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8d9f60e06f62..4af15d0340ad 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * linux/mm/bootmem.c 2 * bootmem - A boot-time physical memory allocator and configurator
3 * 3 *
4 * Copyright (C) 1999 Ingo Molnar 4 * Copyright (C) 1999 Ingo Molnar
5 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 5 * 1999 Kanoj Sarcar, SGI
6 * 2008 Johannes Weiner
6 * 7 *
7 * simple boot-time physical memory area allocator and 8 * Access to this subsystem has to be serialized externally (which is true
8 * free memory collector. It's used to deal with reserved 9 * for the boot process anyway).
9 * system memory and memory holes as well.
10 */ 10 */
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h> 12#include <linux/pfn.h>
@@ -19,15 +19,10 @@
19 19
20#include "internal.h" 20#include "internal.h"
21 21
22/*
23 * Access to this subsystem has to be serialized externally. (this is
24 * true for the boot process anyway)
25 */
26unsigned long max_low_pfn; 22unsigned long max_low_pfn;
27unsigned long min_low_pfn; 23unsigned long min_low_pfn;
28unsigned long max_pfn; 24unsigned long max_pfn;
29 25
30static LIST_HEAD(bdata_list);
31#ifdef CONFIG_CRASH_DUMP 26#ifdef CONFIG_CRASH_DUMP
32/* 27/*
33 * If we have booted due to a crash, max_pfn will be a very low value. We need 28 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list);
36unsigned long saved_max_pfn; 31unsigned long saved_max_pfn;
37#endif 32#endif
38 33
39/* return the number of _pages_ that will be allocated for the boot bitmap */ 34bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
40unsigned long __init bootmem_bootmap_pages(unsigned long pages) 35
36static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
37
38static int bootmem_debug;
39
40static int __init bootmem_debug_setup(char *buf)
41{ 41{
42 unsigned long mapsize; 42 bootmem_debug = 1;
43 return 0;
44}
45early_param("bootmem_debug", bootmem_debug_setup);
43 46
44 mapsize = (pages+7)/8; 47#define bdebug(fmt, args...) ({ \
45 mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; 48 if (unlikely(bootmem_debug)) \
46 mapsize >>= PAGE_SHIFT; 49 printk(KERN_INFO \
50 "bootmem::%s " fmt, \
51 __FUNCTION__, ## args); \
52})
47 53
48 return mapsize; 54static unsigned long __init bootmap_bytes(unsigned long pages)
55{
56 unsigned long bytes = (pages + 7) / 8;
57
58 return ALIGN(bytes, sizeof(long));
49} 59}
50 60
51/* 61/**
52 * link bdata in order 62 * bootmem_bootmap_pages - calculate bitmap size in pages
63 * @pages: number of pages the bitmap has to represent
53 */ 64 */
54static void __init link_bootmem(bootmem_data_t *bdata) 65unsigned long __init bootmem_bootmap_pages(unsigned long pages)
55{ 66{
56 bootmem_data_t *ent; 67 unsigned long bytes = bootmap_bytes(pages);
57 68
58 if (list_empty(&bdata_list)) { 69 return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
59 list_add(&bdata->list, &bdata_list);
60 return;
61 }
62 /* insert in order */
63 list_for_each_entry(ent, &bdata_list, list) {
64 if (bdata->node_boot_start < ent->node_boot_start) {
65 list_add_tail(&bdata->list, &ent->list);
66 return;
67 }
68 }
69 list_add_tail(&bdata->list, &bdata_list);
70} 70}
71 71
72/* 72/*
73 * Given an initialised bdata, it returns the size of the boot bitmap 73 * link bdata in order
74 */ 74 */
75static unsigned long __init get_mapsize(bootmem_data_t *bdata) 75static void __init link_bootmem(bootmem_data_t *bdata)
76{ 76{
77 unsigned long mapsize; 77 struct list_head *iter;
78 unsigned long start = PFN_DOWN(bdata->node_boot_start);
79 unsigned long end = bdata->node_low_pfn;
80 78
81 mapsize = ((end - start) + 7) / 8; 79 list_for_each(iter, &bdata_list) {
82 return ALIGN(mapsize, sizeof(long)); 80 bootmem_data_t *ent;
81
82 ent = list_entry(iter, bootmem_data_t, list);
83 if (bdata->node_min_pfn < ent->node_min_pfn)
84 break;
85 }
86 list_add_tail(&bdata->list, iter);
83} 87}
84 88
85/* 89/*
86 * Called once to set up the allocator itself. 90 * Called once to set up the allocator itself.
87 */ 91 */
88static unsigned long __init init_bootmem_core(pg_data_t *pgdat, 92static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
89 unsigned long mapstart, unsigned long start, unsigned long end) 93 unsigned long mapstart, unsigned long start, unsigned long end)
90{ 94{
91 bootmem_data_t *bdata = pgdat->bdata;
92 unsigned long mapsize; 95 unsigned long mapsize;
93 96
97 mminit_validate_memmodel_limits(&start, &end);
94 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); 98 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
95 bdata->node_boot_start = PFN_PHYS(start); 99 bdata->node_min_pfn = start;
96 bdata->node_low_pfn = end; 100 bdata->node_low_pfn = end;
97 link_bootmem(bdata); 101 link_bootmem(bdata);
98 102
@@ -100,429 +104,461 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
100 * Initially all pages are reserved - setup_arch() has to 104 * Initially all pages are reserved - setup_arch() has to
101 * register free RAM areas explicitly. 105 * register free RAM areas explicitly.
102 */ 106 */
103 mapsize = get_mapsize(bdata); 107 mapsize = bootmap_bytes(end - start);
104 memset(bdata->node_bootmem_map, 0xff, mapsize); 108 memset(bdata->node_bootmem_map, 0xff, mapsize);
105 109
110 bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
111 bdata - bootmem_node_data, start, mapstart, end, mapsize);
112
106 return mapsize; 113 return mapsize;
107} 114}
108 115
109/* 116/**
110 * Marks a particular physical memory range as unallocatable. Usable RAM 117 * init_bootmem_node - register a node as boot memory
111 * might be used for boot-time allocations - or it might get added 118 * @pgdat: node to register
112 * to the free page pool later on. 119 * @freepfn: pfn where the bitmap for this node is to be placed
120 * @startpfn: first pfn on the node
121 * @endpfn: first pfn after the node
122 *
123 * Returns the number of bytes needed to hold the bitmap for this node.
113 */ 124 */
114static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, 125unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
115 unsigned long addr, unsigned long size, int flags) 126 unsigned long startpfn, unsigned long endpfn)
116{ 127{
117 unsigned long sidx, eidx; 128 return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
118 unsigned long i; 129}
119 130
120 BUG_ON(!size); 131/**
132 * init_bootmem - register boot memory
133 * @start: pfn where the bitmap is to be placed
134 * @pages: number of available physical pages
135 *
136 * Returns the number of bytes needed to hold the bitmap.
137 */
138unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
139{
140 max_low_pfn = pages;
141 min_low_pfn = start;
142 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
143}
121 144
122 /* out of range, don't hold other */ 145static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
123 if (addr + size < bdata->node_boot_start || 146{
124 PFN_DOWN(addr) > bdata->node_low_pfn) 147 int aligned;
148 struct page *page;
149 unsigned long start, end, pages, count = 0;
150
151 if (!bdata->node_bootmem_map)
125 return 0; 152 return 0;
126 153
154 start = bdata->node_min_pfn;
155 end = bdata->node_low_pfn;
156
127 /* 157 /*
128 * Round up to index to the range. 158 * If the start is aligned to the machines wordsize, we might
159 * be able to free pages in bulks of that order.
129 */ 160 */
130 if (addr > bdata->node_boot_start) 161 aligned = !(start & (BITS_PER_LONG - 1));
131 sidx= PFN_DOWN(addr - bdata->node_boot_start);
132 else
133 sidx = 0;
134 162
135 eidx = PFN_UP(addr + size - bdata->node_boot_start); 163 bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
136 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 164 bdata - bootmem_node_data, start, end, aligned);
137 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
138 165
139 for (i = sidx; i < eidx; i++) { 166 while (start < end) {
140 if (test_bit(i, bdata->node_bootmem_map)) { 167 unsigned long *map, idx, vec;
141 if (flags & BOOTMEM_EXCLUSIVE)
142 return -EBUSY;
143 }
144 }
145 168
146 return 0; 169 map = bdata->node_bootmem_map;
170 idx = start - bdata->node_min_pfn;
171 vec = ~map[idx / BITS_PER_LONG];
147 172
148} 173 if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
174 int order = ilog2(BITS_PER_LONG);
149 175
150static void __init reserve_bootmem_core(bootmem_data_t *bdata, 176 __free_pages_bootmem(pfn_to_page(start), order);
151 unsigned long addr, unsigned long size, int flags) 177 count += BITS_PER_LONG;
152{ 178 } else {
153 unsigned long sidx, eidx; 179 unsigned long off = 0;
154 unsigned long i;
155
156 BUG_ON(!size);
157 180
158 /* out of range */ 181 while (vec && off < BITS_PER_LONG) {
159 if (addr + size < bdata->node_boot_start || 182 if (vec & 1) {
160 PFN_DOWN(addr) > bdata->node_low_pfn) 183 page = pfn_to_page(start + off);
161 return; 184 __free_pages_bootmem(page, 0);
185 count++;
186 }
187 vec >>= 1;
188 off++;
189 }
190 }
191 start += BITS_PER_LONG;
192 }
162 193
163 /* 194 page = virt_to_page(bdata->node_bootmem_map);
164 * Round up to index to the range. 195 pages = bdata->node_low_pfn - bdata->node_min_pfn;
165 */ 196 pages = bootmem_bootmap_pages(pages);
166 if (addr > bdata->node_boot_start) 197 count += pages;
167 sidx= PFN_DOWN(addr - bdata->node_boot_start); 198 while (pages--)
168 else 199 __free_pages_bootmem(page++, 0);
169 sidx = 0;
170 200
171 eidx = PFN_UP(addr + size - bdata->node_boot_start); 201 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
172 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
173 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
174 202
175 for (i = sidx; i < eidx; i++) { 203 return count;
176 if (test_and_set_bit(i, bdata->node_bootmem_map)) {
177#ifdef CONFIG_DEBUG_BOOTMEM
178 printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
179#endif
180 }
181 }
182} 204}
183 205
184static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, 206/**
185 unsigned long size) 207 * free_all_bootmem_node - release a node's free pages to the buddy allocator
208 * @pgdat: node to be released
209 *
210 * Returns the number of pages actually released.
211 */
212unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
186{ 213{
187 unsigned long sidx, eidx; 214 register_page_bootmem_info_node(pgdat);
188 unsigned long i; 215 return free_all_bootmem_core(pgdat->bdata);
189 216}
190 BUG_ON(!size);
191 217
192 /* out range */ 218/**
193 if (addr + size < bdata->node_boot_start || 219 * free_all_bootmem - release free pages to the buddy allocator
194 PFN_DOWN(addr) > bdata->node_low_pfn) 220 *
195 return; 221 * Returns the number of pages actually released.
196 /* 222 */
197 * round down end of usable mem, partially free pages are 223unsigned long __init free_all_bootmem(void)
198 * considered reserved. 224{
199 */ 225 return free_all_bootmem_core(NODE_DATA(0)->bdata);
226}
200 227
201 if (addr >= bdata->node_boot_start && addr < bdata->last_success) 228static void __init __free(bootmem_data_t *bdata,
202 bdata->last_success = addr; 229 unsigned long sidx, unsigned long eidx)
230{
231 unsigned long idx;
203 232
204 /* 233 bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
205 * Round up to index to the range. 234 sidx + bdata->node_min_pfn,
206 */ 235 eidx + bdata->node_min_pfn);
207 if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start))
208 sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
209 else
210 sidx = 0;
211 236
212 eidx = PFN_DOWN(addr + size - bdata->node_boot_start); 237 if (bdata->hint_idx > sidx)
213 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 238 bdata->hint_idx = sidx;
214 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
215 239
216 for (i = sidx; i < eidx; i++) { 240 for (idx = sidx; idx < eidx; idx++)
217 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) 241 if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
218 BUG(); 242 BUG();
219 }
220} 243}
221 244
222/* 245static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
223 * We 'merge' subsequent allocations to save space. We might 'lose' 246 unsigned long eidx, int flags)
224 * some fraction of a page if allocations cannot be satisfied due to
225 * size constraints on boxes where there is physical RAM space
226 * fragmentation - in these cases (mostly large memory boxes) this
227 * is not a problem.
228 *
229 * On low memory boxes we get it right in 100% of the cases.
230 *
231 * alignment has to be a power of 2 value.
232 *
233 * NOTE: This function is _not_ reentrant.
234 */
235void * __init
236__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
237 unsigned long align, unsigned long goal, unsigned long limit)
238{ 247{
239 unsigned long areasize, preferred; 248 unsigned long idx;
240 unsigned long i, start = 0, incr, eidx, end_pfn; 249 int exclusive = flags & BOOTMEM_EXCLUSIVE;
241 void *ret; 250
242 unsigned long node_boot_start; 251 bdebug("nid=%td start=%lx end=%lx flags=%x\n",
243 void *node_bootmem_map; 252 bdata - bootmem_node_data,
244 253 sidx + bdata->node_min_pfn,
245 if (!size) { 254 eidx + bdata->node_min_pfn,
246 printk("__alloc_bootmem_core(): zero-sized request\n"); 255 flags);
247 BUG(); 256
248 } 257 for (idx = sidx; idx < eidx; idx++)
249 BUG_ON(align & (align-1)); 258 if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
250 259 if (exclusive) {
251 /* on nodes without memory - bootmem_map is NULL */ 260 __free(bdata, sidx, idx);
252 if (!bdata->node_bootmem_map) 261 return -EBUSY;
253 return NULL; 262 }
263 bdebug("silent double reserve of PFN %lx\n",
264 idx + bdata->node_min_pfn);
265 }
266 return 0;
267}
254 268
255 /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ 269static int __init mark_bootmem_node(bootmem_data_t *bdata,
256 node_boot_start = bdata->node_boot_start; 270 unsigned long start, unsigned long end,
257 node_bootmem_map = bdata->node_bootmem_map; 271 int reserve, int flags)
258 if (align) { 272{
259 node_boot_start = ALIGN(bdata->node_boot_start, align); 273 unsigned long sidx, eidx;
260 if (node_boot_start > bdata->node_boot_start)
261 node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
262 PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
263 }
264 274
265 if (limit && node_boot_start >= limit) 275 bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
266 return NULL; 276 bdata - bootmem_node_data, start, end, reserve, flags);
267 277
268 end_pfn = bdata->node_low_pfn; 278 BUG_ON(start < bdata->node_min_pfn);
269 limit = PFN_DOWN(limit); 279 BUG_ON(end > bdata->node_low_pfn);
270 if (limit && end_pfn > limit)
271 end_pfn = limit;
272 280
273 eidx = end_pfn - PFN_DOWN(node_boot_start); 281 sidx = start - bdata->node_min_pfn;
282 eidx = end - bdata->node_min_pfn;
274 283
275 /* 284 if (reserve)
276 * We try to allocate bootmem pages above 'goal' 285 return __reserve(bdata, sidx, eidx, flags);
277 * first, then we try to allocate lower pages. 286 else
278 */ 287 __free(bdata, sidx, eidx);
279 preferred = 0; 288 return 0;
280 if (goal && PFN_DOWN(goal) < end_pfn) { 289}
281 if (goal > node_boot_start)
282 preferred = goal - node_boot_start;
283
284 if (bdata->last_success > node_boot_start &&
285 bdata->last_success - node_boot_start >= preferred)
286 if (!limit || (limit && limit > bdata->last_success))
287 preferred = bdata->last_success - node_boot_start;
288 }
289 290
290 preferred = PFN_DOWN(ALIGN(preferred, align)); 291static int __init mark_bootmem(unsigned long start, unsigned long end,
291 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; 292 int reserve, int flags)
292 incr = align >> PAGE_SHIFT ? : 1; 293{
294 unsigned long pos;
295 bootmem_data_t *bdata;
293 296
294restart_scan: 297 pos = start;
295 for (i = preferred; i < eidx;) { 298 list_for_each_entry(bdata, &bdata_list, list) {
296 unsigned long j; 299 int err;
300 unsigned long max;
297 301
298 i = find_next_zero_bit(node_bootmem_map, eidx, i); 302 if (pos < bdata->node_min_pfn ||
299 i = ALIGN(i, incr); 303 pos >= bdata->node_low_pfn) {
300 if (i >= eidx) 304 BUG_ON(pos != start);
301 break;
302 if (test_bit(i, node_bootmem_map)) {
303 i += incr;
304 continue; 305 continue;
305 } 306 }
306 for (j = i + 1; j < i + areasize; ++j) {
307 if (j >= eidx)
308 goto fail_block;
309 if (test_bit(j, node_bootmem_map))
310 goto fail_block;
311 }
312 start = i;
313 goto found;
314 fail_block:
315 i = ALIGN(j, incr);
316 if (i == j)
317 i += incr;
318 }
319 307
320 if (preferred > 0) { 308 max = min(bdata->node_low_pfn, end);
321 preferred = 0;
322 goto restart_scan;
323 }
324 return NULL;
325 309
326found: 310 err = mark_bootmem_node(bdata, pos, max, reserve, flags);
327 bdata->last_success = PFN_PHYS(start) + node_boot_start; 311 if (reserve && err) {
328 BUG_ON(start >= eidx); 312 mark_bootmem(start, pos, 0, 0);
329 313 return err;
330 /*
331 * Is the next page of the previous allocation-end the start
332 * of this allocation's buffer? If yes then we can 'merge'
333 * the previous partial page with this allocation.
334 */
335 if (align < PAGE_SIZE &&
336 bdata->last_offset && bdata->last_pos+1 == start) {
337 unsigned long offset, remaining_size;
338 offset = ALIGN(bdata->last_offset, align);
339 BUG_ON(offset > PAGE_SIZE);
340 remaining_size = PAGE_SIZE - offset;
341 if (size < remaining_size) {
342 areasize = 0;
343 /* last_pos unchanged */
344 bdata->last_offset = offset + size;
345 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
346 offset + node_boot_start);
347 } else {
348 remaining_size = size - remaining_size;
349 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
350 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
351 offset + node_boot_start);
352 bdata->last_pos = start + areasize - 1;
353 bdata->last_offset = remaining_size;
354 } 314 }
355 bdata->last_offset &= ~PAGE_MASK;
356 } else {
357 bdata->last_pos = start + areasize - 1;
358 bdata->last_offset = size & ~PAGE_MASK;
359 ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
360 }
361 315
362 /* 316 if (max == end)
363 * Reserve the area now: 317 return 0;
364 */ 318 pos = bdata->node_low_pfn;
365 for (i = start; i < start + areasize; i++) 319 }
366 if (unlikely(test_and_set_bit(i, node_bootmem_map))) 320 BUG();
367 BUG();
368 memset(ret, 0, size);
369 return ret;
370} 321}
371 322
372static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) 323/**
324 * free_bootmem_node - mark a page range as usable
325 * @pgdat: node the range resides on
326 * @physaddr: starting address of the range
327 * @size: size of the range in bytes
328 *
329 * Partial pages will be considered reserved and left as they are.
330 *
331 * The range must reside completely on the specified node.
332 */
333void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
334 unsigned long size)
373{ 335{
374 struct page *page; 336 unsigned long start, end;
375 unsigned long pfn;
376 bootmem_data_t *bdata = pgdat->bdata;
377 unsigned long i, count, total = 0;
378 unsigned long idx;
379 unsigned long *map;
380 int gofast = 0;
381
382 BUG_ON(!bdata->node_bootmem_map);
383
384 count = 0;
385 /* first extant page of the node */
386 pfn = PFN_DOWN(bdata->node_boot_start);
387 idx = bdata->node_low_pfn - pfn;
388 map = bdata->node_bootmem_map;
389 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
390 if (bdata->node_boot_start == 0 ||
391 ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
392 gofast = 1;
393 for (i = 0; i < idx; ) {
394 unsigned long v = ~map[i / BITS_PER_LONG];
395
396 if (gofast && v == ~0UL) {
397 int order;
398
399 page = pfn_to_page(pfn);
400 count += BITS_PER_LONG;
401 order = ffs(BITS_PER_LONG) - 1;
402 __free_pages_bootmem(page, order);
403 i += BITS_PER_LONG;
404 page += BITS_PER_LONG;
405 } else if (v) {
406 unsigned long m;
407
408 page = pfn_to_page(pfn);
409 for (m = 1; m && i < idx; m<<=1, page++, i++) {
410 if (v & m) {
411 count++;
412 __free_pages_bootmem(page, 0);
413 }
414 }
415 } else {
416 i += BITS_PER_LONG;
417 }
418 pfn += BITS_PER_LONG;
419 }
420 total += count;
421 337
422 /* 338 start = PFN_UP(physaddr);
423 * Now free the allocator bitmap itself, it's not 339 end = PFN_DOWN(physaddr + size);
424 * needed anymore:
425 */
426 page = virt_to_page(bdata->node_bootmem_map);
427 count = 0;
428 idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
429 for (i = 0; i < idx; i++, page++) {
430 __free_pages_bootmem(page, 0);
431 count++;
432 }
433 total += count;
434 bdata->node_bootmem_map = NULL;
435 340
436 return total; 341 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
437} 342}
438 343
439unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, 344/**
440 unsigned long startpfn, unsigned long endpfn) 345 * free_bootmem - mark a page range as usable
441{ 346 * @addr: starting address of the range
442 return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); 347 * @size: size of the range in bytes
443} 348 *
444 349 * Partial pages will be considered reserved and left as they are.
445int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 350 *
446 unsigned long size, int flags) 351 * The range must be contiguous but may span node boundaries.
352 */
353void __init free_bootmem(unsigned long addr, unsigned long size)
447{ 354{
448 int ret; 355 unsigned long start, end;
449 356
450 ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); 357 start = PFN_UP(addr);
451 if (ret < 0) 358 end = PFN_DOWN(addr + size);
452 return -ENOMEM;
453 reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
454 359
455 return 0; 360 mark_bootmem(start, end, 0, 0);
456} 361}
457 362
458void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 363/**
459 unsigned long size) 364 * reserve_bootmem_node - mark a page range as reserved
365 * @pgdat: node the range resides on
366 * @physaddr: starting address of the range
367 * @size: size of the range in bytes
368 * @flags: reservation flags (see linux/bootmem.h)
369 *
370 * Partial pages will be reserved.
371 *
372 * The range must reside completely on the specified node.
373 */
374int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
375 unsigned long size, int flags)
460{ 376{
461 free_bootmem_core(pgdat->bdata, physaddr, size); 377 unsigned long start, end;
462}
463 378
464unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 379 start = PFN_DOWN(physaddr);
465{ 380 end = PFN_UP(physaddr + size);
466 register_page_bootmem_info_node(pgdat);
467 return free_all_bootmem_core(pgdat);
468}
469 381
470unsigned long __init init_bootmem(unsigned long start, unsigned long pages) 382 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
471{
472 max_low_pfn = pages;
473 min_low_pfn = start;
474 return init_bootmem_core(NODE_DATA(0), start, 0, pages);
475} 383}
476 384
477#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
386/**
387 * reserve_bootmem - mark a page range as usable
388 * @addr: starting address of the range
389 * @size: size of the range in bytes
390 * @flags: reservation flags (see linux/bootmem.h)
391 *
392 * Partial pages will be reserved.
393 *
394 * The range must be contiguous but may span node boundaries.
395 */
478int __init reserve_bootmem(unsigned long addr, unsigned long size, 396int __init reserve_bootmem(unsigned long addr, unsigned long size,
479 int flags) 397 int flags)
480{ 398{
481 bootmem_data_t *bdata; 399 unsigned long start, end;
482 int ret;
483 400
484 list_for_each_entry(bdata, &bdata_list, list) { 401 start = PFN_DOWN(addr);
485 ret = can_reserve_bootmem_core(bdata, addr, size, flags); 402 end = PFN_UP(addr + size);
486 if (ret < 0)
487 return ret;
488 }
489 list_for_each_entry(bdata, &bdata_list, list)
490 reserve_bootmem_core(bdata, addr, size, flags);
491 403
492 return 0; 404 return mark_bootmem(start, end, 1, flags);
493} 405}
494#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 406#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
495 407
496void __init free_bootmem(unsigned long addr, unsigned long size) 408static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
409 unsigned long size, unsigned long align,
410 unsigned long goal, unsigned long limit)
497{ 411{
498 bootmem_data_t *bdata; 412 unsigned long fallback = 0;
499 list_for_each_entry(bdata, &bdata_list, list) 413 unsigned long min, max, start, sidx, midx, step;
500 free_bootmem_core(bdata, addr, size);
501}
502 414
503unsigned long __init free_all_bootmem(void) 415 BUG_ON(!size);
504{ 416 BUG_ON(align & (align - 1));
505 return free_all_bootmem_core(NODE_DATA(0)); 417 BUG_ON(limit && goal + size > limit);
418
419 if (!bdata->node_bootmem_map)
420 return NULL;
421
422 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
423 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
424 align, goal, limit);
425
426 min = bdata->node_min_pfn;
427 max = bdata->node_low_pfn;
428
429 goal >>= PAGE_SHIFT;
430 limit >>= PAGE_SHIFT;
431
432 if (limit && max > limit)
433 max = limit;
434 if (max <= min)
435 return NULL;
436
437 step = max(align >> PAGE_SHIFT, 1UL);
438
439 if (goal && min < goal && goal < max)
440 start = ALIGN(goal, step);
441 else
442 start = ALIGN(min, step);
443
444 sidx = start - bdata->node_min_pfn;;
445 midx = max - bdata->node_min_pfn;
446
447 if (bdata->hint_idx > sidx) {
448 /*
449 * Handle the valid case of sidx being zero and still
450 * catch the fallback below.
451 */
452 fallback = sidx + 1;
453 sidx = ALIGN(bdata->hint_idx, step);
454 }
455
456 while (1) {
457 int merge;
458 void *region;
459 unsigned long eidx, i, start_off, end_off;
460find_block:
461 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
462 sidx = ALIGN(sidx, step);
463 eidx = sidx + PFN_UP(size);
464
465 if (sidx >= midx || eidx > midx)
466 break;
467
468 for (i = sidx; i < eidx; i++)
469 if (test_bit(i, bdata->node_bootmem_map)) {
470 sidx = ALIGN(i, step);
471 if (sidx == i)
472 sidx += step;
473 goto find_block;
474 }
475
476 if (bdata->last_end_off &&
477 PFN_DOWN(bdata->last_end_off) + 1 == sidx)
478 start_off = ALIGN(bdata->last_end_off, align);
479 else
480 start_off = PFN_PHYS(sidx);
481
482 merge = PFN_DOWN(start_off) < sidx;
483 end_off = start_off + size;
484
485 bdata->last_end_off = end_off;
486 bdata->hint_idx = PFN_UP(end_off);
487
488 /*
489 * Reserve the area now:
490 */
491 if (__reserve(bdata, PFN_DOWN(start_off) + merge,
492 PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
493 BUG();
494
495 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
496 start_off);
497 memset(region, 0, size);
498 return region;
499 }
500
501 if (fallback) {
502 sidx = ALIGN(fallback - 1, step);
503 fallback = 0;
504 goto find_block;
505 }
506
507 return NULL;
506} 508}
507 509
508void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 510static void * __init ___alloc_bootmem_nopanic(unsigned long size,
509 unsigned long goal) 511 unsigned long align,
512 unsigned long goal,
513 unsigned long limit)
510{ 514{
511 bootmem_data_t *bdata; 515 bootmem_data_t *bdata;
512 void *ptr;
513 516
517restart:
514 list_for_each_entry(bdata, &bdata_list, list) { 518 list_for_each_entry(bdata, &bdata_list, list) {
515 ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); 519 void *region;
516 if (ptr) 520
517 return ptr; 521 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
522 continue;
523 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
524 break;
525
526 region = alloc_bootmem_core(bdata, size, align, goal, limit);
527 if (region)
528 return region;
529 }
530
531 if (goal) {
532 goal = 0;
533 goto restart;
518 } 534 }
535
519 return NULL; 536 return NULL;
520} 537}
521 538
522void * __init __alloc_bootmem(unsigned long size, unsigned long align, 539/**
523 unsigned long goal) 540 * __alloc_bootmem_nopanic - allocate boot memory without panicking
541 * @size: size of the request in bytes
542 * @align: alignment of the region
543 * @goal: preferred starting address of the region
544 *
545 * The goal is dropped if it can not be satisfied and the allocation will
546 * fall back to memory below @goal.
547 *
548 * Allocation may happen on any node in the system.
549 *
550 * Returns NULL on failure.
551 */
552void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
553 unsigned long goal)
524{ 554{
525 void *mem = __alloc_bootmem_nopanic(size,align,goal); 555 return ___alloc_bootmem_nopanic(size, align, goal, 0);
556}
557
558static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
559 unsigned long goal, unsigned long limit)
560{
561 void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
526 562
527 if (mem) 563 if (mem)
528 return mem; 564 return mem;
@@ -534,78 +570,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
534 return NULL; 570 return NULL;
535} 571}
536 572
573/**
574 * __alloc_bootmem - allocate boot memory
575 * @size: size of the request in bytes
576 * @align: alignment of the region
577 * @goal: preferred starting address of the region
578 *
579 * The goal is dropped if it can not be satisfied and the allocation will
580 * fall back to memory below @goal.
581 *
582 * Allocation may happen on any node in the system.
583 *
584 * The function panics if the request can not be satisfied.
585 */
586void * __init __alloc_bootmem(unsigned long size, unsigned long align,
587 unsigned long goal)
588{
589 return ___alloc_bootmem(size, align, goal, 0);
590}
537 591
538void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 592static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
539 unsigned long align, unsigned long goal) 593 unsigned long size, unsigned long align,
594 unsigned long goal, unsigned long limit)
540{ 595{
541 void *ptr; 596 void *ptr;
542 597
543 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 598 ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
544 if (ptr) 599 if (ptr)
545 return ptr; 600 return ptr;
546 601
547 return __alloc_bootmem(size, align, goal); 602 return ___alloc_bootmem(size, align, goal, limit);
603}
604
605/**
606 * __alloc_bootmem_node - allocate boot memory from a specific node
607 * @pgdat: node to allocate from
608 * @size: size of the request in bytes
609 * @align: alignment of the region
610 * @goal: preferred starting address of the region
611 *
612 * The goal is dropped if it can not be satisfied and the allocation will
613 * fall back to memory below @goal.
614 *
615 * Allocation may fall back to any node in the system if the specified node
616 * can not hold the requested memory.
617 *
618 * The function panics if the request can not be satisfied.
619 */
620void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
621 unsigned long align, unsigned long goal)
622{
623 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
548} 624}
549 625
550#ifdef CONFIG_SPARSEMEM 626#ifdef CONFIG_SPARSEMEM
627/**
628 * alloc_bootmem_section - allocate boot memory from a specific section
629 * @size: size of the request in bytes
630 * @section_nr: sparse map section to allocate from
631 *
632 * Return NULL on failure.
633 */
551void * __init alloc_bootmem_section(unsigned long size, 634void * __init alloc_bootmem_section(unsigned long size,
552 unsigned long section_nr) 635 unsigned long section_nr)
553{ 636{
554 void *ptr; 637 bootmem_data_t *bdata;
555 unsigned long limit, goal, start_nr, end_nr, pfn; 638 unsigned long pfn, goal, limit;
556 struct pglist_data *pgdat;
557 639
558 pfn = section_nr_to_pfn(section_nr); 640 pfn = section_nr_to_pfn(section_nr);
559 goal = PFN_PHYS(pfn); 641 goal = pfn << PAGE_SHIFT;
560 limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; 642 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
561 pgdat = NODE_DATA(early_pfn_to_nid(pfn)); 643 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
562 ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
563 limit);
564 644
565 if (!ptr) 645 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
566 return NULL; 646}
647#endif
567 648
568 start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); 649void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
569 end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); 650 unsigned long align, unsigned long goal)
570 if (start_nr != section_nr || end_nr != section_nr) { 651{
571 printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", 652 void *ptr;
572 section_nr);
573 free_bootmem_core(pgdat->bdata, __pa(ptr), size);
574 ptr = NULL;
575 }
576 653
577 return ptr; 654 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
655 if (ptr)
656 return ptr;
657
658 return __alloc_bootmem_nopanic(size, align, goal);
578} 659}
579#endif
580 660
581#ifndef ARCH_LOW_ADDRESS_LIMIT 661#ifndef ARCH_LOW_ADDRESS_LIMIT
582#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL 662#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
583#endif 663#endif
584 664
665/**
666 * __alloc_bootmem_low - allocate low boot memory
667 * @size: size of the request in bytes
668 * @align: alignment of the region
669 * @goal: preferred starting address of the region
670 *
671 * The goal is dropped if it can not be satisfied and the allocation will
672 * fall back to memory below @goal.
673 *
674 * Allocation may happen on any node in the system.
675 *
676 * The function panics if the request can not be satisfied.
677 */
585void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, 678void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
586 unsigned long goal) 679 unsigned long goal)
587{ 680{
588 bootmem_data_t *bdata; 681 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
589 void *ptr;
590
591 list_for_each_entry(bdata, &bdata_list, list) {
592 ptr = __alloc_bootmem_core(bdata, size, align, goal,
593 ARCH_LOW_ADDRESS_LIMIT);
594 if (ptr)
595 return ptr;
596 }
597
598 /*
599 * Whoops, we cannot satisfy the allocation request.
600 */
601 printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
602 panic("Out of low memory");
603 return NULL;
604} 682}
605 683
684/**
685 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
686 * @pgdat: node to allocate from
687 * @size: size of the request in bytes
688 * @align: alignment of the region
689 * @goal: preferred starting address of the region
690 *
691 * The goal is dropped if it can not be satisfied and the allocation will
692 * fall back to memory below @goal.
693 *
694 * Allocation may fall back to any node in the system if the specified node
695 * can not hold the requested memory.
696 *
697 * The function panics if the request can not be satisfied.
698 */
606void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 699void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
607 unsigned long align, unsigned long goal) 700 unsigned long align, unsigned long goal)
608{ 701{
609 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, 702 return ___alloc_bootmem_node(pgdat->bdata, size, align,
610 ARCH_LOW_ADDRESS_LIMIT); 703 goal, ARCH_LOW_ADDRESS_LIMIT);
611} 704}
diff --git a/mm/filemap.c b/mm/filemap.c
index 65d9d9e2b755..2d3ec1ffc66e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,9 +42,6 @@
42 42
43#include <asm/mman.h> 43#include <asm/mman.h>
44 44
45static ssize_t
46generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
47 loff_t offset, unsigned long nr_segs);
48 45
49/* 46/*
50 * Shared mappings implemented 30.11.1994. It's not fully working yet, 47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -118,7 +115,7 @@ void __remove_from_page_cache(struct page *page)
118{ 115{
119 struct address_space *mapping = page->mapping; 116 struct address_space *mapping = page->mapping;
120 117
121 mem_cgroup_uncharge_page(page); 118 mem_cgroup_uncharge_cache_page(page);
122 radix_tree_delete(&mapping->page_tree, page->index); 119 radix_tree_delete(&mapping->page_tree, page->index);
123 page->mapping = NULL; 120 page->mapping = NULL;
124 mapping->nrpages--; 121 mapping->nrpages--;
@@ -477,12 +474,12 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
477 mapping->nrpages++; 474 mapping->nrpages++;
478 __inc_zone_page_state(page, NR_FILE_PAGES); 475 __inc_zone_page_state(page, NR_FILE_PAGES);
479 } else 476 } else
480 mem_cgroup_uncharge_page(page); 477 mem_cgroup_uncharge_cache_page(page);
481 478
482 write_unlock_irq(&mapping->tree_lock); 479 write_unlock_irq(&mapping->tree_lock);
483 radix_tree_preload_end(); 480 radix_tree_preload_end();
484 } else 481 } else
485 mem_cgroup_uncharge_page(page); 482 mem_cgroup_uncharge_cache_page(page);
486out: 483out:
487 return error; 484 return error;
488} 485}
@@ -1200,42 +1197,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1200 1197
1201 mapping = filp->f_mapping; 1198 mapping = filp->f_mapping;
1202 inode = mapping->host; 1199 inode = mapping->host;
1203 retval = 0;
1204 if (!count) 1200 if (!count)
1205 goto out; /* skip atime */ 1201 goto out; /* skip atime */
1206 size = i_size_read(inode); 1202 size = i_size_read(inode);
1207 if (pos < size) { 1203 if (pos < size) {
1208 retval = generic_file_direct_IO(READ, iocb, 1204 retval = filemap_write_and_wait(mapping);
1209 iov, pos, nr_segs); 1205 if (!retval) {
1206 retval = mapping->a_ops->direct_IO(READ, iocb,
1207 iov, pos, nr_segs);
1208 }
1210 if (retval > 0) 1209 if (retval > 0)
1211 *ppos = pos + retval; 1210 *ppos = pos + retval;
1212 } 1211 if (retval) {
1213 if (likely(retval != 0)) { 1212 file_accessed(filp);
1214 file_accessed(filp); 1213 goto out;
1215 goto out; 1214 }
1216 } 1215 }
1217 } 1216 }
1218 1217
1219 retval = 0; 1218 for (seg = 0; seg < nr_segs; seg++) {
1220 if (count) { 1219 read_descriptor_t desc;
1221 for (seg = 0; seg < nr_segs; seg++) {
1222 read_descriptor_t desc;
1223 1220
1224 desc.written = 0; 1221 desc.written = 0;
1225 desc.arg.buf = iov[seg].iov_base; 1222 desc.arg.buf = iov[seg].iov_base;
1226 desc.count = iov[seg].iov_len; 1223 desc.count = iov[seg].iov_len;
1227 if (desc.count == 0) 1224 if (desc.count == 0)
1228 continue; 1225 continue;
1229 desc.error = 0; 1226 desc.error = 0;
1230 do_generic_file_read(filp,ppos,&desc,file_read_actor); 1227 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1231 retval += desc.written; 1228 retval += desc.written;
1232 if (desc.error) { 1229 if (desc.error) {
1233 retval = retval ?: desc.error; 1230 retval = retval ?: desc.error;
1234 break; 1231 break;
1235 }
1236 if (desc.count > 0)
1237 break;
1238 } 1232 }
1233 if (desc.count > 0)
1234 break;
1239 } 1235 }
1240out: 1236out:
1241 return retval; 1237 return retval;
@@ -2004,11 +2000,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2004 struct address_space *mapping = file->f_mapping; 2000 struct address_space *mapping = file->f_mapping;
2005 struct inode *inode = mapping->host; 2001 struct inode *inode = mapping->host;
2006 ssize_t written; 2002 ssize_t written;
2003 size_t write_len;
2004 pgoff_t end;
2007 2005
2008 if (count != ocount) 2006 if (count != ocount)
2009 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2007 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2010 2008
2011 written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2009 /*
2010 * Unmap all mmappings of the file up-front.
2011 *
2012 * This will cause any pte dirty bits to be propagated into the
2013 * pageframes for the subsequent filemap_write_and_wait().
2014 */
2015 write_len = iov_length(iov, *nr_segs);
2016 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2017 if (mapping_mapped(mapping))
2018 unmap_mapping_range(mapping, pos, write_len, 0);
2019
2020 written = filemap_write_and_wait(mapping);
2021 if (written)
2022 goto out;
2023
2024 /*
2025 * After a write we want buffered reads to be sure to go to disk to get
2026 * the new data. We invalidate clean cached page from the region we're
2027 * about to write. We do this *before* the write so that we can return
2028 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2029 */
2030 if (mapping->nrpages) {
2031 written = invalidate_inode_pages2_range(mapping,
2032 pos >> PAGE_CACHE_SHIFT, end);
2033 if (written)
2034 goto out;
2035 }
2036
2037 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2038
2039 /*
2040 * Finally, try again to invalidate clean pages which might have been
2041 * cached by non-direct readahead, or faulted in by get_user_pages()
2042 * if the source of the write was an mmap'ed region of the file
2043 * we're writing. Either one is a pretty crazy thing to do,
2044 * so we don't support it 100%. If this invalidation
2045 * fails, tough, the write still worked...
2046 */
2047 if (mapping->nrpages) {
2048 invalidate_inode_pages2_range(mapping,
2049 pos >> PAGE_CACHE_SHIFT, end);
2050 }
2051
2012 if (written > 0) { 2052 if (written > 0) {
2013 loff_t end = pos + written; 2053 loff_t end = pos + written;
2014 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2054 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -2024,6 +2064,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2024 * i_mutex is held, which protects generic_osync_inode() from 2064 * i_mutex is held, which protects generic_osync_inode() from
2025 * livelocking. AIO O_DIRECT ops attempt to sync metadata here. 2065 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2026 */ 2066 */
2067out:
2027 if ((written >= 0 || written == -EIOCBQUEUED) && 2068 if ((written >= 0 || written == -EIOCBQUEUED) &&
2028 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2069 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2029 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); 2070 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
@@ -2511,66 +2552,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2511} 2552}
2512EXPORT_SYMBOL(generic_file_aio_write); 2553EXPORT_SYMBOL(generic_file_aio_write);
2513 2554
2514/*
2515 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
2516 * went wrong during pagecache shootdown.
2517 */
2518static ssize_t
2519generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2520 loff_t offset, unsigned long nr_segs)
2521{
2522 struct file *file = iocb->ki_filp;
2523 struct address_space *mapping = file->f_mapping;
2524 ssize_t retval;
2525 size_t write_len;
2526 pgoff_t end = 0; /* silence gcc */
2527
2528 /*
2529 * If it's a write, unmap all mmappings of the file up-front. This
2530 * will cause any pte dirty bits to be propagated into the pageframes
2531 * for the subsequent filemap_write_and_wait().
2532 */
2533 if (rw == WRITE) {
2534 write_len = iov_length(iov, nr_segs);
2535 end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
2536 if (mapping_mapped(mapping))
2537 unmap_mapping_range(mapping, offset, write_len, 0);
2538 }
2539
2540 retval = filemap_write_and_wait(mapping);
2541 if (retval)
2542 goto out;
2543
2544 /*
2545 * After a write we want buffered reads to be sure to go to disk to get
2546 * the new data. We invalidate clean cached page from the region we're
2547 * about to write. We do this *before* the write so that we can return
2548 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2549 */
2550 if (rw == WRITE && mapping->nrpages) {
2551 retval = invalidate_inode_pages2_range(mapping,
2552 offset >> PAGE_CACHE_SHIFT, end);
2553 if (retval)
2554 goto out;
2555 }
2556
2557 retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
2558
2559 /*
2560 * Finally, try again to invalidate clean pages which might have been
2561 * cached by non-direct readahead, or faulted in by get_user_pages()
2562 * if the source of the write was an mmap'ed region of the file
2563 * we're writing. Either one is a pretty crazy thing to do,
2564 * so we don't support it 100%. If this invalidation
2565 * fails, tough, the write still worked...
2566 */
2567 if (rw == WRITE && mapping->nrpages) {
2568 invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
2569 }
2570out:
2571 return retval;
2572}
2573
2574/** 2555/**
2575 * try_to_release_page() - release old fs-specific metadata on a page 2556 * try_to_release_page() - release old fs-specific metadata on a page
2576 * 2557 *
@@ -2582,9 +2563,8 @@ out:
2582 * Otherwise return zero. 2563 * Otherwise return zero.
2583 * 2564 *
2584 * The @gfp_mask argument specifies whether I/O may be performed to release 2565 * The @gfp_mask argument specifies whether I/O may be performed to release
2585 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). 2566 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2586 * 2567 *
2587 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
2588 */ 2568 */
2589int try_to_release_page(struct page *page, gfp_t gfp_mask) 2569int try_to_release_page(struct page *page, gfp_t gfp_mask)
2590{ 2570{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ab171274ef21..a8bf4ab01f86 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,6 +14,8 @@
14#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
15#include <linux/cpuset.h> 15#include <linux/cpuset.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/bootmem.h>
18#include <linux/sysfs.h>
17 19
18#include <asm/page.h> 20#include <asm/page.h>
19#include <asm/pgtable.h> 21#include <asm/pgtable.h>
@@ -22,30 +24,340 @@
22#include "internal.h" 24#include "internal.h"
23 25
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 26const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
27static unsigned long nr_overcommit_huge_pages;
28unsigned long max_huge_pages;
29unsigned long sysctl_overcommit_huge_pages;
30static struct list_head hugepage_freelists[MAX_NUMNODES];
31static unsigned int nr_huge_pages_node[MAX_NUMNODES];
32static unsigned int free_huge_pages_node[MAX_NUMNODES];
33static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 27static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 28unsigned long hugepages_treat_as_movable;
36static int hugetlb_next_nid; 29
30static int max_hstate;
31unsigned int default_hstate_idx;
32struct hstate hstates[HUGE_MAX_HSTATE];
33
34__initdata LIST_HEAD(huge_boot_pages);
35
36/* for command line parsing */
37static struct hstate * __initdata parsed_hstate;
38static unsigned long __initdata default_hstate_max_huge_pages;
39static unsigned long __initdata default_hstate_size;
40
41#define for_each_hstate(h) \
42 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
37 43
38/* 44/*
39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 45 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
40 */ 46 */
41static DEFINE_SPINLOCK(hugetlb_lock); 47static DEFINE_SPINLOCK(hugetlb_lock);
42 48
43static void clear_huge_page(struct page *page, unsigned long addr) 49/*
50 * Region tracking -- allows tracking of reservations and instantiated pages
51 * across the pages in a mapping.
52 *
53 * The region data structures are protected by a combination of the mmap_sem
54 * and the hugetlb_instantion_mutex. To access or modify a region the caller
55 * must either hold the mmap_sem for write, or the mmap_sem for read and
56 * the hugetlb_instantiation mutex:
57 *
58 * down_write(&mm->mmap_sem);
59 * or
60 * down_read(&mm->mmap_sem);
61 * mutex_lock(&hugetlb_instantiation_mutex);
62 */
63struct file_region {
64 struct list_head link;
65 long from;
66 long to;
67};
68
69static long region_add(struct list_head *head, long f, long t)
70{
71 struct file_region *rg, *nrg, *trg;
72
73 /* Locate the region we are either in or before. */
74 list_for_each_entry(rg, head, link)
75 if (f <= rg->to)
76 break;
77
78 /* Round our left edge to the current segment if it encloses us. */
79 if (f > rg->from)
80 f = rg->from;
81
82 /* Check for and consume any regions we now overlap with. */
83 nrg = rg;
84 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
85 if (&rg->link == head)
86 break;
87 if (rg->from > t)
88 break;
89
90 /* If this area reaches higher then extend our area to
91 * include it completely. If this is not the first area
92 * which we intend to reuse, free it. */
93 if (rg->to > t)
94 t = rg->to;
95 if (rg != nrg) {
96 list_del(&rg->link);
97 kfree(rg);
98 }
99 }
100 nrg->from = f;
101 nrg->to = t;
102 return 0;
103}
104
105static long region_chg(struct list_head *head, long f, long t)
106{
107 struct file_region *rg, *nrg;
108 long chg = 0;
109
110 /* Locate the region we are before or in. */
111 list_for_each_entry(rg, head, link)
112 if (f <= rg->to)
113 break;
114
115 /* If we are below the current region then a new region is required.
116 * Subtle, allocate a new region at the position but make it zero
117 * size such that we can guarantee to record the reservation. */
118 if (&rg->link == head || t < rg->from) {
119 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
120 if (!nrg)
121 return -ENOMEM;
122 nrg->from = f;
123 nrg->to = f;
124 INIT_LIST_HEAD(&nrg->link);
125 list_add(&nrg->link, rg->link.prev);
126
127 return t - f;
128 }
129
130 /* Round our left edge to the current segment if it encloses us. */
131 if (f > rg->from)
132 f = rg->from;
133 chg = t - f;
134
135 /* Check for and consume any regions we now overlap with. */
136 list_for_each_entry(rg, rg->link.prev, link) {
137 if (&rg->link == head)
138 break;
139 if (rg->from > t)
140 return chg;
141
142 /* We overlap with this area, if it extends futher than
143 * us then we must extend ourselves. Account for its
144 * existing reservation. */
145 if (rg->to > t) {
146 chg += rg->to - t;
147 t = rg->to;
148 }
149 chg -= rg->to - rg->from;
150 }
151 return chg;
152}
153
154static long region_truncate(struct list_head *head, long end)
155{
156 struct file_region *rg, *trg;
157 long chg = 0;
158
159 /* Locate the region we are either in or before. */
160 list_for_each_entry(rg, head, link)
161 if (end <= rg->to)
162 break;
163 if (&rg->link == head)
164 return 0;
165
166 /* If we are in the middle of a region then adjust it. */
167 if (end > rg->from) {
168 chg = rg->to - end;
169 rg->to = end;
170 rg = list_entry(rg->link.next, typeof(*rg), link);
171 }
172
173 /* Drop any remaining regions. */
174 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
175 if (&rg->link == head)
176 break;
177 chg += rg->to - rg->from;
178 list_del(&rg->link);
179 kfree(rg);
180 }
181 return chg;
182}
183
184static long region_count(struct list_head *head, long f, long t)
185{
186 struct file_region *rg;
187 long chg = 0;
188
189 /* Locate each segment we overlap with, and count that overlap. */
190 list_for_each_entry(rg, head, link) {
191 int seg_from;
192 int seg_to;
193
194 if (rg->to <= f)
195 continue;
196 if (rg->from >= t)
197 break;
198
199 seg_from = max(rg->from, f);
200 seg_to = min(rg->to, t);
201
202 chg += seg_to - seg_from;
203 }
204
205 return chg;
206}
207
208/*
209 * Convert the address within this vma to the page offset within
210 * the mapping, in pagecache page units; huge pages here.
211 */
212static pgoff_t vma_hugecache_offset(struct hstate *h,
213 struct vm_area_struct *vma, unsigned long address)
214{
215 return ((address - vma->vm_start) >> huge_page_shift(h)) +
216 (vma->vm_pgoff >> huge_page_order(h));
217}
218
219/*
220 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
221 * bits of the reservation map pointer, which are always clear due to
222 * alignment.
223 */
224#define HPAGE_RESV_OWNER (1UL << 0)
225#define HPAGE_RESV_UNMAPPED (1UL << 1)
226#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
227
228/*
229 * These helpers are used to track how many pages are reserved for
230 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
231 * is guaranteed to have their future faults succeed.
232 *
233 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
234 * the reserve counters are updated with the hugetlb_lock held. It is safe
235 * to reset the VMA at fork() time as it is not in use yet and there is no
236 * chance of the global counters getting corrupted as a result of the values.
237 *
238 * The private mapping reservation is represented in a subtly different
239 * manner to a shared mapping. A shared mapping has a region map associated
240 * with the underlying file, this region map represents the backing file
241 * pages which have ever had a reservation assigned which this persists even
242 * after the page is instantiated. A private mapping has a region map
243 * associated with the original mmap which is attached to all VMAs which
244 * reference it, this region map represents those offsets which have consumed
245 * reservation ie. where pages have been instantiated.
246 */
247static unsigned long get_vma_private_data(struct vm_area_struct *vma)
248{
249 return (unsigned long)vma->vm_private_data;
250}
251
252static void set_vma_private_data(struct vm_area_struct *vma,
253 unsigned long value)
254{
255 vma->vm_private_data = (void *)value;
256}
257
258struct resv_map {
259 struct kref refs;
260 struct list_head regions;
261};
262
263struct resv_map *resv_map_alloc(void)
264{
265 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
266 if (!resv_map)
267 return NULL;
268
269 kref_init(&resv_map->refs);
270 INIT_LIST_HEAD(&resv_map->regions);
271
272 return resv_map;
273}
274
275void resv_map_release(struct kref *ref)
276{
277 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
278
279 /* Clear out any active regions before we release the map. */
280 region_truncate(&resv_map->regions, 0);
281 kfree(resv_map);
282}
283
284static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
285{
286 VM_BUG_ON(!is_vm_hugetlb_page(vma));
287 if (!(vma->vm_flags & VM_SHARED))
288 return (struct resv_map *)(get_vma_private_data(vma) &
289 ~HPAGE_RESV_MASK);
290 return 0;
291}
292
293static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
294{
295 VM_BUG_ON(!is_vm_hugetlb_page(vma));
296 VM_BUG_ON(vma->vm_flags & VM_SHARED);
297
298 set_vma_private_data(vma, (get_vma_private_data(vma) &
299 HPAGE_RESV_MASK) | (unsigned long)map);
300}
301
302static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
303{
304 VM_BUG_ON(!is_vm_hugetlb_page(vma));
305 VM_BUG_ON(vma->vm_flags & VM_SHARED);
306
307 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
308}
309
310static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
311{
312 VM_BUG_ON(!is_vm_hugetlb_page(vma));
313
314 return (get_vma_private_data(vma) & flag) != 0;
315}
316
317/* Decrement the reserved pages in the hugepage pool by one */
318static void decrement_hugepage_resv_vma(struct hstate *h,
319 struct vm_area_struct *vma)
320{
321 if (vma->vm_flags & VM_NORESERVE)
322 return;
323
324 if (vma->vm_flags & VM_SHARED) {
325 /* Shared mappings always use reserves */
326 h->resv_huge_pages--;
327 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
328 /*
329 * Only the process that called mmap() has reserves for
330 * private mappings.
331 */
332 h->resv_huge_pages--;
333 }
334}
335
336/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
337void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
338{
339 VM_BUG_ON(!is_vm_hugetlb_page(vma));
340 if (!(vma->vm_flags & VM_SHARED))
341 vma->vm_private_data = (void *)0;
342}
343
344/* Returns true if the VMA has associated reserve pages */
345static int vma_has_reserves(struct vm_area_struct *vma)
346{
347 if (vma->vm_flags & VM_SHARED)
348 return 1;
349 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
350 return 1;
351 return 0;
352}
353
354static void clear_huge_page(struct page *page,
355 unsigned long addr, unsigned long sz)
44{ 356{
45 int i; 357 int i;
46 358
47 might_sleep(); 359 might_sleep();
48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 360 for (i = 0; i < sz/PAGE_SIZE; i++) {
49 cond_resched(); 361 cond_resched();
50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 362 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
51 } 363 }
@@ -55,42 +367,44 @@ static void copy_huge_page(struct page *dst, struct page *src,
55 unsigned long addr, struct vm_area_struct *vma) 367 unsigned long addr, struct vm_area_struct *vma)
56{ 368{
57 int i; 369 int i;
370 struct hstate *h = hstate_vma(vma);
58 371
59 might_sleep(); 372 might_sleep();
60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 373 for (i = 0; i < pages_per_huge_page(h); i++) {
61 cond_resched(); 374 cond_resched();
62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 375 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
63 } 376 }
64} 377}
65 378
66static void enqueue_huge_page(struct page *page) 379static void enqueue_huge_page(struct hstate *h, struct page *page)
67{ 380{
68 int nid = page_to_nid(page); 381 int nid = page_to_nid(page);
69 list_add(&page->lru, &hugepage_freelists[nid]); 382 list_add(&page->lru, &h->hugepage_freelists[nid]);
70 free_huge_pages++; 383 h->free_huge_pages++;
71 free_huge_pages_node[nid]++; 384 h->free_huge_pages_node[nid]++;
72} 385}
73 386
74static struct page *dequeue_huge_page(void) 387static struct page *dequeue_huge_page(struct hstate *h)
75{ 388{
76 int nid; 389 int nid;
77 struct page *page = NULL; 390 struct page *page = NULL;
78 391
79 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 392 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
80 if (!list_empty(&hugepage_freelists[nid])) { 393 if (!list_empty(&h->hugepage_freelists[nid])) {
81 page = list_entry(hugepage_freelists[nid].next, 394 page = list_entry(h->hugepage_freelists[nid].next,
82 struct page, lru); 395 struct page, lru);
83 list_del(&page->lru); 396 list_del(&page->lru);
84 free_huge_pages--; 397 h->free_huge_pages--;
85 free_huge_pages_node[nid]--; 398 h->free_huge_pages_node[nid]--;
86 break; 399 break;
87 } 400 }
88 } 401 }
89 return page; 402 return page;
90} 403}
91 404
92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 405static struct page *dequeue_huge_page_vma(struct hstate *h,
93 unsigned long address) 406 struct vm_area_struct *vma,
407 unsigned long address, int avoid_reserve)
94{ 408{
95 int nid; 409 int nid;
96 struct page *page = NULL; 410 struct page *page = NULL;
@@ -101,18 +415,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
101 struct zone *zone; 415 struct zone *zone;
102 struct zoneref *z; 416 struct zoneref *z;
103 417
418 /*
419 * A child process with MAP_PRIVATE mappings created by their parent
420 * have no page reserves. This check ensures that reservations are
421 * not "stolen". The child may still get SIGKILLed
422 */
423 if (!vma_has_reserves(vma) &&
424 h->free_huge_pages - h->resv_huge_pages == 0)
425 return NULL;
426
427 /* If reserves cannot be used, ensure enough pages are in the pool */
428 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
429 return NULL;
430
104 for_each_zone_zonelist_nodemask(zone, z, zonelist, 431 for_each_zone_zonelist_nodemask(zone, z, zonelist,
105 MAX_NR_ZONES - 1, nodemask) { 432 MAX_NR_ZONES - 1, nodemask) {
106 nid = zone_to_nid(zone); 433 nid = zone_to_nid(zone);
107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 434 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
108 !list_empty(&hugepage_freelists[nid])) { 435 !list_empty(&h->hugepage_freelists[nid])) {
109 page = list_entry(hugepage_freelists[nid].next, 436 page = list_entry(h->hugepage_freelists[nid].next,
110 struct page, lru); 437 struct page, lru);
111 list_del(&page->lru); 438 list_del(&page->lru);
112 free_huge_pages--; 439 h->free_huge_pages--;
113 free_huge_pages_node[nid]--; 440 h->free_huge_pages_node[nid]--;
114 if (vma && vma->vm_flags & VM_MAYSHARE) 441
115 resv_huge_pages--; 442 if (!avoid_reserve)
443 decrement_hugepage_resv_vma(h, vma);
444
116 break; 445 break;
117 } 446 }
118 } 447 }
@@ -120,12 +449,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
120 return page; 449 return page;
121} 450}
122 451
123static void update_and_free_page(struct page *page) 452static void update_and_free_page(struct hstate *h, struct page *page)
124{ 453{
125 int i; 454 int i;
126 nr_huge_pages--; 455
127 nr_huge_pages_node[page_to_nid(page)]--; 456 h->nr_huge_pages--;
128 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 457 h->nr_huge_pages_node[page_to_nid(page)]--;
458 for (i = 0; i < pages_per_huge_page(h); i++) {
129 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 459 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
130 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 460 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
131 1 << PG_private | 1<< PG_writeback); 461 1 << PG_private | 1<< PG_writeback);
@@ -133,11 +463,27 @@ static void update_and_free_page(struct page *page)
133 set_compound_page_dtor(page, NULL); 463 set_compound_page_dtor(page, NULL);
134 set_page_refcounted(page); 464 set_page_refcounted(page);
135 arch_release_hugepage(page); 465 arch_release_hugepage(page);
136 __free_pages(page, HUGETLB_PAGE_ORDER); 466 __free_pages(page, huge_page_order(h));
467}
468
469struct hstate *size_to_hstate(unsigned long size)
470{
471 struct hstate *h;
472
473 for_each_hstate(h) {
474 if (huge_page_size(h) == size)
475 return h;
476 }
477 return NULL;
137} 478}
138 479
139static void free_huge_page(struct page *page) 480static void free_huge_page(struct page *page)
140{ 481{
482 /*
483 * Can't pass hstate in here because it is called from the
484 * compound page destructor.
485 */
486 struct hstate *h = page_hstate(page);
141 int nid = page_to_nid(page); 487 int nid = page_to_nid(page);
142 struct address_space *mapping; 488 struct address_space *mapping;
143 489
@@ -147,12 +493,12 @@ static void free_huge_page(struct page *page)
147 INIT_LIST_HEAD(&page->lru); 493 INIT_LIST_HEAD(&page->lru);
148 494
149 spin_lock(&hugetlb_lock); 495 spin_lock(&hugetlb_lock);
150 if (surplus_huge_pages_node[nid]) { 496 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
151 update_and_free_page(page); 497 update_and_free_page(h, page);
152 surplus_huge_pages--; 498 h->surplus_huge_pages--;
153 surplus_huge_pages_node[nid]--; 499 h->surplus_huge_pages_node[nid]--;
154 } else { 500 } else {
155 enqueue_huge_page(page); 501 enqueue_huge_page(h, page);
156 } 502 }
157 spin_unlock(&hugetlb_lock); 503 spin_unlock(&hugetlb_lock);
158 if (mapping) 504 if (mapping)
@@ -164,7 +510,7 @@ static void free_huge_page(struct page *page)
164 * balanced by operating on them in a round-robin fashion. 510 * balanced by operating on them in a round-robin fashion.
165 * Returns 1 if an adjustment was made. 511 * Returns 1 if an adjustment was made.
166 */ 512 */
167static int adjust_pool_surplus(int delta) 513static int adjust_pool_surplus(struct hstate *h, int delta)
168{ 514{
169 static int prev_nid; 515 static int prev_nid;
170 int nid = prev_nid; 516 int nid = prev_nid;
@@ -177,15 +523,15 @@ static int adjust_pool_surplus(int delta)
177 nid = first_node(node_online_map); 523 nid = first_node(node_online_map);
178 524
179 /* To shrink on this node, there must be a surplus page */ 525 /* To shrink on this node, there must be a surplus page */
180 if (delta < 0 && !surplus_huge_pages_node[nid]) 526 if (delta < 0 && !h->surplus_huge_pages_node[nid])
181 continue; 527 continue;
182 /* Surplus cannot exceed the total number of pages */ 528 /* Surplus cannot exceed the total number of pages */
183 if (delta > 0 && surplus_huge_pages_node[nid] >= 529 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
184 nr_huge_pages_node[nid]) 530 h->nr_huge_pages_node[nid])
185 continue; 531 continue;
186 532
187 surplus_huge_pages += delta; 533 h->surplus_huge_pages += delta;
188 surplus_huge_pages_node[nid] += delta; 534 h->surplus_huge_pages_node[nid] += delta;
189 ret = 1; 535 ret = 1;
190 break; 536 break;
191 } while (nid != prev_nid); 537 } while (nid != prev_nid);
@@ -194,59 +540,74 @@ static int adjust_pool_surplus(int delta)
194 return ret; 540 return ret;
195} 541}
196 542
197static struct page *alloc_fresh_huge_page_node(int nid) 543static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
544{
545 set_compound_page_dtor(page, free_huge_page);
546 spin_lock(&hugetlb_lock);
547 h->nr_huge_pages++;
548 h->nr_huge_pages_node[nid]++;
549 spin_unlock(&hugetlb_lock);
550 put_page(page); /* free it into the hugepage allocator */
551}
552
553static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
198{ 554{
199 struct page *page; 555 struct page *page;
200 556
557 if (h->order >= MAX_ORDER)
558 return NULL;
559
201 page = alloc_pages_node(nid, 560 page = alloc_pages_node(nid,
202 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 561 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
203 __GFP_REPEAT|__GFP_NOWARN, 562 __GFP_REPEAT|__GFP_NOWARN,
204 HUGETLB_PAGE_ORDER); 563 huge_page_order(h));
205 if (page) { 564 if (page) {
206 if (arch_prepare_hugepage(page)) { 565 if (arch_prepare_hugepage(page)) {
207 __free_pages(page, HUGETLB_PAGE_ORDER); 566 __free_pages(page, HUGETLB_PAGE_ORDER);
208 return NULL; 567 return NULL;
209 } 568 }
210 set_compound_page_dtor(page, free_huge_page); 569 prep_new_huge_page(h, page, nid);
211 spin_lock(&hugetlb_lock);
212 nr_huge_pages++;
213 nr_huge_pages_node[nid]++;
214 spin_unlock(&hugetlb_lock);
215 put_page(page); /* free it into the hugepage allocator */
216 } 570 }
217 571
218 return page; 572 return page;
219} 573}
220 574
221static int alloc_fresh_huge_page(void) 575/*
576 * Use a helper variable to find the next node and then
577 * copy it back to hugetlb_next_nid afterwards:
578 * otherwise there's a window in which a racer might
579 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
580 * But we don't need to use a spin_lock here: it really
581 * doesn't matter if occasionally a racer chooses the
582 * same nid as we do. Move nid forward in the mask even
583 * if we just successfully allocated a hugepage so that
584 * the next caller gets hugepages on the next node.
585 */
586static int hstate_next_node(struct hstate *h)
587{
588 int next_nid;
589 next_nid = next_node(h->hugetlb_next_nid, node_online_map);
590 if (next_nid == MAX_NUMNODES)
591 next_nid = first_node(node_online_map);
592 h->hugetlb_next_nid = next_nid;
593 return next_nid;
594}
595
596static int alloc_fresh_huge_page(struct hstate *h)
222{ 597{
223 struct page *page; 598 struct page *page;
224 int start_nid; 599 int start_nid;
225 int next_nid; 600 int next_nid;
226 int ret = 0; 601 int ret = 0;
227 602
228 start_nid = hugetlb_next_nid; 603 start_nid = h->hugetlb_next_nid;
229 604
230 do { 605 do {
231 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 606 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
232 if (page) 607 if (page)
233 ret = 1; 608 ret = 1;
234 /* 609 next_nid = hstate_next_node(h);
235 * Use a helper variable to find the next node and then 610 } while (!page && h->hugetlb_next_nid != start_nid);
236 * copy it back to hugetlb_next_nid afterwards:
237 * otherwise there's a window in which a racer might
238 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
239 * But we don't need to use a spin_lock here: it really
240 * doesn't matter if occasionally a racer chooses the
241 * same nid as we do. Move nid forward in the mask even
242 * if we just successfully allocated a hugepage so that
243 * the next caller gets hugepages on the next node.
244 */
245 next_nid = next_node(hugetlb_next_nid, node_online_map);
246 if (next_nid == MAX_NUMNODES)
247 next_nid = first_node(node_online_map);
248 hugetlb_next_nid = next_nid;
249 } while (!page && hugetlb_next_nid != start_nid);
250 611
251 if (ret) 612 if (ret)
252 count_vm_event(HTLB_BUDDY_PGALLOC); 613 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -256,12 +617,15 @@ static int alloc_fresh_huge_page(void)
256 return ret; 617 return ret;
257} 618}
258 619
259static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 620static struct page *alloc_buddy_huge_page(struct hstate *h,
260 unsigned long address) 621 struct vm_area_struct *vma, unsigned long address)
261{ 622{
262 struct page *page; 623 struct page *page;
263 unsigned int nid; 624 unsigned int nid;
264 625
626 if (h->order >= MAX_ORDER)
627 return NULL;
628
265 /* 629 /*
266 * Assume we will successfully allocate the surplus page to 630 * Assume we will successfully allocate the surplus page to
267 * prevent racing processes from causing the surplus to exceed 631 * prevent racing processes from causing the surplus to exceed
@@ -286,18 +650,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
286 * per-node value is checked there. 650 * per-node value is checked there.
287 */ 651 */
288 spin_lock(&hugetlb_lock); 652 spin_lock(&hugetlb_lock);
289 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 653 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
290 spin_unlock(&hugetlb_lock); 654 spin_unlock(&hugetlb_lock);
291 return NULL; 655 return NULL;
292 } else { 656 } else {
293 nr_huge_pages++; 657 h->nr_huge_pages++;
294 surplus_huge_pages++; 658 h->surplus_huge_pages++;
295 } 659 }
296 spin_unlock(&hugetlb_lock); 660 spin_unlock(&hugetlb_lock);
297 661
298 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 662 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
299 __GFP_REPEAT|__GFP_NOWARN, 663 __GFP_REPEAT|__GFP_NOWARN,
300 HUGETLB_PAGE_ORDER); 664 huge_page_order(h));
301 665
302 spin_lock(&hugetlb_lock); 666 spin_lock(&hugetlb_lock);
303 if (page) { 667 if (page) {
@@ -312,12 +676,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
312 /* 676 /*
313 * We incremented the global counters already 677 * We incremented the global counters already
314 */ 678 */
315 nr_huge_pages_node[nid]++; 679 h->nr_huge_pages_node[nid]++;
316 surplus_huge_pages_node[nid]++; 680 h->surplus_huge_pages_node[nid]++;
317 __count_vm_event(HTLB_BUDDY_PGALLOC); 681 __count_vm_event(HTLB_BUDDY_PGALLOC);
318 } else { 682 } else {
319 nr_huge_pages--; 683 h->nr_huge_pages--;
320 surplus_huge_pages--; 684 h->surplus_huge_pages--;
321 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 685 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
322 } 686 }
323 spin_unlock(&hugetlb_lock); 687 spin_unlock(&hugetlb_lock);
@@ -329,16 +693,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
329 * Increase the hugetlb pool such that it can accomodate a reservation 693 * Increase the hugetlb pool such that it can accomodate a reservation
330 * of size 'delta'. 694 * of size 'delta'.
331 */ 695 */
332static int gather_surplus_pages(int delta) 696static int gather_surplus_pages(struct hstate *h, int delta)
333{ 697{
334 struct list_head surplus_list; 698 struct list_head surplus_list;
335 struct page *page, *tmp; 699 struct page *page, *tmp;
336 int ret, i; 700 int ret, i;
337 int needed, allocated; 701 int needed, allocated;
338 702
339 needed = (resv_huge_pages + delta) - free_huge_pages; 703 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
340 if (needed <= 0) { 704 if (needed <= 0) {
341 resv_huge_pages += delta; 705 h->resv_huge_pages += delta;
342 return 0; 706 return 0;
343 } 707 }
344 708
@@ -349,7 +713,7 @@ static int gather_surplus_pages(int delta)
349retry: 713retry:
350 spin_unlock(&hugetlb_lock); 714 spin_unlock(&hugetlb_lock);
351 for (i = 0; i < needed; i++) { 715 for (i = 0; i < needed; i++) {
352 page = alloc_buddy_huge_page(NULL, 0); 716 page = alloc_buddy_huge_page(h, NULL, 0);
353 if (!page) { 717 if (!page) {
354 /* 718 /*
355 * We were not able to allocate enough pages to 719 * We were not able to allocate enough pages to
@@ -370,7 +734,8 @@ retry:
370 * because either resv_huge_pages or free_huge_pages may have changed. 734 * because either resv_huge_pages or free_huge_pages may have changed.
371 */ 735 */
372 spin_lock(&hugetlb_lock); 736 spin_lock(&hugetlb_lock);
373 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 737 needed = (h->resv_huge_pages + delta) -
738 (h->free_huge_pages + allocated);
374 if (needed > 0) 739 if (needed > 0)
375 goto retry; 740 goto retry;
376 741
@@ -383,7 +748,7 @@ retry:
383 * before they are reserved. 748 * before they are reserved.
384 */ 749 */
385 needed += allocated; 750 needed += allocated;
386 resv_huge_pages += delta; 751 h->resv_huge_pages += delta;
387 ret = 0; 752 ret = 0;
388free: 753free:
389 /* Free the needed pages to the hugetlb pool */ 754 /* Free the needed pages to the hugetlb pool */
@@ -391,7 +756,7 @@ free:
391 if ((--needed) < 0) 756 if ((--needed) < 0)
392 break; 757 break;
393 list_del(&page->lru); 758 list_del(&page->lru);
394 enqueue_huge_page(page); 759 enqueue_huge_page(h, page);
395 } 760 }
396 761
397 /* Free unnecessary surplus pages to the buddy allocator */ 762 /* Free unnecessary surplus pages to the buddy allocator */
@@ -419,7 +784,8 @@ free:
419 * allocated to satisfy the reservation must be explicitly freed if they were 784 * allocated to satisfy the reservation must be explicitly freed if they were
420 * never used. 785 * never used.
421 */ 786 */
422static void return_unused_surplus_pages(unsigned long unused_resv_pages) 787static void return_unused_surplus_pages(struct hstate *h,
788 unsigned long unused_resv_pages)
423{ 789{
424 static int nid = -1; 790 static int nid = -1;
425 struct page *page; 791 struct page *page;
@@ -434,114 +800,231 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
434 unsigned long remaining_iterations = num_online_nodes(); 800 unsigned long remaining_iterations = num_online_nodes();
435 801
436 /* Uncommit the reservation */ 802 /* Uncommit the reservation */
437 resv_huge_pages -= unused_resv_pages; 803 h->resv_huge_pages -= unused_resv_pages;
804
805 /* Cannot return gigantic pages currently */
806 if (h->order >= MAX_ORDER)
807 return;
438 808
439 nr_pages = min(unused_resv_pages, surplus_huge_pages); 809 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
440 810
441 while (remaining_iterations-- && nr_pages) { 811 while (remaining_iterations-- && nr_pages) {
442 nid = next_node(nid, node_online_map); 812 nid = next_node(nid, node_online_map);
443 if (nid == MAX_NUMNODES) 813 if (nid == MAX_NUMNODES)
444 nid = first_node(node_online_map); 814 nid = first_node(node_online_map);
445 815
446 if (!surplus_huge_pages_node[nid]) 816 if (!h->surplus_huge_pages_node[nid])
447 continue; 817 continue;
448 818
449 if (!list_empty(&hugepage_freelists[nid])) { 819 if (!list_empty(&h->hugepage_freelists[nid])) {
450 page = list_entry(hugepage_freelists[nid].next, 820 page = list_entry(h->hugepage_freelists[nid].next,
451 struct page, lru); 821 struct page, lru);
452 list_del(&page->lru); 822 list_del(&page->lru);
453 update_and_free_page(page); 823 update_and_free_page(h, page);
454 free_huge_pages--; 824 h->free_huge_pages--;
455 free_huge_pages_node[nid]--; 825 h->free_huge_pages_node[nid]--;
456 surplus_huge_pages--; 826 h->surplus_huge_pages--;
457 surplus_huge_pages_node[nid]--; 827 h->surplus_huge_pages_node[nid]--;
458 nr_pages--; 828 nr_pages--;
459 remaining_iterations = num_online_nodes(); 829 remaining_iterations = num_online_nodes();
460 } 830 }
461 } 831 }
462} 832}
463 833
834/*
835 * Determine if the huge page at addr within the vma has an associated
836 * reservation. Where it does not we will need to logically increase
837 * reservation and actually increase quota before an allocation can occur.
838 * Where any new reservation would be required the reservation change is
839 * prepared, but not committed. Once the page has been quota'd allocated
840 * an instantiated the change should be committed via vma_commit_reservation.
841 * No action is required on failure.
842 */
843static int vma_needs_reservation(struct hstate *h,
844 struct vm_area_struct *vma, unsigned long addr)
845{
846 struct address_space *mapping = vma->vm_file->f_mapping;
847 struct inode *inode = mapping->host;
848
849 if (vma->vm_flags & VM_SHARED) {
850 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
851 return region_chg(&inode->i_mapping->private_list,
852 idx, idx + 1);
853
854 } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
855 return 1;
464 856
465static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 857 } else {
466 unsigned long addr) 858 int err;
859 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
860 struct resv_map *reservations = vma_resv_map(vma);
861
862 err = region_chg(&reservations->regions, idx, idx + 1);
863 if (err < 0)
864 return err;
865 return 0;
866 }
867}
868static void vma_commit_reservation(struct hstate *h,
869 struct vm_area_struct *vma, unsigned long addr)
467{ 870{
468 struct page *page; 871 struct address_space *mapping = vma->vm_file->f_mapping;
872 struct inode *inode = mapping->host;
469 873
470 spin_lock(&hugetlb_lock); 874 if (vma->vm_flags & VM_SHARED) {
471 page = dequeue_huge_page_vma(vma, addr); 875 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
472 spin_unlock(&hugetlb_lock); 876 region_add(&inode->i_mapping->private_list, idx, idx + 1);
473 return page ? page : ERR_PTR(-VM_FAULT_OOM); 877
878 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
879 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
880 struct resv_map *reservations = vma_resv_map(vma);
881
882 /* Mark this page used in the map. */
883 region_add(&reservations->regions, idx, idx + 1);
884 }
474} 885}
475 886
476static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 887static struct page *alloc_huge_page(struct vm_area_struct *vma,
477 unsigned long addr) 888 unsigned long addr, int avoid_reserve)
478{ 889{
479 struct page *page = NULL; 890 struct hstate *h = hstate_vma(vma);
891 struct page *page;
892 struct address_space *mapping = vma->vm_file->f_mapping;
893 struct inode *inode = mapping->host;
894 unsigned int chg;
480 895
481 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 896 /*
482 return ERR_PTR(-VM_FAULT_SIGBUS); 897 * Processes that did not create the mapping will have no reserves and
898 * will not have accounted against quota. Check that the quota can be
899 * made before satisfying the allocation
900 * MAP_NORESERVE mappings may also need pages and quota allocated
901 * if no reserve mapping overlaps.
902 */
903 chg = vma_needs_reservation(h, vma, addr);
904 if (chg < 0)
905 return ERR_PTR(chg);
906 if (chg)
907 if (hugetlb_get_quota(inode->i_mapping, chg))
908 return ERR_PTR(-ENOSPC);
483 909
484 spin_lock(&hugetlb_lock); 910 spin_lock(&hugetlb_lock);
485 if (free_huge_pages > resv_huge_pages) 911 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
486 page = dequeue_huge_page_vma(vma, addr);
487 spin_unlock(&hugetlb_lock); 912 spin_unlock(&hugetlb_lock);
913
488 if (!page) { 914 if (!page) {
489 page = alloc_buddy_huge_page(vma, addr); 915 page = alloc_buddy_huge_page(h, vma, addr);
490 if (!page) { 916 if (!page) {
491 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 917 hugetlb_put_quota(inode->i_mapping, chg);
492 return ERR_PTR(-VM_FAULT_OOM); 918 return ERR_PTR(-VM_FAULT_OOM);
493 } 919 }
494 } 920 }
921
922 set_page_refcounted(page);
923 set_page_private(page, (unsigned long) mapping);
924
925 vma_commit_reservation(h, vma, addr);
926
495 return page; 927 return page;
496} 928}
497 929
498static struct page *alloc_huge_page(struct vm_area_struct *vma, 930__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
499 unsigned long addr)
500{ 931{
501 struct page *page; 932 struct huge_bootmem_page *m;
502 struct address_space *mapping = vma->vm_file->f_mapping; 933 int nr_nodes = nodes_weight(node_online_map);
503 934
504 if (vma->vm_flags & VM_MAYSHARE) 935 while (nr_nodes) {
505 page = alloc_huge_page_shared(vma, addr); 936 void *addr;
506 else 937
507 page = alloc_huge_page_private(vma, addr); 938 addr = __alloc_bootmem_node_nopanic(
939 NODE_DATA(h->hugetlb_next_nid),
940 huge_page_size(h), huge_page_size(h), 0);
941
942 if (addr) {
943 /*
944 * Use the beginning of the huge page to store the
945 * huge_bootmem_page struct (until gather_bootmem
946 * puts them into the mem_map).
947 */
948 m = addr;
949 if (m)
950 goto found;
951 }
952 hstate_next_node(h);
953 nr_nodes--;
954 }
955 return 0;
956
957found:
958 BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
959 /* Put them into a private list first because mem_map is not up yet */
960 list_add(&m->list, &huge_boot_pages);
961 m->hstate = h;
962 return 1;
963}
508 964
509 if (!IS_ERR(page)) { 965/* Put bootmem huge pages into the standard lists after mem_map is up */
510 set_page_refcounted(page); 966static void __init gather_bootmem_prealloc(void)
511 set_page_private(page, (unsigned long) mapping); 967{
968 struct huge_bootmem_page *m;
969
970 list_for_each_entry(m, &huge_boot_pages, list) {
971 struct page *page = virt_to_page(m);
972 struct hstate *h = m->hstate;
973 __ClearPageReserved(page);
974 WARN_ON(page_count(page) != 1);
975 prep_compound_page(page, h->order);
976 prep_new_huge_page(h, page, page_to_nid(page));
512 } 977 }
513 return page;
514} 978}
515 979
516static int __init hugetlb_init(void) 980static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
517{ 981{
518 unsigned long i; 982 unsigned long i;
519 983
520 if (HPAGE_SHIFT == 0) 984 for (i = 0; i < h->max_huge_pages; ++i) {
521 return 0; 985 if (h->order >= MAX_ORDER) {
522 986 if (!alloc_bootmem_huge_page(h))
523 for (i = 0; i < MAX_NUMNODES; ++i) 987 break;
524 INIT_LIST_HEAD(&hugepage_freelists[i]); 988 } else if (!alloc_fresh_huge_page(h))
989 break;
990 }
991 h->max_huge_pages = i;
992}
525 993
526 hugetlb_next_nid = first_node(node_online_map); 994static void __init hugetlb_init_hstates(void)
995{
996 struct hstate *h;
527 997
528 for (i = 0; i < max_huge_pages; ++i) { 998 for_each_hstate(h) {
529 if (!alloc_fresh_huge_page()) 999 /* oversize hugepages were init'ed in early boot */
530 break; 1000 if (h->order < MAX_ORDER)
1001 hugetlb_hstate_alloc_pages(h);
531 } 1002 }
532 max_huge_pages = free_huge_pages = nr_huge_pages = i;
533 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
534 return 0;
535} 1003}
536module_init(hugetlb_init);
537 1004
538static int __init hugetlb_setup(char *s) 1005static char * __init memfmt(char *buf, unsigned long n)
539{ 1006{
540 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 1007 if (n >= (1UL << 30))
541 max_huge_pages = 0; 1008 sprintf(buf, "%lu GB", n >> 30);
542 return 1; 1009 else if (n >= (1UL << 20))
1010 sprintf(buf, "%lu MB", n >> 20);
1011 else
1012 sprintf(buf, "%lu KB", n >> 10);
1013 return buf;
1014}
1015
1016static void __init report_hugepages(void)
1017{
1018 struct hstate *h;
1019
1020 for_each_hstate(h) {
1021 char buf[32];
1022 printk(KERN_INFO "HugeTLB registered %s page size, "
1023 "pre-allocated %ld pages\n",
1024 memfmt(buf, huge_page_size(h)),
1025 h->free_huge_pages);
1026 }
543} 1027}
544__setup("hugepages=", hugetlb_setup);
545 1028
546static unsigned int cpuset_mems_nr(unsigned int *array) 1029static unsigned int cpuset_mems_nr(unsigned int *array)
547{ 1030{
@@ -556,35 +1039,42 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
556 1039
557#ifdef CONFIG_SYSCTL 1040#ifdef CONFIG_SYSCTL
558#ifdef CONFIG_HIGHMEM 1041#ifdef CONFIG_HIGHMEM
559static void try_to_free_low(unsigned long count) 1042static void try_to_free_low(struct hstate *h, unsigned long count)
560{ 1043{
561 int i; 1044 int i;
562 1045
1046 if (h->order >= MAX_ORDER)
1047 return;
1048
563 for (i = 0; i < MAX_NUMNODES; ++i) { 1049 for (i = 0; i < MAX_NUMNODES; ++i) {
564 struct page *page, *next; 1050 struct page *page, *next;
565 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 1051 struct list_head *freel = &h->hugepage_freelists[i];
566 if (count >= nr_huge_pages) 1052 list_for_each_entry_safe(page, next, freel, lru) {
1053 if (count >= h->nr_huge_pages)
567 return; 1054 return;
568 if (PageHighMem(page)) 1055 if (PageHighMem(page))
569 continue; 1056 continue;
570 list_del(&page->lru); 1057 list_del(&page->lru);
571 update_and_free_page(page); 1058 update_and_free_page(h, page);
572 free_huge_pages--; 1059 h->free_huge_pages--;
573 free_huge_pages_node[page_to_nid(page)]--; 1060 h->free_huge_pages_node[page_to_nid(page)]--;
574 } 1061 }
575 } 1062 }
576} 1063}
577#else 1064#else
578static inline void try_to_free_low(unsigned long count) 1065static inline void try_to_free_low(struct hstate *h, unsigned long count)
579{ 1066{
580} 1067}
581#endif 1068#endif
582 1069
583#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 1070#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
584static unsigned long set_max_huge_pages(unsigned long count) 1071static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
585{ 1072{
586 unsigned long min_count, ret; 1073 unsigned long min_count, ret;
587 1074
1075 if (h->order >= MAX_ORDER)
1076 return h->max_huge_pages;
1077
588 /* 1078 /*
589 * Increase the pool size 1079 * Increase the pool size
590 * First take pages out of surplus state. Then make up the 1080 * First take pages out of surplus state. Then make up the
@@ -597,20 +1087,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
597 * within all the constraints specified by the sysctls. 1087 * within all the constraints specified by the sysctls.
598 */ 1088 */
599 spin_lock(&hugetlb_lock); 1089 spin_lock(&hugetlb_lock);
600 while (surplus_huge_pages && count > persistent_huge_pages) { 1090 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
601 if (!adjust_pool_surplus(-1)) 1091 if (!adjust_pool_surplus(h, -1))
602 break; 1092 break;
603 } 1093 }
604 1094
605 while (count > persistent_huge_pages) { 1095 while (count > persistent_huge_pages(h)) {
606 int ret;
607 /* 1096 /*
608 * If this allocation races such that we no longer need the 1097 * If this allocation races such that we no longer need the
609 * page, free_huge_page will handle it by freeing the page 1098 * page, free_huge_page will handle it by freeing the page
610 * and reducing the surplus. 1099 * and reducing the surplus.
611 */ 1100 */
612 spin_unlock(&hugetlb_lock); 1101 spin_unlock(&hugetlb_lock);
613 ret = alloc_fresh_huge_page(); 1102 ret = alloc_fresh_huge_page(h);
614 spin_lock(&hugetlb_lock); 1103 spin_lock(&hugetlb_lock);
615 if (!ret) 1104 if (!ret)
616 goto out; 1105 goto out;
@@ -632,31 +1121,288 @@ static unsigned long set_max_huge_pages(unsigned long count)
632 * and won't grow the pool anywhere else. Not until one of the 1121 * and won't grow the pool anywhere else. Not until one of the
633 * sysctls are changed, or the surplus pages go out of use. 1122 * sysctls are changed, or the surplus pages go out of use.
634 */ 1123 */
635 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 1124 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
636 min_count = max(count, min_count); 1125 min_count = max(count, min_count);
637 try_to_free_low(min_count); 1126 try_to_free_low(h, min_count);
638 while (min_count < persistent_huge_pages) { 1127 while (min_count < persistent_huge_pages(h)) {
639 struct page *page = dequeue_huge_page(); 1128 struct page *page = dequeue_huge_page(h);
640 if (!page) 1129 if (!page)
641 break; 1130 break;
642 update_and_free_page(page); 1131 update_and_free_page(h, page);
643 } 1132 }
644 while (count < persistent_huge_pages) { 1133 while (count < persistent_huge_pages(h)) {
645 if (!adjust_pool_surplus(1)) 1134 if (!adjust_pool_surplus(h, 1))
646 break; 1135 break;
647 } 1136 }
648out: 1137out:
649 ret = persistent_huge_pages; 1138 ret = persistent_huge_pages(h);
650 spin_unlock(&hugetlb_lock); 1139 spin_unlock(&hugetlb_lock);
651 return ret; 1140 return ret;
652} 1141}
653 1142
1143#define HSTATE_ATTR_RO(_name) \
1144 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1145
1146#define HSTATE_ATTR(_name) \
1147 static struct kobj_attribute _name##_attr = \
1148 __ATTR(_name, 0644, _name##_show, _name##_store)
1149
1150static struct kobject *hugepages_kobj;
1151static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1152
1153static struct hstate *kobj_to_hstate(struct kobject *kobj)
1154{
1155 int i;
1156 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1157 if (hstate_kobjs[i] == kobj)
1158 return &hstates[i];
1159 BUG();
1160 return NULL;
1161}
1162
1163static ssize_t nr_hugepages_show(struct kobject *kobj,
1164 struct kobj_attribute *attr, char *buf)
1165{
1166 struct hstate *h = kobj_to_hstate(kobj);
1167 return sprintf(buf, "%lu\n", h->nr_huge_pages);
1168}
1169static ssize_t nr_hugepages_store(struct kobject *kobj,
1170 struct kobj_attribute *attr, const char *buf, size_t count)
1171{
1172 int err;
1173 unsigned long input;
1174 struct hstate *h = kobj_to_hstate(kobj);
1175
1176 err = strict_strtoul(buf, 10, &input);
1177 if (err)
1178 return 0;
1179
1180 h->max_huge_pages = set_max_huge_pages(h, input);
1181
1182 return count;
1183}
1184HSTATE_ATTR(nr_hugepages);
1185
1186static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1187 struct kobj_attribute *attr, char *buf)
1188{
1189 struct hstate *h = kobj_to_hstate(kobj);
1190 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1191}
1192static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1193 struct kobj_attribute *attr, const char *buf, size_t count)
1194{
1195 int err;
1196 unsigned long input;
1197 struct hstate *h = kobj_to_hstate(kobj);
1198
1199 err = strict_strtoul(buf, 10, &input);
1200 if (err)
1201 return 0;
1202
1203 spin_lock(&hugetlb_lock);
1204 h->nr_overcommit_huge_pages = input;
1205 spin_unlock(&hugetlb_lock);
1206
1207 return count;
1208}
1209HSTATE_ATTR(nr_overcommit_hugepages);
1210
1211static ssize_t free_hugepages_show(struct kobject *kobj,
1212 struct kobj_attribute *attr, char *buf)
1213{
1214 struct hstate *h = kobj_to_hstate(kobj);
1215 return sprintf(buf, "%lu\n", h->free_huge_pages);
1216}
1217HSTATE_ATTR_RO(free_hugepages);
1218
1219static ssize_t resv_hugepages_show(struct kobject *kobj,
1220 struct kobj_attribute *attr, char *buf)
1221{
1222 struct hstate *h = kobj_to_hstate(kobj);
1223 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1224}
1225HSTATE_ATTR_RO(resv_hugepages);
1226
1227static ssize_t surplus_hugepages_show(struct kobject *kobj,
1228 struct kobj_attribute *attr, char *buf)
1229{
1230 struct hstate *h = kobj_to_hstate(kobj);
1231 return sprintf(buf, "%lu\n", h->surplus_huge_pages);
1232}
1233HSTATE_ATTR_RO(surplus_hugepages);
1234
1235static struct attribute *hstate_attrs[] = {
1236 &nr_hugepages_attr.attr,
1237 &nr_overcommit_hugepages_attr.attr,
1238 &free_hugepages_attr.attr,
1239 &resv_hugepages_attr.attr,
1240 &surplus_hugepages_attr.attr,
1241 NULL,
1242};
1243
1244static struct attribute_group hstate_attr_group = {
1245 .attrs = hstate_attrs,
1246};
1247
1248static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
1249{
1250 int retval;
1251
1252 hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
1253 hugepages_kobj);
1254 if (!hstate_kobjs[h - hstates])
1255 return -ENOMEM;
1256
1257 retval = sysfs_create_group(hstate_kobjs[h - hstates],
1258 &hstate_attr_group);
1259 if (retval)
1260 kobject_put(hstate_kobjs[h - hstates]);
1261
1262 return retval;
1263}
1264
1265static void __init hugetlb_sysfs_init(void)
1266{
1267 struct hstate *h;
1268 int err;
1269
1270 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1271 if (!hugepages_kobj)
1272 return;
1273
1274 for_each_hstate(h) {
1275 err = hugetlb_sysfs_add_hstate(h);
1276 if (err)
1277 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1278 h->name);
1279 }
1280}
1281
1282static void __exit hugetlb_exit(void)
1283{
1284 struct hstate *h;
1285
1286 for_each_hstate(h) {
1287 kobject_put(hstate_kobjs[h - hstates]);
1288 }
1289
1290 kobject_put(hugepages_kobj);
1291}
1292module_exit(hugetlb_exit);
1293
1294static int __init hugetlb_init(void)
1295{
1296 BUILD_BUG_ON(HPAGE_SHIFT == 0);
1297
1298 if (!size_to_hstate(default_hstate_size)) {
1299 default_hstate_size = HPAGE_SIZE;
1300 if (!size_to_hstate(default_hstate_size))
1301 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1302 }
1303 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
1304 if (default_hstate_max_huge_pages)
1305 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1306
1307 hugetlb_init_hstates();
1308
1309 gather_bootmem_prealloc();
1310
1311 report_hugepages();
1312
1313 hugetlb_sysfs_init();
1314
1315 return 0;
1316}
1317module_init(hugetlb_init);
1318
1319/* Should be called on processing a hugepagesz=... option */
1320void __init hugetlb_add_hstate(unsigned order)
1321{
1322 struct hstate *h;
1323 unsigned long i;
1324
1325 if (size_to_hstate(PAGE_SIZE << order)) {
1326 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1327 return;
1328 }
1329 BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
1330 BUG_ON(order == 0);
1331 h = &hstates[max_hstate++];
1332 h->order = order;
1333 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1334 h->nr_huge_pages = 0;
1335 h->free_huge_pages = 0;
1336 for (i = 0; i < MAX_NUMNODES; ++i)
1337 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1338 h->hugetlb_next_nid = first_node(node_online_map);
1339 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1340 huge_page_size(h)/1024);
1341
1342 parsed_hstate = h;
1343}
1344
1345static int __init hugetlb_nrpages_setup(char *s)
1346{
1347 unsigned long *mhp;
1348 static unsigned long *last_mhp;
1349
1350 /*
1351 * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
1352 * so this hugepages= parameter goes to the "default hstate".
1353 */
1354 if (!max_hstate)
1355 mhp = &default_hstate_max_huge_pages;
1356 else
1357 mhp = &parsed_hstate->max_huge_pages;
1358
1359 if (mhp == last_mhp) {
1360 printk(KERN_WARNING "hugepages= specified twice without "
1361 "interleaving hugepagesz=, ignoring\n");
1362 return 1;
1363 }
1364
1365 if (sscanf(s, "%lu", mhp) <= 0)
1366 *mhp = 0;
1367
1368 /*
1369 * Global state is always initialized later in hugetlb_init.
1370 * But we need to allocate >= MAX_ORDER hstates here early to still
1371 * use the bootmem allocator.
1372 */
1373 if (max_hstate && parsed_hstate->order >= MAX_ORDER)
1374 hugetlb_hstate_alloc_pages(parsed_hstate);
1375
1376 last_mhp = mhp;
1377
1378 return 1;
1379}
1380__setup("hugepages=", hugetlb_nrpages_setup);
1381
1382static int __init hugetlb_default_setup(char *s)
1383{
1384 default_hstate_size = memparse(s, &s);
1385 return 1;
1386}
1387__setup("default_hugepagesz=", hugetlb_default_setup);
1388
654int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1389int hugetlb_sysctl_handler(struct ctl_table *table, int write,
655 struct file *file, void __user *buffer, 1390 struct file *file, void __user *buffer,
656 size_t *length, loff_t *ppos) 1391 size_t *length, loff_t *ppos)
657{ 1392{
1393 struct hstate *h = &default_hstate;
1394 unsigned long tmp;
1395
1396 if (!write)
1397 tmp = h->max_huge_pages;
1398
1399 table->data = &tmp;
1400 table->maxlen = sizeof(unsigned long);
658 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1401 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
659 max_huge_pages = set_max_huge_pages(max_huge_pages); 1402
1403 if (write)
1404 h->max_huge_pages = set_max_huge_pages(h, tmp);
1405
660 return 0; 1406 return 0;
661} 1407}
662 1408
@@ -676,10 +1422,22 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
676 struct file *file, void __user *buffer, 1422 struct file *file, void __user *buffer,
677 size_t *length, loff_t *ppos) 1423 size_t *length, loff_t *ppos)
678{ 1424{
1425 struct hstate *h = &default_hstate;
1426 unsigned long tmp;
1427
1428 if (!write)
1429 tmp = h->nr_overcommit_huge_pages;
1430
1431 table->data = &tmp;
1432 table->maxlen = sizeof(unsigned long);
679 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1433 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
680 spin_lock(&hugetlb_lock); 1434
681 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 1435 if (write) {
682 spin_unlock(&hugetlb_lock); 1436 spin_lock(&hugetlb_lock);
1437 h->nr_overcommit_huge_pages = tmp;
1438 spin_unlock(&hugetlb_lock);
1439 }
1440
683 return 0; 1441 return 0;
684} 1442}
685 1443
@@ -687,34 +1445,118 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
687 1445
688int hugetlb_report_meminfo(char *buf) 1446int hugetlb_report_meminfo(char *buf)
689{ 1447{
1448 struct hstate *h = &default_hstate;
690 return sprintf(buf, 1449 return sprintf(buf,
691 "HugePages_Total: %5lu\n" 1450 "HugePages_Total: %5lu\n"
692 "HugePages_Free: %5lu\n" 1451 "HugePages_Free: %5lu\n"
693 "HugePages_Rsvd: %5lu\n" 1452 "HugePages_Rsvd: %5lu\n"
694 "HugePages_Surp: %5lu\n" 1453 "HugePages_Surp: %5lu\n"
695 "Hugepagesize: %5lu kB\n", 1454 "Hugepagesize: %5lu kB\n",
696 nr_huge_pages, 1455 h->nr_huge_pages,
697 free_huge_pages, 1456 h->free_huge_pages,
698 resv_huge_pages, 1457 h->resv_huge_pages,
699 surplus_huge_pages, 1458 h->surplus_huge_pages,
700 HPAGE_SIZE/1024); 1459 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
701} 1460}
702 1461
703int hugetlb_report_node_meminfo(int nid, char *buf) 1462int hugetlb_report_node_meminfo(int nid, char *buf)
704{ 1463{
1464 struct hstate *h = &default_hstate;
705 return sprintf(buf, 1465 return sprintf(buf,
706 "Node %d HugePages_Total: %5u\n" 1466 "Node %d HugePages_Total: %5u\n"
707 "Node %d HugePages_Free: %5u\n" 1467 "Node %d HugePages_Free: %5u\n"
708 "Node %d HugePages_Surp: %5u\n", 1468 "Node %d HugePages_Surp: %5u\n",
709 nid, nr_huge_pages_node[nid], 1469 nid, h->nr_huge_pages_node[nid],
710 nid, free_huge_pages_node[nid], 1470 nid, h->free_huge_pages_node[nid],
711 nid, surplus_huge_pages_node[nid]); 1471 nid, h->surplus_huge_pages_node[nid]);
712} 1472}
713 1473
714/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 1474/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
715unsigned long hugetlb_total_pages(void) 1475unsigned long hugetlb_total_pages(void)
716{ 1476{
717 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 1477 struct hstate *h = &default_hstate;
1478 return h->nr_huge_pages * pages_per_huge_page(h);
1479}
1480
1481static int hugetlb_acct_memory(struct hstate *h, long delta)
1482{
1483 int ret = -ENOMEM;
1484
1485 spin_lock(&hugetlb_lock);
1486 /*
1487 * When cpuset is configured, it breaks the strict hugetlb page
1488 * reservation as the accounting is done on a global variable. Such
1489 * reservation is completely rubbish in the presence of cpuset because
1490 * the reservation is not checked against page availability for the
1491 * current cpuset. Application can still potentially OOM'ed by kernel
1492 * with lack of free htlb page in cpuset that the task is in.
1493 * Attempt to enforce strict accounting with cpuset is almost
1494 * impossible (or too ugly) because cpuset is too fluid that
1495 * task or memory node can be dynamically moved between cpusets.
1496 *
1497 * The change of semantics for shared hugetlb mapping with cpuset is
1498 * undesirable. However, in order to preserve some of the semantics,
1499 * we fall back to check against current free page availability as
1500 * a best attempt and hopefully to minimize the impact of changing
1501 * semantics that cpuset has.
1502 */
1503 if (delta > 0) {
1504 if (gather_surplus_pages(h, delta) < 0)
1505 goto out;
1506
1507 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
1508 return_unused_surplus_pages(h, delta);
1509 goto out;
1510 }
1511 }
1512
1513 ret = 0;
1514 if (delta < 0)
1515 return_unused_surplus_pages(h, (unsigned long) -delta);
1516
1517out:
1518 spin_unlock(&hugetlb_lock);
1519 return ret;
1520}
1521
1522static void hugetlb_vm_op_open(struct vm_area_struct *vma)
1523{
1524 struct resv_map *reservations = vma_resv_map(vma);
1525
1526 /*
1527 * This new VMA should share its siblings reservation map if present.
1528 * The VMA will only ever have a valid reservation map pointer where
1529 * it is being copied for another still existing VMA. As that VMA
1530 * has a reference to the reservation map it cannot dissappear until
1531 * after this open call completes. It is therefore safe to take a
1532 * new reference here without additional locking.
1533 */
1534 if (reservations)
1535 kref_get(&reservations->refs);
1536}
1537
1538static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1539{
1540 struct hstate *h = hstate_vma(vma);
1541 struct resv_map *reservations = vma_resv_map(vma);
1542 unsigned long reserve;
1543 unsigned long start;
1544 unsigned long end;
1545
1546 if (reservations) {
1547 start = vma_hugecache_offset(h, vma, vma->vm_start);
1548 end = vma_hugecache_offset(h, vma, vma->vm_end);
1549
1550 reserve = (end - start) -
1551 region_count(&reservations->regions, start, end);
1552
1553 kref_put(&reservations->refs, resv_map_release);
1554
1555 if (reserve) {
1556 hugetlb_acct_memory(h, -reserve);
1557 hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
1558 }
1559 }
718} 1560}
719 1561
720/* 1562/*
@@ -731,6 +1573,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
731 1573
732struct vm_operations_struct hugetlb_vm_ops = { 1574struct vm_operations_struct hugetlb_vm_ops = {
733 .fault = hugetlb_vm_op_fault, 1575 .fault = hugetlb_vm_op_fault,
1576 .open = hugetlb_vm_op_open,
1577 .close = hugetlb_vm_op_close,
734}; 1578};
735 1579
736static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 1580static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -769,14 +1613,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
769 struct page *ptepage; 1613 struct page *ptepage;
770 unsigned long addr; 1614 unsigned long addr;
771 int cow; 1615 int cow;
1616 struct hstate *h = hstate_vma(vma);
1617 unsigned long sz = huge_page_size(h);
772 1618
773 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 1619 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
774 1620
775 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 1621 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
776 src_pte = huge_pte_offset(src, addr); 1622 src_pte = huge_pte_offset(src, addr);
777 if (!src_pte) 1623 if (!src_pte)
778 continue; 1624 continue;
779 dst_pte = huge_pte_alloc(dst, addr); 1625 dst_pte = huge_pte_alloc(dst, addr, sz);
780 if (!dst_pte) 1626 if (!dst_pte)
781 goto nomem; 1627 goto nomem;
782 1628
@@ -804,7 +1650,7 @@ nomem:
804} 1650}
805 1651
806void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1652void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
807 unsigned long end) 1653 unsigned long end, struct page *ref_page)
808{ 1654{
809 struct mm_struct *mm = vma->vm_mm; 1655 struct mm_struct *mm = vma->vm_mm;
810 unsigned long address; 1656 unsigned long address;
@@ -812,6 +1658,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
812 pte_t pte; 1658 pte_t pte;
813 struct page *page; 1659 struct page *page;
814 struct page *tmp; 1660 struct page *tmp;
1661 struct hstate *h = hstate_vma(vma);
1662 unsigned long sz = huge_page_size(h);
1663
815 /* 1664 /*
816 * A page gathering list, protected by per file i_mmap_lock. The 1665 * A page gathering list, protected by per file i_mmap_lock. The
817 * lock is used to avoid list corruption from multiple unmapping 1666 * lock is used to avoid list corruption from multiple unmapping
@@ -820,11 +1669,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
820 LIST_HEAD(page_list); 1669 LIST_HEAD(page_list);
821 1670
822 WARN_ON(!is_vm_hugetlb_page(vma)); 1671 WARN_ON(!is_vm_hugetlb_page(vma));
823 BUG_ON(start & ~HPAGE_MASK); 1672 BUG_ON(start & ~huge_page_mask(h));
824 BUG_ON(end & ~HPAGE_MASK); 1673 BUG_ON(end & ~huge_page_mask(h));
825 1674
826 spin_lock(&mm->page_table_lock); 1675 spin_lock(&mm->page_table_lock);
827 for (address = start; address < end; address += HPAGE_SIZE) { 1676 for (address = start; address < end; address += sz) {
828 ptep = huge_pte_offset(mm, address); 1677 ptep = huge_pte_offset(mm, address);
829 if (!ptep) 1678 if (!ptep)
830 continue; 1679 continue;
@@ -832,6 +1681,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
832 if (huge_pmd_unshare(mm, &address, ptep)) 1681 if (huge_pmd_unshare(mm, &address, ptep))
833 continue; 1682 continue;
834 1683
1684 /*
1685 * If a reference page is supplied, it is because a specific
1686 * page is being unmapped, not a range. Ensure the page we
1687 * are about to unmap is the actual page of interest.
1688 */
1689 if (ref_page) {
1690 pte = huge_ptep_get(ptep);
1691 if (huge_pte_none(pte))
1692 continue;
1693 page = pte_page(pte);
1694 if (page != ref_page)
1695 continue;
1696
1697 /*
1698 * Mark the VMA as having unmapped its page so that
1699 * future faults in this VMA will fail rather than
1700 * looking like data was lost
1701 */
1702 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
1703 }
1704
835 pte = huge_ptep_get_and_clear(mm, address, ptep); 1705 pte = huge_ptep_get_and_clear(mm, address, ptep);
836 if (huge_pte_none(pte)) 1706 if (huge_pte_none(pte))
837 continue; 1707 continue;
@@ -850,31 +1720,71 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
850} 1720}
851 1721
852void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1722void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
853 unsigned long end) 1723 unsigned long end, struct page *ref_page)
854{ 1724{
1725 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1726 __unmap_hugepage_range(vma, start, end, ref_page);
1727 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
1728}
1729
1730/*
1731 * This is called when the original mapper is failing to COW a MAP_PRIVATE
1732 * mappping it owns the reserve page for. The intention is to unmap the page
1733 * from other VMAs and let the children be SIGKILLed if they are faulting the
1734 * same region.
1735 */
1736int unmap_ref_private(struct mm_struct *mm,
1737 struct vm_area_struct *vma,
1738 struct page *page,
1739 unsigned long address)
1740{
1741 struct vm_area_struct *iter_vma;
1742 struct address_space *mapping;
1743 struct prio_tree_iter iter;
1744 pgoff_t pgoff;
1745
855 /* 1746 /*
856 * It is undesirable to test vma->vm_file as it should be non-null 1747 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
857 * for valid hugetlb area. However, vm_file will be NULL in the error 1748 * from page cache lookup which is in HPAGE_SIZE units.
858 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
859 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
860 * to clean up. Since no pte has actually been setup, it is safe to
861 * do nothing in this case.
862 */ 1749 */
863 if (vma->vm_file) { 1750 address = address & huge_page_mask(hstate_vma(vma));
864 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1751 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
865 __unmap_hugepage_range(vma, start, end); 1752 + (vma->vm_pgoff >> PAGE_SHIFT);
866 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1753 mapping = (struct address_space *)page_private(page);
1754
1755 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1756 /* Do not unmap the current VMA */
1757 if (iter_vma == vma)
1758 continue;
1759
1760 /*
1761 * Unmap the page from other VMAs without their own reserves.
1762 * They get marked to be SIGKILLed if they fault in these
1763 * areas. This is because a future no-page fault on this VMA
1764 * could insert a zeroed page instead of the data existing
1765 * from the time of fork. This would look like data corruption
1766 */
1767 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1768 unmap_hugepage_range(iter_vma,
1769 address, address + HPAGE_SIZE,
1770 page);
867 } 1771 }
1772
1773 return 1;
868} 1774}
869 1775
870static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 1776static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
871 unsigned long address, pte_t *ptep, pte_t pte) 1777 unsigned long address, pte_t *ptep, pte_t pte,
1778 struct page *pagecache_page)
872{ 1779{
1780 struct hstate *h = hstate_vma(vma);
873 struct page *old_page, *new_page; 1781 struct page *old_page, *new_page;
874 int avoidcopy; 1782 int avoidcopy;
1783 int outside_reserve = 0;
875 1784
876 old_page = pte_page(pte); 1785 old_page = pte_page(pte);
877 1786
1787retry_avoidcopy:
878 /* If no-one else is actually using this page, avoid the copy 1788 /* If no-one else is actually using this page, avoid the copy
879 * and just make the page writable */ 1789 * and just make the page writable */
880 avoidcopy = (page_count(old_page) == 1); 1790 avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1793,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
883 return 0; 1793 return 0;
884 } 1794 }
885 1795
1796 /*
1797 * If the process that created a MAP_PRIVATE mapping is about to
1798 * perform a COW due to a shared page count, attempt to satisfy
1799 * the allocation without using the existing reserves. The pagecache
1800 * page is used to determine if the reserve at this address was
1801 * consumed or not. If reserves were used, a partial faulted mapping
1802 * at the time of fork() could consume its reserves on COW instead
1803 * of the full address range.
1804 */
1805 if (!(vma->vm_flags & VM_SHARED) &&
1806 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1807 old_page != pagecache_page)
1808 outside_reserve = 1;
1809
886 page_cache_get(old_page); 1810 page_cache_get(old_page);
887 new_page = alloc_huge_page(vma, address); 1811 new_page = alloc_huge_page(vma, address, outside_reserve);
888 1812
889 if (IS_ERR(new_page)) { 1813 if (IS_ERR(new_page)) {
890 page_cache_release(old_page); 1814 page_cache_release(old_page);
1815
1816 /*
1817 * If a process owning a MAP_PRIVATE mapping fails to COW,
1818 * it is due to references held by a child and an insufficient
1819 * huge page pool. To guarantee the original mappers
1820 * reliability, unmap the page from child processes. The child
1821 * may get SIGKILLed if it later faults.
1822 */
1823 if (outside_reserve) {
1824 BUG_ON(huge_pte_none(pte));
1825 if (unmap_ref_private(mm, vma, old_page, address)) {
1826 BUG_ON(page_count(old_page) != 1);
1827 BUG_ON(huge_pte_none(pte));
1828 goto retry_avoidcopy;
1829 }
1830 WARN_ON_ONCE(1);
1831 }
1832
891 return -PTR_ERR(new_page); 1833 return -PTR_ERR(new_page);
892 } 1834 }
893 1835
@@ -896,7 +1838,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
896 __SetPageUptodate(new_page); 1838 __SetPageUptodate(new_page);
897 spin_lock(&mm->page_table_lock); 1839 spin_lock(&mm->page_table_lock);
898 1840
899 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 1841 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
900 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 1842 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
901 /* Break COW */ 1843 /* Break COW */
902 huge_ptep_clear_flush(vma, address, ptep); 1844 huge_ptep_clear_flush(vma, address, ptep);
@@ -910,19 +1852,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
910 return 0; 1852 return 0;
911} 1853}
912 1854
1855/* Return the pagecache page at a given address within a VMA */
1856static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1857 struct vm_area_struct *vma, unsigned long address)
1858{
1859 struct address_space *mapping;
1860 pgoff_t idx;
1861
1862 mapping = vma->vm_file->f_mapping;
1863 idx = vma_hugecache_offset(h, vma, address);
1864
1865 return find_lock_page(mapping, idx);
1866}
1867
913static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1868static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
914 unsigned long address, pte_t *ptep, int write_access) 1869 unsigned long address, pte_t *ptep, int write_access)
915{ 1870{
1871 struct hstate *h = hstate_vma(vma);
916 int ret = VM_FAULT_SIGBUS; 1872 int ret = VM_FAULT_SIGBUS;
917 unsigned long idx; 1873 pgoff_t idx;
918 unsigned long size; 1874 unsigned long size;
919 struct page *page; 1875 struct page *page;
920 struct address_space *mapping; 1876 struct address_space *mapping;
921 pte_t new_pte; 1877 pte_t new_pte;
922 1878
1879 /*
1880 * Currently, we are forced to kill the process in the event the
1881 * original mapper has unmapped pages from the child due to a failed
1882 * COW. Warn that such a situation has occured as it may not be obvious
1883 */
1884 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
1885 printk(KERN_WARNING
1886 "PID %d killed due to inadequate hugepage pool\n",
1887 current->pid);
1888 return ret;
1889 }
1890
923 mapping = vma->vm_file->f_mapping; 1891 mapping = vma->vm_file->f_mapping;
924 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 1892 idx = vma_hugecache_offset(h, vma, address);
925 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
926 1893
927 /* 1894 /*
928 * Use page lock to guard against racing truncation 1895 * Use page lock to guard against racing truncation
@@ -931,15 +1898,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
931retry: 1898retry:
932 page = find_lock_page(mapping, idx); 1899 page = find_lock_page(mapping, idx);
933 if (!page) { 1900 if (!page) {
934 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1901 size = i_size_read(mapping->host) >> huge_page_shift(h);
935 if (idx >= size) 1902 if (idx >= size)
936 goto out; 1903 goto out;
937 page = alloc_huge_page(vma, address); 1904 page = alloc_huge_page(vma, address, 0);
938 if (IS_ERR(page)) { 1905 if (IS_ERR(page)) {
939 ret = -PTR_ERR(page); 1906 ret = -PTR_ERR(page);
940 goto out; 1907 goto out;
941 } 1908 }
942 clear_huge_page(page, address); 1909 clear_huge_page(page, address, huge_page_size(h));
943 __SetPageUptodate(page); 1910 __SetPageUptodate(page);
944 1911
945 if (vma->vm_flags & VM_SHARED) { 1912 if (vma->vm_flags & VM_SHARED) {
@@ -955,14 +1922,14 @@ retry:
955 } 1922 }
956 1923
957 spin_lock(&inode->i_lock); 1924 spin_lock(&inode->i_lock);
958 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 1925 inode->i_blocks += blocks_per_huge_page(h);
959 spin_unlock(&inode->i_lock); 1926 spin_unlock(&inode->i_lock);
960 } else 1927 } else
961 lock_page(page); 1928 lock_page(page);
962 } 1929 }
963 1930
964 spin_lock(&mm->page_table_lock); 1931 spin_lock(&mm->page_table_lock);
965 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1932 size = i_size_read(mapping->host) >> huge_page_shift(h);
966 if (idx >= size) 1933 if (idx >= size)
967 goto backout; 1934 goto backout;
968 1935
@@ -976,7 +1943,7 @@ retry:
976 1943
977 if (write_access && !(vma->vm_flags & VM_SHARED)) { 1944 if (write_access && !(vma->vm_flags & VM_SHARED)) {
978 /* Optimization, do the COW without a second fault */ 1945 /* Optimization, do the COW without a second fault */
979 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 1946 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
980 } 1947 }
981 1948
982 spin_unlock(&mm->page_table_lock); 1949 spin_unlock(&mm->page_table_lock);
@@ -998,8 +1965,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
998 pte_t entry; 1965 pte_t entry;
999 int ret; 1966 int ret;
1000 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 1967 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
1968 struct hstate *h = hstate_vma(vma);
1001 1969
1002 ptep = huge_pte_alloc(mm, address); 1970 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
1003 if (!ptep) 1971 if (!ptep)
1004 return VM_FAULT_OOM; 1972 return VM_FAULT_OOM;
1005 1973
@@ -1021,14 +1989,30 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1021 spin_lock(&mm->page_table_lock); 1989 spin_lock(&mm->page_table_lock);
1022 /* Check for a racing update before calling hugetlb_cow */ 1990 /* Check for a racing update before calling hugetlb_cow */
1023 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 1991 if (likely(pte_same(entry, huge_ptep_get(ptep))))
1024 if (write_access && !pte_write(entry)) 1992 if (write_access && !pte_write(entry)) {
1025 ret = hugetlb_cow(mm, vma, address, ptep, entry); 1993 struct page *page;
1994 page = hugetlbfs_pagecache_page(h, vma, address);
1995 ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
1996 if (page) {
1997 unlock_page(page);
1998 put_page(page);
1999 }
2000 }
1026 spin_unlock(&mm->page_table_lock); 2001 spin_unlock(&mm->page_table_lock);
1027 mutex_unlock(&hugetlb_instantiation_mutex); 2002 mutex_unlock(&hugetlb_instantiation_mutex);
1028 2003
1029 return ret; 2004 return ret;
1030} 2005}
1031 2006
2007/* Can be overriden by architectures */
2008__attribute__((weak)) struct page *
2009follow_huge_pud(struct mm_struct *mm, unsigned long address,
2010 pud_t *pud, int write)
2011{
2012 BUG();
2013 return NULL;
2014}
2015
1032int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2016int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1033 struct page **pages, struct vm_area_struct **vmas, 2017 struct page **pages, struct vm_area_struct **vmas,
1034 unsigned long *position, int *length, int i, 2018 unsigned long *position, int *length, int i,
@@ -1037,6 +2021,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1037 unsigned long pfn_offset; 2021 unsigned long pfn_offset;
1038 unsigned long vaddr = *position; 2022 unsigned long vaddr = *position;
1039 int remainder = *length; 2023 int remainder = *length;
2024 struct hstate *h = hstate_vma(vma);
1040 2025
1041 spin_lock(&mm->page_table_lock); 2026 spin_lock(&mm->page_table_lock);
1042 while (vaddr < vma->vm_end && remainder) { 2027 while (vaddr < vma->vm_end && remainder) {
@@ -1048,7 +2033,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1048 * each hugepage. We have to make * sure we get the 2033 * each hugepage. We have to make * sure we get the
1049 * first, for the page indexing below to work. 2034 * first, for the page indexing below to work.
1050 */ 2035 */
1051 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 2036 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
1052 2037
1053 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 2038 if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
1054 (write && !pte_write(huge_ptep_get(pte)))) { 2039 (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1066,7 +2051,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1066 break; 2051 break;
1067 } 2052 }
1068 2053
1069 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 2054 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
1070 page = pte_page(huge_ptep_get(pte)); 2055 page = pte_page(huge_ptep_get(pte));
1071same_page: 2056same_page:
1072 if (pages) { 2057 if (pages) {
@@ -1082,7 +2067,7 @@ same_page:
1082 --remainder; 2067 --remainder;
1083 ++i; 2068 ++i;
1084 if (vaddr < vma->vm_end && remainder && 2069 if (vaddr < vma->vm_end && remainder &&
1085 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 2070 pfn_offset < pages_per_huge_page(h)) {
1086 /* 2071 /*
1087 * We use pfn_offset to avoid touching the pageframes 2072 * We use pfn_offset to avoid touching the pageframes
1088 * of this compound page. 2073 * of this compound page.
@@ -1104,13 +2089,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1104 unsigned long start = address; 2089 unsigned long start = address;
1105 pte_t *ptep; 2090 pte_t *ptep;
1106 pte_t pte; 2091 pte_t pte;
2092 struct hstate *h = hstate_vma(vma);
1107 2093
1108 BUG_ON(address >= end); 2094 BUG_ON(address >= end);
1109 flush_cache_range(vma, address, end); 2095 flush_cache_range(vma, address, end);
1110 2096
1111 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2097 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1112 spin_lock(&mm->page_table_lock); 2098 spin_lock(&mm->page_table_lock);
1113 for (; address < end; address += HPAGE_SIZE) { 2099 for (; address < end; address += huge_page_size(h)) {
1114 ptep = huge_pte_offset(mm, address); 2100 ptep = huge_pte_offset(mm, address);
1115 if (!ptep) 2101 if (!ptep)
1116 continue; 2102 continue;
@@ -1128,195 +2114,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1128 flush_tlb_range(vma, start, end); 2114 flush_tlb_range(vma, start, end);
1129} 2115}
1130 2116
1131struct file_region { 2117int hugetlb_reserve_pages(struct inode *inode,
1132 struct list_head link; 2118 long from, long to,
1133 long from; 2119 struct vm_area_struct *vma)
1134 long to;
1135};
1136
1137static long region_add(struct list_head *head, long f, long t)
1138{
1139 struct file_region *rg, *nrg, *trg;
1140
1141 /* Locate the region we are either in or before. */
1142 list_for_each_entry(rg, head, link)
1143 if (f <= rg->to)
1144 break;
1145
1146 /* Round our left edge to the current segment if it encloses us. */
1147 if (f > rg->from)
1148 f = rg->from;
1149
1150 /* Check for and consume any regions we now overlap with. */
1151 nrg = rg;
1152 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1153 if (&rg->link == head)
1154 break;
1155 if (rg->from > t)
1156 break;
1157
1158 /* If this area reaches higher then extend our area to
1159 * include it completely. If this is not the first area
1160 * which we intend to reuse, free it. */
1161 if (rg->to > t)
1162 t = rg->to;
1163 if (rg != nrg) {
1164 list_del(&rg->link);
1165 kfree(rg);
1166 }
1167 }
1168 nrg->from = f;
1169 nrg->to = t;
1170 return 0;
1171}
1172
1173static long region_chg(struct list_head *head, long f, long t)
1174{
1175 struct file_region *rg, *nrg;
1176 long chg = 0;
1177
1178 /* Locate the region we are before or in. */
1179 list_for_each_entry(rg, head, link)
1180 if (f <= rg->to)
1181 break;
1182
1183 /* If we are below the current region then a new region is required.
1184 * Subtle, allocate a new region at the position but make it zero
1185 * size such that we can guarantee to record the reservation. */
1186 if (&rg->link == head || t < rg->from) {
1187 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
1188 if (!nrg)
1189 return -ENOMEM;
1190 nrg->from = f;
1191 nrg->to = f;
1192 INIT_LIST_HEAD(&nrg->link);
1193 list_add(&nrg->link, rg->link.prev);
1194
1195 return t - f;
1196 }
1197
1198 /* Round our left edge to the current segment if it encloses us. */
1199 if (f > rg->from)
1200 f = rg->from;
1201 chg = t - f;
1202
1203 /* Check for and consume any regions we now overlap with. */
1204 list_for_each_entry(rg, rg->link.prev, link) {
1205 if (&rg->link == head)
1206 break;
1207 if (rg->from > t)
1208 return chg;
1209
1210 /* We overlap with this area, if it extends futher than
1211 * us then we must extend ourselves. Account for its
1212 * existing reservation. */
1213 if (rg->to > t) {
1214 chg += rg->to - t;
1215 t = rg->to;
1216 }
1217 chg -= rg->to - rg->from;
1218 }
1219 return chg;
1220}
1221
1222static long region_truncate(struct list_head *head, long end)
1223{ 2120{
1224 struct file_region *rg, *trg; 2121 long ret, chg;
1225 long chg = 0; 2122 struct hstate *h = hstate_inode(inode);
1226 2123
1227 /* Locate the region we are either in or before. */ 2124 if (vma && vma->vm_flags & VM_NORESERVE)
1228 list_for_each_entry(rg, head, link)
1229 if (end <= rg->to)
1230 break;
1231 if (&rg->link == head)
1232 return 0; 2125 return 0;
1233 2126
1234 /* If we are in the middle of a region then adjust it. */
1235 if (end > rg->from) {
1236 chg = rg->to - end;
1237 rg->to = end;
1238 rg = list_entry(rg->link.next, typeof(*rg), link);
1239 }
1240
1241 /* Drop any remaining regions. */
1242 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1243 if (&rg->link == head)
1244 break;
1245 chg += rg->to - rg->from;
1246 list_del(&rg->link);
1247 kfree(rg);
1248 }
1249 return chg;
1250}
1251
1252static int hugetlb_acct_memory(long delta)
1253{
1254 int ret = -ENOMEM;
1255
1256 spin_lock(&hugetlb_lock);
1257 /* 2127 /*
1258 * When cpuset is configured, it breaks the strict hugetlb page 2128 * Shared mappings base their reservation on the number of pages that
1259 * reservation as the accounting is done on a global variable. Such 2129 * are already allocated on behalf of the file. Private mappings need
1260 * reservation is completely rubbish in the presence of cpuset because 2130 * to reserve the full area even if read-only as mprotect() may be
1261 * the reservation is not checked against page availability for the 2131 * called to make the mapping read-write. Assume !vma is a shm mapping
1262 * current cpuset. Application can still potentially OOM'ed by kernel
1263 * with lack of free htlb page in cpuset that the task is in.
1264 * Attempt to enforce strict accounting with cpuset is almost
1265 * impossible (or too ugly) because cpuset is too fluid that
1266 * task or memory node can be dynamically moved between cpusets.
1267 *
1268 * The change of semantics for shared hugetlb mapping with cpuset is
1269 * undesirable. However, in order to preserve some of the semantics,
1270 * we fall back to check against current free page availability as
1271 * a best attempt and hopefully to minimize the impact of changing
1272 * semantics that cpuset has.
1273 */ 2132 */
1274 if (delta > 0) { 2133 if (!vma || vma->vm_flags & VM_SHARED)
1275 if (gather_surplus_pages(delta) < 0) 2134 chg = region_chg(&inode->i_mapping->private_list, from, to);
1276 goto out; 2135 else {
1277 2136 struct resv_map *resv_map = resv_map_alloc();
1278 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 2137 if (!resv_map)
1279 return_unused_surplus_pages(delta); 2138 return -ENOMEM;
1280 goto out;
1281 }
1282 }
1283
1284 ret = 0;
1285 if (delta < 0)
1286 return_unused_surplus_pages((unsigned long) -delta);
1287 2139
1288out: 2140 chg = to - from;
1289 spin_unlock(&hugetlb_lock);
1290 return ret;
1291}
1292 2141
1293int hugetlb_reserve_pages(struct inode *inode, long from, long to) 2142 set_vma_resv_map(vma, resv_map);
1294{ 2143 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
1295 long ret, chg; 2144 }
1296 2145
1297 chg = region_chg(&inode->i_mapping->private_list, from, to);
1298 if (chg < 0) 2146 if (chg < 0)
1299 return chg; 2147 return chg;
1300 2148
1301 if (hugetlb_get_quota(inode->i_mapping, chg)) 2149 if (hugetlb_get_quota(inode->i_mapping, chg))
1302 return -ENOSPC; 2150 return -ENOSPC;
1303 ret = hugetlb_acct_memory(chg); 2151 ret = hugetlb_acct_memory(h, chg);
1304 if (ret < 0) { 2152 if (ret < 0) {
1305 hugetlb_put_quota(inode->i_mapping, chg); 2153 hugetlb_put_quota(inode->i_mapping, chg);
1306 return ret; 2154 return ret;
1307 } 2155 }
1308 region_add(&inode->i_mapping->private_list, from, to); 2156 if (!vma || vma->vm_flags & VM_SHARED)
2157 region_add(&inode->i_mapping->private_list, from, to);
1309 return 0; 2158 return 0;
1310} 2159}
1311 2160
1312void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 2161void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
1313{ 2162{
2163 struct hstate *h = hstate_inode(inode);
1314 long chg = region_truncate(&inode->i_mapping->private_list, offset); 2164 long chg = region_truncate(&inode->i_mapping->private_list, offset);
1315 2165
1316 spin_lock(&inode->i_lock); 2166 spin_lock(&inode->i_lock);
1317 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 2167 inode->i_blocks -= blocks_per_huge_page(h);
1318 spin_unlock(&inode->i_lock); 2168 spin_unlock(&inode->i_lock);
1319 2169
1320 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2170 hugetlb_put_quota(inode->i_mapping, (chg - freed));
1321 hugetlb_acct_memory(-(chg - freed)); 2171 hugetlb_acct_memory(h, -(chg - freed));
1322} 2172}
diff --git a/mm/internal.h b/mm/internal.h
index 0034e947e4bc..1f43f7416972 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,11 @@
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15 15
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling);
18
19extern void prep_compound_page(struct page *page, unsigned long order);
20
16static inline void set_page_count(struct page *page, int v) 21static inline void set_page_count(struct page *page, int v)
17{ 22{
18 atomic_set(&page->_count, v); 23 atomic_set(&page->_count, v);
@@ -59,4 +64,60 @@ static inline unsigned long page_order(struct page *page)
59#define __paginginit __init 64#define __paginginit __init
60#endif 65#endif
61 66
67/* Memory initialisation debug and verification */
68enum mminit_level {
69 MMINIT_WARNING,
70 MMINIT_VERIFY,
71 MMINIT_TRACE
72};
73
74#ifdef CONFIG_DEBUG_MEMORY_INIT
75
76extern int mminit_loglevel;
77
78#define mminit_dprintk(level, prefix, fmt, arg...) \
79do { \
80 if (level < mminit_loglevel) { \
81 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
82 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
83 } \
84} while (0)
85
86extern void mminit_verify_pageflags_layout(void);
87extern void mminit_verify_page_links(struct page *page,
88 enum zone_type zone, unsigned long nid, unsigned long pfn);
89extern void mminit_verify_zonelist(void);
90
91#else
92
93static inline void mminit_dprintk(enum mminit_level level,
94 const char *prefix, const char *fmt, ...)
95{
96}
97
98static inline void mminit_verify_pageflags_layout(void)
99{
100}
101
102static inline void mminit_verify_page_links(struct page *page,
103 enum zone_type zone, unsigned long nid, unsigned long pfn)
104{
105}
106
107static inline void mminit_verify_zonelist(void)
108{
109}
110#endif /* CONFIG_DEBUG_MEMORY_INIT */
111
112/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
113#if defined(CONFIG_SPARSEMEM)
114extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
115 unsigned long *end_pfn);
116#else
117static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
118 unsigned long *end_pfn)
119{
120}
121#endif /* CONFIG_SPARSEMEM */
122
62#endif 123#endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e1d9b7..fba566c51322 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,9 +35,9 @@
35 35
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37 37
38struct cgroup_subsys mem_cgroup_subsys; 38struct cgroup_subsys mem_cgroup_subsys __read_mostly;
39static const int MEM_CGROUP_RECLAIM_RETRIES = 5; 39static struct kmem_cache *page_cgroup_cache __read_mostly;
40static struct kmem_cache *page_cgroup_cache; 40#define MEM_CGROUP_RECLAIM_RETRIES 5
41 41
42/* 42/*
43 * Statistics for memory cgroup. 43 * Statistics for memory cgroup.
@@ -166,7 +166,6 @@ struct page_cgroup {
166 struct list_head lru; /* per cgroup LRU list */ 166 struct list_head lru; /* per cgroup LRU list */
167 struct page *page; 167 struct page *page;
168 struct mem_cgroup *mem_cgroup; 168 struct mem_cgroup *mem_cgroup;
169 int ref_cnt; /* cached, mapped, migrating */
170 int flags; 169 int flags;
171}; 170};
172#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 171#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
@@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
185enum charge_type { 184enum charge_type {
186 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 185 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
187 MEM_CGROUP_CHARGE_TYPE_MAPPED, 186 MEM_CGROUP_CHARGE_TYPE_MAPPED,
187 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
188}; 188};
189 189
190/* 190/*
@@ -296,7 +296,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
297 297
298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
299 list_del_init(&pc->lru); 299 list_del(&pc->lru);
300} 300}
301 301
302static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 302static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
@@ -354,6 +354,9 @@ void mem_cgroup_move_lists(struct page *page, bool active)
354 struct mem_cgroup_per_zone *mz; 354 struct mem_cgroup_per_zone *mz;
355 unsigned long flags; 355 unsigned long flags;
356 356
357 if (mem_cgroup_subsys.disabled)
358 return;
359
357 /* 360 /*
358 * We cannot lock_page_cgroup while holding zone's lru_lock, 361 * We cannot lock_page_cgroup while holding zone's lru_lock,
359 * because other holders of lock_page_cgroup can be interrupted 362 * because other holders of lock_page_cgroup can be interrupted
@@ -524,7 +527,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
524 * < 0 if the cgroup is over its limit 527 * < 0 if the cgroup is over its limit
525 */ 528 */
526static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 529static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
527 gfp_t gfp_mask, enum charge_type ctype) 530 gfp_t gfp_mask, enum charge_type ctype,
531 struct mem_cgroup *memcg)
528{ 532{
529 struct mem_cgroup *mem; 533 struct mem_cgroup *mem;
530 struct page_cgroup *pc; 534 struct page_cgroup *pc;
@@ -532,35 +536,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
532 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 536 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
533 struct mem_cgroup_per_zone *mz; 537 struct mem_cgroup_per_zone *mz;
534 538
535 if (mem_cgroup_subsys.disabled) 539 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
536 return 0; 540 if (unlikely(pc == NULL))
537
538 /*
539 * Should page_cgroup's go to their own slab?
540 * One could optimize the performance of the charging routine
541 * by saving a bit in the page_flags and using it as a lock
542 * to see if the cgroup page already has a page_cgroup associated
543 * with it
544 */
545retry:
546 lock_page_cgroup(page);
547 pc = page_get_page_cgroup(page);
548 /*
549 * The page_cgroup exists and
550 * the page has already been accounted.
551 */
552 if (pc) {
553 VM_BUG_ON(pc->page != page);
554 VM_BUG_ON(pc->ref_cnt <= 0);
555
556 pc->ref_cnt++;
557 unlock_page_cgroup(page);
558 goto done;
559 }
560 unlock_page_cgroup(page);
561
562 pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
563 if (pc == NULL)
564 goto err; 541 goto err;
565 542
566 /* 543 /*
@@ -569,16 +546,18 @@ retry:
569 * thread group leader migrates. It's possible that mm is not 546 * thread group leader migrates. It's possible that mm is not
570 * set, if so charge the init_mm (happens for pagecache usage). 547 * set, if so charge the init_mm (happens for pagecache usage).
571 */ 548 */
572 if (!mm) 549 if (likely(!memcg)) {
573 mm = &init_mm; 550 rcu_read_lock();
574 551 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
575 rcu_read_lock(); 552 /*
576 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 553 * For every charge from the cgroup, increment reference count
577 /* 554 */
578 * For every charge from the cgroup, increment reference count 555 css_get(&mem->css);
579 */ 556 rcu_read_unlock();
580 css_get(&mem->css); 557 } else {
581 rcu_read_unlock(); 558 mem = memcg;
559 css_get(&memcg->css);
560 }
582 561
583 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 562 while (res_counter_charge(&mem->res, PAGE_SIZE)) {
584 if (!(gfp_mask & __GFP_WAIT)) 563 if (!(gfp_mask & __GFP_WAIT))
@@ -603,25 +582,24 @@ retry:
603 } 582 }
604 } 583 }
605 584
606 pc->ref_cnt = 1;
607 pc->mem_cgroup = mem; 585 pc->mem_cgroup = mem;
608 pc->page = page; 586 pc->page = page;
609 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 587 /*
588 * If a page is accounted as a page cache, insert to inactive list.
589 * If anon, insert to active list.
590 */
610 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 591 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
611 pc->flags = PAGE_CGROUP_FLAG_CACHE; 592 pc->flags = PAGE_CGROUP_FLAG_CACHE;
593 else
594 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
612 595
613 lock_page_cgroup(page); 596 lock_page_cgroup(page);
614 if (page_get_page_cgroup(page)) { 597 if (unlikely(page_get_page_cgroup(page))) {
615 unlock_page_cgroup(page); 598 unlock_page_cgroup(page);
616 /*
617 * Another charge has been added to this page already.
618 * We take lock_page_cgroup(page) again and read
619 * page->cgroup, increment refcnt.... just retry is OK.
620 */
621 res_counter_uncharge(&mem->res, PAGE_SIZE); 599 res_counter_uncharge(&mem->res, PAGE_SIZE);
622 css_put(&mem->css); 600 css_put(&mem->css);
623 kmem_cache_free(page_cgroup_cache, pc); 601 kmem_cache_free(page_cgroup_cache, pc);
624 goto retry; 602 goto done;
625 } 603 }
626 page_assign_page_cgroup(page, pc); 604 page_assign_page_cgroup(page, pc);
627 605
@@ -642,24 +620,65 @@ err:
642 620
643int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 621int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
644{ 622{
623 if (mem_cgroup_subsys.disabled)
624 return 0;
625
626 /*
627 * If already mapped, we don't have to account.
628 * If page cache, page->mapping has address_space.
629 * But page->mapping may have out-of-use anon_vma pointer,
630 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
631 * is NULL.
632 */
633 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
634 return 0;
635 if (unlikely(!mm))
636 mm = &init_mm;
645 return mem_cgroup_charge_common(page, mm, gfp_mask, 637 return mem_cgroup_charge_common(page, mm, gfp_mask,
646 MEM_CGROUP_CHARGE_TYPE_MAPPED); 638 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
647} 639}
648 640
649int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 641int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
650 gfp_t gfp_mask) 642 gfp_t gfp_mask)
651{ 643{
652 if (!mm) 644 if (mem_cgroup_subsys.disabled)
645 return 0;
646
647 /*
648 * Corner case handling. This is called from add_to_page_cache()
649 * in usual. But some FS (shmem) precharges this page before calling it
650 * and call add_to_page_cache() with GFP_NOWAIT.
651 *
652 * For GFP_NOWAIT case, the page may be pre-charged before calling
653 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
654 * charge twice. (It works but has to pay a bit larger cost.)
655 */
656 if (!(gfp_mask & __GFP_WAIT)) {
657 struct page_cgroup *pc;
658
659 lock_page_cgroup(page);
660 pc = page_get_page_cgroup(page);
661 if (pc) {
662 VM_BUG_ON(pc->page != page);
663 VM_BUG_ON(!pc->mem_cgroup);
664 unlock_page_cgroup(page);
665 return 0;
666 }
667 unlock_page_cgroup(page);
668 }
669
670 if (unlikely(!mm))
653 mm = &init_mm; 671 mm = &init_mm;
672
654 return mem_cgroup_charge_common(page, mm, gfp_mask, 673 return mem_cgroup_charge_common(page, mm, gfp_mask,
655 MEM_CGROUP_CHARGE_TYPE_CACHE); 674 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
656} 675}
657 676
658/* 677/*
659 * Uncharging is always a welcome operation, we never complain, simply 678 * uncharge if !page_mapped(page)
660 * uncharge.
661 */ 679 */
662void mem_cgroup_uncharge_page(struct page *page) 680static void
681__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
663{ 682{
664 struct page_cgroup *pc; 683 struct page_cgroup *pc;
665 struct mem_cgroup *mem; 684 struct mem_cgroup *mem;
@@ -674,98 +693,151 @@ void mem_cgroup_uncharge_page(struct page *page)
674 */ 693 */
675 lock_page_cgroup(page); 694 lock_page_cgroup(page);
676 pc = page_get_page_cgroup(page); 695 pc = page_get_page_cgroup(page);
677 if (!pc) 696 if (unlikely(!pc))
678 goto unlock; 697 goto unlock;
679 698
680 VM_BUG_ON(pc->page != page); 699 VM_BUG_ON(pc->page != page);
681 VM_BUG_ON(pc->ref_cnt <= 0);
682 700
683 if (--(pc->ref_cnt) == 0) { 701 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
684 mz = page_cgroup_zoneinfo(pc); 702 && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
685 spin_lock_irqsave(&mz->lru_lock, flags); 703 || page_mapped(page)))
686 __mem_cgroup_remove_list(mz, pc); 704 goto unlock;
687 spin_unlock_irqrestore(&mz->lru_lock, flags);
688 705
689 page_assign_page_cgroup(page, NULL); 706 mz = page_cgroup_zoneinfo(pc);
690 unlock_page_cgroup(page); 707 spin_lock_irqsave(&mz->lru_lock, flags);
708 __mem_cgroup_remove_list(mz, pc);
709 spin_unlock_irqrestore(&mz->lru_lock, flags);
691 710
692 mem = pc->mem_cgroup; 711 page_assign_page_cgroup(page, NULL);
693 res_counter_uncharge(&mem->res, PAGE_SIZE); 712 unlock_page_cgroup(page);
694 css_put(&mem->css);
695 713
696 kmem_cache_free(page_cgroup_cache, pc); 714 mem = pc->mem_cgroup;
697 return; 715 res_counter_uncharge(&mem->res, PAGE_SIZE);
698 } 716 css_put(&mem->css);
699 717
718 kmem_cache_free(page_cgroup_cache, pc);
719 return;
700unlock: 720unlock:
701 unlock_page_cgroup(page); 721 unlock_page_cgroup(page);
702} 722}
703 723
724void mem_cgroup_uncharge_page(struct page *page)
725{
726 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
727}
728
729void mem_cgroup_uncharge_cache_page(struct page *page)
730{
731 VM_BUG_ON(page_mapped(page));
732 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
733}
734
704/* 735/*
705 * Returns non-zero if a page (under migration) has valid page_cgroup member. 736 * Before starting migration, account against new page.
706 * Refcnt of page_cgroup is incremented.
707 */ 737 */
708int mem_cgroup_prepare_migration(struct page *page) 738int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
709{ 739{
710 struct page_cgroup *pc; 740 struct page_cgroup *pc;
741 struct mem_cgroup *mem = NULL;
742 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
743 int ret = 0;
711 744
712 if (mem_cgroup_subsys.disabled) 745 if (mem_cgroup_subsys.disabled)
713 return 0; 746 return 0;
714 747
715 lock_page_cgroup(page); 748 lock_page_cgroup(page);
716 pc = page_get_page_cgroup(page); 749 pc = page_get_page_cgroup(page);
717 if (pc) 750 if (pc) {
718 pc->ref_cnt++; 751 mem = pc->mem_cgroup;
752 css_get(&mem->css);
753 if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
754 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
755 }
719 unlock_page_cgroup(page); 756 unlock_page_cgroup(page);
720 return pc != NULL; 757 if (mem) {
758 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
759 ctype, mem);
760 css_put(&mem->css);
761 }
762 return ret;
721} 763}
722 764
723void mem_cgroup_end_migration(struct page *page) 765/* remove redundant charge if migration failed*/
766void mem_cgroup_end_migration(struct page *newpage)
724{ 767{
725 mem_cgroup_uncharge_page(page); 768 /*
769 * At success, page->mapping is not NULL.
770 * special rollback care is necessary when
771 * 1. at migration failure. (newpage->mapping is cleared in this case)
772 * 2. the newpage was moved but not remapped again because the task
773 * exits and the newpage is obsolete. In this case, the new page
774 * may be a swapcache. So, we just call mem_cgroup_uncharge_page()
775 * always for avoiding mess. The page_cgroup will be removed if
776 * unnecessary. File cache pages is still on radix-tree. Don't
777 * care it.
778 */
779 if (!newpage->mapping)
780 __mem_cgroup_uncharge_common(newpage,
781 MEM_CGROUP_CHARGE_TYPE_FORCE);
782 else if (PageAnon(newpage))
783 mem_cgroup_uncharge_page(newpage);
726} 784}
727 785
728/* 786/*
729 * We know both *page* and *newpage* are now not-on-LRU and PG_locked. 787 * A call to try to shrink memory usage under specified resource controller.
730 * And no race with uncharge() routines because page_cgroup for *page* 788 * This is typically used for page reclaiming for shmem for reducing side
731 * has extra one reference by mem_cgroup_prepare_migration. 789 * effect of page allocation from shmem, which is used by some mem_cgroup.
732 */ 790 */
733void mem_cgroup_page_migration(struct page *page, struct page *newpage) 791int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
734{ 792{
735 struct page_cgroup *pc; 793 struct mem_cgroup *mem;
736 struct mem_cgroup_per_zone *mz; 794 int progress = 0;
737 unsigned long flags; 795 int retry = MEM_CGROUP_RECLAIM_RETRIES;
738 796
739 lock_page_cgroup(page); 797 if (mem_cgroup_subsys.disabled)
740 pc = page_get_page_cgroup(page); 798 return 0;
741 if (!pc) {
742 unlock_page_cgroup(page);
743 return;
744 }
745 799
746 mz = page_cgroup_zoneinfo(pc); 800 rcu_read_lock();
747 spin_lock_irqsave(&mz->lru_lock, flags); 801 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
748 __mem_cgroup_remove_list(mz, pc); 802 css_get(&mem->css);
749 spin_unlock_irqrestore(&mz->lru_lock, flags); 803 rcu_read_unlock();
750 804
751 page_assign_page_cgroup(page, NULL); 805 do {
752 unlock_page_cgroup(page); 806 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
807 } while (!progress && --retry);
753 808
754 pc->page = newpage; 809 css_put(&mem->css);
755 lock_page_cgroup(newpage); 810 if (!retry)
756 page_assign_page_cgroup(newpage, pc); 811 return -ENOMEM;
812 return 0;
813}
757 814
758 mz = page_cgroup_zoneinfo(pc); 815int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
759 spin_lock_irqsave(&mz->lru_lock, flags); 816{
760 __mem_cgroup_add_list(mz, pc); 817
761 spin_unlock_irqrestore(&mz->lru_lock, flags); 818 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
819 int progress;
820 int ret = 0;
762 821
763 unlock_page_cgroup(newpage); 822 while (res_counter_set_limit(&memcg->res, val)) {
823 if (signal_pending(current)) {
824 ret = -EINTR;
825 break;
826 }
827 if (!retry_count) {
828 ret = -EBUSY;
829 break;
830 }
831 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
832 if (!progress)
833 retry_count--;
834 }
835 return ret;
764} 836}
765 837
838
766/* 839/*
767 * This routine traverse page_cgroup in given list and drop them all. 840 * This routine traverse page_cgroup in given list and drop them all.
768 * This routine ignores page_cgroup->ref_cnt.
769 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 841 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
770 */ 842 */
771#define FORCE_UNCHARGE_BATCH (128) 843#define FORCE_UNCHARGE_BATCH (128)
@@ -790,12 +862,20 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
790 page = pc->page; 862 page = pc->page;
791 get_page(page); 863 get_page(page);
792 spin_unlock_irqrestore(&mz->lru_lock, flags); 864 spin_unlock_irqrestore(&mz->lru_lock, flags);
793 mem_cgroup_uncharge_page(page); 865 /*
794 put_page(page); 866 * Check if this page is on LRU. !LRU page can be found
795 if (--count <= 0) { 867 * if it's under page migration.
796 count = FORCE_UNCHARGE_BATCH; 868 */
869 if (PageLRU(page)) {
870 __mem_cgroup_uncharge_common(page,
871 MEM_CGROUP_CHARGE_TYPE_FORCE);
872 put_page(page);
873 if (--count <= 0) {
874 count = FORCE_UNCHARGE_BATCH;
875 cond_resched();
876 }
877 } else
797 cond_resched(); 878 cond_resched();
798 }
799 spin_lock_irqsave(&mz->lru_lock, flags); 879 spin_lock_irqsave(&mz->lru_lock, flags);
800 } 880 }
801 spin_unlock_irqrestore(&mz->lru_lock, flags); 881 spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -810,9 +890,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
810 int ret = -EBUSY; 890 int ret = -EBUSY;
811 int node, zid; 891 int node, zid;
812 892
813 if (mem_cgroup_subsys.disabled)
814 return 0;
815
816 css_get(&mem->css); 893 css_get(&mem->css);
817 /* 894 /*
818 * page reclaim code (kswapd etc..) will move pages between 895 * page reclaim code (kswapd etc..) will move pages between
@@ -838,32 +915,34 @@ out:
838 return ret; 915 return ret;
839} 916}
840 917
841static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
842{
843 *tmp = memparse(buf, &buf);
844 if (*buf != '\0')
845 return -EINVAL;
846
847 /*
848 * Round up the value to the closest page size
849 */
850 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
851 return 0;
852}
853
854static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 918static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
855{ 919{
856 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 920 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
857 cft->private); 921 cft->private);
858} 922}
859 923/*
860static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 924 * The user of this function is...
861 struct file *file, const char __user *userbuf, 925 * RES_LIMIT.
862 size_t nbytes, loff_t *ppos) 926 */
927static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
928 const char *buffer)
863{ 929{
864 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 930 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
865 cft->private, userbuf, nbytes, ppos, 931 unsigned long long val;
866 mem_cgroup_write_strategy); 932 int ret;
933
934 switch (cft->private) {
935 case RES_LIMIT:
936 /* This function does all necessary parse...reuse it */
937 ret = res_counter_memparse_write_strategy(buffer, &val);
938 if (!ret)
939 ret = mem_cgroup_resize_limit(memcg, val);
940 break;
941 default:
942 ret = -EINVAL; /* should be BUG() ? */
943 break;
944 }
945 return ret;
867} 946}
868 947
869static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 948static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -940,7 +1019,7 @@ static struct cftype mem_cgroup_files[] = {
940 { 1019 {
941 .name = "limit_in_bytes", 1020 .name = "limit_in_bytes",
942 .private = RES_LIMIT, 1021 .private = RES_LIMIT,
943 .write = mem_cgroup_write, 1022 .write_string = mem_cgroup_write,
944 .read_u64 = mem_cgroup_read, 1023 .read_u64 = mem_cgroup_read,
945 }, 1024 },
946 { 1025 {
@@ -1070,8 +1149,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1070static int mem_cgroup_populate(struct cgroup_subsys *ss, 1149static int mem_cgroup_populate(struct cgroup_subsys *ss,
1071 struct cgroup *cont) 1150 struct cgroup *cont)
1072{ 1151{
1073 if (mem_cgroup_subsys.disabled)
1074 return 0;
1075 return cgroup_add_files(cont, ss, mem_cgroup_files, 1152 return cgroup_add_files(cont, ss, mem_cgroup_files,
1076 ARRAY_SIZE(mem_cgroup_files)); 1153 ARRAY_SIZE(mem_cgroup_files));
1077} 1154}
@@ -1084,9 +1161,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1084 struct mm_struct *mm; 1161 struct mm_struct *mm;
1085 struct mem_cgroup *mem, *old_mem; 1162 struct mem_cgroup *mem, *old_mem;
1086 1163
1087 if (mem_cgroup_subsys.disabled)
1088 return;
1089
1090 mm = get_task_mm(p); 1164 mm = get_task_mm(p);
1091 if (mm == NULL) 1165 if (mm == NULL)
1092 return; 1166 return;
diff --git a/mm/memory.c b/mm/memory.c
index 2302d228fe04..262e3eb6601a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,8 @@
61#include <linux/swapops.h> 61#include <linux/swapops.h>
62#include <linux/elf.h> 62#include <linux/elf.h>
63 63
64#include "internal.h"
65
64#ifndef CONFIG_NEED_MULTIPLE_NODES 66#ifndef CONFIG_NEED_MULTIPLE_NODES
65/* use the per-pgdat data instead for discontigmem - mbligh */ 67/* use the per-pgdat data instead for discontigmem - mbligh */
66unsigned long max_mapnr; 68unsigned long max_mapnr;
@@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
211 * 213 *
212 * Must be called with pagetable lock held. 214 * Must be called with pagetable lock held.
213 */ 215 */
214void free_pgd_range(struct mmu_gather **tlb, 216void free_pgd_range(struct mmu_gather *tlb,
215 unsigned long addr, unsigned long end, 217 unsigned long addr, unsigned long end,
216 unsigned long floor, unsigned long ceiling) 218 unsigned long floor, unsigned long ceiling)
217{ 219{
@@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb,
262 return; 264 return;
263 265
264 start = addr; 266 start = addr;
265 pgd = pgd_offset((*tlb)->mm, addr); 267 pgd = pgd_offset(tlb->mm, addr);
266 do { 268 do {
267 next = pgd_addr_end(addr, end); 269 next = pgd_addr_end(addr, end);
268 if (pgd_none_or_clear_bad(pgd)) 270 if (pgd_none_or_clear_bad(pgd))
269 continue; 271 continue;
270 free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 272 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
271 } while (pgd++, addr = next, addr != end); 273 } while (pgd++, addr = next, addr != end);
272} 274}
273 275
274void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, 276void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
275 unsigned long floor, unsigned long ceiling) 277 unsigned long floor, unsigned long ceiling)
276{ 278{
277 while (vma) { 279 while (vma) {
@@ -899,9 +901,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
899 } 901 }
900 902
901 if (unlikely(is_vm_hugetlb_page(vma))) { 903 if (unlikely(is_vm_hugetlb_page(vma))) {
902 unmap_hugepage_range(vma, start, end); 904 /*
903 zap_work -= (end - start) / 905 * It is undesirable to test vma->vm_file as it
904 (HPAGE_SIZE / PAGE_SIZE); 906 * should be non-null for valid hugetlb area.
907 * However, vm_file will be NULL in the error
908 * cleanup path of do_mmap_pgoff. When
909 * hugetlbfs ->mmap method fails,
910 * do_mmap_pgoff() nullifies vma->vm_file
911 * before calling this function to clean up.
912 * Since no pte has actually been setup, it is
913 * safe to do nothing in this case.
914 */
915 if (vma->vm_file) {
916 unmap_hugepage_range(vma, start, end, NULL);
917 zap_work -= (end - start) /
918 pages_per_huge_page(hstate_vma(vma));
919 }
920
905 start = end; 921 start = end;
906 } else 922 } else
907 start = unmap_page_range(*tlbp, vma, 923 start = unmap_page_range(*tlbp, vma,
@@ -982,19 +998,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
982 goto no_page_table; 998 goto no_page_table;
983 999
984 pud = pud_offset(pgd, address); 1000 pud = pud_offset(pgd, address);
985 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 1001 if (pud_none(*pud))
986 goto no_page_table; 1002 goto no_page_table;
987 1003 if (pud_huge(*pud)) {
1004 BUG_ON(flags & FOLL_GET);
1005 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1006 goto out;
1007 }
1008 if (unlikely(pud_bad(*pud)))
1009 goto no_page_table;
1010
988 pmd = pmd_offset(pud, address); 1011 pmd = pmd_offset(pud, address);
989 if (pmd_none(*pmd)) 1012 if (pmd_none(*pmd))
990 goto no_page_table; 1013 goto no_page_table;
991
992 if (pmd_huge(*pmd)) { 1014 if (pmd_huge(*pmd)) {
993 BUG_ON(flags & FOLL_GET); 1015 BUG_ON(flags & FOLL_GET);
994 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1016 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
995 goto out; 1017 goto out;
996 } 1018 }
997
998 if (unlikely(pmd_bad(*pmd))) 1019 if (unlikely(pmd_bad(*pmd)))
999 goto no_page_table; 1020 goto no_page_table;
1000 1021
@@ -1058,11 +1079,9 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1058 if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) 1079 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1059 return 0; 1080 return 0;
1060 /* 1081 /*
1061 * And if we have a fault or a nopfn routine, it's not an 1082 * And if we have a fault routine, it's not an anonymous region.
1062 * anonymous region.
1063 */ 1083 */
1064 return !vma->vm_ops || 1084 return !vma->vm_ops || !vma->vm_ops->fault;
1065 (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
1066} 1085}
1067 1086
1068int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1087int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -1338,6 +1357,11 @@ out:
1338 * 1357 *
1339 * This function should only be called from a vm_ops->fault handler, and 1358 * This function should only be called from a vm_ops->fault handler, and
1340 * in that case the handler should return NULL. 1359 * in that case the handler should return NULL.
1360 *
1361 * vma cannot be a COW mapping.
1362 *
1363 * As this is called only for pages that do not currently exist, we
1364 * do not need to flush old virtual caches or the TLB.
1341 */ 1365 */
1342int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1366int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1343 unsigned long pfn) 1367 unsigned long pfn)
@@ -1548,6 +1572,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1548 unsigned long next; 1572 unsigned long next;
1549 int err; 1573 int err;
1550 1574
1575 BUG_ON(pud_huge(*pud));
1576
1551 pmd = pmd_alloc(mm, pud, addr); 1577 pmd = pmd_alloc(mm, pud, addr);
1552 if (!pmd) 1578 if (!pmd)
1553 return -ENOMEM; 1579 return -ENOMEM;
@@ -2501,59 +2527,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2501 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2527 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2502} 2528}
2503 2529
2504
2505/*
2506 * do_no_pfn() tries to create a new page mapping for a page without
2507 * a struct_page backing it
2508 *
2509 * As this is called only for pages that do not currently exist, we
2510 * do not need to flush old virtual caches or the TLB.
2511 *
2512 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2513 * but allow concurrent faults), and pte mapped but not yet locked.
2514 * We return with mmap_sem still held, but pte unmapped and unlocked.
2515 *
2516 * It is expected that the ->nopfn handler always returns the same pfn
2517 * for a given virtual mapping.
2518 *
2519 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2520 */
2521static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2522 unsigned long address, pte_t *page_table, pmd_t *pmd,
2523 int write_access)
2524{
2525 spinlock_t *ptl;
2526 pte_t entry;
2527 unsigned long pfn;
2528
2529 pte_unmap(page_table);
2530 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2531 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2532
2533 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2534
2535 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2536
2537 if (unlikely(pfn == NOPFN_OOM))
2538 return VM_FAULT_OOM;
2539 else if (unlikely(pfn == NOPFN_SIGBUS))
2540 return VM_FAULT_SIGBUS;
2541 else if (unlikely(pfn == NOPFN_REFAULT))
2542 return 0;
2543
2544 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2545
2546 /* Only go through if we didn't race with anybody else... */
2547 if (pte_none(*page_table)) {
2548 entry = pfn_pte(pfn, vma->vm_page_prot);
2549 if (write_access)
2550 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2551 set_pte_at(mm, address, page_table, entry);
2552 }
2553 pte_unmap_unlock(page_table, ptl);
2554 return 0;
2555}
2556
2557/* 2530/*
2558 * Fault of a previously existing named mapping. Repopulate the pte 2531 * Fault of a previously existing named mapping. Repopulate the pte
2559 * from the encoded file_pte if possible. This enables swappable 2532 * from the encoded file_pte if possible. This enables swappable
@@ -2614,9 +2587,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2614 if (likely(vma->vm_ops->fault)) 2587 if (likely(vma->vm_ops->fault))
2615 return do_linear_fault(mm, vma, address, 2588 return do_linear_fault(mm, vma, address,
2616 pte, pmd, write_access, entry); 2589 pte, pmd, write_access, entry);
2617 if (unlikely(vma->vm_ops->nopfn))
2618 return do_no_pfn(mm, vma, address, pte,
2619 pmd, write_access);
2620 } 2590 }
2621 return do_anonymous_page(mm, vma, address, 2591 return do_anonymous_page(mm, vma, address,
2622 pte, pmd, write_access); 2592 pte, pmd, write_access);
@@ -2804,6 +2774,86 @@ int in_gate_area_no_task(unsigned long addr)
2804 2774
2805#endif /* __HAVE_ARCH_GATE_AREA */ 2775#endif /* __HAVE_ARCH_GATE_AREA */
2806 2776
2777#ifdef CONFIG_HAVE_IOREMAP_PROT
2778static resource_size_t follow_phys(struct vm_area_struct *vma,
2779 unsigned long address, unsigned int flags,
2780 unsigned long *prot)
2781{
2782 pgd_t *pgd;
2783 pud_t *pud;
2784 pmd_t *pmd;
2785 pte_t *ptep, pte;
2786 spinlock_t *ptl;
2787 resource_size_t phys_addr = 0;
2788 struct mm_struct *mm = vma->vm_mm;
2789
2790 VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
2791
2792 pgd = pgd_offset(mm, address);
2793 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
2794 goto no_page_table;
2795
2796 pud = pud_offset(pgd, address);
2797 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
2798 goto no_page_table;
2799
2800 pmd = pmd_offset(pud, address);
2801 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
2802 goto no_page_table;
2803
2804 /* We cannot handle huge page PFN maps. Luckily they don't exist. */
2805 if (pmd_huge(*pmd))
2806 goto no_page_table;
2807
2808 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
2809 if (!ptep)
2810 goto out;
2811
2812 pte = *ptep;
2813 if (!pte_present(pte))
2814 goto unlock;
2815 if ((flags & FOLL_WRITE) && !pte_write(pte))
2816 goto unlock;
2817 phys_addr = pte_pfn(pte);
2818 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
2819
2820 *prot = pgprot_val(pte_pgprot(pte));
2821
2822unlock:
2823 pte_unmap_unlock(ptep, ptl);
2824out:
2825 return phys_addr;
2826no_page_table:
2827 return 0;
2828}
2829
2830int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2831 void *buf, int len, int write)
2832{
2833 resource_size_t phys_addr;
2834 unsigned long prot = 0;
2835 void *maddr;
2836 int offset = addr & (PAGE_SIZE-1);
2837
2838 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
2839 return -EINVAL;
2840
2841 phys_addr = follow_phys(vma, addr, write, &prot);
2842
2843 if (!phys_addr)
2844 return -EINVAL;
2845
2846 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
2847 if (write)
2848 memcpy_toio(maddr + offset, buf, len);
2849 else
2850 memcpy_fromio(buf, maddr + offset, len);
2851 iounmap(maddr);
2852
2853 return len;
2854}
2855#endif
2856
2807/* 2857/*
2808 * Access another process' address space. 2858 * Access another process' address space.
2809 * Source/target buffer must be kernel space, 2859 * Source/target buffer must be kernel space,
@@ -2813,7 +2863,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2813{ 2863{
2814 struct mm_struct *mm; 2864 struct mm_struct *mm;
2815 struct vm_area_struct *vma; 2865 struct vm_area_struct *vma;
2816 struct page *page;
2817 void *old_buf = buf; 2866 void *old_buf = buf;
2818 2867
2819 mm = get_task_mm(tsk); 2868 mm = get_task_mm(tsk);
@@ -2825,28 +2874,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2825 while (len) { 2874 while (len) {
2826 int bytes, ret, offset; 2875 int bytes, ret, offset;
2827 void *maddr; 2876 void *maddr;
2877 struct page *page = NULL;
2828 2878
2829 ret = get_user_pages(tsk, mm, addr, 1, 2879 ret = get_user_pages(tsk, mm, addr, 1,
2830 write, 1, &page, &vma); 2880 write, 1, &page, &vma);
2831 if (ret <= 0) 2881 if (ret <= 0) {
2832 break; 2882 /*
2833 2883 * Check if this is a VM_IO | VM_PFNMAP VMA, which
2834 bytes = len; 2884 * we can access using slightly different code.
2835 offset = addr & (PAGE_SIZE-1); 2885 */
2836 if (bytes > PAGE_SIZE-offset) 2886#ifdef CONFIG_HAVE_IOREMAP_PROT
2837 bytes = PAGE_SIZE-offset; 2887 vma = find_vma(mm, addr);
2838 2888 if (!vma)
2839 maddr = kmap(page); 2889 break;
2840 if (write) { 2890 if (vma->vm_ops && vma->vm_ops->access)
2841 copy_to_user_page(vma, page, addr, 2891 ret = vma->vm_ops->access(vma, addr, buf,
2842 maddr + offset, buf, bytes); 2892 len, write);
2843 set_page_dirty_lock(page); 2893 if (ret <= 0)
2894#endif
2895 break;
2896 bytes = ret;
2844 } else { 2897 } else {
2845 copy_from_user_page(vma, page, addr, 2898 bytes = len;
2846 buf, maddr + offset, bytes); 2899 offset = addr & (PAGE_SIZE-1);
2900 if (bytes > PAGE_SIZE-offset)
2901 bytes = PAGE_SIZE-offset;
2902
2903 maddr = kmap(page);
2904 if (write) {
2905 copy_to_user_page(vma, page, addr,
2906 maddr + offset, buf, bytes);
2907 set_page_dirty_lock(page);
2908 } else {
2909 copy_from_user_page(vma, page, addr,
2910 buf, maddr + offset, bytes);
2911 }
2912 kunmap(page);
2913 page_cache_release(page);
2847 } 2914 }
2848 kunmap(page);
2849 page_cache_release(page);
2850 len -= bytes; 2915 len -= bytes;
2851 buf += bytes; 2916 buf += bytes;
2852 addr += bytes; 2917 addr += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 833f854eabe5..89fee2dcb039 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -62,9 +62,9 @@ static void release_memory_resource(struct resource *res)
62 62
63#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 63#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
64#ifndef CONFIG_SPARSEMEM_VMEMMAP 64#ifndef CONFIG_SPARSEMEM_VMEMMAP
65static void get_page_bootmem(unsigned long info, struct page *page, int magic) 65static void get_page_bootmem(unsigned long info, struct page *page, int type)
66{ 66{
67 atomic_set(&page->_mapcount, magic); 67 atomic_set(&page->_mapcount, type);
68 SetPagePrivate(page); 68 SetPagePrivate(page);
69 set_page_private(page, info); 69 set_page_private(page, info);
70 atomic_inc(&page->_count); 70 atomic_inc(&page->_count);
@@ -72,10 +72,10 @@ static void get_page_bootmem(unsigned long info, struct page *page, int magic)
72 72
73void put_page_bootmem(struct page *page) 73void put_page_bootmem(struct page *page)
74{ 74{
75 int magic; 75 int type;
76 76
77 magic = atomic_read(&page->_mapcount); 77 type = atomic_read(&page->_mapcount);
78 BUG_ON(magic >= -1); 78 BUG_ON(type >= -1);
79 79
80 if (atomic_dec_return(&page->_count) == 1) { 80 if (atomic_dec_return(&page->_count) == 1) {
81 ClearPagePrivate(page); 81 ClearPagePrivate(page);
@@ -86,7 +86,7 @@ void put_page_bootmem(struct page *page)
86 86
87} 87}
88 88
89void register_page_bootmem_info_section(unsigned long start_pfn) 89static void register_page_bootmem_info_section(unsigned long start_pfn)
90{ 90{
91 unsigned long *usemap, mapsize, section_nr, i; 91 unsigned long *usemap, mapsize, section_nr, i;
92 struct mem_section *ms; 92 struct mem_section *ms;
@@ -119,7 +119,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn)
119 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 119 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
120 120
121 for (i = 0; i < mapsize; i++, page++) 121 for (i = 0; i < mapsize; i++, page++)
122 get_page_bootmem(section_nr, page, MIX_INFO); 122 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
123 123
124} 124}
125 125
@@ -429,7 +429,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
429 429
430 if (need_zonelists_rebuild) 430 if (need_zonelists_rebuild)
431 build_all_zonelists(); 431 build_all_zonelists();
432 vm_total_pages = nr_free_pagecache_pages(); 432 else
433 vm_total_pages = nr_free_pagecache_pages();
434
433 writeback_set_ratelimit(); 435 writeback_set_ratelimit();
434 436
435 if (onlined_pages) 437 if (onlined_pages)
@@ -455,7 +457,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
455 /* we can use NODE_DATA(nid) from here */ 457 /* we can use NODE_DATA(nid) from here */
456 458
457 /* init node's zones as empty zones, we don't have any present pages.*/ 459 /* init node's zones as empty zones, we don't have any present pages.*/
458 free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); 460 free_area_init_node(nid, zones_size, start_pfn, zholes_size);
459 461
460 return pgdat; 462 return pgdat;
461} 463}
@@ -521,6 +523,66 @@ EXPORT_SYMBOL_GPL(add_memory);
521 523
522#ifdef CONFIG_MEMORY_HOTREMOVE 524#ifdef CONFIG_MEMORY_HOTREMOVE
523/* 525/*
526 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
527 * set and the size of the free page is given by page_order(). Using this,
528 * the function determines if the pageblock contains only free pages.
529 * Due to buddy contraints, a free page at least the size of a pageblock will
530 * be located at the start of the pageblock
531 */
532static inline int pageblock_free(struct page *page)
533{
534 return PageBuddy(page) && page_order(page) >= pageblock_order;
535}
536
537/* Return the start of the next active pageblock after a given page */
538static struct page *next_active_pageblock(struct page *page)
539{
540 int pageblocks_stride;
541
542 /* Ensure the starting page is pageblock-aligned */
543 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
544
545 /* Move forward by at least 1 * pageblock_nr_pages */
546 pageblocks_stride = 1;
547
548 /* If the entire pageblock is free, move to the end of free page */
549 if (pageblock_free(page))
550 pageblocks_stride += page_order(page) - pageblock_order;
551
552 return page + (pageblocks_stride * pageblock_nr_pages);
553}
554
555/* Checks if this range of memory is likely to be hot-removable. */
556int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
557{
558 int type;
559 struct page *page = pfn_to_page(start_pfn);
560 struct page *end_page = page + nr_pages;
561
562 /* Check the starting page of each pageblock within the range */
563 for (; page < end_page; page = next_active_pageblock(page)) {
564 type = get_pageblock_migratetype(page);
565
566 /*
567 * A pageblock containing MOVABLE or free pages is considered
568 * removable
569 */
570 if (type != MIGRATE_MOVABLE && !pageblock_free(page))
571 return 0;
572
573 /*
574 * A pageblock starting with a PageReserved page is not
575 * considered removable.
576 */
577 if (PageReserved(page))
578 return 0;
579 }
580
581 /* All pageblocks in the memory block are likely to be hot-removable */
582 return 1;
583}
584
585/*
524 * Confirm all pages in a range [start, end) is belongs to the same zone. 586 * Confirm all pages in a range [start, end) is belongs to the same zone.
525 */ 587 */
526static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 588static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c3..e550bec20582 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1481,7 +1481,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1481 1481
1482 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1482 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1483 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1483 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1484 HPAGE_SHIFT), gfp_flags); 1484 huge_page_shift(hstate_vma(vma))), gfp_flags);
1485 } else { 1485 } else {
1486 zl = policy_zonelist(gfp_flags, *mpol); 1486 zl = policy_zonelist(gfp_flags, *mpol);
1487 if ((*mpol)->mode == MPOL_BIND) 1487 if ((*mpol)->mode == MPOL_BIND)
@@ -2220,9 +2220,12 @@ static void check_huge_range(struct vm_area_struct *vma,
2220{ 2220{
2221 unsigned long addr; 2221 unsigned long addr;
2222 struct page *page; 2222 struct page *page;
2223 struct hstate *h = hstate_vma(vma);
2224 unsigned long sz = huge_page_size(h);
2223 2225
2224 for (addr = start; addr < end; addr += HPAGE_SIZE) { 2226 for (addr = start; addr < end; addr += sz) {
2225 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); 2227 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2228 addr & huge_page_mask(h));
2226 pte_t pte; 2229 pte_t pte;
2227 2230
2228 if (!ptep) 2231 if (!ptep)
diff --git a/mm/migrate.c b/mm/migrate.c
index 55bd355d170d..d8c65a65c61d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -30,6 +30,7 @@
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/syscalls.h>
33 34
34#include "internal.h" 35#include "internal.h"
35 36
@@ -357,6 +358,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
357 __inc_zone_page_state(newpage, NR_FILE_PAGES); 358 __inc_zone_page_state(newpage, NR_FILE_PAGES);
358 359
359 write_unlock_irq(&mapping->tree_lock); 360 write_unlock_irq(&mapping->tree_lock);
361 if (!PageSwapCache(newpage)) {
362 mem_cgroup_uncharge_cache_page(page);
363 }
360 364
361 return 0; 365 return 0;
362} 366}
@@ -610,7 +614,6 @@ static int move_to_new_page(struct page *newpage, struct page *page)
610 rc = fallback_migrate_page(mapping, newpage, page); 614 rc = fallback_migrate_page(mapping, newpage, page);
611 615
612 if (!rc) { 616 if (!rc) {
613 mem_cgroup_page_migration(page, newpage);
614 remove_migration_ptes(page, newpage); 617 remove_migration_ptes(page, newpage);
615 } else 618 } else
616 newpage->mapping = NULL; 619 newpage->mapping = NULL;
@@ -640,6 +643,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
640 /* page was freed from under us. So we are done. */ 643 /* page was freed from under us. So we are done. */
641 goto move_newpage; 644 goto move_newpage;
642 645
646 charge = mem_cgroup_prepare_migration(page, newpage);
647 if (charge == -ENOMEM) {
648 rc = -ENOMEM;
649 goto move_newpage;
650 }
651 /* prepare cgroup just returns 0 or -ENOMEM */
652 BUG_ON(charge);
653
643 rc = -EAGAIN; 654 rc = -EAGAIN;
644 if (TestSetPageLocked(page)) { 655 if (TestSetPageLocked(page)) {
645 if (!force) 656 if (!force)
@@ -691,19 +702,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
691 goto rcu_unlock; 702 goto rcu_unlock;
692 } 703 }
693 704
694 charge = mem_cgroup_prepare_migration(page);
695 /* Establish migration ptes or remove ptes */ 705 /* Establish migration ptes or remove ptes */
696 try_to_unmap(page, 1); 706 try_to_unmap(page, 1);
697 707
698 if (!page_mapped(page)) 708 if (!page_mapped(page))
699 rc = move_to_new_page(newpage, page); 709 rc = move_to_new_page(newpage, page);
700 710
701 if (rc) { 711 if (rc)
702 remove_migration_ptes(page, page); 712 remove_migration_ptes(page, page);
703 if (charge)
704 mem_cgroup_end_migration(page);
705 } else if (charge)
706 mem_cgroup_end_migration(newpage);
707rcu_unlock: 713rcu_unlock:
708 if (rcu_locked) 714 if (rcu_locked)
709 rcu_read_unlock(); 715 rcu_read_unlock();
@@ -724,6 +730,8 @@ unlock:
724 } 730 }
725 731
726move_newpage: 732move_newpage:
733 if (!charge)
734 mem_cgroup_end_migration(newpage);
727 /* 735 /*
728 * Move the new page to the LRU. If migration was not successful 736 * Move the new page to the LRU. If migration was not successful
729 * then this will free the page. 737 * then this will free the page.
@@ -1070,7 +1078,6 @@ out2:
1070 mmput(mm); 1078 mmput(mm);
1071 return err; 1079 return err;
1072} 1080}
1073#endif
1074 1081
1075/* 1082/*
1076 * Call migration functions in the vma_ops that may prepare 1083 * Call migration functions in the vma_ops that may prepare
@@ -1092,3 +1099,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1092 } 1099 }
1093 return err; 1100 return err;
1094} 1101}
1102#endif
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 000000000000..c6af41ea9994
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,152 @@
1/*
2 * mm_init.c - Memory initialisation verification and debugging
3 *
4 * Copyright 2008 IBM Corporation, 2008
5 * Author Mel Gorman <mel@csn.ul.ie>
6 *
7 */
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/kobject.h>
11#include <linux/module.h>
12#include "internal.h"
13
14#ifdef CONFIG_DEBUG_MEMORY_INIT
15int __meminitdata mminit_loglevel;
16
17/* The zonelists are simply reported, validation is manual. */
18void mminit_verify_zonelist(void)
19{
20 int nid;
21
22 if (mminit_loglevel < MMINIT_VERIFY)
23 return;
24
25 for_each_online_node(nid) {
26 pg_data_t *pgdat = NODE_DATA(nid);
27 struct zone *zone;
28 struct zoneref *z;
29 struct zonelist *zonelist;
30 int i, listid, zoneid;
31
32 BUG_ON(MAX_ZONELISTS > 2);
33 for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
34
35 /* Identify the zone and nodelist */
36 zoneid = i % MAX_NR_ZONES;
37 listid = i / MAX_NR_ZONES;
38 zonelist = &pgdat->node_zonelists[listid];
39 zone = &pgdat->node_zones[zoneid];
40 if (!populated_zone(zone))
41 continue;
42
43 /* Print information about the zonelist */
44 printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
45 listid > 0 ? "thisnode" : "general", nid,
46 zone->name);
47
48 /* Iterate the zonelist */
49 for_each_zone_zonelist(zone, z, zonelist, zoneid) {
50#ifdef CONFIG_NUMA
51 printk(KERN_CONT "%d:%s ",
52 zone->node, zone->name);
53#else
54 printk(KERN_CONT "0:%s ", zone->name);
55#endif /* CONFIG_NUMA */
56 }
57 printk(KERN_CONT "\n");
58 }
59 }
60}
61
62void __init mminit_verify_pageflags_layout(void)
63{
64 int shift, width;
65 unsigned long or_mask, add_mask;
66
67 shift = 8 * sizeof(unsigned long);
68 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
69 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
70 "Section %d Node %d Zone %d Flags %d\n",
71 SECTIONS_WIDTH,
72 NODES_WIDTH,
73 ZONES_WIDTH,
74 NR_PAGEFLAGS);
75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
76 "Section %d Node %d Zone %d\n",
77#ifdef SECTIONS_SHIFT
78 SECTIONS_SHIFT,
79#else
80 0,
81#endif
82 NODES_SHIFT,
83 ZONES_SHIFT);
84 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
85 "Section %lu Node %lu Zone %lu\n",
86 (unsigned long)SECTIONS_PGSHIFT,
87 (unsigned long)NODES_PGSHIFT,
88 (unsigned long)ZONES_PGSHIFT);
89 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
90 "Zone ID: %lu -> %lu\n",
91 (unsigned long)ZONEID_PGOFF,
92 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
93 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
94 "location: %d -> %d unused %d -> %d flags %d -> %d\n",
95 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
96#ifdef NODE_NOT_IN_PAGE_FLAGS
97 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
98 "Node not in page flags");
99#endif
100
101 if (SECTIONS_WIDTH) {
102 shift -= SECTIONS_WIDTH;
103 BUG_ON(shift != SECTIONS_PGSHIFT);
104 }
105 if (NODES_WIDTH) {
106 shift -= NODES_WIDTH;
107 BUG_ON(shift != NODES_PGSHIFT);
108 }
109 if (ZONES_WIDTH) {
110 shift -= ZONES_WIDTH;
111 BUG_ON(shift != ZONES_PGSHIFT);
112 }
113
114 /* Check for bitmask overlaps */
115 or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
116 (NODES_MASK << NODES_PGSHIFT) |
117 (SECTIONS_MASK << SECTIONS_PGSHIFT);
118 add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
119 (NODES_MASK << NODES_PGSHIFT) +
120 (SECTIONS_MASK << SECTIONS_PGSHIFT);
121 BUG_ON(or_mask != add_mask);
122}
123
124void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
125 unsigned long nid, unsigned long pfn)
126{
127 BUG_ON(page_to_nid(page) != nid);
128 BUG_ON(page_zonenum(page) != zone);
129 BUG_ON(page_to_pfn(page) != pfn);
130}
131
132static __init int set_mminit_loglevel(char *str)
133{
134 get_option(&str, &mminit_loglevel);
135 return 0;
136}
137early_param("mminit_loglevel", set_mminit_loglevel);
138#endif /* CONFIG_DEBUG_MEMORY_INIT */
139
140struct kobject *mm_kobj;
141EXPORT_SYMBOL_GPL(mm_kobj);
142
143static int __init mm_sysfs_init(void)
144{
145 mm_kobj = kobject_create_and_add("mm", kernel_kobj);
146 if (!mm_kobj)
147 return -ENOMEM;
148
149 return 0;
150}
151
152__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d102b956fd8..5e0cc99e9cd5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -32,6 +32,8 @@
32#include <asm/tlb.h> 32#include <asm/tlb.h>
33#include <asm/mmu_context.h> 33#include <asm/mmu_context.h>
34 34
35#include "internal.h"
36
35#ifndef arch_mmap_check 37#ifndef arch_mmap_check
36#define arch_mmap_check(addr, len, flags) (0) 38#define arch_mmap_check(addr, len, flags) (0)
37#endif 39#endif
@@ -1108,6 +1110,9 @@ munmap_back:
1108 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 1110 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1109 return -ENOMEM; 1111 return -ENOMEM;
1110 1112
1113 if (flags & MAP_NORESERVE)
1114 vm_flags |= VM_NORESERVE;
1115
1111 if (accountable && (!(flags & MAP_NORESERVE) || 1116 if (accountable && (!(flags & MAP_NORESERVE) ||
1112 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { 1117 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
1113 if (vm_flags & VM_SHARED) { 1118 if (vm_flags & VM_SHARED) {
@@ -1763,7 +1768,7 @@ static void unmap_region(struct mm_struct *mm,
1763 update_hiwater_rss(mm); 1768 update_hiwater_rss(mm);
1764 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); 1769 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1765 vm_unacct_memory(nr_accounted); 1770 vm_unacct_memory(nr_accounted);
1766 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1771 free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1767 next? next->vm_start: 0); 1772 next? next->vm_start: 0);
1768 tlb_finish_mmu(tlb, start, end); 1773 tlb_finish_mmu(tlb, start, end);
1769} 1774}
@@ -1807,7 +1812,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1807 struct mempolicy *pol; 1812 struct mempolicy *pol;
1808 struct vm_area_struct *new; 1813 struct vm_area_struct *new;
1809 1814
1810 if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) 1815 if (is_vm_hugetlb_page(vma) && (addr &
1816 ~(huge_page_mask(hstate_vma(vma)))))
1811 return -EINVAL; 1817 return -EINVAL;
1812 1818
1813 if (mm->map_count >= sysctl_max_map_count) 1819 if (mm->map_count >= sysctl_max_map_count)
@@ -2063,7 +2069,7 @@ void exit_mmap(struct mm_struct *mm)
2063 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2069 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2064 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2070 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2065 vm_unacct_memory(nr_accounted); 2071 vm_unacct_memory(nr_accounted);
2066 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2072 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2067 tlb_finish_mmu(tlb, 0, end); 2073 tlb_finish_mmu(tlb, 0, end);
2068 2074
2069 /* 2075 /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 360d9cc8b38c..abd645a3b0a0 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -153,12 +153,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
153 * If we make a private mapping writable we increase our commit; 153 * If we make a private mapping writable we increase our commit;
154 * but (without finer accounting) cannot reduce our commit if we 154 * but (without finer accounting) cannot reduce our commit if we
155 * make it unwritable again. 155 * make it unwritable again.
156 *
157 * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
158 * a MAP_NORESERVE private mapping to writable will now reserve.
159 */ 156 */
160 if (newflags & VM_WRITE) { 157 if (newflags & VM_WRITE) {
161 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { 158 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
159 VM_SHARED|VM_NORESERVE))) {
162 charged = nrpages; 160 charged = nrpages;
163 if (security_vm_enough_memory(charged)) 161 if (security_vm_enough_memory(charged))
164 return -ENOMEM; 162 return -ENOMEM;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79ac4afc908c..6da667274df5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve;
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; 153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 unsigned long __initdata required_kernelcore; 156 static unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 157 static unsigned long __initdata required_movablecore;
158 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 158 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
159 159
160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
161 int movable_zone; 161 int movable_zone;
@@ -264,7 +264,7 @@ static void free_compound_page(struct page *page)
264 __free_pages_ok(page, compound_order(page)); 264 __free_pages_ok(page, compound_order(page));
265} 265}
266 266
267static void prep_compound_page(struct page *page, unsigned long order) 267void prep_compound_page(struct page *page, unsigned long order)
268{ 268{
269 int i; 269 int i;
270 int nr_pages = 1 << order; 270 int nr_pages = 1 << order;
@@ -432,8 +432,9 @@ static inline void __free_one_page(struct page *page,
432 432
433 buddy = __page_find_buddy(page, page_idx, order); 433 buddy = __page_find_buddy(page, page_idx, order);
434 if (!page_is_buddy(page, buddy, order)) 434 if (!page_is_buddy(page, buddy, order))
435 break; /* Move the buddy up one level. */ 435 break;
436 436
437 /* Our buddy is free, merge with it and move up one order. */
437 list_del(&buddy->lru); 438 list_del(&buddy->lru);
438 zone->free_area[order].nr_free--; 439 zone->free_area[order].nr_free--;
439 rmv_page_order(buddy); 440 rmv_page_order(buddy);
@@ -532,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
532/* 533/*
533 * permit the bootmem allocator to evade page validation on high-order frees 534 * permit the bootmem allocator to evade page validation on high-order frees
534 */ 535 */
535void __free_pages_bootmem(struct page *page, unsigned int order) 536void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
536{ 537{
537 if (order == 0) { 538 if (order == 0) {
538 __ClearPageReserved(page); 539 __ClearPageReserved(page);
@@ -673,9 +674,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
673 * Note that start_page and end_pages are not aligned on a pageblock 674 * Note that start_page and end_pages are not aligned on a pageblock
674 * boundary. If alignment is required, use move_freepages_block() 675 * boundary. If alignment is required, use move_freepages_block()
675 */ 676 */
676int move_freepages(struct zone *zone, 677static int move_freepages(struct zone *zone,
677 struct page *start_page, struct page *end_page, 678 struct page *start_page, struct page *end_page,
678 int migratetype) 679 int migratetype)
679{ 680{
680 struct page *page; 681 struct page *page;
681 unsigned long order; 682 unsigned long order;
@@ -714,7 +715,8 @@ int move_freepages(struct zone *zone,
714 return pages_moved; 715 return pages_moved;
715} 716}
716 717
717int move_freepages_block(struct zone *zone, struct page *page, int migratetype) 718static int move_freepages_block(struct zone *zone, struct page *page,
719 int migratetype)
718{ 720{
719 unsigned long start_pfn, end_pfn; 721 unsigned long start_pfn, end_pfn;
720 struct page *start_page, *end_page; 722 struct page *start_page, *end_page;
@@ -1429,7 +1431,7 @@ try_next_zone:
1429/* 1431/*
1430 * This is the 'heart' of the zoned buddy allocator. 1432 * This is the 'heart' of the zoned buddy allocator.
1431 */ 1433 */
1432static struct page * 1434struct page *
1433__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1435__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1434 struct zonelist *zonelist, nodemask_t *nodemask) 1436 struct zonelist *zonelist, nodemask_t *nodemask)
1435{ 1437{
@@ -1632,22 +1634,7 @@ nopage:
1632got_pg: 1634got_pg:
1633 return page; 1635 return page;
1634} 1636}
1635 1637EXPORT_SYMBOL(__alloc_pages_internal);
1636struct page *
1637__alloc_pages(gfp_t gfp_mask, unsigned int order,
1638 struct zonelist *zonelist)
1639{
1640 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
1641}
1642
1643struct page *
1644__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1645 struct zonelist *zonelist, nodemask_t *nodemask)
1646{
1647 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
1648}
1649
1650EXPORT_SYMBOL(__alloc_pages);
1651 1638
1652/* 1639/*
1653 * Common helper functions. 1640 * Common helper functions.
@@ -1711,6 +1698,59 @@ void free_pages(unsigned long addr, unsigned int order)
1711 1698
1712EXPORT_SYMBOL(free_pages); 1699EXPORT_SYMBOL(free_pages);
1713 1700
1701/**
1702 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
1703 * @size: the number of bytes to allocate
1704 * @gfp_mask: GFP flags for the allocation
1705 *
1706 * This function is similar to alloc_pages(), except that it allocates the
1707 * minimum number of pages to satisfy the request. alloc_pages() can only
1708 * allocate memory in power-of-two pages.
1709 *
1710 * This function is also limited by MAX_ORDER.
1711 *
1712 * Memory allocated by this function must be released by free_pages_exact().
1713 */
1714void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
1715{
1716 unsigned int order = get_order(size);
1717 unsigned long addr;
1718
1719 addr = __get_free_pages(gfp_mask, order);
1720 if (addr) {
1721 unsigned long alloc_end = addr + (PAGE_SIZE << order);
1722 unsigned long used = addr + PAGE_ALIGN(size);
1723
1724 split_page(virt_to_page(addr), order);
1725 while (used < alloc_end) {
1726 free_page(used);
1727 used += PAGE_SIZE;
1728 }
1729 }
1730
1731 return (void *)addr;
1732}
1733EXPORT_SYMBOL(alloc_pages_exact);
1734
1735/**
1736 * free_pages_exact - release memory allocated via alloc_pages_exact()
1737 * @virt: the value returned by alloc_pages_exact.
1738 * @size: size of allocation, same value as passed to alloc_pages_exact().
1739 *
1740 * Release the memory allocated by a previous call to alloc_pages_exact.
1741 */
1742void free_pages_exact(void *virt, size_t size)
1743{
1744 unsigned long addr = (unsigned long)virt;
1745 unsigned long end = addr + PAGE_ALIGN(size);
1746
1747 while (addr < end) {
1748 free_page(addr);
1749 addr += PAGE_SIZE;
1750 }
1751}
1752EXPORT_SYMBOL(free_pages_exact);
1753
1714static unsigned int nr_free_zone_pages(int offset) 1754static unsigned int nr_free_zone_pages(int offset)
1715{ 1755{
1716 struct zoneref *z; 1756 struct zoneref *z;
@@ -2352,6 +2392,7 @@ void build_all_zonelists(void)
2352 2392
2353 if (system_state == SYSTEM_BOOTING) { 2393 if (system_state == SYSTEM_BOOTING) {
2354 __build_all_zonelists(NULL); 2394 __build_all_zonelists(NULL);
2395 mminit_verify_zonelist();
2355 cpuset_init_current_mems_allowed(); 2396 cpuset_init_current_mems_allowed();
2356 } else { 2397 } else {
2357 /* we have to stop all cpus to guarantee there is no user 2398 /* we have to stop all cpus to guarantee there is no user
@@ -2534,6 +2575,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2534 } 2575 }
2535 page = pfn_to_page(pfn); 2576 page = pfn_to_page(pfn);
2536 set_page_links(page, zone, nid, pfn); 2577 set_page_links(page, zone, nid, pfn);
2578 mminit_verify_page_links(page, zone, nid, pfn);
2537 init_page_count(page); 2579 init_page_count(page);
2538 reset_page_mapcount(page); 2580 reset_page_mapcount(page);
2539 SetPageReserved(page); 2581 SetPageReserved(page);
@@ -2611,7 +2653,7 @@ static int zone_batchsize(struct zone *zone)
2611 return batch; 2653 return batch;
2612} 2654}
2613 2655
2614inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2656static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2615{ 2657{
2616 struct per_cpu_pages *pcp; 2658 struct per_cpu_pages *pcp;
2617 2659
@@ -2836,6 +2878,12 @@ __meminit int init_currently_empty_zone(struct zone *zone,
2836 2878
2837 zone->zone_start_pfn = zone_start_pfn; 2879 zone->zone_start_pfn = zone_start_pfn;
2838 2880
2881 mminit_dprintk(MMINIT_TRACE, "memmap_init",
2882 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
2883 pgdat->node_id,
2884 (unsigned long)zone_idx(zone),
2885 zone_start_pfn, (zone_start_pfn + size));
2886
2839 zone_init_free_lists(zone); 2887 zone_init_free_lists(zone);
2840 2888
2841 return 0; 2889 return 0;
@@ -2975,7 +3023,8 @@ void __init sparse_memory_present_with_active_regions(int nid)
2975void __init push_node_boundaries(unsigned int nid, 3023void __init push_node_boundaries(unsigned int nid,
2976 unsigned long start_pfn, unsigned long end_pfn) 3024 unsigned long start_pfn, unsigned long end_pfn)
2977{ 3025{
2978 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", 3026 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3027 "Entering push_node_boundaries(%u, %lu, %lu)\n",
2979 nid, start_pfn, end_pfn); 3028 nid, start_pfn, end_pfn);
2980 3029
2981 /* Initialise the boundary for this node if necessary */ 3030 /* Initialise the boundary for this node if necessary */
@@ -2993,7 +3042,8 @@ void __init push_node_boundaries(unsigned int nid,
2993static void __meminit account_node_boundary(unsigned int nid, 3042static void __meminit account_node_boundary(unsigned int nid,
2994 unsigned long *start_pfn, unsigned long *end_pfn) 3043 unsigned long *start_pfn, unsigned long *end_pfn)
2995{ 3044{
2996 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 3045 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3046 "Entering account_node_boundary(%u, %lu, %lu)\n",
2997 nid, *start_pfn, *end_pfn); 3047 nid, *start_pfn, *end_pfn);
2998 3048
2999 /* Return if boundary information has not been provided */ 3049 /* Return if boundary information has not been provided */
@@ -3050,7 +3100,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
3050 * assumption is made that zones within a node are ordered in monotonic 3100 * assumption is made that zones within a node are ordered in monotonic
3051 * increasing memory addresses so that the "highest" populated zone is used 3101 * increasing memory addresses so that the "highest" populated zone is used
3052 */ 3102 */
3053void __init find_usable_zone_for_movable(void) 3103static void __init find_usable_zone_for_movable(void)
3054{ 3104{
3055 int zone_index; 3105 int zone_index;
3056 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 3106 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
@@ -3076,7 +3126,7 @@ void __init find_usable_zone_for_movable(void)
3076 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 3126 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
3077 * zones within a node are in order of monotonic increases memory addresses 3127 * zones within a node are in order of monotonic increases memory addresses
3078 */ 3128 */
3079void __meminit adjust_zone_range_for_zone_movable(int nid, 3129static void __meminit adjust_zone_range_for_zone_movable(int nid,
3080 unsigned long zone_type, 3130 unsigned long zone_type,
3081 unsigned long node_start_pfn, 3131 unsigned long node_start_pfn,
3082 unsigned long node_end_pfn, 3132 unsigned long node_end_pfn,
@@ -3137,7 +3187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3137 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3187 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3138 * then all holes in the requested range will be accounted for. 3188 * then all holes in the requested range will be accounted for.
3139 */ 3189 */
3140unsigned long __meminit __absent_pages_in_range(int nid, 3190static unsigned long __meminit __absent_pages_in_range(int nid,
3141 unsigned long range_start_pfn, 3191 unsigned long range_start_pfn,
3142 unsigned long range_end_pfn) 3192 unsigned long range_end_pfn)
3143{ 3193{
@@ -3368,8 +3418,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3368 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 3418 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3369 if (realsize >= memmap_pages) { 3419 if (realsize >= memmap_pages) {
3370 realsize -= memmap_pages; 3420 realsize -= memmap_pages;
3371 printk(KERN_DEBUG 3421 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3372 " %s zone: %lu pages used for memmap\n", 3422 "%s zone: %lu pages used for memmap\n",
3373 zone_names[j], memmap_pages); 3423 zone_names[j], memmap_pages);
3374 } else 3424 } else
3375 printk(KERN_WARNING 3425 printk(KERN_WARNING
@@ -3379,7 +3429,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3379 /* Account for reserved pages */ 3429 /* Account for reserved pages */
3380 if (j == 0 && realsize > dma_reserve) { 3430 if (j == 0 && realsize > dma_reserve) {
3381 realsize -= dma_reserve; 3431 realsize -= dma_reserve;
3382 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 3432 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3433 "%s zone: %lu pages reserved\n",
3383 zone_names[0], dma_reserve); 3434 zone_names[0], dma_reserve);
3384 } 3435 }
3385 3436
@@ -3464,10 +3515,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
3464#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 3515#endif /* CONFIG_FLAT_NODE_MEM_MAP */
3465} 3516}
3466 3517
3467void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, 3518void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
3468 unsigned long *zones_size, unsigned long node_start_pfn, 3519 unsigned long node_start_pfn, unsigned long *zholes_size)
3469 unsigned long *zholes_size)
3470{ 3520{
3521 pg_data_t *pgdat = NODE_DATA(nid);
3522
3471 pgdat->node_id = nid; 3523 pgdat->node_id = nid;
3472 pgdat->node_start_pfn = node_start_pfn; 3524 pgdat->node_start_pfn = node_start_pfn;
3473 calculate_node_totalpages(pgdat, zones_size, zholes_size); 3525 calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -3520,10 +3572,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3520{ 3572{
3521 int i; 3573 int i;
3522 3574
3523 printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " 3575 mminit_dprintk(MMINIT_TRACE, "memory_register",
3524 "%d entries of %d used\n", 3576 "Entering add_active_range(%d, %#lx, %#lx) "
3525 nid, start_pfn, end_pfn, 3577 "%d entries of %d used\n",
3526 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 3578 nid, start_pfn, end_pfn,
3579 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
3580
3581 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
3527 3582
3528 /* Merge with existing active regions if possible */ 3583 /* Merge with existing active regions if possible */
3529 for (i = 0; i < nr_nodemap_entries; i++) { 3584 for (i = 0; i < nr_nodemap_entries; i++) {
@@ -3669,7 +3724,7 @@ static void __init sort_node_map(void)
3669} 3724}
3670 3725
3671/* Find the lowest pfn for a node */ 3726/* Find the lowest pfn for a node */
3672unsigned long __init find_min_pfn_for_node(int nid) 3727static unsigned long __init find_min_pfn_for_node(int nid)
3673{ 3728{
3674 int i; 3729 int i;
3675 unsigned long min_pfn = ULONG_MAX; 3730 unsigned long min_pfn = ULONG_MAX;
@@ -3741,7 +3796,7 @@ static unsigned long __init early_calculate_totalpages(void)
3741 * memory. When they don't, some nodes will have more kernelcore than 3796 * memory. When they don't, some nodes will have more kernelcore than
3742 * others 3797 * others
3743 */ 3798 */
3744void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 3799static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3745{ 3800{
3746 int i, nid; 3801 int i, nid;
3747 unsigned long usable_startpfn; 3802 unsigned long usable_startpfn;
@@ -3957,10 +4012,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3957 early_node_map[i].end_pfn); 4012 early_node_map[i].end_pfn);
3958 4013
3959 /* Initialise every node */ 4014 /* Initialise every node */
4015 mminit_verify_pageflags_layout();
3960 setup_nr_node_ids(); 4016 setup_nr_node_ids();
3961 for_each_online_node(nid) { 4017 for_each_online_node(nid) {
3962 pg_data_t *pgdat = NODE_DATA(nid); 4018 pg_data_t *pgdat = NODE_DATA(nid);
3963 free_area_init_node(nid, pgdat, NULL, 4019 free_area_init_node(nid, NULL,
3964 find_min_pfn_for_node(nid), NULL); 4020 find_min_pfn_for_node(nid), NULL);
3965 4021
3966 /* Any memory on that node */ 4022 /* Any memory on that node */
@@ -4025,15 +4081,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4025} 4081}
4026 4082
4027#ifndef CONFIG_NEED_MULTIPLE_NODES 4083#ifndef CONFIG_NEED_MULTIPLE_NODES
4028static bootmem_data_t contig_bootmem_data; 4084struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] };
4029struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
4030
4031EXPORT_SYMBOL(contig_page_data); 4085EXPORT_SYMBOL(contig_page_data);
4032#endif 4086#endif
4033 4087
4034void __init free_area_init(unsigned long *zones_size) 4088void __init free_area_init(unsigned long *zones_size)
4035{ 4089{
4036 free_area_init_node(0, NODE_DATA(0), zones_size, 4090 free_area_init_node(0, zones_size,
4037 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 4091 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
4038} 4092}
4039 4093
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 9d834aa4b979..0cbe0c60c6bf 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work)
130 * Thread creation: For how long have there been zero 130 * Thread creation: For how long have there been zero
131 * available threads? 131 * available threads?
132 */ 132 */
133 if (jiffies - last_empty_jifs > 1 * HZ) { 133 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
134 /* unlocked list_empty() test is OK here */ 134 /* unlocked list_empty() test is OK here */
135 if (list_empty(&pdflush_list)) { 135 if (list_empty(&pdflush_list)) {
136 /* unlocked test is OK here */ 136 /* unlocked test is OK here */
@@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work)
151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) 151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
152 continue; 152 continue;
153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); 153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
154 if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { 154 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
155 /* Limit exit rate */ 155 /* Limit exit rate */
156 pdf->when_i_went_to_sleep = jiffies; 156 pdf->when_i_went_to_sleep = jiffies;
157 break; /* exeunt */ 157 break; /* exeunt */
diff --git a/mm/rmap.c b/mm/rmap.c
index bf0a5b7cfb8e..abbd29f7c43f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -576,14 +576,8 @@ void page_add_anon_rmap(struct page *page,
576 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 576 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
577 if (atomic_inc_and_test(&page->_mapcount)) 577 if (atomic_inc_and_test(&page->_mapcount))
578 __page_set_anon_rmap(page, vma, address); 578 __page_set_anon_rmap(page, vma, address);
579 else { 579 else
580 __page_check_anon_rmap(page, vma, address); 580 __page_check_anon_rmap(page, vma, address);
581 /*
582 * We unconditionally charged during prepare, we uncharge here
583 * This takes care of balancing the reference counts
584 */
585 mem_cgroup_uncharge_page(page);
586 }
587} 581}
588 582
589/** 583/**
@@ -614,12 +608,6 @@ void page_add_file_rmap(struct page *page)
614{ 608{
615 if (atomic_inc_and_test(&page->_mapcount)) 609 if (atomic_inc_and_test(&page->_mapcount))
616 __inc_zone_page_state(page, NR_FILE_MAPPED); 610 __inc_zone_page_state(page, NR_FILE_MAPPED);
617 else
618 /*
619 * We unconditionally charged during prepare, we uncharge here
620 * This takes care of balancing the reference counts
621 */
622 mem_cgroup_uncharge_page(page);
623} 611}
624 612
625#ifdef CONFIG_DEBUG_VM 613#ifdef CONFIG_DEBUG_VM
diff --git a/mm/shmem.c b/mm/shmem.c
index e2a6ae1a44e9..f92fea94d037 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -922,20 +922,26 @@ found:
922 error = 1; 922 error = 1;
923 if (!inode) 923 if (!inode)
924 goto out; 924 goto out;
925 /* Precharge page while we can wait, compensate afterwards */ 925 /* Precharge page using GFP_KERNEL while we can wait */
926 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 926 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
927 if (error) 927 if (error)
928 goto out; 928 goto out;
929 error = radix_tree_preload(GFP_KERNEL); 929 error = radix_tree_preload(GFP_KERNEL);
930 if (error) 930 if (error) {
931 goto uncharge; 931 mem_cgroup_uncharge_cache_page(page);
932 goto out;
933 }
932 error = 1; 934 error = 1;
933 935
934 spin_lock(&info->lock); 936 spin_lock(&info->lock);
935 ptr = shmem_swp_entry(info, idx, NULL); 937 ptr = shmem_swp_entry(info, idx, NULL);
936 if (ptr && ptr->val == entry.val) 938 if (ptr && ptr->val == entry.val) {
937 error = add_to_page_cache(page, inode->i_mapping, 939 error = add_to_page_cache(page, inode->i_mapping,
938 idx, GFP_NOWAIT); 940 idx, GFP_NOWAIT);
941 /* does mem_cgroup_uncharge_cache_page on error */
942 } else /* we must compensate for our precharge above */
943 mem_cgroup_uncharge_cache_page(page);
944
939 if (error == -EEXIST) { 945 if (error == -EEXIST) {
940 struct page *filepage = find_get_page(inode->i_mapping, idx); 946 struct page *filepage = find_get_page(inode->i_mapping, idx);
941 error = 1; 947 error = 1;
@@ -961,8 +967,6 @@ found:
961 shmem_swp_unmap(ptr); 967 shmem_swp_unmap(ptr);
962 spin_unlock(&info->lock); 968 spin_unlock(&info->lock);
963 radix_tree_preload_end(); 969 radix_tree_preload_end();
964uncharge:
965 mem_cgroup_uncharge_page(page);
966out: 970out:
967 unlock_page(page); 971 unlock_page(page);
968 page_cache_release(page); 972 page_cache_release(page);
@@ -1311,17 +1315,14 @@ repeat:
1311 shmem_swp_unmap(entry); 1315 shmem_swp_unmap(entry);
1312 spin_unlock(&info->lock); 1316 spin_unlock(&info->lock);
1313 unlock_page(swappage); 1317 unlock_page(swappage);
1318 page_cache_release(swappage);
1314 if (error == -ENOMEM) { 1319 if (error == -ENOMEM) {
1315 /* allow reclaim from this memory cgroup */ 1320 /* allow reclaim from this memory cgroup */
1316 error = mem_cgroup_cache_charge(swappage, 1321 error = mem_cgroup_shrink_usage(current->mm,
1317 current->mm, gfp & ~__GFP_HIGHMEM); 1322 gfp);
1318 if (error) { 1323 if (error)
1319 page_cache_release(swappage);
1320 goto failed; 1324 goto failed;
1321 }
1322 mem_cgroup_uncharge_page(swappage);
1323 } 1325 }
1324 page_cache_release(swappage);
1325 goto repeat; 1326 goto repeat;
1326 } 1327 }
1327 } else if (sgp == SGP_READ && !filepage) { 1328 } else if (sgp == SGP_READ && !filepage) {
@@ -1358,6 +1359,8 @@ repeat:
1358 } 1359 }
1359 1360
1360 if (!filepage) { 1361 if (!filepage) {
1362 int ret;
1363
1361 spin_unlock(&info->lock); 1364 spin_unlock(&info->lock);
1362 filepage = shmem_alloc_page(gfp, info, idx); 1365 filepage = shmem_alloc_page(gfp, info, idx);
1363 if (!filepage) { 1366 if (!filepage) {
@@ -1386,10 +1389,18 @@ repeat:
1386 swap = *entry; 1389 swap = *entry;
1387 shmem_swp_unmap(entry); 1390 shmem_swp_unmap(entry);
1388 } 1391 }
1389 if (error || swap.val || 0 != add_to_page_cache_lru( 1392 ret = error || swap.val;
1390 filepage, mapping, idx, GFP_NOWAIT)) { 1393 if (ret)
1394 mem_cgroup_uncharge_cache_page(filepage);
1395 else
1396 ret = add_to_page_cache_lru(filepage, mapping,
1397 idx, GFP_NOWAIT);
1398 /*
1399 * At add_to_page_cache_lru() failure, uncharge will
1400 * be done automatically.
1401 */
1402 if (ret) {
1391 spin_unlock(&info->lock); 1403 spin_unlock(&info->lock);
1392 mem_cgroup_uncharge_page(filepage);
1393 page_cache_release(filepage); 1404 page_cache_release(filepage);
1394 shmem_unacct_blocks(info->flags, 1); 1405 shmem_unacct_blocks(info->flags, 1);
1395 shmem_free_blocks(inode, 1); 1406 shmem_free_blocks(inode, 1);
@@ -1398,7 +1409,6 @@ repeat:
1398 goto failed; 1409 goto failed;
1399 goto repeat; 1410 goto repeat;
1400 } 1411 }
1401 mem_cgroup_uncharge_page(filepage);
1402 info->flags |= SHMEM_PAGEIN; 1412 info->flags |= SHMEM_PAGEIN;
1403 } 1413 }
1404 1414
@@ -1690,26 +1700,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1690 file_accessed(filp); 1700 file_accessed(filp);
1691} 1701}
1692 1702
1693static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 1703static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1704 const struct iovec *iov, unsigned long nr_segs, loff_t pos)
1694{ 1705{
1695 read_descriptor_t desc; 1706 struct file *filp = iocb->ki_filp;
1707 ssize_t retval;
1708 unsigned long seg;
1709 size_t count;
1710 loff_t *ppos = &iocb->ki_pos;
1696 1711
1697 if ((ssize_t) count < 0) 1712 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1698 return -EINVAL; 1713 if (retval)
1699 if (!access_ok(VERIFY_WRITE, buf, count)) 1714 return retval;
1700 return -EFAULT;
1701 if (!count)
1702 return 0;
1703 1715
1704 desc.written = 0; 1716 for (seg = 0; seg < nr_segs; seg++) {
1705 desc.count = count; 1717 read_descriptor_t desc;
1706 desc.arg.buf = buf;
1707 desc.error = 0;
1708 1718
1709 do_shmem_file_read(filp, ppos, &desc, file_read_actor); 1719 desc.written = 0;
1710 if (desc.written) 1720 desc.arg.buf = iov[seg].iov_base;
1711 return desc.written; 1721 desc.count = iov[seg].iov_len;
1712 return desc.error; 1722 if (desc.count == 0)
1723 continue;
1724 desc.error = 0;
1725 do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1726 retval += desc.written;
1727 if (desc.error) {
1728 retval = retval ?: desc.error;
1729 break;
1730 }
1731 if (desc.count > 0)
1732 break;
1733 }
1734 return retval;
1713} 1735}
1714 1736
1715static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1737static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -2369,8 +2391,9 @@ static const struct file_operations shmem_file_operations = {
2369 .mmap = shmem_mmap, 2391 .mmap = shmem_mmap,
2370#ifdef CONFIG_TMPFS 2392#ifdef CONFIG_TMPFS
2371 .llseek = generic_file_llseek, 2393 .llseek = generic_file_llseek,
2372 .read = shmem_file_read, 2394 .read = do_sync_read,
2373 .write = do_sync_write, 2395 .write = do_sync_write,
2396 .aio_read = shmem_file_aio_read,
2374 .aio_write = generic_file_aio_write, 2397 .aio_write = generic_file_aio_write,
2375 .fsync = simple_sync_file, 2398 .fsync = simple_sync_file,
2376 .splice_read = generic_file_splice_read, 2399 .splice_read = generic_file_splice_read,
diff --git a/mm/slob.c b/mm/slob.c
index a3ad6671adf1..de268eb7ac70 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large);
130 */ 130 */
131static inline int slob_page(struct slob_page *sp) 131static inline int slob_page(struct slob_page *sp)
132{ 132{
133 return test_bit(PG_active, &sp->flags); 133 return PageSlobPage((struct page *)sp);
134} 134}
135 135
136static inline void set_slob_page(struct slob_page *sp) 136static inline void set_slob_page(struct slob_page *sp)
137{ 137{
138 __set_bit(PG_active, &sp->flags); 138 __SetPageSlobPage((struct page *)sp);
139} 139}
140 140
141static inline void clear_slob_page(struct slob_page *sp) 141static inline void clear_slob_page(struct slob_page *sp)
142{ 142{
143 __clear_bit(PG_active, &sp->flags); 143 __ClearPageSlobPage((struct page *)sp);
144} 144}
145 145
146/* 146/*
@@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp)
148 */ 148 */
149static inline int slob_page_free(struct slob_page *sp) 149static inline int slob_page_free(struct slob_page *sp)
150{ 150{
151 return test_bit(PG_private, &sp->flags); 151 return PageSlobFree((struct page *)sp);
152} 152}
153 153
154static void set_slob_page_free(struct slob_page *sp, struct list_head *list) 154static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
155{ 155{
156 list_add(&sp->list, list); 156 list_add(&sp->list, list);
157 __set_bit(PG_private, &sp->flags); 157 __SetPageSlobFree((struct page *)sp);
158} 158}
159 159
160static inline void clear_slob_page_free(struct slob_page *sp) 160static inline void clear_slob_page_free(struct slob_page *sp)
161{ 161{
162 list_del(&sp->list); 162 list_del(&sp->list);
163 __clear_bit(PG_private, &sp->flags); 163 __ClearPageSlobFree((struct page *)sp);
164} 164}
165 165
166#define SLOB_UNIT sizeof(slob_t) 166#define SLOB_UNIT sizeof(slob_t)
diff --git a/mm/slub.c b/mm/slub.c
index 6d4a49c1ff2f..77c21cf53ff9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -102,44 +102,12 @@
102 * the fast path and disables lockless freelists. 102 * the fast path and disables lockless freelists.
103 */ 103 */
104 104
105#define FROZEN (1 << PG_active)
106
107#ifdef CONFIG_SLUB_DEBUG 105#ifdef CONFIG_SLUB_DEBUG
108#define SLABDEBUG (1 << PG_error) 106#define SLABDEBUG 1
109#else 107#else
110#define SLABDEBUG 0 108#define SLABDEBUG 0
111#endif 109#endif
112 110
113static inline int SlabFrozen(struct page *page)
114{
115 return page->flags & FROZEN;
116}
117
118static inline void SetSlabFrozen(struct page *page)
119{
120 page->flags |= FROZEN;
121}
122
123static inline void ClearSlabFrozen(struct page *page)
124{
125 page->flags &= ~FROZEN;
126}
127
128static inline int SlabDebug(struct page *page)
129{
130 return page->flags & SLABDEBUG;
131}
132
133static inline void SetSlabDebug(struct page *page)
134{
135 page->flags |= SLABDEBUG;
136}
137
138static inline void ClearSlabDebug(struct page *page)
139{
140 page->flags &= ~SLABDEBUG;
141}
142
143/* 111/*
144 * Issues still to be resolved: 112 * Issues still to be resolved:
145 * 113 *
@@ -971,7 +939,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
971 } 939 }
972 940
973 /* Special debug activities for freeing objects */ 941 /* Special debug activities for freeing objects */
974 if (!SlabFrozen(page) && !page->freelist) 942 if (!PageSlubFrozen(page) && !page->freelist)
975 remove_full(s, page); 943 remove_full(s, page);
976 if (s->flags & SLAB_STORE_USER) 944 if (s->flags & SLAB_STORE_USER)
977 set_track(s, object, TRACK_FREE, addr); 945 set_track(s, object, TRACK_FREE, addr);
@@ -1157,7 +1125,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1157 page->flags |= 1 << PG_slab; 1125 page->flags |= 1 << PG_slab;
1158 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1126 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
1159 SLAB_STORE_USER | SLAB_TRACE)) 1127 SLAB_STORE_USER | SLAB_TRACE))
1160 SetSlabDebug(page); 1128 __SetPageSlubDebug(page);
1161 1129
1162 start = page_address(page); 1130 start = page_address(page);
1163 1131
@@ -1184,14 +1152,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1184 int order = compound_order(page); 1152 int order = compound_order(page);
1185 int pages = 1 << order; 1153 int pages = 1 << order;
1186 1154
1187 if (unlikely(SlabDebug(page))) { 1155 if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
1188 void *p; 1156 void *p;
1189 1157
1190 slab_pad_check(s, page); 1158 slab_pad_check(s, page);
1191 for_each_object(p, s, page_address(page), 1159 for_each_object(p, s, page_address(page),
1192 page->objects) 1160 page->objects)
1193 check_object(s, page, p, 0); 1161 check_object(s, page, p, 0);
1194 ClearSlabDebug(page); 1162 __ClearPageSlubDebug(page);
1195 } 1163 }
1196 1164
1197 mod_zone_page_state(page_zone(page), 1165 mod_zone_page_state(page_zone(page),
@@ -1288,7 +1256,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1288 if (slab_trylock(page)) { 1256 if (slab_trylock(page)) {
1289 list_del(&page->lru); 1257 list_del(&page->lru);
1290 n->nr_partial--; 1258 n->nr_partial--;
1291 SetSlabFrozen(page); 1259 __SetPageSlubFrozen(page);
1292 return 1; 1260 return 1;
1293 } 1261 }
1294 return 0; 1262 return 0;
@@ -1398,7 +1366,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1398 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1366 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1399 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); 1367 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1400 1368
1401 ClearSlabFrozen(page); 1369 __ClearPageSlubFrozen(page);
1402 if (page->inuse) { 1370 if (page->inuse) {
1403 1371
1404 if (page->freelist) { 1372 if (page->freelist) {
@@ -1406,7 +1374,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1406 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1374 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1407 } else { 1375 } else {
1408 stat(c, DEACTIVATE_FULL); 1376 stat(c, DEACTIVATE_FULL);
1409 if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1377 if (SLABDEBUG && PageSlubDebug(page) &&
1378 (s->flags & SLAB_STORE_USER))
1410 add_full(n, page); 1379 add_full(n, page);
1411 } 1380 }
1412 slab_unlock(page); 1381 slab_unlock(page);
@@ -1551,7 +1520,7 @@ load_freelist:
1551 object = c->page->freelist; 1520 object = c->page->freelist;
1552 if (unlikely(!object)) 1521 if (unlikely(!object))
1553 goto another_slab; 1522 goto another_slab;
1554 if (unlikely(SlabDebug(c->page))) 1523 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1555 goto debug; 1524 goto debug;
1556 1525
1557 c->freelist = object[c->offset]; 1526 c->freelist = object[c->offset];
@@ -1588,7 +1557,7 @@ new_slab:
1588 if (c->page) 1557 if (c->page)
1589 flush_slab(s, c); 1558 flush_slab(s, c);
1590 slab_lock(new); 1559 slab_lock(new);
1591 SetSlabFrozen(new); 1560 __SetPageSlubFrozen(new);
1592 c->page = new; 1561 c->page = new;
1593 goto load_freelist; 1562 goto load_freelist;
1594 } 1563 }
@@ -1674,7 +1643,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1674 stat(c, FREE_SLOWPATH); 1643 stat(c, FREE_SLOWPATH);
1675 slab_lock(page); 1644 slab_lock(page);
1676 1645
1677 if (unlikely(SlabDebug(page))) 1646 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1678 goto debug; 1647 goto debug;
1679 1648
1680checks_ok: 1649checks_ok:
@@ -1682,7 +1651,7 @@ checks_ok:
1682 page->freelist = object; 1651 page->freelist = object;
1683 page->inuse--; 1652 page->inuse--;
1684 1653
1685 if (unlikely(SlabFrozen(page))) { 1654 if (unlikely(PageSlubFrozen(page))) {
1686 stat(c, FREE_FROZEN); 1655 stat(c, FREE_FROZEN);
1687 goto out_unlock; 1656 goto out_unlock;
1688 } 1657 }
@@ -3317,12 +3286,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3317 s->name, page); 3286 s->name, page);
3318 3287
3319 if (s->flags & DEBUG_DEFAULT_FLAGS) { 3288 if (s->flags & DEBUG_DEFAULT_FLAGS) {
3320 if (!SlabDebug(page)) 3289 if (!PageSlubDebug(page))
3321 printk(KERN_ERR "SLUB %s: SlabDebug not set " 3290 printk(KERN_ERR "SLUB %s: SlubDebug not set "
3322 "on slab 0x%p\n", s->name, page); 3291 "on slab 0x%p\n", s->name, page);
3323 } else { 3292 } else {
3324 if (SlabDebug(page)) 3293 if (PageSlubDebug(page))
3325 printk(KERN_ERR "SLUB %s: SlabDebug set on " 3294 printk(KERN_ERR "SLUB %s: SlubDebug set on "
3326 "slab 0x%p\n", s->name, page); 3295 "slab 0x%p\n", s->name, page);
3327 } 3296 }
3328} 3297}
diff --git a/mm/sparse.c b/mm/sparse.c
index 36511c7b5e2c..8ffc08990008 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -12,6 +12,7 @@
12#include <asm/dma.h> 12#include <asm/dma.h>
13#include <asm/pgalloc.h> 13#include <asm/pgalloc.h>
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include "internal.h"
15 16
16/* 17/*
17 * Permanent SPARSEMEM data: 18 * Permanent SPARSEMEM data:
@@ -147,22 +148,41 @@ static inline int sparse_early_nid(struct mem_section *section)
147 return (section->section_mem_map >> SECTION_NID_SHIFT); 148 return (section->section_mem_map >> SECTION_NID_SHIFT);
148} 149}
149 150
150/* Record a memory area against a node. */ 151/* Validate the physical addressing limitations of the model */
151void __init memory_present(int nid, unsigned long start, unsigned long end) 152void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
153 unsigned long *end_pfn)
152{ 154{
153 unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); 155 unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
154 unsigned long pfn;
155 156
156 /* 157 /*
157 * Sanity checks - do not allow an architecture to pass 158 * Sanity checks - do not allow an architecture to pass
158 * in larger pfns than the maximum scope of sparsemem: 159 * in larger pfns than the maximum scope of sparsemem:
159 */ 160 */
160 if (start >= max_arch_pfn) 161 if (*start_pfn > max_sparsemem_pfn) {
161 return; 162 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
162 if (end >= max_arch_pfn) 163 "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
163 end = max_arch_pfn; 164 *start_pfn, *end_pfn, max_sparsemem_pfn);
165 WARN_ON_ONCE(1);
166 *start_pfn = max_sparsemem_pfn;
167 *end_pfn = max_sparsemem_pfn;
168 }
169
170 if (*end_pfn > max_sparsemem_pfn) {
171 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
172 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
173 *start_pfn, *end_pfn, max_sparsemem_pfn);
174 WARN_ON_ONCE(1);
175 *end_pfn = max_sparsemem_pfn;
176 }
177}
178
179/* Record a memory area against a node. */
180void __init memory_present(int nid, unsigned long start, unsigned long end)
181{
182 unsigned long pfn;
164 183
165 start &= PAGE_SECTION_MASK; 184 start &= PAGE_SECTION_MASK;
185 mminit_validate_memmodel_limits(&start, &end);
166 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { 186 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
167 unsigned long section = pfn_to_section_nr(pfn); 187 unsigned long section = pfn_to_section_nr(pfn);
168 struct mem_section *ms; 188 struct mem_section *ms;
@@ -187,6 +207,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
187 unsigned long pfn; 207 unsigned long pfn;
188 unsigned long nr_pages = 0; 208 unsigned long nr_pages = 0;
189 209
210 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
190 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 211 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
191 if (nid != early_pfn_to_nid(pfn)) 212 if (nid != early_pfn_to_nid(pfn))
192 continue; 213 continue;
@@ -248,16 +269,92 @@ static unsigned long *__kmalloc_section_usemap(void)
248} 269}
249#endif /* CONFIG_MEMORY_HOTPLUG */ 270#endif /* CONFIG_MEMORY_HOTPLUG */
250 271
272#ifdef CONFIG_MEMORY_HOTREMOVE
273static unsigned long * __init
274sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
275{
276 unsigned long section_nr;
277
278 /*
279 * A page may contain usemaps for other sections preventing the
280 * page being freed and making a section unremovable while
281 * other sections referencing the usemap retmain active. Similarly,
282 * a pgdat can prevent a section being removed. If section A
283 * contains a pgdat and section B contains the usemap, both
284 * sections become inter-dependent. This allocates usemaps
285 * from the same section as the pgdat where possible to avoid
286 * this problem.
287 */
288 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
289 return alloc_bootmem_section(usemap_size(), section_nr);
290}
291
292static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
293{
294 unsigned long usemap_snr, pgdat_snr;
295 static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
296 static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
297 struct pglist_data *pgdat = NODE_DATA(nid);
298 int usemap_nid;
299
300 usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
301 pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
302 if (usemap_snr == pgdat_snr)
303 return;
304
305 if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
306 /* skip redundant message */
307 return;
308
309 old_usemap_snr = usemap_snr;
310 old_pgdat_snr = pgdat_snr;
311
312 usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
313 if (usemap_nid != nid) {
314 printk(KERN_INFO
315 "node %d must be removed before remove section %ld\n",
316 nid, usemap_snr);
317 return;
318 }
319 /*
320 * There is a circular dependency.
321 * Some platforms allow un-removable section because they will just
322 * gather other removable sections for dynamic partitioning.
323 * Just notify un-removable section's number here.
324 */
325 printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
326 pgdat_snr, nid);
327 printk(KERN_CONT
328 " have a circular dependency on usemap and pgdat allocations\n");
329}
330#else
331static unsigned long * __init
332sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
333{
334 return NULL;
335}
336
337static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
338{
339}
340#endif /* CONFIG_MEMORY_HOTREMOVE */
341
251static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 342static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
252{ 343{
253 unsigned long *usemap; 344 unsigned long *usemap;
254 struct mem_section *ms = __nr_to_section(pnum); 345 struct mem_section *ms = __nr_to_section(pnum);
255 int nid = sparse_early_nid(ms); 346 int nid = sparse_early_nid(ms);
256 347
257 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 348 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
258 if (usemap) 349 if (usemap)
259 return usemap; 350 return usemap;
260 351
352 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
353 if (usemap) {
354 check_usemap_section_nr(nid, usemap);
355 return usemap;
356 }
357
261 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 358 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
262 nid = 0; 359 nid = 0;
263 360
diff --git a/mm/swap.c b/mm/swap.c
index 45c9f25a8a3b..dd89234ee51f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,9 +34,9 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; 37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; 38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; 39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
40 40
41/* 41/*
42 * This path almost never happens for VM activity - pages are normally 42 * This path almost never happens for VM activity - pages are normally
@@ -493,7 +493,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
493 */ 493 */
494#define ACCT_THRESHOLD max(16, NR_CPUS * 2) 494#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
495 495
496static DEFINE_PER_CPU(long, committed_space) = 0; 496static DEFINE_PER_CPU(long, committed_space);
497 497
498void vm_acct_memory(long pages) 498void vm_acct_memory(long pages)
499{ 499{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd1bb5920306..2f33edb8bee9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -37,6 +37,7 @@ DEFINE_SPINLOCK(swap_lock);
37unsigned int nr_swapfiles; 37unsigned int nr_swapfiles;
38long total_swap_pages; 38long total_swap_pages;
39static int swap_overflow; 39static int swap_overflow;
40static int least_priority;
40 41
41static const char Bad_file[] = "Bad swap file entry "; 42static const char Bad_file[] = "Bad swap file entry ";
42static const char Unused_file[] = "Unused swap file entry "; 43static const char Unused_file[] = "Unused swap file entry ";
@@ -1260,6 +1261,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1260 /* just pick something that's safe... */ 1261 /* just pick something that's safe... */
1261 swap_list.next = swap_list.head; 1262 swap_list.next = swap_list.head;
1262 } 1263 }
1264 if (p->prio < 0) {
1265 for (i = p->next; i >= 0; i = swap_info[i].next)
1266 swap_info[i].prio = p->prio--;
1267 least_priority++;
1268 }
1263 nr_swap_pages -= p->pages; 1269 nr_swap_pages -= p->pages;
1264 total_swap_pages -= p->pages; 1270 total_swap_pages -= p->pages;
1265 p->flags &= ~SWP_WRITEOK; 1271 p->flags &= ~SWP_WRITEOK;
@@ -1272,9 +1278,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1272 if (err) { 1278 if (err) {
1273 /* re-insert swap space back into swap_list */ 1279 /* re-insert swap space back into swap_list */
1274 spin_lock(&swap_lock); 1280 spin_lock(&swap_lock);
1275 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) 1281 if (p->prio < 0)
1282 p->prio = --least_priority;
1283 prev = -1;
1284 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1276 if (p->prio >= swap_info[i].prio) 1285 if (p->prio >= swap_info[i].prio)
1277 break; 1286 break;
1287 prev = i;
1288 }
1278 p->next = i; 1289 p->next = i;
1279 if (prev < 0) 1290 if (prev < 0)
1280 swap_list.head = swap_list.next = p - swap_info; 1291 swap_list.head = swap_list.next = p - swap_info;
@@ -1447,7 +1458,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1447 unsigned int type; 1458 unsigned int type;
1448 int i, prev; 1459 int i, prev;
1449 int error; 1460 int error;
1450 static int least_priority;
1451 union swap_header *swap_header = NULL; 1461 union swap_header *swap_header = NULL;
1452 int swap_header_version; 1462 int swap_header_version;
1453 unsigned int nr_good_pages = 0; 1463 unsigned int nr_good_pages = 0;
@@ -1455,7 +1465,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1455 sector_t span; 1465 sector_t span;
1456 unsigned long maxpages = 1; 1466 unsigned long maxpages = 1;
1457 int swapfilesize; 1467 int swapfilesize;
1458 unsigned short *swap_map; 1468 unsigned short *swap_map = NULL;
1459 struct page *page = NULL; 1469 struct page *page = NULL;
1460 struct inode *inode = NULL; 1470 struct inode *inode = NULL;
1461 int did_down = 0; 1471 int did_down = 0;
@@ -1474,22 +1484,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1474 } 1484 }
1475 if (type >= nr_swapfiles) 1485 if (type >= nr_swapfiles)
1476 nr_swapfiles = type+1; 1486 nr_swapfiles = type+1;
1487 memset(p, 0, sizeof(*p));
1477 INIT_LIST_HEAD(&p->extent_list); 1488 INIT_LIST_HEAD(&p->extent_list);
1478 p->flags = SWP_USED; 1489 p->flags = SWP_USED;
1479 p->swap_file = NULL;
1480 p->old_block_size = 0;
1481 p->swap_map = NULL;
1482 p->lowest_bit = 0;
1483 p->highest_bit = 0;
1484 p->cluster_nr = 0;
1485 p->inuse_pages = 0;
1486 p->next = -1; 1490 p->next = -1;
1487 if (swap_flags & SWAP_FLAG_PREFER) {
1488 p->prio =
1489 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
1490 } else {
1491 p->prio = --least_priority;
1492 }
1493 spin_unlock(&swap_lock); 1491 spin_unlock(&swap_lock);
1494 name = getname(specialfile); 1492 name = getname(specialfile);
1495 error = PTR_ERR(name); 1493 error = PTR_ERR(name);
@@ -1632,19 +1630,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1632 goto bad_swap; 1630 goto bad_swap;
1633 1631
1634 /* OK, set up the swap map and apply the bad block list */ 1632 /* OK, set up the swap map and apply the bad block list */
1635 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 1633 swap_map = vmalloc(maxpages * sizeof(short));
1634 if (!swap_map) {
1636 error = -ENOMEM; 1635 error = -ENOMEM;
1637 goto bad_swap; 1636 goto bad_swap;
1638 } 1637 }
1639 1638
1640 error = 0; 1639 error = 0;
1641 memset(p->swap_map, 0, maxpages * sizeof(short)); 1640 memset(swap_map, 0, maxpages * sizeof(short));
1642 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1641 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1643 int page_nr = swap_header->info.badpages[i]; 1642 int page_nr = swap_header->info.badpages[i];
1644 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) 1643 if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
1645 error = -EINVAL; 1644 error = -EINVAL;
1646 else 1645 else
1647 p->swap_map[page_nr] = SWAP_MAP_BAD; 1646 swap_map[page_nr] = SWAP_MAP_BAD;
1648 } 1647 }
1649 nr_good_pages = swap_header->info.last_page - 1648 nr_good_pages = swap_header->info.last_page -
1650 swap_header->info.nr_badpages - 1649 swap_header->info.nr_badpages -
@@ -1654,7 +1653,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1654 } 1653 }
1655 1654
1656 if (nr_good_pages) { 1655 if (nr_good_pages) {
1657 p->swap_map[0] = SWAP_MAP_BAD; 1656 swap_map[0] = SWAP_MAP_BAD;
1658 p->max = maxpages; 1657 p->max = maxpages;
1659 p->pages = nr_good_pages; 1658 p->pages = nr_good_pages;
1660 nr_extents = setup_swap_extents(p, &span); 1659 nr_extents = setup_swap_extents(p, &span);
@@ -1672,6 +1671,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1672 1671
1673 mutex_lock(&swapon_mutex); 1672 mutex_lock(&swapon_mutex);
1674 spin_lock(&swap_lock); 1673 spin_lock(&swap_lock);
1674 if (swap_flags & SWAP_FLAG_PREFER)
1675 p->prio =
1676 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
1677 else
1678 p->prio = --least_priority;
1679 p->swap_map = swap_map;
1675 p->flags = SWP_ACTIVE; 1680 p->flags = SWP_ACTIVE;
1676 nr_swap_pages += nr_good_pages; 1681 nr_swap_pages += nr_good_pages;
1677 total_swap_pages += nr_good_pages; 1682 total_swap_pages += nr_good_pages;
@@ -1707,12 +1712,8 @@ bad_swap:
1707 destroy_swap_extents(p); 1712 destroy_swap_extents(p);
1708bad_swap_2: 1713bad_swap_2:
1709 spin_lock(&swap_lock); 1714 spin_lock(&swap_lock);
1710 swap_map = p->swap_map;
1711 p->swap_file = NULL; 1715 p->swap_file = NULL;
1712 p->swap_map = NULL;
1713 p->flags = 0; 1716 p->flags = 0;
1714 if (!(swap_flags & SWAP_FLAG_PREFER))
1715 ++least_priority;
1716 spin_unlock(&swap_lock); 1717 spin_unlock(&swap_lock);
1717 vfree(swap_map); 1718 vfree(swap_map);
1718 if (swap_file) 1719 if (swap_file)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6e45b0f3d125..35f293816294 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -931,6 +931,25 @@ static void s_stop(struct seq_file *m, void *p)
931 read_unlock(&vmlist_lock); 931 read_unlock(&vmlist_lock);
932} 932}
933 933
934static void show_numa_info(struct seq_file *m, struct vm_struct *v)
935{
936 if (NUMA_BUILD) {
937 unsigned int nr, *counters = m->private;
938
939 if (!counters)
940 return;
941
942 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
943
944 for (nr = 0; nr < v->nr_pages; nr++)
945 counters[page_to_nid(v->pages[nr])]++;
946
947 for_each_node_state(nr, N_HIGH_MEMORY)
948 if (counters[nr])
949 seq_printf(m, " N%u=%u", nr, counters[nr]);
950 }
951}
952
934static int s_show(struct seq_file *m, void *p) 953static int s_show(struct seq_file *m, void *p)
935{ 954{
936 struct vm_struct *v = p; 955 struct vm_struct *v = p;
@@ -967,6 +986,7 @@ static int s_show(struct seq_file *m, void *p)
967 if (v->flags & VM_VPAGES) 986 if (v->flags & VM_VPAGES)
968 seq_printf(m, " vpages"); 987 seq_printf(m, " vpages");
969 988
989 show_numa_info(m, v);
970 seq_putc(m, '\n'); 990 seq_putc(m, '\n');
971 return 0; 991 return 0;
972} 992}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 967d30ccd92b..26672c6cd3ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -38,6 +38,7 @@
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h> 40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
41 42
42#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
43#include <asm/div64.h> 44#include <asm/div64.h>
@@ -1316,6 +1317,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1316 struct zone *zone; 1317 struct zone *zone;
1317 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1318 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1318 1319
1320 delayacct_freepages_start();
1321
1319 if (scan_global_lru(sc)) 1322 if (scan_global_lru(sc))
1320 count_vm_event(ALLOCSTALL); 1323 count_vm_event(ALLOCSTALL);
1321 /* 1324 /*
@@ -1396,6 +1399,8 @@ out:
1396 } else 1399 } else
1397 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); 1400 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1398 1401
1402 delayacct_freepages_end();
1403
1399 return ret; 1404 return ret;
1400} 1405}
1401 1406
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c3d4a781802f..b0d08e667ece 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -13,6 +13,7 @@
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/vmstat.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17 18
18#ifdef CONFIG_VM_EVENT_COUNTERS 19#ifdef CONFIG_VM_EVENT_COUNTERS