diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 12 | ||||
-rw-r--r-- | mm/bootmem.c | 196 | ||||
-rw-r--r-- | mm/dmapool.c | 12 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 10 | ||||
-rw-r--r-- | mm/filemap_xip.c | 200 | ||||
-rw-r--r-- | mm/hugetlb.c | 78 | ||||
-rw-r--r-- | mm/internal.h | 3 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 228 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 186 | ||||
-rw-r--r-- | mm/mempolicy.c | 1051 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 33 | ||||
-rw-r--r-- | mm/mmzone.c | 30 | ||||
-rw-r--r-- | mm/nommu.c | 6 | ||||
-rw-r--r-- | mm/oom_kill.c | 58 | ||||
-rw-r--r-- | mm/page_alloc.c | 274 | ||||
-rw-r--r-- | mm/pagewalk.c | 8 | ||||
-rw-r--r-- | mm/rmap.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 144 | ||||
-rw-r--r-- | mm/slab.c | 17 | ||||
-rw-r--r-- | mm/slub.c | 18 | ||||
-rw-r--r-- | mm/sparse.c | 145 | ||||
-rw-r--r-- | mm/swap.c | 37 | ||||
-rw-r--r-- | mm/swapfile.c | 8 | ||||
-rw-r--r-- | mm/truncate.c | 11 | ||||
-rw-r--r-- | mm/vmalloc.c | 141 | ||||
-rw-r--r-- | mm/vmscan.c | 46 | ||||
-rw-r--r-- | mm/vmstat.c | 11 |
30 files changed, 1919 insertions, 1058 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0016ebd4dcba..3aa819d628c1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -143,6 +143,18 @@ config MEMORY_HOTREMOVE | |||
143 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE | 143 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE |
144 | depends on MIGRATION | 144 | depends on MIGRATION |
145 | 145 | ||
146 | # | ||
147 | # If we have space for more page flags then we can enable additional | ||
148 | # optimizations and functionality. | ||
149 | # | ||
150 | # Regular Sparsemem takes page flag bits for the sectionid if it does not | ||
151 | # use a virtual memmap. Disable extended page flags for 32 bit platforms | ||
152 | # that require the use of a sectionid in the page flags. | ||
153 | # | ||
154 | config PAGEFLAGS_EXTENDED | ||
155 | def_bool y | ||
156 | depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM | ||
157 | |||
146 | # Heavily threaded applications may benefit from splitting the mm-wide | 158 | # Heavily threaded applications may benefit from splitting the mm-wide |
147 | # page_table_lock, so that faults on different parts of the user address | 159 | # page_table_lock, so that faults on different parts of the user address |
148 | # space can be handled with less contention: split it at this NR_CPUS. | 160 | # space can be handled with less contention: split it at this NR_CPUS. |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 2ccea700968f..e8fb927392b9 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -111,44 +111,74 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat, | |||
111 | * might be used for boot-time allocations - or it might get added | 111 | * might be used for boot-time allocations - or it might get added |
112 | * to the free page pool later on. | 112 | * to the free page pool later on. |
113 | */ | 113 | */ |
114 | static int __init reserve_bootmem_core(bootmem_data_t *bdata, | 114 | static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, |
115 | unsigned long addr, unsigned long size, int flags) | 115 | unsigned long addr, unsigned long size, int flags) |
116 | { | 116 | { |
117 | unsigned long sidx, eidx; | 117 | unsigned long sidx, eidx; |
118 | unsigned long i; | 118 | unsigned long i; |
119 | int ret; | 119 | |
120 | BUG_ON(!size); | ||
121 | |||
122 | /* out of range, don't hold other */ | ||
123 | if (addr + size < bdata->node_boot_start || | ||
124 | PFN_DOWN(addr) > bdata->node_low_pfn) | ||
125 | return 0; | ||
120 | 126 | ||
121 | /* | 127 | /* |
122 | * round up, partially reserved pages are considered | 128 | * Round up to index to the range. |
123 | * fully reserved. | ||
124 | */ | 129 | */ |
130 | if (addr > bdata->node_boot_start) | ||
131 | sidx= PFN_DOWN(addr - bdata->node_boot_start); | ||
132 | else | ||
133 | sidx = 0; | ||
134 | |||
135 | eidx = PFN_UP(addr + size - bdata->node_boot_start); | ||
136 | if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) | ||
137 | eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); | ||
138 | |||
139 | for (i = sidx; i < eidx; i++) { | ||
140 | if (test_bit(i, bdata->node_bootmem_map)) { | ||
141 | if (flags & BOOTMEM_EXCLUSIVE) | ||
142 | return -EBUSY; | ||
143 | } | ||
144 | } | ||
145 | |||
146 | return 0; | ||
147 | |||
148 | } | ||
149 | |||
150 | static void __init reserve_bootmem_core(bootmem_data_t *bdata, | ||
151 | unsigned long addr, unsigned long size, int flags) | ||
152 | { | ||
153 | unsigned long sidx, eidx; | ||
154 | unsigned long i; | ||
155 | |||
125 | BUG_ON(!size); | 156 | BUG_ON(!size); |
126 | BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn); | ||
127 | BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn); | ||
128 | BUG_ON(addr < bdata->node_boot_start); | ||
129 | 157 | ||
130 | sidx = PFN_DOWN(addr - bdata->node_boot_start); | 158 | /* out of range */ |
159 | if (addr + size < bdata->node_boot_start || | ||
160 | PFN_DOWN(addr) > bdata->node_low_pfn) | ||
161 | return; | ||
162 | |||
163 | /* | ||
164 | * Round up to index to the range. | ||
165 | */ | ||
166 | if (addr > bdata->node_boot_start) | ||
167 | sidx= PFN_DOWN(addr - bdata->node_boot_start); | ||
168 | else | ||
169 | sidx = 0; | ||
170 | |||
131 | eidx = PFN_UP(addr + size - bdata->node_boot_start); | 171 | eidx = PFN_UP(addr + size - bdata->node_boot_start); |
172 | if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) | ||
173 | eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); | ||
132 | 174 | ||
133 | for (i = sidx; i < eidx; i++) | 175 | for (i = sidx; i < eidx; i++) { |
134 | if (test_and_set_bit(i, bdata->node_bootmem_map)) { | 176 | if (test_and_set_bit(i, bdata->node_bootmem_map)) { |
135 | #ifdef CONFIG_DEBUG_BOOTMEM | 177 | #ifdef CONFIG_DEBUG_BOOTMEM |
136 | printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); | 178 | printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); |
137 | #endif | 179 | #endif |
138 | if (flags & BOOTMEM_EXCLUSIVE) { | ||
139 | ret = -EBUSY; | ||
140 | goto err; | ||
141 | } | ||
142 | } | 180 | } |
143 | 181 | } | |
144 | return 0; | ||
145 | |||
146 | err: | ||
147 | /* unreserve memory we accidentally reserved */ | ||
148 | for (i--; i >= sidx; i--) | ||
149 | clear_bit(i, bdata->node_bootmem_map); | ||
150 | |||
151 | return ret; | ||
152 | } | 182 | } |
153 | 183 | ||
154 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | 184 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, |
@@ -206,9 +236,11 @@ void * __init | |||
206 | __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | 236 | __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, |
207 | unsigned long align, unsigned long goal, unsigned long limit) | 237 | unsigned long align, unsigned long goal, unsigned long limit) |
208 | { | 238 | { |
209 | unsigned long offset, remaining_size, areasize, preferred; | 239 | unsigned long areasize, preferred; |
210 | unsigned long i, start = 0, incr, eidx, end_pfn; | 240 | unsigned long i, start = 0, incr, eidx, end_pfn; |
211 | void *ret; | 241 | void *ret; |
242 | unsigned long node_boot_start; | ||
243 | void *node_bootmem_map; | ||
212 | 244 | ||
213 | if (!size) { | 245 | if (!size) { |
214 | printk("__alloc_bootmem_core(): zero-sized request\n"); | 246 | printk("__alloc_bootmem_core(): zero-sized request\n"); |
@@ -216,70 +248,83 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | |||
216 | } | 248 | } |
217 | BUG_ON(align & (align-1)); | 249 | BUG_ON(align & (align-1)); |
218 | 250 | ||
219 | if (limit && bdata->node_boot_start >= limit) | ||
220 | return NULL; | ||
221 | |||
222 | /* on nodes without memory - bootmem_map is NULL */ | 251 | /* on nodes without memory - bootmem_map is NULL */ |
223 | if (!bdata->node_bootmem_map) | 252 | if (!bdata->node_bootmem_map) |
224 | return NULL; | 253 | return NULL; |
225 | 254 | ||
255 | /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ | ||
256 | node_boot_start = bdata->node_boot_start; | ||
257 | node_bootmem_map = bdata->node_bootmem_map; | ||
258 | if (align) { | ||
259 | node_boot_start = ALIGN(bdata->node_boot_start, align); | ||
260 | if (node_boot_start > bdata->node_boot_start) | ||
261 | node_bootmem_map = (unsigned long *)bdata->node_bootmem_map + | ||
262 | PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG; | ||
263 | } | ||
264 | |||
265 | if (limit && node_boot_start >= limit) | ||
266 | return NULL; | ||
267 | |||
226 | end_pfn = bdata->node_low_pfn; | 268 | end_pfn = bdata->node_low_pfn; |
227 | limit = PFN_DOWN(limit); | 269 | limit = PFN_DOWN(limit); |
228 | if (limit && end_pfn > limit) | 270 | if (limit && end_pfn > limit) |
229 | end_pfn = limit; | 271 | end_pfn = limit; |
230 | 272 | ||
231 | eidx = end_pfn - PFN_DOWN(bdata->node_boot_start); | 273 | eidx = end_pfn - PFN_DOWN(node_boot_start); |
232 | offset = 0; | ||
233 | if (align && (bdata->node_boot_start & (align - 1UL)) != 0) | ||
234 | offset = align - (bdata->node_boot_start & (align - 1UL)); | ||
235 | offset = PFN_DOWN(offset); | ||
236 | 274 | ||
237 | /* | 275 | /* |
238 | * We try to allocate bootmem pages above 'goal' | 276 | * We try to allocate bootmem pages above 'goal' |
239 | * first, then we try to allocate lower pages. | 277 | * first, then we try to allocate lower pages. |
240 | */ | 278 | */ |
241 | if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { | 279 | preferred = 0; |
242 | preferred = goal - bdata->node_boot_start; | 280 | if (goal && PFN_DOWN(goal) < end_pfn) { |
281 | if (goal > node_boot_start) | ||
282 | preferred = goal - node_boot_start; | ||
243 | 283 | ||
244 | if (bdata->last_success >= preferred) | 284 | if (bdata->last_success > node_boot_start && |
285 | bdata->last_success - node_boot_start >= preferred) | ||
245 | if (!limit || (limit && limit > bdata->last_success)) | 286 | if (!limit || (limit && limit > bdata->last_success)) |
246 | preferred = bdata->last_success; | 287 | preferred = bdata->last_success - node_boot_start; |
247 | } else | 288 | } |
248 | preferred = 0; | ||
249 | 289 | ||
250 | preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; | 290 | preferred = PFN_DOWN(ALIGN(preferred, align)); |
251 | areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; | 291 | areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; |
252 | incr = align >> PAGE_SHIFT ? : 1; | 292 | incr = align >> PAGE_SHIFT ? : 1; |
253 | 293 | ||
254 | restart_scan: | 294 | restart_scan: |
255 | for (i = preferred; i < eidx; i += incr) { | 295 | for (i = preferred; i < eidx;) { |
256 | unsigned long j; | 296 | unsigned long j; |
257 | i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); | 297 | |
298 | i = find_next_zero_bit(node_bootmem_map, eidx, i); | ||
258 | i = ALIGN(i, incr); | 299 | i = ALIGN(i, incr); |
259 | if (i >= eidx) | 300 | if (i >= eidx) |
260 | break; | 301 | break; |
261 | if (test_bit(i, bdata->node_bootmem_map)) | 302 | if (test_bit(i, node_bootmem_map)) { |
303 | i += incr; | ||
262 | continue; | 304 | continue; |
305 | } | ||
263 | for (j = i + 1; j < i + areasize; ++j) { | 306 | for (j = i + 1; j < i + areasize; ++j) { |
264 | if (j >= eidx) | 307 | if (j >= eidx) |
265 | goto fail_block; | 308 | goto fail_block; |
266 | if (test_bit(j, bdata->node_bootmem_map)) | 309 | if (test_bit(j, node_bootmem_map)) |
267 | goto fail_block; | 310 | goto fail_block; |
268 | } | 311 | } |
269 | start = i; | 312 | start = i; |
270 | goto found; | 313 | goto found; |
271 | fail_block: | 314 | fail_block: |
272 | i = ALIGN(j, incr); | 315 | i = ALIGN(j, incr); |
316 | if (i == j) | ||
317 | i += incr; | ||
273 | } | 318 | } |
274 | 319 | ||
275 | if (preferred > offset) { | 320 | if (preferred > 0) { |
276 | preferred = offset; | 321 | preferred = 0; |
277 | goto restart_scan; | 322 | goto restart_scan; |
278 | } | 323 | } |
279 | return NULL; | 324 | return NULL; |
280 | 325 | ||
281 | found: | 326 | found: |
282 | bdata->last_success = PFN_PHYS(start); | 327 | bdata->last_success = PFN_PHYS(start) + node_boot_start; |
283 | BUG_ON(start >= eidx); | 328 | BUG_ON(start >= eidx); |
284 | 329 | ||
285 | /* | 330 | /* |
@@ -289,6 +334,7 @@ found: | |||
289 | */ | 334 | */ |
290 | if (align < PAGE_SIZE && | 335 | if (align < PAGE_SIZE && |
291 | bdata->last_offset && bdata->last_pos+1 == start) { | 336 | bdata->last_offset && bdata->last_pos+1 == start) { |
337 | unsigned long offset, remaining_size; | ||
292 | offset = ALIGN(bdata->last_offset, align); | 338 | offset = ALIGN(bdata->last_offset, align); |
293 | BUG_ON(offset > PAGE_SIZE); | 339 | BUG_ON(offset > PAGE_SIZE); |
294 | remaining_size = PAGE_SIZE - offset; | 340 | remaining_size = PAGE_SIZE - offset; |
@@ -297,14 +343,12 @@ found: | |||
297 | /* last_pos unchanged */ | 343 | /* last_pos unchanged */ |
298 | bdata->last_offset = offset + size; | 344 | bdata->last_offset = offset + size; |
299 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + | 345 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + |
300 | offset + | 346 | offset + node_boot_start); |
301 | bdata->node_boot_start); | ||
302 | } else { | 347 | } else { |
303 | remaining_size = size - remaining_size; | 348 | remaining_size = size - remaining_size; |
304 | areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; | 349 | areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; |
305 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + | 350 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + |
306 | offset + | 351 | offset + node_boot_start); |
307 | bdata->node_boot_start); | ||
308 | bdata->last_pos = start + areasize - 1; | 352 | bdata->last_pos = start + areasize - 1; |
309 | bdata->last_offset = remaining_size; | 353 | bdata->last_offset = remaining_size; |
310 | } | 354 | } |
@@ -312,14 +356,14 @@ found: | |||
312 | } else { | 356 | } else { |
313 | bdata->last_pos = start + areasize - 1; | 357 | bdata->last_pos = start + areasize - 1; |
314 | bdata->last_offset = size & ~PAGE_MASK; | 358 | bdata->last_offset = size & ~PAGE_MASK; |
315 | ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); | 359 | ret = phys_to_virt(start * PAGE_SIZE + node_boot_start); |
316 | } | 360 | } |
317 | 361 | ||
318 | /* | 362 | /* |
319 | * Reserve the area now: | 363 | * Reserve the area now: |
320 | */ | 364 | */ |
321 | for (i = start; i < start + areasize; i++) | 365 | for (i = start; i < start + areasize; i++) |
322 | if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) | 366 | if (unlikely(test_and_set_bit(i, node_bootmem_map))) |
323 | BUG(); | 367 | BUG(); |
324 | memset(ret, 0, size); | 368 | memset(ret, 0, size); |
325 | return ret; | 369 | return ret; |
@@ -401,6 +445,11 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, | |||
401 | void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 445 | void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
402 | unsigned long size, int flags) | 446 | unsigned long size, int flags) |
403 | { | 447 | { |
448 | int ret; | ||
449 | |||
450 | ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); | ||
451 | if (ret < 0) | ||
452 | return; | ||
404 | reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); | 453 | reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); |
405 | } | 454 | } |
406 | 455 | ||
@@ -412,6 +461,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
412 | 461 | ||
413 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 462 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
414 | { | 463 | { |
464 | register_page_bootmem_info_node(pgdat); | ||
415 | return free_all_bootmem_core(pgdat); | 465 | return free_all_bootmem_core(pgdat); |
416 | } | 466 | } |
417 | 467 | ||
@@ -426,7 +476,18 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
426 | int __init reserve_bootmem(unsigned long addr, unsigned long size, | 476 | int __init reserve_bootmem(unsigned long addr, unsigned long size, |
427 | int flags) | 477 | int flags) |
428 | { | 478 | { |
429 | return reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size, flags); | 479 | bootmem_data_t *bdata; |
480 | int ret; | ||
481 | |||
482 | list_for_each_entry(bdata, &bdata_list, list) { | ||
483 | ret = can_reserve_bootmem_core(bdata, addr, size, flags); | ||
484 | if (ret < 0) | ||
485 | return ret; | ||
486 | } | ||
487 | list_for_each_entry(bdata, &bdata_list, list) | ||
488 | reserve_bootmem_core(bdata, addr, size, flags); | ||
489 | |||
490 | return 0; | ||
430 | } | 491 | } |
431 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ | 492 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ |
432 | 493 | ||
@@ -484,6 +545,37 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
484 | return __alloc_bootmem(size, align, goal); | 545 | return __alloc_bootmem(size, align, goal); |
485 | } | 546 | } |
486 | 547 | ||
548 | #ifdef CONFIG_SPARSEMEM | ||
549 | void * __init alloc_bootmem_section(unsigned long size, | ||
550 | unsigned long section_nr) | ||
551 | { | ||
552 | void *ptr; | ||
553 | unsigned long limit, goal, start_nr, end_nr, pfn; | ||
554 | struct pglist_data *pgdat; | ||
555 | |||
556 | pfn = section_nr_to_pfn(section_nr); | ||
557 | goal = PFN_PHYS(pfn); | ||
558 | limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; | ||
559 | pgdat = NODE_DATA(early_pfn_to_nid(pfn)); | ||
560 | ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal, | ||
561 | limit); | ||
562 | |||
563 | if (!ptr) | ||
564 | return NULL; | ||
565 | |||
566 | start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); | ||
567 | end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); | ||
568 | if (start_nr != section_nr || end_nr != section_nr) { | ||
569 | printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", | ||
570 | section_nr); | ||
571 | free_bootmem_core(pgdat->bdata, __pa(ptr), size); | ||
572 | ptr = NULL; | ||
573 | } | ||
574 | |||
575 | return ptr; | ||
576 | } | ||
577 | #endif | ||
578 | |||
487 | #ifndef ARCH_LOW_ADDRESS_LIMIT | 579 | #ifndef ARCH_LOW_ADDRESS_LIMIT |
488 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | 580 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL |
489 | #endif | 581 | #endif |
diff --git a/mm/dmapool.c b/mm/dmapool.c index 34aaac451a96..b1f0885dda22 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -37,6 +37,10 @@ | |||
37 | #include <linux/types.h> | 37 | #include <linux/types.h> |
38 | #include <linux/wait.h> | 38 | #include <linux/wait.h> |
39 | 39 | ||
40 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) | ||
41 | #define DMAPOOL_DEBUG 1 | ||
42 | #endif | ||
43 | |||
40 | struct dma_pool { /* the pool */ | 44 | struct dma_pool { /* the pool */ |
41 | struct list_head page_list; | 45 | struct list_head page_list; |
42 | spinlock_t lock; | 46 | spinlock_t lock; |
@@ -216,7 +220,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) | |||
216 | page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, | 220 | page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, |
217 | &page->dma, mem_flags); | 221 | &page->dma, mem_flags); |
218 | if (page->vaddr) { | 222 | if (page->vaddr) { |
219 | #ifdef CONFIG_DEBUG_SLAB | 223 | #ifdef DMAPOOL_DEBUG |
220 | memset(page->vaddr, POOL_POISON_FREED, pool->allocation); | 224 | memset(page->vaddr, POOL_POISON_FREED, pool->allocation); |
221 | #endif | 225 | #endif |
222 | pool_initialise_page(pool, page); | 226 | pool_initialise_page(pool, page); |
@@ -239,7 +243,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) | |||
239 | { | 243 | { |
240 | dma_addr_t dma = page->dma; | 244 | dma_addr_t dma = page->dma; |
241 | 245 | ||
242 | #ifdef CONFIG_DEBUG_SLAB | 246 | #ifdef DMAPOOL_DEBUG |
243 | memset(page->vaddr, POOL_POISON_FREED, pool->allocation); | 247 | memset(page->vaddr, POOL_POISON_FREED, pool->allocation); |
244 | #endif | 248 | #endif |
245 | dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); | 249 | dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); |
@@ -336,7 +340,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
336 | page->offset = *(int *)(page->vaddr + offset); | 340 | page->offset = *(int *)(page->vaddr + offset); |
337 | retval = offset + page->vaddr; | 341 | retval = offset + page->vaddr; |
338 | *handle = offset + page->dma; | 342 | *handle = offset + page->dma; |
339 | #ifdef CONFIG_DEBUG_SLAB | 343 | #ifdef DMAPOOL_DEBUG |
340 | memset(retval, POOL_POISON_ALLOCATED, pool->size); | 344 | memset(retval, POOL_POISON_ALLOCATED, pool->size); |
341 | #endif | 345 | #endif |
342 | done: | 346 | done: |
@@ -391,7 +395,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
391 | } | 395 | } |
392 | 396 | ||
393 | offset = vaddr - page->vaddr; | 397 | offset = vaddr - page->vaddr; |
394 | #ifdef CONFIG_DEBUG_SLAB | 398 | #ifdef DMAPOOL_DEBUG |
395 | if ((dma - page->dma) != offset) { | 399 | if ((dma - page->dma) != offset) { |
396 | if (pool->dev) | 400 | if (pool->dev) |
397 | dev_err(pool->dev, | 401 | dev_err(pool->dev, |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 3c0f1e99f5e4..343cfdfebd9e 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -49,7 +49,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
49 | goto out; | 49 | goto out; |
50 | } | 50 | } |
51 | 51 | ||
52 | if (mapping->a_ops->get_xip_page) { | 52 | if (mapping->a_ops->get_xip_mem) { |
53 | switch (advice) { | 53 | switch (advice) { |
54 | case POSIX_FADV_NORMAL: | 54 | case POSIX_FADV_NORMAL: |
55 | case POSIX_FADV_RANDOM: | 55 | case POSIX_FADV_RANDOM: |
diff --git a/mm/filemap.c b/mm/filemap.c index 07e9d9258b48..239d36163bbe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -576,10 +576,12 @@ EXPORT_SYMBOL(unlock_page); | |||
576 | */ | 576 | */ |
577 | void end_page_writeback(struct page *page) | 577 | void end_page_writeback(struct page *page) |
578 | { | 578 | { |
579 | if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { | 579 | if (TestClearPageReclaim(page)) |
580 | if (!test_clear_page_writeback(page)) | 580 | rotate_reclaimable_page(page); |
581 | BUG(); | 581 | |
582 | } | 582 | if (!test_clear_page_writeback(page)) |
583 | BUG(); | ||
584 | |||
583 | smp_mb__after_clear_bit(); | 585 | smp_mb__after_clear_bit(); |
584 | wake_up_page(page, PG_writeback); | 586 | wake_up_page(page, PG_writeback); |
585 | } | 587 | } |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 5e598c42afd7..3e744abcce9d 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <asm/tlbflush.h> | 17 | #include <asm/tlbflush.h> |
18 | #include <asm/io.h> | ||
18 | 19 | ||
19 | /* | 20 | /* |
20 | * We do use our own empty page to avoid interference with other users | 21 | * We do use our own empty page to avoid interference with other users |
@@ -42,37 +43,41 @@ static struct page *xip_sparse_page(void) | |||
42 | 43 | ||
43 | /* | 44 | /* |
44 | * This is a file read routine for execute in place files, and uses | 45 | * This is a file read routine for execute in place files, and uses |
45 | * the mapping->a_ops->get_xip_page() function for the actual low-level | 46 | * the mapping->a_ops->get_xip_mem() function for the actual low-level |
46 | * stuff. | 47 | * stuff. |
47 | * | 48 | * |
48 | * Note the struct file* is not used at all. It may be NULL. | 49 | * Note the struct file* is not used at all. It may be NULL. |
49 | */ | 50 | */ |
50 | static void | 51 | static ssize_t |
51 | do_xip_mapping_read(struct address_space *mapping, | 52 | do_xip_mapping_read(struct address_space *mapping, |
52 | struct file_ra_state *_ra, | 53 | struct file_ra_state *_ra, |
53 | struct file *filp, | 54 | struct file *filp, |
54 | loff_t *ppos, | 55 | char __user *buf, |
55 | read_descriptor_t *desc, | 56 | size_t len, |
56 | read_actor_t actor) | 57 | loff_t *ppos) |
57 | { | 58 | { |
58 | struct inode *inode = mapping->host; | 59 | struct inode *inode = mapping->host; |
59 | pgoff_t index, end_index; | 60 | pgoff_t index, end_index; |
60 | unsigned long offset; | 61 | unsigned long offset; |
61 | loff_t isize; | 62 | loff_t isize, pos; |
63 | size_t copied = 0, error = 0; | ||
62 | 64 | ||
63 | BUG_ON(!mapping->a_ops->get_xip_page); | 65 | BUG_ON(!mapping->a_ops->get_xip_mem); |
64 | 66 | ||
65 | index = *ppos >> PAGE_CACHE_SHIFT; | 67 | pos = *ppos; |
66 | offset = *ppos & ~PAGE_CACHE_MASK; | 68 | index = pos >> PAGE_CACHE_SHIFT; |
69 | offset = pos & ~PAGE_CACHE_MASK; | ||
67 | 70 | ||
68 | isize = i_size_read(inode); | 71 | isize = i_size_read(inode); |
69 | if (!isize) | 72 | if (!isize) |
70 | goto out; | 73 | goto out; |
71 | 74 | ||
72 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | 75 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; |
73 | for (;;) { | 76 | do { |
74 | struct page *page; | 77 | unsigned long nr, left; |
75 | unsigned long nr, ret; | 78 | void *xip_mem; |
79 | unsigned long xip_pfn; | ||
80 | int zero = 0; | ||
76 | 81 | ||
77 | /* nr is the maximum number of bytes to copy from this page */ | 82 | /* nr is the maximum number of bytes to copy from this page */ |
78 | nr = PAGE_CACHE_SIZE; | 83 | nr = PAGE_CACHE_SIZE; |
@@ -85,19 +90,17 @@ do_xip_mapping_read(struct address_space *mapping, | |||
85 | } | 90 | } |
86 | } | 91 | } |
87 | nr = nr - offset; | 92 | nr = nr - offset; |
93 | if (nr > len) | ||
94 | nr = len; | ||
88 | 95 | ||
89 | page = mapping->a_ops->get_xip_page(mapping, | 96 | error = mapping->a_ops->get_xip_mem(mapping, index, 0, |
90 | index*(PAGE_SIZE/512), 0); | 97 | &xip_mem, &xip_pfn); |
91 | if (!page) | 98 | if (unlikely(error)) { |
92 | goto no_xip_page; | 99 | if (error == -ENODATA) { |
93 | if (unlikely(IS_ERR(page))) { | ||
94 | if (PTR_ERR(page) == -ENODATA) { | ||
95 | /* sparse */ | 100 | /* sparse */ |
96 | page = ZERO_PAGE(0); | 101 | zero = 1; |
97 | } else { | 102 | } else |
98 | desc->error = PTR_ERR(page); | ||
99 | goto out; | 103 | goto out; |
100 | } | ||
101 | } | 104 | } |
102 | 105 | ||
103 | /* If users can be writing to this page using arbitrary | 106 | /* If users can be writing to this page using arbitrary |
@@ -105,10 +108,10 @@ do_xip_mapping_read(struct address_space *mapping, | |||
105 | * before reading the page on the kernel side. | 108 | * before reading the page on the kernel side. |
106 | */ | 109 | */ |
107 | if (mapping_writably_mapped(mapping)) | 110 | if (mapping_writably_mapped(mapping)) |
108 | flush_dcache_page(page); | 111 | /* address based flush */ ; |
109 | 112 | ||
110 | /* | 113 | /* |
111 | * Ok, we have the page, so now we can copy it to user space... | 114 | * Ok, we have the mem, so now we can copy it to user space... |
112 | * | 115 | * |
113 | * The actor routine returns how many bytes were actually used.. | 116 | * The actor routine returns how many bytes were actually used.. |
114 | * NOTE! This may not be the same as how much of a user buffer | 117 | * NOTE! This may not be the same as how much of a user buffer |
@@ -116,47 +119,38 @@ do_xip_mapping_read(struct address_space *mapping, | |||
116 | * "pos" here (the actor routine has to update the user buffer | 119 | * "pos" here (the actor routine has to update the user buffer |
117 | * pointers and the remaining count). | 120 | * pointers and the remaining count). |
118 | */ | 121 | */ |
119 | ret = actor(desc, page, offset, nr); | 122 | if (!zero) |
120 | offset += ret; | 123 | left = __copy_to_user(buf+copied, xip_mem+offset, nr); |
121 | index += offset >> PAGE_CACHE_SHIFT; | 124 | else |
122 | offset &= ~PAGE_CACHE_MASK; | 125 | left = __clear_user(buf + copied, nr); |
123 | 126 | ||
124 | if (ret == nr && desc->count) | 127 | if (left) { |
125 | continue; | 128 | error = -EFAULT; |
126 | goto out; | 129 | goto out; |
130 | } | ||
127 | 131 | ||
128 | no_xip_page: | 132 | copied += (nr - left); |
129 | /* Did not get the page. Report it */ | 133 | offset += (nr - left); |
130 | desc->error = -EIO; | 134 | index += offset >> PAGE_CACHE_SHIFT; |
131 | goto out; | 135 | offset &= ~PAGE_CACHE_MASK; |
132 | } | 136 | } while (copied < len); |
133 | 137 | ||
134 | out: | 138 | out: |
135 | *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; | 139 | *ppos = pos + copied; |
136 | if (filp) | 140 | if (filp) |
137 | file_accessed(filp); | 141 | file_accessed(filp); |
142 | |||
143 | return (copied ? copied : error); | ||
138 | } | 144 | } |
139 | 145 | ||
140 | ssize_t | 146 | ssize_t |
141 | xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) | 147 | xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) |
142 | { | 148 | { |
143 | read_descriptor_t desc; | ||
144 | |||
145 | if (!access_ok(VERIFY_WRITE, buf, len)) | 149 | if (!access_ok(VERIFY_WRITE, buf, len)) |
146 | return -EFAULT; | 150 | return -EFAULT; |
147 | 151 | ||
148 | desc.written = 0; | 152 | return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, |
149 | desc.arg.buf = buf; | 153 | buf, len, ppos); |
150 | desc.count = len; | ||
151 | desc.error = 0; | ||
152 | |||
153 | do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, | ||
154 | ppos, &desc, file_read_actor); | ||
155 | |||
156 | if (desc.written) | ||
157 | return desc.written; | ||
158 | else | ||
159 | return desc.error; | ||
160 | } | 154 | } |
161 | EXPORT_SYMBOL_GPL(xip_file_read); | 155 | EXPORT_SYMBOL_GPL(xip_file_read); |
162 | 156 | ||
@@ -211,13 +205,16 @@ __xip_unmap (struct address_space * mapping, | |||
211 | * | 205 | * |
212 | * This function is derived from filemap_fault, but used for execute in place | 206 | * This function is derived from filemap_fault, but used for execute in place |
213 | */ | 207 | */ |
214 | static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf) | 208 | static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
215 | { | 209 | { |
216 | struct file *file = area->vm_file; | 210 | struct file *file = vma->vm_file; |
217 | struct address_space *mapping = file->f_mapping; | 211 | struct address_space *mapping = file->f_mapping; |
218 | struct inode *inode = mapping->host; | 212 | struct inode *inode = mapping->host; |
219 | struct page *page; | ||
220 | pgoff_t size; | 213 | pgoff_t size; |
214 | void *xip_mem; | ||
215 | unsigned long xip_pfn; | ||
216 | struct page *page; | ||
217 | int error; | ||
221 | 218 | ||
222 | /* XXX: are VM_FAULT_ codes OK? */ | 219 | /* XXX: are VM_FAULT_ codes OK? */ |
223 | 220 | ||
@@ -225,35 +222,44 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf) | |||
225 | if (vmf->pgoff >= size) | 222 | if (vmf->pgoff >= size) |
226 | return VM_FAULT_SIGBUS; | 223 | return VM_FAULT_SIGBUS; |
227 | 224 | ||
228 | page = mapping->a_ops->get_xip_page(mapping, | 225 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, |
229 | vmf->pgoff*(PAGE_SIZE/512), 0); | 226 | &xip_mem, &xip_pfn); |
230 | if (!IS_ERR(page)) | 227 | if (likely(!error)) |
231 | goto out; | 228 | goto found; |
232 | if (PTR_ERR(page) != -ENODATA) | 229 | if (error != -ENODATA) |
233 | return VM_FAULT_OOM; | 230 | return VM_FAULT_OOM; |
234 | 231 | ||
235 | /* sparse block */ | 232 | /* sparse block */ |
236 | if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) && | 233 | if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && |
237 | (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) && | 234 | (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && |
238 | (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { | 235 | (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { |
236 | int err; | ||
237 | |||
239 | /* maybe shared writable, allocate new block */ | 238 | /* maybe shared writable, allocate new block */ |
240 | page = mapping->a_ops->get_xip_page(mapping, | 239 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, |
241 | vmf->pgoff*(PAGE_SIZE/512), 1); | 240 | &xip_mem, &xip_pfn); |
242 | if (IS_ERR(page)) | 241 | if (error) |
243 | return VM_FAULT_SIGBUS; | 242 | return VM_FAULT_SIGBUS; |
244 | /* unmap page at pgoff from all other vmas */ | 243 | /* unmap sparse mappings at pgoff from all other vmas */ |
245 | __xip_unmap(mapping, vmf->pgoff); | 244 | __xip_unmap(mapping, vmf->pgoff); |
245 | |||
246 | found: | ||
247 | err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, | ||
248 | xip_pfn); | ||
249 | if (err == -ENOMEM) | ||
250 | return VM_FAULT_OOM; | ||
251 | BUG_ON(err); | ||
252 | return VM_FAULT_NOPAGE; | ||
246 | } else { | 253 | } else { |
247 | /* not shared and writable, use xip_sparse_page() */ | 254 | /* not shared and writable, use xip_sparse_page() */ |
248 | page = xip_sparse_page(); | 255 | page = xip_sparse_page(); |
249 | if (!page) | 256 | if (!page) |
250 | return VM_FAULT_OOM; | 257 | return VM_FAULT_OOM; |
251 | } | ||
252 | 258 | ||
253 | out: | 259 | page_cache_get(page); |
254 | page_cache_get(page); | 260 | vmf->page = page; |
255 | vmf->page = page; | 261 | return 0; |
256 | return 0; | 262 | } |
257 | } | 263 | } |
258 | 264 | ||
259 | static struct vm_operations_struct xip_file_vm_ops = { | 265 | static struct vm_operations_struct xip_file_vm_ops = { |
@@ -262,11 +268,11 @@ static struct vm_operations_struct xip_file_vm_ops = { | |||
262 | 268 | ||
263 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | 269 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) |
264 | { | 270 | { |
265 | BUG_ON(!file->f_mapping->a_ops->get_xip_page); | 271 | BUG_ON(!file->f_mapping->a_ops->get_xip_mem); |
266 | 272 | ||
267 | file_accessed(file); | 273 | file_accessed(file); |
268 | vma->vm_ops = &xip_file_vm_ops; | 274 | vma->vm_ops = &xip_file_vm_ops; |
269 | vma->vm_flags |= VM_CAN_NONLINEAR; | 275 | vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; |
270 | return 0; | 276 | return 0; |
271 | } | 277 | } |
272 | EXPORT_SYMBOL_GPL(xip_file_mmap); | 278 | EXPORT_SYMBOL_GPL(xip_file_mmap); |
@@ -279,17 +285,17 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
279 | const struct address_space_operations *a_ops = mapping->a_ops; | 285 | const struct address_space_operations *a_ops = mapping->a_ops; |
280 | struct inode *inode = mapping->host; | 286 | struct inode *inode = mapping->host; |
281 | long status = 0; | 287 | long status = 0; |
282 | struct page *page; | ||
283 | size_t bytes; | 288 | size_t bytes; |
284 | ssize_t written = 0; | 289 | ssize_t written = 0; |
285 | 290 | ||
286 | BUG_ON(!mapping->a_ops->get_xip_page); | 291 | BUG_ON(!mapping->a_ops->get_xip_mem); |
287 | 292 | ||
288 | do { | 293 | do { |
289 | unsigned long index; | 294 | unsigned long index; |
290 | unsigned long offset; | 295 | unsigned long offset; |
291 | size_t copied; | 296 | size_t copied; |
292 | char *kaddr; | 297 | void *xip_mem; |
298 | unsigned long xip_pfn; | ||
293 | 299 | ||
294 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 300 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ |
295 | index = pos >> PAGE_CACHE_SHIFT; | 301 | index = pos >> PAGE_CACHE_SHIFT; |
@@ -297,28 +303,22 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
297 | if (bytes > count) | 303 | if (bytes > count) |
298 | bytes = count; | 304 | bytes = count; |
299 | 305 | ||
300 | page = a_ops->get_xip_page(mapping, | 306 | status = a_ops->get_xip_mem(mapping, index, 0, |
301 | index*(PAGE_SIZE/512), 0); | 307 | &xip_mem, &xip_pfn); |
302 | if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { | 308 | if (status == -ENODATA) { |
303 | /* we allocate a new page unmap it */ | 309 | /* we allocate a new page unmap it */ |
304 | page = a_ops->get_xip_page(mapping, | 310 | status = a_ops->get_xip_mem(mapping, index, 1, |
305 | index*(PAGE_SIZE/512), 1); | 311 | &xip_mem, &xip_pfn); |
306 | if (!IS_ERR(page)) | 312 | if (!status) |
307 | /* unmap page at pgoff from all other vmas */ | 313 | /* unmap page at pgoff from all other vmas */ |
308 | __xip_unmap(mapping, index); | 314 | __xip_unmap(mapping, index); |
309 | } | 315 | } |
310 | 316 | ||
311 | if (IS_ERR(page)) { | 317 | if (status) |
312 | status = PTR_ERR(page); | ||
313 | break; | 318 | break; |
314 | } | ||
315 | 319 | ||
316 | fault_in_pages_readable(buf, bytes); | ||
317 | kaddr = kmap_atomic(page, KM_USER0); | ||
318 | copied = bytes - | 320 | copied = bytes - |
319 | __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); | 321 | __copy_from_user_nocache(xip_mem + offset, buf, bytes); |
320 | kunmap_atomic(kaddr, KM_USER0); | ||
321 | flush_dcache_page(page); | ||
322 | 322 | ||
323 | if (likely(copied > 0)) { | 323 | if (likely(copied > 0)) { |
324 | status = copied; | 324 | status = copied; |
@@ -398,7 +398,7 @@ EXPORT_SYMBOL_GPL(xip_file_write); | |||
398 | 398 | ||
399 | /* | 399 | /* |
400 | * truncate a page used for execute in place | 400 | * truncate a page used for execute in place |
401 | * functionality is analog to block_truncate_page but does use get_xip_page | 401 | * functionality is analog to block_truncate_page but does use get_xip_mem |
402 | * to get the page instead of page cache | 402 | * to get the page instead of page cache |
403 | */ | 403 | */ |
404 | int | 404 | int |
@@ -408,9 +408,11 @@ xip_truncate_page(struct address_space *mapping, loff_t from) | |||
408 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 408 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
409 | unsigned blocksize; | 409 | unsigned blocksize; |
410 | unsigned length; | 410 | unsigned length; |
411 | struct page *page; | 411 | void *xip_mem; |
412 | unsigned long xip_pfn; | ||
413 | int err; | ||
412 | 414 | ||
413 | BUG_ON(!mapping->a_ops->get_xip_page); | 415 | BUG_ON(!mapping->a_ops->get_xip_mem); |
414 | 416 | ||
415 | blocksize = 1 << mapping->host->i_blkbits; | 417 | blocksize = 1 << mapping->host->i_blkbits; |
416 | length = offset & (blocksize - 1); | 418 | length = offset & (blocksize - 1); |
@@ -421,18 +423,16 @@ xip_truncate_page(struct address_space *mapping, loff_t from) | |||
421 | 423 | ||
422 | length = blocksize - length; | 424 | length = blocksize - length; |
423 | 425 | ||
424 | page = mapping->a_ops->get_xip_page(mapping, | 426 | err = mapping->a_ops->get_xip_mem(mapping, index, 0, |
425 | index*(PAGE_SIZE/512), 0); | 427 | &xip_mem, &xip_pfn); |
426 | if (!page) | 428 | if (unlikely(err)) { |
427 | return -ENOMEM; | 429 | if (err == -ENODATA) |
428 | if (unlikely(IS_ERR(page))) { | ||
429 | if (PTR_ERR(page) == -ENODATA) | ||
430 | /* Hole? No need to truncate */ | 430 | /* Hole? No need to truncate */ |
431 | return 0; | 431 | return 0; |
432 | else | 432 | else |
433 | return PTR_ERR(page); | 433 | return err; |
434 | } | 434 | } |
435 | zero_user(page, offset, length); | 435 | memset(xip_mem + offset, 0, length); |
436 | return 0; | 436 | return 0; |
437 | } | 437 | } |
438 | EXPORT_SYMBOL_GPL(xip_truncate_page); | 438 | EXPORT_SYMBOL_GPL(xip_truncate_page); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51c9e2c01640..df28c1773fb2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -95,13 +95,16 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
95 | int nid; | 95 | int nid; |
96 | struct page *page = NULL; | 96 | struct page *page = NULL; |
97 | struct mempolicy *mpol; | 97 | struct mempolicy *mpol; |
98 | nodemask_t *nodemask; | ||
98 | struct zonelist *zonelist = huge_zonelist(vma, address, | 99 | struct zonelist *zonelist = huge_zonelist(vma, address, |
99 | htlb_alloc_mask, &mpol); | 100 | htlb_alloc_mask, &mpol, &nodemask); |
100 | struct zone **z; | 101 | struct zone *zone; |
101 | 102 | struct zoneref *z; | |
102 | for (z = zonelist->zones; *z; z++) { | 103 | |
103 | nid = zone_to_nid(*z); | 104 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
104 | if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && | 105 | MAX_NR_ZONES - 1, nodemask) { |
106 | nid = zone_to_nid(zone); | ||
107 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | ||
105 | !list_empty(&hugepage_freelists[nid])) { | 108 | !list_empty(&hugepage_freelists[nid])) { |
106 | page = list_entry(hugepage_freelists[nid].next, | 109 | page = list_entry(hugepage_freelists[nid].next, |
107 | struct page, lru); | 110 | struct page, lru); |
@@ -113,7 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
113 | break; | 116 | break; |
114 | } | 117 | } |
115 | } | 118 | } |
116 | mpol_free(mpol); /* unref if mpol !NULL */ | 119 | mpol_cond_put(mpol); |
117 | return page; | 120 | return page; |
118 | } | 121 | } |
119 | 122 | ||
@@ -129,6 +132,7 @@ static void update_and_free_page(struct page *page) | |||
129 | } | 132 | } |
130 | set_compound_page_dtor(page, NULL); | 133 | set_compound_page_dtor(page, NULL); |
131 | set_page_refcounted(page); | 134 | set_page_refcounted(page); |
135 | arch_release_hugepage(page); | ||
132 | __free_pages(page, HUGETLB_PAGE_ORDER); | 136 | __free_pages(page, HUGETLB_PAGE_ORDER); |
133 | } | 137 | } |
134 | 138 | ||
@@ -198,6 +202,10 @@ static struct page *alloc_fresh_huge_page_node(int nid) | |||
198 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, | 202 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, |
199 | HUGETLB_PAGE_ORDER); | 203 | HUGETLB_PAGE_ORDER); |
200 | if (page) { | 204 | if (page) { |
205 | if (arch_prepare_hugepage(page)) { | ||
206 | __free_pages(page, HUGETLB_PAGE_ORDER); | ||
207 | return 0; | ||
208 | } | ||
201 | set_compound_page_dtor(page, free_huge_page); | 209 | set_compound_page_dtor(page, free_huge_page); |
202 | spin_lock(&hugetlb_lock); | 210 | spin_lock(&hugetlb_lock); |
203 | nr_huge_pages++; | 211 | nr_huge_pages++; |
@@ -239,6 +247,11 @@ static int alloc_fresh_huge_page(void) | |||
239 | hugetlb_next_nid = next_nid; | 247 | hugetlb_next_nid = next_nid; |
240 | } while (!page && hugetlb_next_nid != start_nid); | 248 | } while (!page && hugetlb_next_nid != start_nid); |
241 | 249 | ||
250 | if (ret) | ||
251 | count_vm_event(HTLB_BUDDY_PGALLOC); | ||
252 | else | ||
253 | count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); | ||
254 | |||
242 | return ret; | 255 | return ret; |
243 | } | 256 | } |
244 | 257 | ||
@@ -299,9 +312,11 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
299 | */ | 312 | */ |
300 | nr_huge_pages_node[nid]++; | 313 | nr_huge_pages_node[nid]++; |
301 | surplus_huge_pages_node[nid]++; | 314 | surplus_huge_pages_node[nid]++; |
315 | __count_vm_event(HTLB_BUDDY_PGALLOC); | ||
302 | } else { | 316 | } else { |
303 | nr_huge_pages--; | 317 | nr_huge_pages--; |
304 | surplus_huge_pages--; | 318 | surplus_huge_pages--; |
319 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); | ||
305 | } | 320 | } |
306 | spin_unlock(&hugetlb_lock); | 321 | spin_unlock(&hugetlb_lock); |
307 | 322 | ||
@@ -369,11 +384,19 @@ retry: | |||
369 | resv_huge_pages += delta; | 384 | resv_huge_pages += delta; |
370 | ret = 0; | 385 | ret = 0; |
371 | free: | 386 | free: |
387 | /* Free the needed pages to the hugetlb pool */ | ||
372 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 388 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
389 | if ((--needed) < 0) | ||
390 | break; | ||
373 | list_del(&page->lru); | 391 | list_del(&page->lru); |
374 | if ((--needed) >= 0) | 392 | enqueue_huge_page(page); |
375 | enqueue_huge_page(page); | 393 | } |
376 | else { | 394 | |
395 | /* Free unnecessary surplus pages to the buddy allocator */ | ||
396 | if (!list_empty(&surplus_list)) { | ||
397 | spin_unlock(&hugetlb_lock); | ||
398 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | ||
399 | list_del(&page->lru); | ||
377 | /* | 400 | /* |
378 | * The page has a reference count of zero already, so | 401 | * The page has a reference count of zero already, so |
379 | * call free_huge_page directly instead of using | 402 | * call free_huge_page directly instead of using |
@@ -381,10 +404,9 @@ free: | |||
381 | * unlocked which is safe because free_huge_page takes | 404 | * unlocked which is safe because free_huge_page takes |
382 | * hugetlb_lock before deciding how to free the page. | 405 | * hugetlb_lock before deciding how to free the page. |
383 | */ | 406 | */ |
384 | spin_unlock(&hugetlb_lock); | ||
385 | free_huge_page(page); | 407 | free_huge_page(page); |
386 | spin_lock(&hugetlb_lock); | ||
387 | } | 408 | } |
409 | spin_lock(&hugetlb_lock); | ||
388 | } | 410 | } |
389 | 411 | ||
390 | return ret; | 412 | return ret; |
@@ -718,7 +740,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, | |||
718 | entry = | 740 | entry = |
719 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | 741 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); |
720 | } else { | 742 | } else { |
721 | entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); | 743 | entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); |
722 | } | 744 | } |
723 | entry = pte_mkyoung(entry); | 745 | entry = pte_mkyoung(entry); |
724 | entry = pte_mkhuge(entry); | 746 | entry = pte_mkhuge(entry); |
@@ -731,8 +753,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
731 | { | 753 | { |
732 | pte_t entry; | 754 | pte_t entry; |
733 | 755 | ||
734 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | 756 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); |
735 | if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 757 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { |
736 | update_mmu_cache(vma, address, entry); | 758 | update_mmu_cache(vma, address, entry); |
737 | } | 759 | } |
738 | } | 760 | } |
@@ -762,10 +784,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
762 | 784 | ||
763 | spin_lock(&dst->page_table_lock); | 785 | spin_lock(&dst->page_table_lock); |
764 | spin_lock(&src->page_table_lock); | 786 | spin_lock(&src->page_table_lock); |
765 | if (!pte_none(*src_pte)) { | 787 | if (!huge_pte_none(huge_ptep_get(src_pte))) { |
766 | if (cow) | 788 | if (cow) |
767 | ptep_set_wrprotect(src, addr, src_pte); | 789 | huge_ptep_set_wrprotect(src, addr, src_pte); |
768 | entry = *src_pte; | 790 | entry = huge_ptep_get(src_pte); |
769 | ptepage = pte_page(entry); | 791 | ptepage = pte_page(entry); |
770 | get_page(ptepage); | 792 | get_page(ptepage); |
771 | set_huge_pte_at(dst, addr, dst_pte, entry); | 793 | set_huge_pte_at(dst, addr, dst_pte, entry); |
@@ -809,7 +831,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
809 | continue; | 831 | continue; |
810 | 832 | ||
811 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 833 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
812 | if (pte_none(pte)) | 834 | if (huge_pte_none(pte)) |
813 | continue; | 835 | continue; |
814 | 836 | ||
815 | page = pte_page(pte); | 837 | page = pte_page(pte); |
@@ -873,8 +895,9 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
873 | spin_lock(&mm->page_table_lock); | 895 | spin_lock(&mm->page_table_lock); |
874 | 896 | ||
875 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 897 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
876 | if (likely(pte_same(*ptep, pte))) { | 898 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
877 | /* Break COW */ | 899 | /* Break COW */ |
900 | huge_ptep_clear_flush(vma, address, ptep); | ||
878 | set_huge_pte_at(mm, address, ptep, | 901 | set_huge_pte_at(mm, address, ptep, |
879 | make_huge_pte(vma, new_page, 1)); | 902 | make_huge_pte(vma, new_page, 1)); |
880 | /* Make the old page be freed below */ | 903 | /* Make the old page be freed below */ |
@@ -942,7 +965,7 @@ retry: | |||
942 | goto backout; | 965 | goto backout; |
943 | 966 | ||
944 | ret = 0; | 967 | ret = 0; |
945 | if (!pte_none(*ptep)) | 968 | if (!huge_pte_none(huge_ptep_get(ptep))) |
946 | goto backout; | 969 | goto backout; |
947 | 970 | ||
948 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) | 971 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
@@ -984,8 +1007,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
984 | * the same page in the page cache. | 1007 | * the same page in the page cache. |
985 | */ | 1008 | */ |
986 | mutex_lock(&hugetlb_instantiation_mutex); | 1009 | mutex_lock(&hugetlb_instantiation_mutex); |
987 | entry = *ptep; | 1010 | entry = huge_ptep_get(ptep); |
988 | if (pte_none(entry)) { | 1011 | if (huge_pte_none(entry)) { |
989 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); | 1012 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); |
990 | mutex_unlock(&hugetlb_instantiation_mutex); | 1013 | mutex_unlock(&hugetlb_instantiation_mutex); |
991 | return ret; | 1014 | return ret; |
@@ -995,7 +1018,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
995 | 1018 | ||
996 | spin_lock(&mm->page_table_lock); | 1019 | spin_lock(&mm->page_table_lock); |
997 | /* Check for a racing update before calling hugetlb_cow */ | 1020 | /* Check for a racing update before calling hugetlb_cow */ |
998 | if (likely(pte_same(entry, *ptep))) | 1021 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) |
999 | if (write_access && !pte_write(entry)) | 1022 | if (write_access && !pte_write(entry)) |
1000 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | 1023 | ret = hugetlb_cow(mm, vma, address, ptep, entry); |
1001 | spin_unlock(&mm->page_table_lock); | 1024 | spin_unlock(&mm->page_table_lock); |
@@ -1025,7 +1048,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1025 | */ | 1048 | */ |
1026 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | 1049 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); |
1027 | 1050 | ||
1028 | if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { | 1051 | if (!pte || huge_pte_none(huge_ptep_get(pte)) || |
1052 | (write && !pte_write(huge_ptep_get(pte)))) { | ||
1029 | int ret; | 1053 | int ret; |
1030 | 1054 | ||
1031 | spin_unlock(&mm->page_table_lock); | 1055 | spin_unlock(&mm->page_table_lock); |
@@ -1041,7 +1065,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1041 | } | 1065 | } |
1042 | 1066 | ||
1043 | pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; | 1067 | pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; |
1044 | page = pte_page(*pte); | 1068 | page = pte_page(huge_ptep_get(pte)); |
1045 | same_page: | 1069 | same_page: |
1046 | if (pages) { | 1070 | if (pages) { |
1047 | get_page(page); | 1071 | get_page(page); |
@@ -1090,7 +1114,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
1090 | continue; | 1114 | continue; |
1091 | if (huge_pmd_unshare(mm, &address, ptep)) | 1115 | if (huge_pmd_unshare(mm, &address, ptep)) |
1092 | continue; | 1116 | continue; |
1093 | if (!pte_none(*ptep)) { | 1117 | if (!huge_pte_none(huge_ptep_get(ptep))) { |
1094 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 1118 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
1095 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 1119 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
1096 | set_huge_pte_at(mm, address, ptep, pte); | 1120 | set_huge_pte_at(mm, address, ptep, pte); |
diff --git a/mm/internal.h b/mm/internal.h index 789727309f4d..0034e947e4bc 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -34,8 +34,7 @@ static inline void __put_page(struct page *page) | |||
34 | atomic_dec(&page->_count); | 34 | atomic_dec(&page->_count); |
35 | } | 35 | } |
36 | 36 | ||
37 | extern void __init __free_pages_bootmem(struct page *page, | 37 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
38 | unsigned int order); | ||
39 | 38 | ||
40 | /* | 39 | /* |
41 | * function for dealing with page's order in buddy system. | 40 | * function for dealing with page's order in buddy system. |
diff --git a/mm/madvise.c b/mm/madvise.c index 93ee375b38e7..23a0ec3e0ea0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -112,7 +112,7 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
112 | if (!file) | 112 | if (!file) |
113 | return -EBADF; | 113 | return -EBADF; |
114 | 114 | ||
115 | if (file->f_mapping->a_ops->get_xip_page) { | 115 | if (file->f_mapping->a_ops->get_xip_mem) { |
116 | /* no bad return value, but ignore advice */ | 116 | /* no bad return value, but ignore advice */ |
117 | return 0; | 117 | return 0; |
118 | } | 118 | } |
diff --git a/mm/memory.c b/mm/memory.c index 0d14d1e58a5f..bbab1e37055e 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -371,57 +371,93 @@ static inline int is_cow_mapping(unsigned int flags) | |||
371 | } | 371 | } |
372 | 372 | ||
373 | /* | 373 | /* |
374 | * This function gets the "struct page" associated with a pte. | 374 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
375 | * | 375 | * |
376 | * NOTE! Some mappings do not have "struct pages". A raw PFN mapping | 376 | * "Special" mappings do not wish to be associated with a "struct page" (either |
377 | * will have each page table entry just pointing to a raw page frame | 377 | * it doesn't exist, or it exists but they don't want to touch it). In this |
378 | * number, and as far as the VM layer is concerned, those do not have | 378 | * case, NULL is returned here. "Normal" mappings do have a struct page. |
379 | * pages associated with them - even if the PFN might point to memory | ||
380 | * that otherwise is perfectly fine and has a "struct page". | ||
381 | * | 379 | * |
382 | * The way we recognize those mappings is through the rules set up | 380 | * There are 2 broad cases. Firstly, an architecture may define a pte_special() |
383 | * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, | 381 | * pte bit, in which case this function is trivial. Secondly, an architecture |
384 | * and the vm_pgoff will point to the first PFN mapped: thus every | 382 | * may not have a spare pte bit, which requires a more complicated scheme, |
385 | * page that is a raw mapping will always honor the rule | 383 | * described below. |
384 | * | ||
385 | * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a | ||
386 | * special mapping (even if there are underlying and valid "struct pages"). | ||
387 | * COWed pages of a VM_PFNMAP are always normal. | ||
388 | * | ||
389 | * The way we recognize COWed pages within VM_PFNMAP mappings is through the | ||
390 | * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit | ||
391 | * set, and the vm_pgoff will point to the first PFN mapped: thus every special | ||
392 | * mapping will always honor the rule | ||
386 | * | 393 | * |
387 | * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) | 394 | * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) |
388 | * | 395 | * |
389 | * and if that isn't true, the page has been COW'ed (in which case it | 396 | * And for normal mappings this is false. |
390 | * _does_ have a "struct page" associated with it even if it is in a | 397 | * |
391 | * VM_PFNMAP range). | 398 | * This restricts such mappings to be a linear translation from virtual address |
399 | * to pfn. To get around this restriction, we allow arbitrary mappings so long | ||
400 | * as the vma is not a COW mapping; in that case, we know that all ptes are | ||
401 | * special (because none can have been COWed). | ||
402 | * | ||
403 | * | ||
404 | * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. | ||
405 | * | ||
406 | * VM_MIXEDMAP mappings can likewise contain memory with or without "struct | ||
407 | * page" backing, however the difference is that _all_ pages with a struct | ||
408 | * page (that is, those where pfn_valid is true) are refcounted and considered | ||
409 | * normal pages by the VM. The disadvantage is that pages are refcounted | ||
410 | * (which can be slower and simply not an option for some PFNMAP users). The | ||
411 | * advantage is that we don't have to follow the strict linearity rule of | ||
412 | * PFNMAP mappings in order to support COWable mappings. | ||
413 | * | ||
392 | */ | 414 | */ |
393 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) | 415 | #ifdef __HAVE_ARCH_PTE_SPECIAL |
416 | # define HAVE_PTE_SPECIAL 1 | ||
417 | #else | ||
418 | # define HAVE_PTE_SPECIAL 0 | ||
419 | #endif | ||
420 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | ||
421 | pte_t pte) | ||
394 | { | 422 | { |
395 | unsigned long pfn = pte_pfn(pte); | 423 | unsigned long pfn; |
396 | 424 | ||
397 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { | 425 | if (HAVE_PTE_SPECIAL) { |
398 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; | 426 | if (likely(!pte_special(pte))) { |
399 | if (pfn == vma->vm_pgoff + off) | 427 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
400 | return NULL; | 428 | return pte_page(pte); |
401 | if (!is_cow_mapping(vma->vm_flags)) | 429 | } |
402 | return NULL; | 430 | VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); |
431 | return NULL; | ||
403 | } | 432 | } |
404 | 433 | ||
405 | #ifdef CONFIG_DEBUG_VM | 434 | /* !HAVE_PTE_SPECIAL case follows: */ |
406 | /* | 435 | |
407 | * Add some anal sanity checks for now. Eventually, | 436 | pfn = pte_pfn(pte); |
408 | * we should just do "return pfn_to_page(pfn)", but | 437 | |
409 | * in the meantime we check that we get a valid pfn, | 438 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { |
410 | * and that the resulting page looks ok. | 439 | if (vma->vm_flags & VM_MIXEDMAP) { |
411 | */ | 440 | if (!pfn_valid(pfn)) |
412 | if (unlikely(!pfn_valid(pfn))) { | 441 | return NULL; |
413 | print_bad_pte(vma, pte, addr); | 442 | goto out; |
414 | return NULL; | 443 | } else { |
444 | unsigned long off; | ||
445 | off = (addr - vma->vm_start) >> PAGE_SHIFT; | ||
446 | if (pfn == vma->vm_pgoff + off) | ||
447 | return NULL; | ||
448 | if (!is_cow_mapping(vma->vm_flags)) | ||
449 | return NULL; | ||
450 | } | ||
415 | } | 451 | } |
416 | #endif | 452 | |
453 | VM_BUG_ON(!pfn_valid(pfn)); | ||
417 | 454 | ||
418 | /* | 455 | /* |
419 | * NOTE! We still have PageReserved() pages in the page | 456 | * NOTE! We still have PageReserved() pages in the page tables. |
420 | * tables. | ||
421 | * | 457 | * |
422 | * The PAGE_ZERO() pages and various VDSO mappings can | 458 | * eg. VDSO mappings can cause them to exist. |
423 | * cause them to exist. | ||
424 | */ | 459 | */ |
460 | out: | ||
425 | return pfn_to_page(pfn); | 461 | return pfn_to_page(pfn); |
426 | } | 462 | } |
427 | 463 | ||
@@ -1057,8 +1093,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1057 | if (pages) | 1093 | if (pages) |
1058 | foll_flags |= FOLL_GET; | 1094 | foll_flags |= FOLL_GET; |
1059 | if (!write && !(vma->vm_flags & VM_LOCKED) && | 1095 | if (!write && !(vma->vm_flags & VM_LOCKED) && |
1060 | (!vma->vm_ops || (!vma->vm_ops->nopage && | 1096 | (!vma->vm_ops || !vma->vm_ops->fault)) |
1061 | !vma->vm_ops->fault))) | ||
1062 | foll_flags |= FOLL_ANON; | 1097 | foll_flags |= FOLL_ANON; |
1063 | 1098 | ||
1064 | do { | 1099 | do { |
@@ -1141,8 +1176,10 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, | |||
1141 | * old drivers should use this, and they needed to mark their | 1176 | * old drivers should use this, and they needed to mark their |
1142 | * pages reserved for the old functions anyway. | 1177 | * pages reserved for the old functions anyway. |
1143 | */ | 1178 | */ |
1144 | static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) | 1179 | static int insert_page(struct vm_area_struct *vma, unsigned long addr, |
1180 | struct page *page, pgprot_t prot) | ||
1145 | { | 1181 | { |
1182 | struct mm_struct *mm = vma->vm_mm; | ||
1146 | int retval; | 1183 | int retval; |
1147 | pte_t *pte; | 1184 | pte_t *pte; |
1148 | spinlock_t *ptl; | 1185 | spinlock_t *ptl; |
@@ -1202,40 +1239,26 @@ out: | |||
1202 | * | 1239 | * |
1203 | * The page does not need to be reserved. | 1240 | * The page does not need to be reserved. |
1204 | */ | 1241 | */ |
1205 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) | 1242 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, |
1243 | struct page *page) | ||
1206 | { | 1244 | { |
1207 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1245 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1208 | return -EFAULT; | 1246 | return -EFAULT; |
1209 | if (!page_count(page)) | 1247 | if (!page_count(page)) |
1210 | return -EINVAL; | 1248 | return -EINVAL; |
1211 | vma->vm_flags |= VM_INSERTPAGE; | 1249 | vma->vm_flags |= VM_INSERTPAGE; |
1212 | return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); | 1250 | return insert_page(vma, addr, page, vma->vm_page_prot); |
1213 | } | 1251 | } |
1214 | EXPORT_SYMBOL(vm_insert_page); | 1252 | EXPORT_SYMBOL(vm_insert_page); |
1215 | 1253 | ||
1216 | /** | 1254 | static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1217 | * vm_insert_pfn - insert single pfn into user vma | 1255 | unsigned long pfn, pgprot_t prot) |
1218 | * @vma: user vma to map to | ||
1219 | * @addr: target user address of this page | ||
1220 | * @pfn: source kernel pfn | ||
1221 | * | ||
1222 | * Similar to vm_inert_page, this allows drivers to insert individual pages | ||
1223 | * they've allocated into a user vma. Same comments apply. | ||
1224 | * | ||
1225 | * This function should only be called from a vm_ops->fault handler, and | ||
1226 | * in that case the handler should return NULL. | ||
1227 | */ | ||
1228 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | ||
1229 | unsigned long pfn) | ||
1230 | { | 1256 | { |
1231 | struct mm_struct *mm = vma->vm_mm; | 1257 | struct mm_struct *mm = vma->vm_mm; |
1232 | int retval; | 1258 | int retval; |
1233 | pte_t *pte, entry; | 1259 | pte_t *pte, entry; |
1234 | spinlock_t *ptl; | 1260 | spinlock_t *ptl; |
1235 | 1261 | ||
1236 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); | ||
1237 | BUG_ON(is_cow_mapping(vma->vm_flags)); | ||
1238 | |||
1239 | retval = -ENOMEM; | 1262 | retval = -ENOMEM; |
1240 | pte = get_locked_pte(mm, addr, &ptl); | 1263 | pte = get_locked_pte(mm, addr, &ptl); |
1241 | if (!pte) | 1264 | if (!pte) |
@@ -1245,19 +1268,74 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1245 | goto out_unlock; | 1268 | goto out_unlock; |
1246 | 1269 | ||
1247 | /* Ok, finally just insert the thing.. */ | 1270 | /* Ok, finally just insert the thing.. */ |
1248 | entry = pfn_pte(pfn, vma->vm_page_prot); | 1271 | entry = pte_mkspecial(pfn_pte(pfn, prot)); |
1249 | set_pte_at(mm, addr, pte, entry); | 1272 | set_pte_at(mm, addr, pte, entry); |
1250 | update_mmu_cache(vma, addr, entry); | 1273 | update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ |
1251 | 1274 | ||
1252 | retval = 0; | 1275 | retval = 0; |
1253 | out_unlock: | 1276 | out_unlock: |
1254 | pte_unmap_unlock(pte, ptl); | 1277 | pte_unmap_unlock(pte, ptl); |
1255 | |||
1256 | out: | 1278 | out: |
1257 | return retval; | 1279 | return retval; |
1258 | } | 1280 | } |
1281 | |||
1282 | /** | ||
1283 | * vm_insert_pfn - insert single pfn into user vma | ||
1284 | * @vma: user vma to map to | ||
1285 | * @addr: target user address of this page | ||
1286 | * @pfn: source kernel pfn | ||
1287 | * | ||
1288 | * Similar to vm_inert_page, this allows drivers to insert individual pages | ||
1289 | * they've allocated into a user vma. Same comments apply. | ||
1290 | * | ||
1291 | * This function should only be called from a vm_ops->fault handler, and | ||
1292 | * in that case the handler should return NULL. | ||
1293 | */ | ||
1294 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | ||
1295 | unsigned long pfn) | ||
1296 | { | ||
1297 | /* | ||
1298 | * Technically, architectures with pte_special can avoid all these | ||
1299 | * restrictions (same for remap_pfn_range). However we would like | ||
1300 | * consistency in testing and feature parity among all, so we should | ||
1301 | * try to keep these invariants in place for everybody. | ||
1302 | */ | ||
1303 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); | ||
1304 | BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == | ||
1305 | (VM_PFNMAP|VM_MIXEDMAP)); | ||
1306 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); | ||
1307 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | ||
1308 | |||
1309 | if (addr < vma->vm_start || addr >= vma->vm_end) | ||
1310 | return -EFAULT; | ||
1311 | return insert_pfn(vma, addr, pfn, vma->vm_page_prot); | ||
1312 | } | ||
1259 | EXPORT_SYMBOL(vm_insert_pfn); | 1313 | EXPORT_SYMBOL(vm_insert_pfn); |
1260 | 1314 | ||
1315 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | ||
1316 | unsigned long pfn) | ||
1317 | { | ||
1318 | BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); | ||
1319 | |||
1320 | if (addr < vma->vm_start || addr >= vma->vm_end) | ||
1321 | return -EFAULT; | ||
1322 | |||
1323 | /* | ||
1324 | * If we don't have pte special, then we have to use the pfn_valid() | ||
1325 | * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* | ||
1326 | * refcount the page if pfn_valid is true (hence insert_page rather | ||
1327 | * than insert_pfn). | ||
1328 | */ | ||
1329 | if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { | ||
1330 | struct page *page; | ||
1331 | |||
1332 | page = pfn_to_page(pfn); | ||
1333 | return insert_page(vma, addr, page, vma->vm_page_prot); | ||
1334 | } | ||
1335 | return insert_pfn(vma, addr, pfn, vma->vm_page_prot); | ||
1336 | } | ||
1337 | EXPORT_SYMBOL(vm_insert_mixed); | ||
1338 | |||
1261 | /* | 1339 | /* |
1262 | * maps a range of physical memory into the requested pages. the old | 1340 | * maps a range of physical memory into the requested pages. the old |
1263 | * mappings are removed. any references to nonexistent pages results | 1341 | * mappings are removed. any references to nonexistent pages results |
@@ -1276,7 +1354,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1276 | arch_enter_lazy_mmu_mode(); | 1354 | arch_enter_lazy_mmu_mode(); |
1277 | do { | 1355 | do { |
1278 | BUG_ON(!pte_none(*pte)); | 1356 | BUG_ON(!pte_none(*pte)); |
1279 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); | 1357 | set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); |
1280 | pfn++; | 1358 | pfn++; |
1281 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1359 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1282 | arch_leave_lazy_mmu_mode(); | 1360 | arch_leave_lazy_mmu_mode(); |
@@ -2199,20 +2277,9 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2199 | 2277 | ||
2200 | BUG_ON(vma->vm_flags & VM_PFNMAP); | 2278 | BUG_ON(vma->vm_flags & VM_PFNMAP); |
2201 | 2279 | ||
2202 | if (likely(vma->vm_ops->fault)) { | 2280 | ret = vma->vm_ops->fault(vma, &vmf); |
2203 | ret = vma->vm_ops->fault(vma, &vmf); | 2281 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
2204 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2282 | return ret; |
2205 | return ret; | ||
2206 | } else { | ||
2207 | /* Legacy ->nopage path */ | ||
2208 | ret = 0; | ||
2209 | vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); | ||
2210 | /* no page was available -- either SIGBUS or OOM */ | ||
2211 | if (unlikely(vmf.page == NOPAGE_SIGBUS)) | ||
2212 | return VM_FAULT_SIGBUS; | ||
2213 | else if (unlikely(vmf.page == NOPAGE_OOM)) | ||
2214 | return VM_FAULT_OOM; | ||
2215 | } | ||
2216 | 2283 | ||
2217 | /* | 2284 | /* |
2218 | * For consistency in subsequent calls, make the faulted page always | 2285 | * For consistency in subsequent calls, make the faulted page always |
@@ -2377,10 +2444,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2377 | unsigned long pfn; | 2444 | unsigned long pfn; |
2378 | 2445 | ||
2379 | pte_unmap(page_table); | 2446 | pte_unmap(page_table); |
2380 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); | 2447 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); |
2381 | BUG_ON(is_cow_mapping(vma->vm_flags)); | 2448 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); |
2382 | 2449 | ||
2383 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); | 2450 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); |
2451 | |||
2452 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | ||
2453 | |||
2384 | if (unlikely(pfn == NOPFN_OOM)) | 2454 | if (unlikely(pfn == NOPFN_OOM)) |
2385 | return VM_FAULT_OOM; | 2455 | return VM_FAULT_OOM; |
2386 | else if (unlikely(pfn == NOPFN_SIGBUS)) | 2456 | else if (unlikely(pfn == NOPFN_SIGBUS)) |
@@ -2458,7 +2528,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2458 | if (!pte_present(entry)) { | 2528 | if (!pte_present(entry)) { |
2459 | if (pte_none(entry)) { | 2529 | if (pte_none(entry)) { |
2460 | if (vma->vm_ops) { | 2530 | if (vma->vm_ops) { |
2461 | if (vma->vm_ops->fault || vma->vm_ops->nopage) | 2531 | if (likely(vma->vm_ops->fault)) |
2462 | return do_linear_fault(mm, vma, address, | 2532 | return do_linear_fault(mm, vma, address, |
2463 | pte, pmd, write_access, entry); | 2533 | pte, pmd, write_access, entry); |
2464 | if (unlikely(vma->vm_ops->nopfn)) | 2534 | if (unlikely(vma->vm_ops->nopfn)) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0fb330271271..b17dca7249f8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -29,6 +29,8 @@ | |||
29 | 29 | ||
30 | #include <asm/tlbflush.h> | 30 | #include <asm/tlbflush.h> |
31 | 31 | ||
32 | #include "internal.h" | ||
33 | |||
32 | /* add this memory to iomem resource */ | 34 | /* add this memory to iomem resource */ |
33 | static struct resource *register_memory_resource(u64 start, u64 size) | 35 | static struct resource *register_memory_resource(u64 start, u64 size) |
34 | { | 36 | { |
@@ -58,8 +60,105 @@ static void release_memory_resource(struct resource *res) | |||
58 | return; | 60 | return; |
59 | } | 61 | } |
60 | 62 | ||
61 | |||
62 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | 63 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE |
64 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | ||
65 | static void get_page_bootmem(unsigned long info, struct page *page, int magic) | ||
66 | { | ||
67 | atomic_set(&page->_mapcount, magic); | ||
68 | SetPagePrivate(page); | ||
69 | set_page_private(page, info); | ||
70 | atomic_inc(&page->_count); | ||
71 | } | ||
72 | |||
73 | void put_page_bootmem(struct page *page) | ||
74 | { | ||
75 | int magic; | ||
76 | |||
77 | magic = atomic_read(&page->_mapcount); | ||
78 | BUG_ON(magic >= -1); | ||
79 | |||
80 | if (atomic_dec_return(&page->_count) == 1) { | ||
81 | ClearPagePrivate(page); | ||
82 | set_page_private(page, 0); | ||
83 | reset_page_mapcount(page); | ||
84 | __free_pages_bootmem(page, 0); | ||
85 | } | ||
86 | |||
87 | } | ||
88 | |||
89 | void register_page_bootmem_info_section(unsigned long start_pfn) | ||
90 | { | ||
91 | unsigned long *usemap, mapsize, section_nr, i; | ||
92 | struct mem_section *ms; | ||
93 | struct page *page, *memmap; | ||
94 | |||
95 | if (!pfn_valid(start_pfn)) | ||
96 | return; | ||
97 | |||
98 | section_nr = pfn_to_section_nr(start_pfn); | ||
99 | ms = __nr_to_section(section_nr); | ||
100 | |||
101 | /* Get section's memmap address */ | ||
102 | memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); | ||
103 | |||
104 | /* | ||
105 | * Get page for the memmap's phys address | ||
106 | * XXX: need more consideration for sparse_vmemmap... | ||
107 | */ | ||
108 | page = virt_to_page(memmap); | ||
109 | mapsize = sizeof(struct page) * PAGES_PER_SECTION; | ||
110 | mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; | ||
111 | |||
112 | /* remember memmap's page */ | ||
113 | for (i = 0; i < mapsize; i++, page++) | ||
114 | get_page_bootmem(section_nr, page, SECTION_INFO); | ||
115 | |||
116 | usemap = __nr_to_section(section_nr)->pageblock_flags; | ||
117 | page = virt_to_page(usemap); | ||
118 | |||
119 | mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; | ||
120 | |||
121 | for (i = 0; i < mapsize; i++, page++) | ||
122 | get_page_bootmem(section_nr, page, MIX_INFO); | ||
123 | |||
124 | } | ||
125 | |||
126 | void register_page_bootmem_info_node(struct pglist_data *pgdat) | ||
127 | { | ||
128 | unsigned long i, pfn, end_pfn, nr_pages; | ||
129 | int node = pgdat->node_id; | ||
130 | struct page *page; | ||
131 | struct zone *zone; | ||
132 | |||
133 | nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; | ||
134 | page = virt_to_page(pgdat); | ||
135 | |||
136 | for (i = 0; i < nr_pages; i++, page++) | ||
137 | get_page_bootmem(node, page, NODE_INFO); | ||
138 | |||
139 | zone = &pgdat->node_zones[0]; | ||
140 | for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { | ||
141 | if (zone->wait_table) { | ||
142 | nr_pages = zone->wait_table_hash_nr_entries | ||
143 | * sizeof(wait_queue_head_t); | ||
144 | nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; | ||
145 | page = virt_to_page(zone->wait_table); | ||
146 | |||
147 | for (i = 0; i < nr_pages; i++, page++) | ||
148 | get_page_bootmem(node, page, NODE_INFO); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | pfn = pgdat->node_start_pfn; | ||
153 | end_pfn = pfn + pgdat->node_spanned_pages; | ||
154 | |||
155 | /* register_section info */ | ||
156 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) | ||
157 | register_page_bootmem_info_section(pfn); | ||
158 | |||
159 | } | ||
160 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | ||
161 | |||
63 | static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) | 162 | static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) |
64 | { | 163 | { |
65 | struct pglist_data *pgdat = zone->zone_pgdat; | 164 | struct pglist_data *pgdat = zone->zone_pgdat; |
@@ -101,6 +200,36 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn) | |||
101 | return register_new_memory(__pfn_to_section(phys_start_pfn)); | 200 | return register_new_memory(__pfn_to_section(phys_start_pfn)); |
102 | } | 201 | } |
103 | 202 | ||
203 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
204 | static int __remove_section(struct zone *zone, struct mem_section *ms) | ||
205 | { | ||
206 | /* | ||
207 | * XXX: Freeing memmap with vmemmap is not implement yet. | ||
208 | * This should be removed later. | ||
209 | */ | ||
210 | return -EBUSY; | ||
211 | } | ||
212 | #else | ||
213 | static int __remove_section(struct zone *zone, struct mem_section *ms) | ||
214 | { | ||
215 | unsigned long flags; | ||
216 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
217 | int ret = -EINVAL; | ||
218 | |||
219 | if (!valid_section(ms)) | ||
220 | return ret; | ||
221 | |||
222 | ret = unregister_memory_section(ms); | ||
223 | if (ret) | ||
224 | return ret; | ||
225 | |||
226 | pgdat_resize_lock(pgdat, &flags); | ||
227 | sparse_remove_one_section(zone, ms); | ||
228 | pgdat_resize_unlock(pgdat, &flags); | ||
229 | return 0; | ||
230 | } | ||
231 | #endif | ||
232 | |||
104 | /* | 233 | /* |
105 | * Reasonably generic function for adding memory. It is | 234 | * Reasonably generic function for adding memory. It is |
106 | * expected that archs that support memory hotplug will | 235 | * expected that archs that support memory hotplug will |
@@ -134,6 +263,42 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
134 | } | 263 | } |
135 | EXPORT_SYMBOL_GPL(__add_pages); | 264 | EXPORT_SYMBOL_GPL(__add_pages); |
136 | 265 | ||
266 | /** | ||
267 | * __remove_pages() - remove sections of pages from a zone | ||
268 | * @zone: zone from which pages need to be removed | ||
269 | * @phys_start_pfn: starting pageframe (must be aligned to start of a section) | ||
270 | * @nr_pages: number of pages to remove (must be multiple of section size) | ||
271 | * | ||
272 | * Generic helper function to remove section mappings and sysfs entries | ||
273 | * for the section of the memory we are removing. Caller needs to make | ||
274 | * sure that pages are marked reserved and zones are adjust properly by | ||
275 | * calling offline_pages(). | ||
276 | */ | ||
277 | int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | ||
278 | unsigned long nr_pages) | ||
279 | { | ||
280 | unsigned long i, ret = 0; | ||
281 | int sections_to_remove; | ||
282 | |||
283 | /* | ||
284 | * We can only remove entire sections | ||
285 | */ | ||
286 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); | ||
287 | BUG_ON(nr_pages % PAGES_PER_SECTION); | ||
288 | |||
289 | release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); | ||
290 | |||
291 | sections_to_remove = nr_pages / PAGES_PER_SECTION; | ||
292 | for (i = 0; i < sections_to_remove; i++) { | ||
293 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; | ||
294 | ret = __remove_section(zone, __pfn_to_section(pfn)); | ||
295 | if (ret) | ||
296 | break; | ||
297 | } | ||
298 | return ret; | ||
299 | } | ||
300 | EXPORT_SYMBOL_GPL(__remove_pages); | ||
301 | |||
137 | static void grow_zone_span(struct zone *zone, | 302 | static void grow_zone_span(struct zone *zone, |
138 | unsigned long start_pfn, unsigned long end_pfn) | 303 | unsigned long start_pfn, unsigned long end_pfn) |
139 | { | 304 | { |
@@ -164,6 +329,25 @@ static void grow_pgdat_span(struct pglist_data *pgdat, | |||
164 | pgdat->node_start_pfn; | 329 | pgdat->node_start_pfn; |
165 | } | 330 | } |
166 | 331 | ||
332 | void online_page(struct page *page) | ||
333 | { | ||
334 | totalram_pages++; | ||
335 | num_physpages++; | ||
336 | |||
337 | #ifdef CONFIG_HIGHMEM | ||
338 | if (PageHighMem(page)) | ||
339 | totalhigh_pages++; | ||
340 | #endif | ||
341 | |||
342 | #ifdef CONFIG_FLATMEM | ||
343 | max_mapnr = max(page_to_pfn(page), max_mapnr); | ||
344 | #endif | ||
345 | |||
346 | ClearPageReserved(page); | ||
347 | init_page_count(page); | ||
348 | __free_page(page); | ||
349 | } | ||
350 | |||
167 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | 351 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, |
168 | void *arg) | 352 | void *arg) |
169 | { | 353 | { |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3c3601121509..a37a5034f63d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -63,7 +63,6 @@ | |||
63 | grows down? | 63 | grows down? |
64 | make bind policy root only? It can trigger oom much faster and the | 64 | make bind policy root only? It can trigger oom much faster and the |
65 | kernel is not always grateful with that. | 65 | kernel is not always grateful with that. |
66 | could replace all the switch()es with a mempolicy_ops structure. | ||
67 | */ | 66 | */ |
68 | 67 | ||
69 | #include <linux/mempolicy.h> | 68 | #include <linux/mempolicy.h> |
@@ -89,6 +88,7 @@ | |||
89 | #include <linux/rmap.h> | 88 | #include <linux/rmap.h> |
90 | #include <linux/security.h> | 89 | #include <linux/security.h> |
91 | #include <linux/syscalls.h> | 90 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | ||
92 | 92 | ||
93 | #include <asm/tlbflush.h> | 93 | #include <asm/tlbflush.h> |
94 | #include <asm/uaccess.h> | 94 | #include <asm/uaccess.h> |
@@ -105,142 +105,264 @@ static struct kmem_cache *sn_cache; | |||
105 | policied. */ | 105 | policied. */ |
106 | enum zone_type policy_zone = 0; | 106 | enum zone_type policy_zone = 0; |
107 | 107 | ||
108 | /* | ||
109 | * run-time system-wide default policy => local allocation | ||
110 | */ | ||
108 | struct mempolicy default_policy = { | 111 | struct mempolicy default_policy = { |
109 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 112 | .refcnt = ATOMIC_INIT(1), /* never free it */ |
110 | .policy = MPOL_DEFAULT, | 113 | .mode = MPOL_PREFERRED, |
114 | .flags = MPOL_F_LOCAL, | ||
111 | }; | 115 | }; |
112 | 116 | ||
113 | static void mpol_rebind_policy(struct mempolicy *pol, | 117 | static const struct mempolicy_operations { |
114 | const nodemask_t *newmask); | 118 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); |
119 | void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); | ||
120 | } mpol_ops[MPOL_MAX]; | ||
115 | 121 | ||
116 | /* Do sanity checking on a policy */ | 122 | /* Check that the nodemask contains at least one populated zone */ |
117 | static int mpol_check_policy(int mode, nodemask_t *nodes) | 123 | static int is_valid_nodemask(const nodemask_t *nodemask) |
118 | { | 124 | { |
119 | int was_empty, is_empty; | 125 | int nd, k; |
120 | 126 | ||
121 | if (!nodes) | 127 | /* Check that there is something useful in this mask */ |
122 | return 0; | 128 | k = policy_zone; |
123 | 129 | ||
124 | /* | 130 | for_each_node_mask(nd, *nodemask) { |
125 | * "Contextualize" the in-coming nodemast for cpusets: | 131 | struct zone *z; |
126 | * Remember whether in-coming nodemask was empty, If not, | ||
127 | * restrict the nodes to the allowed nodes in the cpuset. | ||
128 | * This is guaranteed to be a subset of nodes with memory. | ||
129 | */ | ||
130 | cpuset_update_task_memory_state(); | ||
131 | is_empty = was_empty = nodes_empty(*nodes); | ||
132 | if (!was_empty) { | ||
133 | nodes_and(*nodes, *nodes, cpuset_current_mems_allowed); | ||
134 | is_empty = nodes_empty(*nodes); /* after "contextualization" */ | ||
135 | } | ||
136 | 132 | ||
137 | switch (mode) { | 133 | for (k = 0; k <= policy_zone; k++) { |
138 | case MPOL_DEFAULT: | 134 | z = &NODE_DATA(nd)->node_zones[k]; |
139 | /* | 135 | if (z->present_pages > 0) |
140 | * require caller to specify an empty nodemask | 136 | return 1; |
141 | * before "contextualization" | 137 | } |
142 | */ | ||
143 | if (!was_empty) | ||
144 | return -EINVAL; | ||
145 | break; | ||
146 | case MPOL_BIND: | ||
147 | case MPOL_INTERLEAVE: | ||
148 | /* | ||
149 | * require at least 1 valid node after "contextualization" | ||
150 | */ | ||
151 | if (is_empty) | ||
152 | return -EINVAL; | ||
153 | break; | ||
154 | case MPOL_PREFERRED: | ||
155 | /* | ||
156 | * Did caller specify invalid nodes? | ||
157 | * Don't silently accept this as "local allocation". | ||
158 | */ | ||
159 | if (!was_empty && is_empty) | ||
160 | return -EINVAL; | ||
161 | break; | ||
162 | } | 138 | } |
139 | |||
163 | return 0; | 140 | return 0; |
164 | } | 141 | } |
165 | 142 | ||
166 | /* Generate a custom zonelist for the BIND policy. */ | 143 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) |
167 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | ||
168 | { | 144 | { |
169 | struct zonelist *zl; | 145 | return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); |
170 | int num, max, nd; | 146 | } |
171 | enum zone_type k; | ||
172 | 147 | ||
173 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 148 | static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, |
174 | max++; /* space for zlcache_ptr (see mmzone.h) */ | 149 | const nodemask_t *rel) |
175 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 150 | { |
176 | if (!zl) | 151 | nodemask_t tmp; |
177 | return ERR_PTR(-ENOMEM); | 152 | nodes_fold(tmp, *orig, nodes_weight(*rel)); |
178 | zl->zlcache_ptr = NULL; | 153 | nodes_onto(*ret, tmp, *rel); |
179 | num = 0; | 154 | } |
180 | /* First put in the highest zones from all nodes, then all the next | 155 | |
181 | lower zones etc. Avoid empty zones because the memory allocator | 156 | static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) |
182 | doesn't like them. If you implement node hot removal you | 157 | { |
183 | have to fix that. */ | 158 | if (nodes_empty(*nodes)) |
184 | k = MAX_NR_ZONES - 1; | 159 | return -EINVAL; |
185 | while (1) { | 160 | pol->v.nodes = *nodes; |
186 | for_each_node_mask(nd, *nodes) { | 161 | return 0; |
187 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | 162 | } |
188 | if (z->present_pages > 0) | 163 | |
189 | zl->zones[num++] = z; | 164 | static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) |
190 | } | 165 | { |
191 | if (k == 0) | 166 | if (!nodes) |
192 | break; | 167 | pol->flags |= MPOL_F_LOCAL; /* local allocation */ |
193 | k--; | 168 | else if (nodes_empty(*nodes)) |
194 | } | 169 | return -EINVAL; /* no allowed nodes */ |
195 | if (num == 0) { | 170 | else |
196 | kfree(zl); | 171 | pol->v.preferred_node = first_node(*nodes); |
197 | return ERR_PTR(-EINVAL); | 172 | return 0; |
198 | } | 173 | } |
199 | zl->zones[num] = NULL; | 174 | |
200 | return zl; | 175 | static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) |
176 | { | ||
177 | if (!is_valid_nodemask(nodes)) | ||
178 | return -EINVAL; | ||
179 | pol->v.nodes = *nodes; | ||
180 | return 0; | ||
201 | } | 181 | } |
202 | 182 | ||
203 | /* Create a new policy */ | 183 | /* Create a new policy */ |
204 | static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | 184 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, |
185 | nodemask_t *nodes) | ||
205 | { | 186 | { |
206 | struct mempolicy *policy; | 187 | struct mempolicy *policy; |
188 | nodemask_t cpuset_context_nmask; | ||
189 | int ret; | ||
207 | 190 | ||
208 | pr_debug("setting mode %d nodes[0] %lx\n", | 191 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
209 | mode, nodes ? nodes_addr(*nodes)[0] : -1); | 192 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); |
210 | 193 | ||
211 | if (mode == MPOL_DEFAULT) | 194 | if (mode == MPOL_DEFAULT) { |
212 | return NULL; | 195 | if (nodes && !nodes_empty(*nodes)) |
196 | return ERR_PTR(-EINVAL); | ||
197 | return NULL; /* simply delete any existing policy */ | ||
198 | } | ||
199 | VM_BUG_ON(!nodes); | ||
200 | |||
201 | /* | ||
202 | * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or | ||
203 | * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). | ||
204 | * All other modes require a valid pointer to a non-empty nodemask. | ||
205 | */ | ||
206 | if (mode == MPOL_PREFERRED) { | ||
207 | if (nodes_empty(*nodes)) { | ||
208 | if (((flags & MPOL_F_STATIC_NODES) || | ||
209 | (flags & MPOL_F_RELATIVE_NODES))) | ||
210 | return ERR_PTR(-EINVAL); | ||
211 | nodes = NULL; /* flag local alloc */ | ||
212 | } | ||
213 | } else if (nodes_empty(*nodes)) | ||
214 | return ERR_PTR(-EINVAL); | ||
213 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 215 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
214 | if (!policy) | 216 | if (!policy) |
215 | return ERR_PTR(-ENOMEM); | 217 | return ERR_PTR(-ENOMEM); |
216 | atomic_set(&policy->refcnt, 1); | 218 | atomic_set(&policy->refcnt, 1); |
217 | switch (mode) { | 219 | policy->mode = mode; |
218 | case MPOL_INTERLEAVE: | 220 | policy->flags = flags; |
219 | policy->v.nodes = *nodes; | 221 | |
220 | if (nodes_weight(policy->v.nodes) == 0) { | 222 | if (nodes) { |
221 | kmem_cache_free(policy_cache, policy); | 223 | /* |
222 | return ERR_PTR(-EINVAL); | 224 | * cpuset related setup doesn't apply to local allocation |
223 | } | 225 | */ |
224 | break; | 226 | cpuset_update_task_memory_state(); |
225 | case MPOL_PREFERRED: | 227 | if (flags & MPOL_F_RELATIVE_NODES) |
226 | policy->v.preferred_node = first_node(*nodes); | 228 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, |
227 | if (policy->v.preferred_node >= MAX_NUMNODES) | 229 | &cpuset_current_mems_allowed); |
228 | policy->v.preferred_node = -1; | 230 | else |
229 | break; | 231 | nodes_and(cpuset_context_nmask, *nodes, |
230 | case MPOL_BIND: | 232 | cpuset_current_mems_allowed); |
231 | policy->v.zonelist = bind_zonelist(nodes); | 233 | if (mpol_store_user_nodemask(policy)) |
232 | if (IS_ERR(policy->v.zonelist)) { | 234 | policy->w.user_nodemask = *nodes; |
233 | void *error_code = policy->v.zonelist; | 235 | else |
234 | kmem_cache_free(policy_cache, policy); | 236 | policy->w.cpuset_mems_allowed = |
235 | return error_code; | 237 | cpuset_mems_allowed(current); |
236 | } | 238 | } |
237 | break; | 239 | |
240 | ret = mpol_ops[mode].create(policy, | ||
241 | nodes ? &cpuset_context_nmask : NULL); | ||
242 | if (ret < 0) { | ||
243 | kmem_cache_free(policy_cache, policy); | ||
244 | return ERR_PTR(ret); | ||
238 | } | 245 | } |
239 | policy->policy = mode; | ||
240 | policy->cpuset_mems_allowed = cpuset_mems_allowed(current); | ||
241 | return policy; | 246 | return policy; |
242 | } | 247 | } |
243 | 248 | ||
249 | /* Slow path of a mpol destructor. */ | ||
250 | void __mpol_put(struct mempolicy *p) | ||
251 | { | ||
252 | if (!atomic_dec_and_test(&p->refcnt)) | ||
253 | return; | ||
254 | kmem_cache_free(policy_cache, p); | ||
255 | } | ||
256 | |||
257 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) | ||
258 | { | ||
259 | } | ||
260 | |||
261 | static void mpol_rebind_nodemask(struct mempolicy *pol, | ||
262 | const nodemask_t *nodes) | ||
263 | { | ||
264 | nodemask_t tmp; | ||
265 | |||
266 | if (pol->flags & MPOL_F_STATIC_NODES) | ||
267 | nodes_and(tmp, pol->w.user_nodemask, *nodes); | ||
268 | else if (pol->flags & MPOL_F_RELATIVE_NODES) | ||
269 | mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); | ||
270 | else { | ||
271 | nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, | ||
272 | *nodes); | ||
273 | pol->w.cpuset_mems_allowed = *nodes; | ||
274 | } | ||
275 | |||
276 | pol->v.nodes = tmp; | ||
277 | if (!node_isset(current->il_next, tmp)) { | ||
278 | current->il_next = next_node(current->il_next, tmp); | ||
279 | if (current->il_next >= MAX_NUMNODES) | ||
280 | current->il_next = first_node(tmp); | ||
281 | if (current->il_next >= MAX_NUMNODES) | ||
282 | current->il_next = numa_node_id(); | ||
283 | } | ||
284 | } | ||
285 | |||
286 | static void mpol_rebind_preferred(struct mempolicy *pol, | ||
287 | const nodemask_t *nodes) | ||
288 | { | ||
289 | nodemask_t tmp; | ||
290 | |||
291 | if (pol->flags & MPOL_F_STATIC_NODES) { | ||
292 | int node = first_node(pol->w.user_nodemask); | ||
293 | |||
294 | if (node_isset(node, *nodes)) { | ||
295 | pol->v.preferred_node = node; | ||
296 | pol->flags &= ~MPOL_F_LOCAL; | ||
297 | } else | ||
298 | pol->flags |= MPOL_F_LOCAL; | ||
299 | } else if (pol->flags & MPOL_F_RELATIVE_NODES) { | ||
300 | mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); | ||
301 | pol->v.preferred_node = first_node(tmp); | ||
302 | } else if (!(pol->flags & MPOL_F_LOCAL)) { | ||
303 | pol->v.preferred_node = node_remap(pol->v.preferred_node, | ||
304 | pol->w.cpuset_mems_allowed, | ||
305 | *nodes); | ||
306 | pol->w.cpuset_mems_allowed = *nodes; | ||
307 | } | ||
308 | } | ||
309 | |||
310 | /* Migrate a policy to a different set of nodes */ | ||
311 | static void mpol_rebind_policy(struct mempolicy *pol, | ||
312 | const nodemask_t *newmask) | ||
313 | { | ||
314 | if (!pol) | ||
315 | return; | ||
316 | if (!mpol_store_user_nodemask(pol) && | ||
317 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) | ||
318 | return; | ||
319 | mpol_ops[pol->mode].rebind(pol, newmask); | ||
320 | } | ||
321 | |||
322 | /* | ||
323 | * Wrapper for mpol_rebind_policy() that just requires task | ||
324 | * pointer, and updates task mempolicy. | ||
325 | */ | ||
326 | |||
327 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | ||
328 | { | ||
329 | mpol_rebind_policy(tsk->mempolicy, new); | ||
330 | } | ||
331 | |||
332 | /* | ||
333 | * Rebind each vma in mm to new nodemask. | ||
334 | * | ||
335 | * Call holding a reference to mm. Takes mm->mmap_sem during call. | ||
336 | */ | ||
337 | |||
338 | void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | ||
339 | { | ||
340 | struct vm_area_struct *vma; | ||
341 | |||
342 | down_write(&mm->mmap_sem); | ||
343 | for (vma = mm->mmap; vma; vma = vma->vm_next) | ||
344 | mpol_rebind_policy(vma->vm_policy, new); | ||
345 | up_write(&mm->mmap_sem); | ||
346 | } | ||
347 | |||
348 | static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { | ||
349 | [MPOL_DEFAULT] = { | ||
350 | .rebind = mpol_rebind_default, | ||
351 | }, | ||
352 | [MPOL_INTERLEAVE] = { | ||
353 | .create = mpol_new_interleave, | ||
354 | .rebind = mpol_rebind_nodemask, | ||
355 | }, | ||
356 | [MPOL_PREFERRED] = { | ||
357 | .create = mpol_new_preferred, | ||
358 | .rebind = mpol_rebind_preferred, | ||
359 | }, | ||
360 | [MPOL_BIND] = { | ||
361 | .create = mpol_new_bind, | ||
362 | .rebind = mpol_rebind_nodemask, | ||
363 | }, | ||
364 | }; | ||
365 | |||
244 | static void gather_stats(struct page *, void *, int pte_dirty); | 366 | static void gather_stats(struct page *, void *, int pte_dirty); |
245 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 367 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
246 | unsigned long flags); | 368 | unsigned long flags); |
@@ -421,7 +543,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | |||
421 | if (!err) { | 543 | if (!err) { |
422 | mpol_get(new); | 544 | mpol_get(new); |
423 | vma->vm_policy = new; | 545 | vma->vm_policy = new; |
424 | mpol_free(old); | 546 | mpol_put(old); |
425 | } | 547 | } |
426 | return err; | 548 | return err; |
427 | } | 549 | } |
@@ -479,46 +601,55 @@ static void mpol_set_task_struct_flag(void) | |||
479 | } | 601 | } |
480 | 602 | ||
481 | /* Set the process memory policy */ | 603 | /* Set the process memory policy */ |
482 | static long do_set_mempolicy(int mode, nodemask_t *nodes) | 604 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, |
605 | nodemask_t *nodes) | ||
483 | { | 606 | { |
484 | struct mempolicy *new; | 607 | struct mempolicy *new; |
608 | struct mm_struct *mm = current->mm; | ||
485 | 609 | ||
486 | if (mpol_check_policy(mode, nodes)) | 610 | new = mpol_new(mode, flags, nodes); |
487 | return -EINVAL; | ||
488 | new = mpol_new(mode, nodes); | ||
489 | if (IS_ERR(new)) | 611 | if (IS_ERR(new)) |
490 | return PTR_ERR(new); | 612 | return PTR_ERR(new); |
491 | mpol_free(current->mempolicy); | 613 | |
614 | /* | ||
615 | * prevent changing our mempolicy while show_numa_maps() | ||
616 | * is using it. | ||
617 | * Note: do_set_mempolicy() can be called at init time | ||
618 | * with no 'mm'. | ||
619 | */ | ||
620 | if (mm) | ||
621 | down_write(&mm->mmap_sem); | ||
622 | mpol_put(current->mempolicy); | ||
492 | current->mempolicy = new; | 623 | current->mempolicy = new; |
493 | mpol_set_task_struct_flag(); | 624 | mpol_set_task_struct_flag(); |
494 | if (new && new->policy == MPOL_INTERLEAVE) | 625 | if (new && new->mode == MPOL_INTERLEAVE && |
626 | nodes_weight(new->v.nodes)) | ||
495 | current->il_next = first_node(new->v.nodes); | 627 | current->il_next = first_node(new->v.nodes); |
628 | if (mm) | ||
629 | up_write(&mm->mmap_sem); | ||
630 | |||
496 | return 0; | 631 | return 0; |
497 | } | 632 | } |
498 | 633 | ||
499 | /* Fill a zone bitmap for a policy */ | 634 | /* |
500 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | 635 | * Return nodemask for policy for get_mempolicy() query |
636 | */ | ||
637 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) | ||
501 | { | 638 | { |
502 | int i; | ||
503 | |||
504 | nodes_clear(*nodes); | 639 | nodes_clear(*nodes); |
505 | switch (p->policy) { | 640 | if (p == &default_policy) |
641 | return; | ||
642 | |||
643 | switch (p->mode) { | ||
506 | case MPOL_BIND: | 644 | case MPOL_BIND: |
507 | for (i = 0; p->v.zonelist->zones[i]; i++) | 645 | /* Fall through */ |
508 | node_set(zone_to_nid(p->v.zonelist->zones[i]), | ||
509 | *nodes); | ||
510 | break; | ||
511 | case MPOL_DEFAULT: | ||
512 | break; | ||
513 | case MPOL_INTERLEAVE: | 646 | case MPOL_INTERLEAVE: |
514 | *nodes = p->v.nodes; | 647 | *nodes = p->v.nodes; |
515 | break; | 648 | break; |
516 | case MPOL_PREFERRED: | 649 | case MPOL_PREFERRED: |
517 | /* or use current node instead of memory_map? */ | 650 | if (!(p->flags & MPOL_F_LOCAL)) |
518 | if (p->v.preferred_node < 0) | ||
519 | *nodes = node_states[N_HIGH_MEMORY]; | ||
520 | else | ||
521 | node_set(p->v.preferred_node, *nodes); | 651 | node_set(p->v.preferred_node, *nodes); |
652 | /* else return empty node mask for local allocation */ | ||
522 | break; | 653 | break; |
523 | default: | 654 | default: |
524 | BUG(); | 655 | BUG(); |
@@ -561,6 +692,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
561 | } | 692 | } |
562 | 693 | ||
563 | if (flags & MPOL_F_ADDR) { | 694 | if (flags & MPOL_F_ADDR) { |
695 | /* | ||
696 | * Do NOT fall back to task policy if the | ||
697 | * vma/shared policy at addr is NULL. We | ||
698 | * want to return MPOL_DEFAULT in this case. | ||
699 | */ | ||
564 | down_read(&mm->mmap_sem); | 700 | down_read(&mm->mmap_sem); |
565 | vma = find_vma_intersection(mm, addr, addr+1); | 701 | vma = find_vma_intersection(mm, addr, addr+1); |
566 | if (!vma) { | 702 | if (!vma) { |
@@ -575,7 +711,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
575 | return -EINVAL; | 711 | return -EINVAL; |
576 | 712 | ||
577 | if (!pol) | 713 | if (!pol) |
578 | pol = &default_policy; | 714 | pol = &default_policy; /* indicates default behavior */ |
579 | 715 | ||
580 | if (flags & MPOL_F_NODE) { | 716 | if (flags & MPOL_F_NODE) { |
581 | if (flags & MPOL_F_ADDR) { | 717 | if (flags & MPOL_F_ADDR) { |
@@ -584,14 +720,17 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
584 | goto out; | 720 | goto out; |
585 | *policy = err; | 721 | *policy = err; |
586 | } else if (pol == current->mempolicy && | 722 | } else if (pol == current->mempolicy && |
587 | pol->policy == MPOL_INTERLEAVE) { | 723 | pol->mode == MPOL_INTERLEAVE) { |
588 | *policy = current->il_next; | 724 | *policy = current->il_next; |
589 | } else { | 725 | } else { |
590 | err = -EINVAL; | 726 | err = -EINVAL; |
591 | goto out; | 727 | goto out; |
592 | } | 728 | } |
593 | } else | 729 | } else { |
594 | *policy = pol->policy; | 730 | *policy = pol == &default_policy ? MPOL_DEFAULT : |
731 | pol->mode; | ||
732 | *policy |= pol->flags; | ||
733 | } | ||
595 | 734 | ||
596 | if (vma) { | 735 | if (vma) { |
597 | up_read(¤t->mm->mmap_sem); | 736 | up_read(¤t->mm->mmap_sem); |
@@ -600,9 +739,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
600 | 739 | ||
601 | err = 0; | 740 | err = 0; |
602 | if (nmask) | 741 | if (nmask) |
603 | get_zonemask(pol, nmask); | 742 | get_policy_nodemask(pol, nmask); |
604 | 743 | ||
605 | out: | 744 | out: |
745 | mpol_cond_put(pol); | ||
606 | if (vma) | 746 | if (vma) |
607 | up_read(¤t->mm->mmap_sem); | 747 | up_read(¤t->mm->mmap_sem); |
608 | return err; | 748 | return err; |
@@ -664,7 +804,7 @@ int do_migrate_pages(struct mm_struct *mm, | |||
664 | int err = 0; | 804 | int err = 0; |
665 | nodemask_t tmp; | 805 | nodemask_t tmp; |
666 | 806 | ||
667 | down_read(&mm->mmap_sem); | 807 | down_read(&mm->mmap_sem); |
668 | 808 | ||
669 | err = migrate_vmas(mm, from_nodes, to_nodes, flags); | 809 | err = migrate_vmas(mm, from_nodes, to_nodes, flags); |
670 | if (err) | 810 | if (err) |
@@ -781,8 +921,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * | |||
781 | #endif | 921 | #endif |
782 | 922 | ||
783 | static long do_mbind(unsigned long start, unsigned long len, | 923 | static long do_mbind(unsigned long start, unsigned long len, |
784 | unsigned long mode, nodemask_t *nmask, | 924 | unsigned short mode, unsigned short mode_flags, |
785 | unsigned long flags) | 925 | nodemask_t *nmask, unsigned long flags) |
786 | { | 926 | { |
787 | struct vm_area_struct *vma; | 927 | struct vm_area_struct *vma; |
788 | struct mm_struct *mm = current->mm; | 928 | struct mm_struct *mm = current->mm; |
@@ -791,9 +931,8 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
791 | int err; | 931 | int err; |
792 | LIST_HEAD(pagelist); | 932 | LIST_HEAD(pagelist); |
793 | 933 | ||
794 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT | | 934 | if (flags & ~(unsigned long)(MPOL_MF_STRICT | |
795 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 935 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
796 | || mode > MPOL_MAX) | ||
797 | return -EINVAL; | 936 | return -EINVAL; |
798 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) | 937 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
799 | return -EPERM; | 938 | return -EPERM; |
@@ -812,10 +951,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
812 | if (end == start) | 951 | if (end == start) |
813 | return 0; | 952 | return 0; |
814 | 953 | ||
815 | if (mpol_check_policy(mode, nmask)) | 954 | new = mpol_new(mode, mode_flags, nmask); |
816 | return -EINVAL; | ||
817 | |||
818 | new = mpol_new(mode, nmask); | ||
819 | if (IS_ERR(new)) | 955 | if (IS_ERR(new)) |
820 | return PTR_ERR(new); | 956 | return PTR_ERR(new); |
821 | 957 | ||
@@ -826,8 +962,9 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
826 | if (!new) | 962 | if (!new) |
827 | flags |= MPOL_MF_DISCONTIG_OK; | 963 | flags |= MPOL_MF_DISCONTIG_OK; |
828 | 964 | ||
829 | pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | 965 | pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", |
830 | mode, nmask ? nodes_addr(*nmask)[0] : -1); | 966 | start, start + len, mode, mode_flags, |
967 | nmask ? nodes_addr(*nmask)[0] : -1); | ||
831 | 968 | ||
832 | down_write(&mm->mmap_sem); | 969 | down_write(&mm->mmap_sem); |
833 | vma = check_range(mm, start, end, nmask, | 970 | vma = check_range(mm, start, end, nmask, |
@@ -848,7 +985,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
848 | } | 985 | } |
849 | 986 | ||
850 | up_write(&mm->mmap_sem); | 987 | up_write(&mm->mmap_sem); |
851 | mpol_free(new); | 988 | mpol_put(new); |
852 | return err; | 989 | return err; |
853 | } | 990 | } |
854 | 991 | ||
@@ -926,11 +1063,19 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
926 | { | 1063 | { |
927 | nodemask_t nodes; | 1064 | nodemask_t nodes; |
928 | int err; | 1065 | int err; |
1066 | unsigned short mode_flags; | ||
929 | 1067 | ||
1068 | mode_flags = mode & MPOL_MODE_FLAGS; | ||
1069 | mode &= ~MPOL_MODE_FLAGS; | ||
1070 | if (mode >= MPOL_MAX) | ||
1071 | return -EINVAL; | ||
1072 | if ((mode_flags & MPOL_F_STATIC_NODES) && | ||
1073 | (mode_flags & MPOL_F_RELATIVE_NODES)) | ||
1074 | return -EINVAL; | ||
930 | err = get_nodes(&nodes, nmask, maxnode); | 1075 | err = get_nodes(&nodes, nmask, maxnode); |
931 | if (err) | 1076 | if (err) |
932 | return err; | 1077 | return err; |
933 | return do_mbind(start, len, mode, &nodes, flags); | 1078 | return do_mbind(start, len, mode, mode_flags, &nodes, flags); |
934 | } | 1079 | } |
935 | 1080 | ||
936 | /* Set the process memory policy */ | 1081 | /* Set the process memory policy */ |
@@ -939,13 +1084,18 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | |||
939 | { | 1084 | { |
940 | int err; | 1085 | int err; |
941 | nodemask_t nodes; | 1086 | nodemask_t nodes; |
1087 | unsigned short flags; | ||
942 | 1088 | ||
943 | if (mode < 0 || mode > MPOL_MAX) | 1089 | flags = mode & MPOL_MODE_FLAGS; |
1090 | mode &= ~MPOL_MODE_FLAGS; | ||
1091 | if ((unsigned int)mode >= MPOL_MAX) | ||
1092 | return -EINVAL; | ||
1093 | if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) | ||
944 | return -EINVAL; | 1094 | return -EINVAL; |
945 | err = get_nodes(&nodes, nmask, maxnode); | 1095 | err = get_nodes(&nodes, nmask, maxnode); |
946 | if (err) | 1096 | if (err) |
947 | return err; | 1097 | return err; |
948 | return do_set_mempolicy(mode, &nodes); | 1098 | return do_set_mempolicy(mode, flags, &nodes); |
949 | } | 1099 | } |
950 | 1100 | ||
951 | asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | 1101 | asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, |
@@ -1131,59 +1281,75 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
1131 | * | 1281 | * |
1132 | * Returns effective policy for a VMA at specified address. | 1282 | * Returns effective policy for a VMA at specified address. |
1133 | * Falls back to @task or system default policy, as necessary. | 1283 | * Falls back to @task or system default policy, as necessary. |
1134 | * Returned policy has extra reference count if shared, vma, | 1284 | * Current or other task's task mempolicy and non-shared vma policies |
1135 | * or some other task's policy [show_numa_maps() can pass | 1285 | * are protected by the task's mmap_sem, which must be held for read by |
1136 | * @task != current]. It is the caller's responsibility to | 1286 | * the caller. |
1137 | * free the reference in these cases. | 1287 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference |
1288 | * count--added by the get_policy() vm_op, as appropriate--to protect against | ||
1289 | * freeing by another task. It is the caller's responsibility to free the | ||
1290 | * extra reference for shared policies. | ||
1138 | */ | 1291 | */ |
1139 | static struct mempolicy * get_vma_policy(struct task_struct *task, | 1292 | static struct mempolicy *get_vma_policy(struct task_struct *task, |
1140 | struct vm_area_struct *vma, unsigned long addr) | 1293 | struct vm_area_struct *vma, unsigned long addr) |
1141 | { | 1294 | { |
1142 | struct mempolicy *pol = task->mempolicy; | 1295 | struct mempolicy *pol = task->mempolicy; |
1143 | int shared_pol = 0; | ||
1144 | 1296 | ||
1145 | if (vma) { | 1297 | if (vma) { |
1146 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1298 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
1147 | pol = vma->vm_ops->get_policy(vma, addr); | 1299 | struct mempolicy *vpol = vma->vm_ops->get_policy(vma, |
1148 | shared_pol = 1; /* if pol non-NULL, add ref below */ | 1300 | addr); |
1149 | } else if (vma->vm_policy && | 1301 | if (vpol) |
1150 | vma->vm_policy->policy != MPOL_DEFAULT) | 1302 | pol = vpol; |
1303 | } else if (vma->vm_policy) | ||
1151 | pol = vma->vm_policy; | 1304 | pol = vma->vm_policy; |
1152 | } | 1305 | } |
1153 | if (!pol) | 1306 | if (!pol) |
1154 | pol = &default_policy; | 1307 | pol = &default_policy; |
1155 | else if (!shared_pol && pol != current->mempolicy) | ||
1156 | mpol_get(pol); /* vma or other task's policy */ | ||
1157 | return pol; | 1308 | return pol; |
1158 | } | 1309 | } |
1159 | 1310 | ||
1160 | /* Return a zonelist representing a mempolicy */ | 1311 | /* |
1161 | static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | 1312 | * Return a nodemask representing a mempolicy for filtering nodes for |
1313 | * page allocation | ||
1314 | */ | ||
1315 | static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) | ||
1162 | { | 1316 | { |
1163 | int nd; | 1317 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ |
1318 | if (unlikely(policy->mode == MPOL_BIND) && | ||
1319 | gfp_zone(gfp) >= policy_zone && | ||
1320 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) | ||
1321 | return &policy->v.nodes; | ||
1164 | 1322 | ||
1165 | switch (policy->policy) { | 1323 | return NULL; |
1324 | } | ||
1325 | |||
1326 | /* Return a zonelist indicated by gfp for node representing a mempolicy */ | ||
1327 | static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) | ||
1328 | { | ||
1329 | int nd = numa_node_id(); | ||
1330 | |||
1331 | switch (policy->mode) { | ||
1166 | case MPOL_PREFERRED: | 1332 | case MPOL_PREFERRED: |
1167 | nd = policy->v.preferred_node; | 1333 | if (!(policy->flags & MPOL_F_LOCAL)) |
1168 | if (nd < 0) | 1334 | nd = policy->v.preferred_node; |
1169 | nd = numa_node_id(); | ||
1170 | break; | 1335 | break; |
1171 | case MPOL_BIND: | 1336 | case MPOL_BIND: |
1172 | /* Lower zones don't get a policy applied */ | 1337 | /* |
1173 | /* Careful: current->mems_allowed might have moved */ | 1338 | * Normally, MPOL_BIND allocations are node-local within the |
1174 | if (gfp_zone(gfp) >= policy_zone) | 1339 | * allowed nodemask. However, if __GFP_THISNODE is set and the |
1175 | if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) | 1340 | * current node is part of the mask, we use the zonelist for |
1176 | return policy->v.zonelist; | 1341 | * the first node in the mask instead. |
1177 | /*FALL THROUGH*/ | 1342 | */ |
1343 | if (unlikely(gfp & __GFP_THISNODE) && | ||
1344 | unlikely(!node_isset(nd, policy->v.nodes))) | ||
1345 | nd = first_node(policy->v.nodes); | ||
1346 | break; | ||
1178 | case MPOL_INTERLEAVE: /* should not happen */ | 1347 | case MPOL_INTERLEAVE: /* should not happen */ |
1179 | case MPOL_DEFAULT: | ||
1180 | nd = numa_node_id(); | ||
1181 | break; | 1348 | break; |
1182 | default: | 1349 | default: |
1183 | nd = 0; | ||
1184 | BUG(); | 1350 | BUG(); |
1185 | } | 1351 | } |
1186 | return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); | 1352 | return node_zonelist(nd, gfp); |
1187 | } | 1353 | } |
1188 | 1354 | ||
1189 | /* Do dynamic interleaving for a process */ | 1355 | /* Do dynamic interleaving for a process */ |
@@ -1196,36 +1362,51 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
1196 | next = next_node(nid, policy->v.nodes); | 1362 | next = next_node(nid, policy->v.nodes); |
1197 | if (next >= MAX_NUMNODES) | 1363 | if (next >= MAX_NUMNODES) |
1198 | next = first_node(policy->v.nodes); | 1364 | next = first_node(policy->v.nodes); |
1199 | me->il_next = next; | 1365 | if (next < MAX_NUMNODES) |
1366 | me->il_next = next; | ||
1200 | return nid; | 1367 | return nid; |
1201 | } | 1368 | } |
1202 | 1369 | ||
1203 | /* | 1370 | /* |
1204 | * Depending on the memory policy provide a node from which to allocate the | 1371 | * Depending on the memory policy provide a node from which to allocate the |
1205 | * next slab entry. | 1372 | * next slab entry. |
1373 | * @policy must be protected by freeing by the caller. If @policy is | ||
1374 | * the current task's mempolicy, this protection is implicit, as only the | ||
1375 | * task can change it's policy. The system default policy requires no | ||
1376 | * such protection. | ||
1206 | */ | 1377 | */ |
1207 | unsigned slab_node(struct mempolicy *policy) | 1378 | unsigned slab_node(struct mempolicy *policy) |
1208 | { | 1379 | { |
1209 | int pol = policy ? policy->policy : MPOL_DEFAULT; | 1380 | if (!policy || policy->flags & MPOL_F_LOCAL) |
1381 | return numa_node_id(); | ||
1382 | |||
1383 | switch (policy->mode) { | ||
1384 | case MPOL_PREFERRED: | ||
1385 | /* | ||
1386 | * handled MPOL_F_LOCAL above | ||
1387 | */ | ||
1388 | return policy->v.preferred_node; | ||
1210 | 1389 | ||
1211 | switch (pol) { | ||
1212 | case MPOL_INTERLEAVE: | 1390 | case MPOL_INTERLEAVE: |
1213 | return interleave_nodes(policy); | 1391 | return interleave_nodes(policy); |
1214 | 1392 | ||
1215 | case MPOL_BIND: | 1393 | case MPOL_BIND: { |
1216 | /* | 1394 | /* |
1217 | * Follow bind policy behavior and start allocation at the | 1395 | * Follow bind policy behavior and start allocation at the |
1218 | * first node. | 1396 | * first node. |
1219 | */ | 1397 | */ |
1220 | return zone_to_nid(policy->v.zonelist->zones[0]); | 1398 | struct zonelist *zonelist; |
1221 | 1399 | struct zone *zone; | |
1222 | case MPOL_PREFERRED: | 1400 | enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); |
1223 | if (policy->v.preferred_node >= 0) | 1401 | zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; |
1224 | return policy->v.preferred_node; | 1402 | (void)first_zones_zonelist(zonelist, highest_zoneidx, |
1225 | /* Fall through */ | 1403 | &policy->v.nodes, |
1404 | &zone); | ||
1405 | return zone->node; | ||
1406 | } | ||
1226 | 1407 | ||
1227 | default: | 1408 | default: |
1228 | return numa_node_id(); | 1409 | BUG(); |
1229 | } | 1410 | } |
1230 | } | 1411 | } |
1231 | 1412 | ||
@@ -1234,10 +1415,13 @@ static unsigned offset_il_node(struct mempolicy *pol, | |||
1234 | struct vm_area_struct *vma, unsigned long off) | 1415 | struct vm_area_struct *vma, unsigned long off) |
1235 | { | 1416 | { |
1236 | unsigned nnodes = nodes_weight(pol->v.nodes); | 1417 | unsigned nnodes = nodes_weight(pol->v.nodes); |
1237 | unsigned target = (unsigned)off % nnodes; | 1418 | unsigned target; |
1238 | int c; | 1419 | int c; |
1239 | int nid = -1; | 1420 | int nid = -1; |
1240 | 1421 | ||
1422 | if (!nnodes) | ||
1423 | return numa_node_id(); | ||
1424 | target = (unsigned int)off % nnodes; | ||
1241 | c = 0; | 1425 | c = 0; |
1242 | do { | 1426 | do { |
1243 | nid = next_node(nid, pol->v.nodes); | 1427 | nid = next_node(nid, pol->v.nodes); |
@@ -1274,40 +1458,30 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1274 | * @vma = virtual memory area whose policy is sought | 1458 | * @vma = virtual memory area whose policy is sought |
1275 | * @addr = address in @vma for shared policy lookup and interleave policy | 1459 | * @addr = address in @vma for shared policy lookup and interleave policy |
1276 | * @gfp_flags = for requested zone | 1460 | * @gfp_flags = for requested zone |
1277 | * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy | 1461 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy |
1462 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask | ||
1278 | * | 1463 | * |
1279 | * Returns a zonelist suitable for a huge page allocation. | 1464 | * Returns a zonelist suitable for a huge page allocation and a pointer |
1280 | * If the effective policy is 'BIND, returns pointer to policy's zonelist. | 1465 | * to the struct mempolicy for conditional unref after allocation. |
1281 | * If it is also a policy for which get_vma_policy() returns an extra | 1466 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's |
1282 | * reference, we must hold that reference until after allocation. | 1467 | * @nodemask for filtering the zonelist. |
1283 | * In that case, return policy via @mpol so hugetlb allocation can drop | ||
1284 | * the reference. For non-'BIND referenced policies, we can/do drop the | ||
1285 | * reference here, so the caller doesn't need to know about the special case | ||
1286 | * for default and current task policy. | ||
1287 | */ | 1468 | */ |
1288 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | 1469 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
1289 | gfp_t gfp_flags, struct mempolicy **mpol) | 1470 | gfp_t gfp_flags, struct mempolicy **mpol, |
1471 | nodemask_t **nodemask) | ||
1290 | { | 1472 | { |
1291 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
1292 | struct zonelist *zl; | 1473 | struct zonelist *zl; |
1293 | 1474 | ||
1294 | *mpol = NULL; /* probably no unref needed */ | 1475 | *mpol = get_vma_policy(current, vma, addr); |
1295 | if (pol->policy == MPOL_INTERLEAVE) { | 1476 | *nodemask = NULL; /* assume !MPOL_BIND */ |
1296 | unsigned nid; | ||
1297 | |||
1298 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | ||
1299 | if (unlikely(pol != &default_policy && | ||
1300 | pol != current->mempolicy)) | ||
1301 | __mpol_free(pol); /* finished with pol */ | ||
1302 | return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); | ||
1303 | } | ||
1304 | 1477 | ||
1305 | zl = zonelist_policy(GFP_HIGHUSER, pol); | 1478 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { |
1306 | if (unlikely(pol != &default_policy && pol != current->mempolicy)) { | 1479 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, |
1307 | if (pol->policy != MPOL_BIND) | 1480 | HPAGE_SHIFT), gfp_flags); |
1308 | __mpol_free(pol); /* finished with pol */ | 1481 | } else { |
1309 | else | 1482 | zl = policy_zonelist(gfp_flags, *mpol); |
1310 | *mpol = pol; /* unref needed after allocation */ | 1483 | if ((*mpol)->mode == MPOL_BIND) |
1484 | *nodemask = &(*mpol)->v.nodes; | ||
1311 | } | 1485 | } |
1312 | return zl; | 1486 | return zl; |
1313 | } | 1487 | } |
@@ -1321,9 +1495,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1321 | struct zonelist *zl; | 1495 | struct zonelist *zl; |
1322 | struct page *page; | 1496 | struct page *page; |
1323 | 1497 | ||
1324 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); | 1498 | zl = node_zonelist(nid, gfp); |
1325 | page = __alloc_pages(gfp, order, zl); | 1499 | page = __alloc_pages(gfp, order, zl); |
1326 | if (page && page_zone(page) == zl->zones[0]) | 1500 | if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) |
1327 | inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); | 1501 | inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); |
1328 | return page; | 1502 | return page; |
1329 | } | 1503 | } |
@@ -1358,28 +1532,27 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1358 | 1532 | ||
1359 | cpuset_update_task_memory_state(); | 1533 | cpuset_update_task_memory_state(); |
1360 | 1534 | ||
1361 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { | 1535 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1362 | unsigned nid; | 1536 | unsigned nid; |
1363 | 1537 | ||
1364 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | 1538 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
1365 | if (unlikely(pol != &default_policy && | 1539 | mpol_cond_put(pol); |
1366 | pol != current->mempolicy)) | ||
1367 | __mpol_free(pol); /* finished with pol */ | ||
1368 | return alloc_page_interleave(gfp, 0, nid); | 1540 | return alloc_page_interleave(gfp, 0, nid); |
1369 | } | 1541 | } |
1370 | zl = zonelist_policy(gfp, pol); | 1542 | zl = policy_zonelist(gfp, pol); |
1371 | if (pol != &default_policy && pol != current->mempolicy) { | 1543 | if (unlikely(mpol_needs_cond_ref(pol))) { |
1372 | /* | 1544 | /* |
1373 | * slow path: ref counted policy -- shared or vma | 1545 | * slow path: ref counted shared policy |
1374 | */ | 1546 | */ |
1375 | struct page *page = __alloc_pages(gfp, 0, zl); | 1547 | struct page *page = __alloc_pages_nodemask(gfp, 0, |
1376 | __mpol_free(pol); | 1548 | zl, policy_nodemask(gfp, pol)); |
1549 | __mpol_put(pol); | ||
1377 | return page; | 1550 | return page; |
1378 | } | 1551 | } |
1379 | /* | 1552 | /* |
1380 | * fast path: default or task policy | 1553 | * fast path: default or task policy |
1381 | */ | 1554 | */ |
1382 | return __alloc_pages(gfp, 0, zl); | 1555 | return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); |
1383 | } | 1556 | } |
1384 | 1557 | ||
1385 | /** | 1558 | /** |
@@ -1409,22 +1582,28 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1409 | cpuset_update_task_memory_state(); | 1582 | cpuset_update_task_memory_state(); |
1410 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1583 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1411 | pol = &default_policy; | 1584 | pol = &default_policy; |
1412 | if (pol->policy == MPOL_INTERLEAVE) | 1585 | |
1586 | /* | ||
1587 | * No reference counting needed for current->mempolicy | ||
1588 | * nor system default_policy | ||
1589 | */ | ||
1590 | if (pol->mode == MPOL_INTERLEAVE) | ||
1413 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1591 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
1414 | return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); | 1592 | return __alloc_pages_nodemask(gfp, order, |
1593 | policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); | ||
1415 | } | 1594 | } |
1416 | EXPORT_SYMBOL(alloc_pages_current); | 1595 | EXPORT_SYMBOL(alloc_pages_current); |
1417 | 1596 | ||
1418 | /* | 1597 | /* |
1419 | * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it | 1598 | * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it |
1420 | * rebinds the mempolicy its copying by calling mpol_rebind_policy() | 1599 | * rebinds the mempolicy its copying by calling mpol_rebind_policy() |
1421 | * with the mems_allowed returned by cpuset_mems_allowed(). This | 1600 | * with the mems_allowed returned by cpuset_mems_allowed(). This |
1422 | * keeps mempolicies cpuset relative after its cpuset moves. See | 1601 | * keeps mempolicies cpuset relative after its cpuset moves. See |
1423 | * further kernel/cpuset.c update_nodemask(). | 1602 | * further kernel/cpuset.c update_nodemask(). |
1424 | */ | 1603 | */ |
1425 | 1604 | ||
1426 | /* Slow path of a mempolicy copy */ | 1605 | /* Slow path of a mempolicy duplicate */ |
1427 | struct mempolicy *__mpol_copy(struct mempolicy *old) | 1606 | struct mempolicy *__mpol_dup(struct mempolicy *old) |
1428 | { | 1607 | { |
1429 | struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 1608 | struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
1430 | 1609 | ||
@@ -1436,55 +1615,64 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
1436 | } | 1615 | } |
1437 | *new = *old; | 1616 | *new = *old; |
1438 | atomic_set(&new->refcnt, 1); | 1617 | atomic_set(&new->refcnt, 1); |
1439 | if (new->policy == MPOL_BIND) { | ||
1440 | int sz = ksize(old->v.zonelist); | ||
1441 | new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); | ||
1442 | if (!new->v.zonelist) { | ||
1443 | kmem_cache_free(policy_cache, new); | ||
1444 | return ERR_PTR(-ENOMEM); | ||
1445 | } | ||
1446 | } | ||
1447 | return new; | 1618 | return new; |
1448 | } | 1619 | } |
1449 | 1620 | ||
1621 | /* | ||
1622 | * If *frompol needs [has] an extra ref, copy *frompol to *tompol , | ||
1623 | * eliminate the * MPOL_F_* flags that require conditional ref and | ||
1624 | * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly | ||
1625 | * after return. Use the returned value. | ||
1626 | * | ||
1627 | * Allows use of a mempolicy for, e.g., multiple allocations with a single | ||
1628 | * policy lookup, even if the policy needs/has extra ref on lookup. | ||
1629 | * shmem_readahead needs this. | ||
1630 | */ | ||
1631 | struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, | ||
1632 | struct mempolicy *frompol) | ||
1633 | { | ||
1634 | if (!mpol_needs_cond_ref(frompol)) | ||
1635 | return frompol; | ||
1636 | |||
1637 | *tompol = *frompol; | ||
1638 | tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ | ||
1639 | __mpol_put(frompol); | ||
1640 | return tompol; | ||
1641 | } | ||
1642 | |||
1643 | static int mpol_match_intent(const struct mempolicy *a, | ||
1644 | const struct mempolicy *b) | ||
1645 | { | ||
1646 | if (a->flags != b->flags) | ||
1647 | return 0; | ||
1648 | if (!mpol_store_user_nodemask(a)) | ||
1649 | return 1; | ||
1650 | return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); | ||
1651 | } | ||
1652 | |||
1450 | /* Slow path of a mempolicy comparison */ | 1653 | /* Slow path of a mempolicy comparison */ |
1451 | int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | 1654 | int __mpol_equal(struct mempolicy *a, struct mempolicy *b) |
1452 | { | 1655 | { |
1453 | if (!a || !b) | 1656 | if (!a || !b) |
1454 | return 0; | 1657 | return 0; |
1455 | if (a->policy != b->policy) | 1658 | if (a->mode != b->mode) |
1456 | return 0; | 1659 | return 0; |
1457 | switch (a->policy) { | 1660 | if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) |
1458 | case MPOL_DEFAULT: | 1661 | return 0; |
1459 | return 1; | 1662 | switch (a->mode) { |
1663 | case MPOL_BIND: | ||
1664 | /* Fall through */ | ||
1460 | case MPOL_INTERLEAVE: | 1665 | case MPOL_INTERLEAVE: |
1461 | return nodes_equal(a->v.nodes, b->v.nodes); | 1666 | return nodes_equal(a->v.nodes, b->v.nodes); |
1462 | case MPOL_PREFERRED: | 1667 | case MPOL_PREFERRED: |
1463 | return a->v.preferred_node == b->v.preferred_node; | 1668 | return a->v.preferred_node == b->v.preferred_node && |
1464 | case MPOL_BIND: { | 1669 | a->flags == b->flags; |
1465 | int i; | ||
1466 | for (i = 0; a->v.zonelist->zones[i]; i++) | ||
1467 | if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i]) | ||
1468 | return 0; | ||
1469 | return b->v.zonelist->zones[i] == NULL; | ||
1470 | } | ||
1471 | default: | 1670 | default: |
1472 | BUG(); | 1671 | BUG(); |
1473 | return 0; | 1672 | return 0; |
1474 | } | 1673 | } |
1475 | } | 1674 | } |
1476 | 1675 | ||
1477 | /* Slow path of a mpol destructor. */ | ||
1478 | void __mpol_free(struct mempolicy *p) | ||
1479 | { | ||
1480 | if (!atomic_dec_and_test(&p->refcnt)) | ||
1481 | return; | ||
1482 | if (p->policy == MPOL_BIND) | ||
1483 | kfree(p->v.zonelist); | ||
1484 | p->policy = MPOL_DEFAULT; | ||
1485 | kmem_cache_free(policy_cache, p); | ||
1486 | } | ||
1487 | |||
1488 | /* | 1676 | /* |
1489 | * Shared memory backing store policy support. | 1677 | * Shared memory backing store policy support. |
1490 | * | 1678 | * |
@@ -1547,7 +1735,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) | |||
1547 | rb_link_node(&new->nd, parent, p); | 1735 | rb_link_node(&new->nd, parent, p); |
1548 | rb_insert_color(&new->nd, &sp->root); | 1736 | rb_insert_color(&new->nd, &sp->root); |
1549 | pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, | 1737 | pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, |
1550 | new->policy ? new->policy->policy : 0); | 1738 | new->policy ? new->policy->mode : 0); |
1551 | } | 1739 | } |
1552 | 1740 | ||
1553 | /* Find shared policy intersecting idx */ | 1741 | /* Find shared policy intersecting idx */ |
@@ -1573,7 +1761,7 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) | |||
1573 | { | 1761 | { |
1574 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); | 1762 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
1575 | rb_erase(&n->nd, &sp->root); | 1763 | rb_erase(&n->nd, &sp->root); |
1576 | mpol_free(n->policy); | 1764 | mpol_put(n->policy); |
1577 | kmem_cache_free(sn_cache, n); | 1765 | kmem_cache_free(sn_cache, n); |
1578 | } | 1766 | } |
1579 | 1767 | ||
@@ -1587,6 +1775,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | |||
1587 | n->start = start; | 1775 | n->start = start; |
1588 | n->end = end; | 1776 | n->end = end; |
1589 | mpol_get(pol); | 1777 | mpol_get(pol); |
1778 | pol->flags |= MPOL_F_SHARED; /* for unref */ | ||
1590 | n->policy = pol; | 1779 | n->policy = pol; |
1591 | return n; | 1780 | return n; |
1592 | } | 1781 | } |
@@ -1633,33 +1822,41 @@ restart: | |||
1633 | sp_insert(sp, new); | 1822 | sp_insert(sp, new); |
1634 | spin_unlock(&sp->lock); | 1823 | spin_unlock(&sp->lock); |
1635 | if (new2) { | 1824 | if (new2) { |
1636 | mpol_free(new2->policy); | 1825 | mpol_put(new2->policy); |
1637 | kmem_cache_free(sn_cache, new2); | 1826 | kmem_cache_free(sn_cache, new2); |
1638 | } | 1827 | } |
1639 | return 0; | 1828 | return 0; |
1640 | } | 1829 | } |
1641 | 1830 | ||
1642 | void mpol_shared_policy_init(struct shared_policy *info, int policy, | 1831 | /** |
1643 | nodemask_t *policy_nodes) | 1832 | * mpol_shared_policy_init - initialize shared policy for inode |
1644 | { | 1833 | * @sp: pointer to inode shared policy |
1645 | info->root = RB_ROOT; | 1834 | * @mpol: struct mempolicy to install |
1646 | spin_lock_init(&info->lock); | 1835 | * |
1647 | 1836 | * Install non-NULL @mpol in inode's shared policy rb-tree. | |
1648 | if (policy != MPOL_DEFAULT) { | 1837 | * On entry, the current task has a reference on a non-NULL @mpol. |
1649 | struct mempolicy *newpol; | 1838 | * This must be released on exit. |
1650 | 1839 | */ | |
1651 | /* Falls back to MPOL_DEFAULT on any error */ | 1840 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) |
1652 | newpol = mpol_new(policy, policy_nodes); | 1841 | { |
1653 | if (!IS_ERR(newpol)) { | 1842 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
1654 | /* Create pseudo-vma that contains just the policy */ | 1843 | spin_lock_init(&sp->lock); |
1655 | struct vm_area_struct pvma; | 1844 | |
1656 | 1845 | if (mpol) { | |
1657 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 1846 | struct vm_area_struct pvma; |
1658 | /* Policy covers entire file */ | 1847 | struct mempolicy *new; |
1659 | pvma.vm_end = TASK_SIZE; | 1848 | |
1660 | mpol_set_shared_policy(info, &pvma, newpol); | 1849 | /* contextualize the tmpfs mount point mempolicy */ |
1661 | mpol_free(newpol); | 1850 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); |
1662 | } | 1851 | mpol_put(mpol); /* drop our ref on sb mpol */ |
1852 | if (IS_ERR(new)) | ||
1853 | return; /* no valid nodemask intersection */ | ||
1854 | |||
1855 | /* Create pseudo-vma that contains just the policy */ | ||
1856 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | ||
1857 | pvma.vm_end = TASK_SIZE; /* policy covers entire file */ | ||
1858 | mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ | ||
1859 | mpol_put(new); /* drop initial ref */ | ||
1663 | } | 1860 | } |
1664 | } | 1861 | } |
1665 | 1862 | ||
@@ -1670,9 +1867,10 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
1670 | struct sp_node *new = NULL; | 1867 | struct sp_node *new = NULL; |
1671 | unsigned long sz = vma_pages(vma); | 1868 | unsigned long sz = vma_pages(vma); |
1672 | 1869 | ||
1673 | pr_debug("set_shared_policy %lx sz %lu %d %lx\n", | 1870 | pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", |
1674 | vma->vm_pgoff, | 1871 | vma->vm_pgoff, |
1675 | sz, npol? npol->policy : -1, | 1872 | sz, npol ? npol->mode : -1, |
1873 | npol ? npol->flags : -1, | ||
1676 | npol ? nodes_addr(npol->v.nodes)[0] : -1); | 1874 | npol ? nodes_addr(npol->v.nodes)[0] : -1); |
1677 | 1875 | ||
1678 | if (npol) { | 1876 | if (npol) { |
@@ -1700,7 +1898,7 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
1700 | n = rb_entry(next, struct sp_node, nd); | 1898 | n = rb_entry(next, struct sp_node, nd); |
1701 | next = rb_next(&n->nd); | 1899 | next = rb_next(&n->nd); |
1702 | rb_erase(&n->nd, &p->root); | 1900 | rb_erase(&n->nd, &p->root); |
1703 | mpol_free(n->policy); | 1901 | mpol_put(n->policy); |
1704 | kmem_cache_free(sn_cache, n); | 1902 | kmem_cache_free(sn_cache, n); |
1705 | } | 1903 | } |
1706 | spin_unlock(&p->lock); | 1904 | spin_unlock(&p->lock); |
@@ -1745,120 +1943,177 @@ void __init numa_policy_init(void) | |||
1745 | if (unlikely(nodes_empty(interleave_nodes))) | 1943 | if (unlikely(nodes_empty(interleave_nodes))) |
1746 | node_set(prefer, interleave_nodes); | 1944 | node_set(prefer, interleave_nodes); |
1747 | 1945 | ||
1748 | if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes)) | 1946 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) |
1749 | printk("numa_policy_init: interleaving failed\n"); | 1947 | printk("numa_policy_init: interleaving failed\n"); |
1750 | } | 1948 | } |
1751 | 1949 | ||
1752 | /* Reset policy of current process to default */ | 1950 | /* Reset policy of current process to default */ |
1753 | void numa_default_policy(void) | 1951 | void numa_default_policy(void) |
1754 | { | 1952 | { |
1755 | do_set_mempolicy(MPOL_DEFAULT, NULL); | 1953 | do_set_mempolicy(MPOL_DEFAULT, 0, NULL); |
1756 | } | 1954 | } |
1757 | 1955 | ||
1758 | /* Migrate a policy to a different set of nodes */ | 1956 | /* |
1759 | static void mpol_rebind_policy(struct mempolicy *pol, | 1957 | * Parse and format mempolicy from/to strings |
1760 | const nodemask_t *newmask) | 1958 | */ |
1761 | { | ||
1762 | nodemask_t *mpolmask; | ||
1763 | nodemask_t tmp; | ||
1764 | 1959 | ||
1765 | if (!pol) | 1960 | /* |
1766 | return; | 1961 | * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag |
1767 | mpolmask = &pol->cpuset_mems_allowed; | 1962 | * Used only for mpol_parse_str() and mpol_to_str() |
1768 | if (nodes_equal(*mpolmask, *newmask)) | 1963 | */ |
1769 | return; | 1964 | #define MPOL_LOCAL (MPOL_INTERLEAVE + 1) |
1965 | static const char * const policy_types[] = | ||
1966 | { "default", "prefer", "bind", "interleave", "local" }; | ||
1770 | 1967 | ||
1771 | switch (pol->policy) { | ||
1772 | case MPOL_DEFAULT: | ||
1773 | break; | ||
1774 | case MPOL_INTERLEAVE: | ||
1775 | nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); | ||
1776 | pol->v.nodes = tmp; | ||
1777 | *mpolmask = *newmask; | ||
1778 | current->il_next = node_remap(current->il_next, | ||
1779 | *mpolmask, *newmask); | ||
1780 | break; | ||
1781 | case MPOL_PREFERRED: | ||
1782 | pol->v.preferred_node = node_remap(pol->v.preferred_node, | ||
1783 | *mpolmask, *newmask); | ||
1784 | *mpolmask = *newmask; | ||
1785 | break; | ||
1786 | case MPOL_BIND: { | ||
1787 | nodemask_t nodes; | ||
1788 | struct zone **z; | ||
1789 | struct zonelist *zonelist; | ||
1790 | 1968 | ||
1969 | #ifdef CONFIG_TMPFS | ||
1970 | /** | ||
1971 | * mpol_parse_str - parse string to mempolicy | ||
1972 | * @str: string containing mempolicy to parse | ||
1973 | * @mpol: pointer to struct mempolicy pointer, returned on success. | ||
1974 | * @no_context: flag whether to "contextualize" the mempolicy | ||
1975 | * | ||
1976 | * Format of input: | ||
1977 | * <mode>[=<flags>][:<nodelist>] | ||
1978 | * | ||
1979 | * if @no_context is true, save the input nodemask in w.user_nodemask in | ||
1980 | * the returned mempolicy. This will be used to "clone" the mempolicy in | ||
1981 | * a specific context [cpuset] at a later time. Used to parse tmpfs mpol | ||
1982 | * mount option. Note that if 'static' or 'relative' mode flags were | ||
1983 | * specified, the input nodemask will already have been saved. Saving | ||
1984 | * it again is redundant, but safe. | ||
1985 | * | ||
1986 | * On success, returns 0, else 1 | ||
1987 | */ | ||
1988 | int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | ||
1989 | { | ||
1990 | struct mempolicy *new = NULL; | ||
1991 | unsigned short uninitialized_var(mode); | ||
1992 | unsigned short uninitialized_var(mode_flags); | ||
1993 | nodemask_t nodes; | ||
1994 | char *nodelist = strchr(str, ':'); | ||
1995 | char *flags = strchr(str, '='); | ||
1996 | int i; | ||
1997 | int err = 1; | ||
1998 | |||
1999 | if (nodelist) { | ||
2000 | /* NUL-terminate mode or flags string */ | ||
2001 | *nodelist++ = '\0'; | ||
2002 | if (nodelist_parse(nodelist, nodes)) | ||
2003 | goto out; | ||
2004 | if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) | ||
2005 | goto out; | ||
2006 | } else | ||
1791 | nodes_clear(nodes); | 2007 | nodes_clear(nodes); |
1792 | for (z = pol->v.zonelist->zones; *z; z++) | ||
1793 | node_set(zone_to_nid(*z), nodes); | ||
1794 | nodes_remap(tmp, nodes, *mpolmask, *newmask); | ||
1795 | nodes = tmp; | ||
1796 | 2008 | ||
1797 | zonelist = bind_zonelist(&nodes); | 2009 | if (flags) |
2010 | *flags++ = '\0'; /* terminate mode string */ | ||
1798 | 2011 | ||
1799 | /* If no mem, then zonelist is NULL and we keep old zonelist. | 2012 | for (i = 0; i <= MPOL_LOCAL; i++) { |
1800 | * If that old zonelist has no remaining mems_allowed nodes, | 2013 | if (!strcmp(str, policy_types[i])) { |
1801 | * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. | 2014 | mode = i; |
1802 | */ | 2015 | break; |
2016 | } | ||
2017 | } | ||
2018 | if (i > MPOL_LOCAL) | ||
2019 | goto out; | ||
1803 | 2020 | ||
1804 | if (!IS_ERR(zonelist)) { | 2021 | switch (mode) { |
1805 | /* Good - got mem - substitute new zonelist */ | 2022 | case MPOL_PREFERRED: |
1806 | kfree(pol->v.zonelist); | 2023 | /* |
1807 | pol->v.zonelist = zonelist; | 2024 | * Insist on a nodelist of one node only |
2025 | */ | ||
2026 | if (nodelist) { | ||
2027 | char *rest = nodelist; | ||
2028 | while (isdigit(*rest)) | ||
2029 | rest++; | ||
2030 | if (!*rest) | ||
2031 | err = 0; | ||
1808 | } | 2032 | } |
1809 | *mpolmask = *newmask; | ||
1810 | break; | 2033 | break; |
1811 | } | 2034 | case MPOL_INTERLEAVE: |
1812 | default: | 2035 | /* |
1813 | BUG(); | 2036 | * Default to online nodes with memory if no nodelist |
2037 | */ | ||
2038 | if (!nodelist) | ||
2039 | nodes = node_states[N_HIGH_MEMORY]; | ||
2040 | err = 0; | ||
2041 | break; | ||
2042 | case MPOL_LOCAL: | ||
2043 | /* | ||
2044 | * Don't allow a nodelist; mpol_new() checks flags | ||
2045 | */ | ||
2046 | if (nodelist) | ||
2047 | goto out; | ||
2048 | mode = MPOL_PREFERRED; | ||
1814 | break; | 2049 | break; |
1815 | } | ||
1816 | } | ||
1817 | |||
1818 | /* | ||
1819 | * Wrapper for mpol_rebind_policy() that just requires task | ||
1820 | * pointer, and updates task mempolicy. | ||
1821 | */ | ||
1822 | 2050 | ||
1823 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | 2051 | /* |
1824 | { | 2052 | * case MPOL_BIND: mpol_new() enforces non-empty nodemask. |
1825 | mpol_rebind_policy(tsk->mempolicy, new); | 2053 | * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. |
1826 | } | 2054 | */ |
2055 | } | ||
1827 | 2056 | ||
1828 | /* | 2057 | mode_flags = 0; |
1829 | * Rebind each vma in mm to new nodemask. | 2058 | if (flags) { |
1830 | * | 2059 | /* |
1831 | * Call holding a reference to mm. Takes mm->mmap_sem during call. | 2060 | * Currently, we only support two mutually exclusive |
1832 | */ | 2061 | * mode flags. |
2062 | */ | ||
2063 | if (!strcmp(flags, "static")) | ||
2064 | mode_flags |= MPOL_F_STATIC_NODES; | ||
2065 | else if (!strcmp(flags, "relative")) | ||
2066 | mode_flags |= MPOL_F_RELATIVE_NODES; | ||
2067 | else | ||
2068 | err = 1; | ||
2069 | } | ||
1833 | 2070 | ||
1834 | void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | 2071 | new = mpol_new(mode, mode_flags, &nodes); |
1835 | { | 2072 | if (IS_ERR(new)) |
1836 | struct vm_area_struct *vma; | 2073 | err = 1; |
2074 | else if (no_context) | ||
2075 | new->w.user_nodemask = nodes; /* save for contextualization */ | ||
1837 | 2076 | ||
1838 | down_write(&mm->mmap_sem); | 2077 | out: |
1839 | for (vma = mm->mmap; vma; vma = vma->vm_next) | 2078 | /* Restore string for error message */ |
1840 | mpol_rebind_policy(vma->vm_policy, new); | 2079 | if (nodelist) |
1841 | up_write(&mm->mmap_sem); | 2080 | *--nodelist = ':'; |
2081 | if (flags) | ||
2082 | *--flags = '='; | ||
2083 | if (!err) | ||
2084 | *mpol = new; | ||
2085 | return err; | ||
1842 | } | 2086 | } |
2087 | #endif /* CONFIG_TMPFS */ | ||
1843 | 2088 | ||
1844 | /* | 2089 | /** |
1845 | * Display pages allocated per node and memory policy via /proc. | 2090 | * mpol_to_str - format a mempolicy structure for printing |
1846 | */ | 2091 | * @buffer: to contain formatted mempolicy string |
1847 | 2092 | * @maxlen: length of @buffer | |
1848 | static const char * const policy_types[] = | 2093 | * @pol: pointer to mempolicy to be formatted |
1849 | { "default", "prefer", "bind", "interleave" }; | 2094 | * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask |
1850 | 2095 | * | |
1851 | /* | ||
1852 | * Convert a mempolicy into a string. | 2096 | * Convert a mempolicy into a string. |
1853 | * Returns the number of characters in buffer (if positive) | 2097 | * Returns the number of characters in buffer (if positive) |
1854 | * or an error (negative) | 2098 | * or an error (negative) |
1855 | */ | 2099 | */ |
1856 | static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | 2100 | int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) |
1857 | { | 2101 | { |
1858 | char *p = buffer; | 2102 | char *p = buffer; |
1859 | int l; | 2103 | int l; |
1860 | nodemask_t nodes; | 2104 | nodemask_t nodes; |
1861 | int mode = pol ? pol->policy : MPOL_DEFAULT; | 2105 | unsigned short mode; |
2106 | unsigned short flags = pol ? pol->flags : 0; | ||
2107 | |||
2108 | /* | ||
2109 | * Sanity check: room for longest mode, flag and some nodes | ||
2110 | */ | ||
2111 | VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); | ||
2112 | |||
2113 | if (!pol || pol == &default_policy) | ||
2114 | mode = MPOL_DEFAULT; | ||
2115 | else | ||
2116 | mode = pol->mode; | ||
1862 | 2117 | ||
1863 | switch (mode) { | 2118 | switch (mode) { |
1864 | case MPOL_DEFAULT: | 2119 | case MPOL_DEFAULT: |
@@ -1867,33 +2122,50 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
1867 | 2122 | ||
1868 | case MPOL_PREFERRED: | 2123 | case MPOL_PREFERRED: |
1869 | nodes_clear(nodes); | 2124 | nodes_clear(nodes); |
1870 | node_set(pol->v.preferred_node, nodes); | 2125 | if (flags & MPOL_F_LOCAL) |
2126 | mode = MPOL_LOCAL; /* pseudo-policy */ | ||
2127 | else | ||
2128 | node_set(pol->v.preferred_node, nodes); | ||
1871 | break; | 2129 | break; |
1872 | 2130 | ||
1873 | case MPOL_BIND: | 2131 | case MPOL_BIND: |
1874 | get_zonemask(pol, &nodes); | 2132 | /* Fall through */ |
1875 | break; | ||
1876 | |||
1877 | case MPOL_INTERLEAVE: | 2133 | case MPOL_INTERLEAVE: |
1878 | nodes = pol->v.nodes; | 2134 | if (no_context) |
2135 | nodes = pol->w.user_nodemask; | ||
2136 | else | ||
2137 | nodes = pol->v.nodes; | ||
1879 | break; | 2138 | break; |
1880 | 2139 | ||
1881 | default: | 2140 | default: |
1882 | BUG(); | 2141 | BUG(); |
1883 | return -EFAULT; | ||
1884 | } | 2142 | } |
1885 | 2143 | ||
1886 | l = strlen(policy_types[mode]); | 2144 | l = strlen(policy_types[mode]); |
1887 | if (buffer + maxlen < p + l + 1) | 2145 | if (buffer + maxlen < p + l + 1) |
1888 | return -ENOSPC; | 2146 | return -ENOSPC; |
1889 | 2147 | ||
1890 | strcpy(p, policy_types[mode]); | 2148 | strcpy(p, policy_types[mode]); |
1891 | p += l; | 2149 | p += l; |
1892 | 2150 | ||
1893 | if (!nodes_empty(nodes)) { | 2151 | if (flags & MPOL_MODE_FLAGS) { |
1894 | if (buffer + maxlen < p + 2) | 2152 | if (buffer + maxlen < p + 2) |
1895 | return -ENOSPC; | 2153 | return -ENOSPC; |
1896 | *p++ = '='; | 2154 | *p++ = '='; |
2155 | |||
2156 | /* | ||
2157 | * Currently, the only defined flags are mutually exclusive | ||
2158 | */ | ||
2159 | if (flags & MPOL_F_STATIC_NODES) | ||
2160 | p += snprintf(p, buffer + maxlen - p, "static"); | ||
2161 | else if (flags & MPOL_F_RELATIVE_NODES) | ||
2162 | p += snprintf(p, buffer + maxlen - p, "relative"); | ||
2163 | } | ||
2164 | |||
2165 | if (!nodes_empty(nodes)) { | ||
2166 | if (buffer + maxlen < p + 2) | ||
2167 | return -ENOSPC; | ||
2168 | *p++ = ':'; | ||
1897 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); | 2169 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); |
1898 | } | 2170 | } |
1899 | return p - buffer; | 2171 | return p - buffer; |
@@ -1971,6 +2243,9 @@ static inline void check_huge_range(struct vm_area_struct *vma, | |||
1971 | } | 2243 | } |
1972 | #endif | 2244 | #endif |
1973 | 2245 | ||
2246 | /* | ||
2247 | * Display pages allocated per node and memory policy via /proc. | ||
2248 | */ | ||
1974 | int show_numa_map(struct seq_file *m, void *v) | 2249 | int show_numa_map(struct seq_file *m, void *v) |
1975 | { | 2250 | { |
1976 | struct proc_maps_private *priv = m->private; | 2251 | struct proc_maps_private *priv = m->private; |
@@ -1990,12 +2265,8 @@ int show_numa_map(struct seq_file *m, void *v) | |||
1990 | return 0; | 2265 | return 0; |
1991 | 2266 | ||
1992 | pol = get_vma_policy(priv->task, vma, vma->vm_start); | 2267 | pol = get_vma_policy(priv->task, vma, vma->vm_start); |
1993 | mpol_to_str(buffer, sizeof(buffer), pol); | 2268 | mpol_to_str(buffer, sizeof(buffer), pol, 0); |
1994 | /* | 2269 | mpol_cond_put(pol); |
1995 | * unref shared or other task's mempolicy | ||
1996 | */ | ||
1997 | if (pol != &default_policy && pol != current->mempolicy) | ||
1998 | __mpol_free(pol); | ||
1999 | 2270 | ||
2000 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | 2271 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); |
2001 | 2272 | ||
diff --git a/mm/mincore.c b/mm/mincore.c index 5efe0ded69b1..5178800bc129 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -33,7 +33,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
33 | * When tmpfs swaps out a page from a file, any process mapping that | 33 | * When tmpfs swaps out a page from a file, any process mapping that |
34 | * file will not get a swp_entry_t in its pte, but rather it is like | 34 | * file will not get a swp_entry_t in its pte, but rather it is like |
35 | * any other file mapping (ie. marked !present and faulted in with | 35 | * any other file mapping (ie. marked !present and faulted in with |
36 | * tmpfs's .nopage). So swapped out tmpfs mappings are tested here. | 36 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. |
37 | * | 37 | * |
38 | * However when tmpfs moves the page from pagecache and into swapcache, | 38 | * However when tmpfs moves the page from pagecache and into swapcache, |
39 | * it is still in core, but the find_get_page below won't find it. | 39 | * it is still in core, but the find_get_page below won't find it. |
@@ -232,7 +232,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | |||
232 | vma->vm_ops->close(vma); | 232 | vma->vm_ops->close(vma); |
233 | if (vma->vm_file) | 233 | if (vma->vm_file) |
234 | fput(vma->vm_file); | 234 | fput(vma->vm_file); |
235 | mpol_free(vma_policy(vma)); | 235 | mpol_put(vma_policy(vma)); |
236 | kmem_cache_free(vm_area_cachep, vma); | 236 | kmem_cache_free(vm_area_cachep, vma); |
237 | return next; | 237 | return next; |
238 | } | 238 | } |
@@ -626,7 +626,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
626 | if (file) | 626 | if (file) |
627 | fput(file); | 627 | fput(file); |
628 | mm->map_count--; | 628 | mm->map_count--; |
629 | mpol_free(vma_policy(next)); | 629 | mpol_put(vma_policy(next)); |
630 | kmem_cache_free(vm_area_cachep, next); | 630 | kmem_cache_free(vm_area_cachep, next); |
631 | /* | 631 | /* |
632 | * In mprotect's case 6 (see comments on vma_merge), | 632 | * In mprotect's case 6 (see comments on vma_merge), |
@@ -1068,7 +1068,6 @@ int vma_wants_writenotify(struct vm_area_struct *vma) | |||
1068 | mapping_cap_account_dirty(vma->vm_file->f_mapping); | 1068 | mapping_cap_account_dirty(vma->vm_file->f_mapping); |
1069 | } | 1069 | } |
1070 | 1070 | ||
1071 | |||
1072 | unsigned long mmap_region(struct file *file, unsigned long addr, | 1071 | unsigned long mmap_region(struct file *file, unsigned long addr, |
1073 | unsigned long len, unsigned long flags, | 1072 | unsigned long len, unsigned long flags, |
1074 | unsigned int vm_flags, unsigned long pgoff, | 1073 | unsigned int vm_flags, unsigned long pgoff, |
@@ -1181,22 +1180,20 @@ munmap_back: | |||
1181 | if (vma_wants_writenotify(vma)) | 1180 | if (vma_wants_writenotify(vma)) |
1182 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); | 1181 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); |
1183 | 1182 | ||
1184 | if (!file || !vma_merge(mm, prev, addr, vma->vm_end, | 1183 | if (file && vma_merge(mm, prev, addr, vma->vm_end, |
1185 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { | 1184 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { |
1186 | file = vma->vm_file; | 1185 | mpol_put(vma_policy(vma)); |
1187 | vma_link(mm, vma, prev, rb_link, rb_parent); | ||
1188 | if (correct_wcount) | ||
1189 | atomic_inc(&inode->i_writecount); | ||
1190 | } else { | ||
1191 | if (file) { | ||
1192 | if (correct_wcount) | ||
1193 | atomic_inc(&inode->i_writecount); | ||
1194 | fput(file); | ||
1195 | } | ||
1196 | mpol_free(vma_policy(vma)); | ||
1197 | kmem_cache_free(vm_area_cachep, vma); | 1186 | kmem_cache_free(vm_area_cachep, vma); |
1187 | fput(file); | ||
1188 | } else { | ||
1189 | vma_link(mm, vma, prev, rb_link, rb_parent); | ||
1190 | file = vma->vm_file; | ||
1198 | } | 1191 | } |
1199 | out: | 1192 | |
1193 | /* Once vma denies write, undo our temporary denial count */ | ||
1194 | if (correct_wcount) | ||
1195 | atomic_inc(&inode->i_writecount); | ||
1196 | out: | ||
1200 | mm->total_vm += len >> PAGE_SHIFT; | 1197 | mm->total_vm += len >> PAGE_SHIFT; |
1201 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1198 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1202 | if (vm_flags & VM_LOCKED) { | 1199 | if (vm_flags & VM_LOCKED) { |
@@ -1813,7 +1810,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1813 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); | 1810 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); |
1814 | } | 1811 | } |
1815 | 1812 | ||
1816 | pol = mpol_copy(vma_policy(vma)); | 1813 | pol = mpol_dup(vma_policy(vma)); |
1817 | if (IS_ERR(pol)) { | 1814 | if (IS_ERR(pol)) { |
1818 | kmem_cache_free(vm_area_cachep, new); | 1815 | kmem_cache_free(vm_area_cachep, new); |
1819 | return PTR_ERR(pol); | 1816 | return PTR_ERR(pol); |
@@ -2129,7 +2126,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2129 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2126 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2130 | if (new_vma) { | 2127 | if (new_vma) { |
2131 | *new_vma = *vma; | 2128 | *new_vma = *vma; |
2132 | pol = mpol_copy(vma_policy(vma)); | 2129 | pol = mpol_dup(vma_policy(vma)); |
2133 | if (IS_ERR(pol)) { | 2130 | if (IS_ERR(pol)) { |
2134 | kmem_cache_free(vm_area_cachep, new_vma); | 2131 | kmem_cache_free(vm_area_cachep, new_vma); |
2135 | return NULL; | 2132 | return NULL; |
diff --git a/mm/mmzone.c b/mm/mmzone.c index eb5838634f18..486ed595ee6f 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone) | |||
42 | return zone; | 42 | return zone; |
43 | } | 43 | } |
44 | 44 | ||
45 | static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) | ||
46 | { | ||
47 | #ifdef CONFIG_NUMA | ||
48 | return node_isset(zonelist_node_idx(zref), *nodes); | ||
49 | #else | ||
50 | return 1; | ||
51 | #endif /* CONFIG_NUMA */ | ||
52 | } | ||
53 | |||
54 | /* Returns the next zone at or below highest_zoneidx in a zonelist */ | ||
55 | struct zoneref *next_zones_zonelist(struct zoneref *z, | ||
56 | enum zone_type highest_zoneidx, | ||
57 | nodemask_t *nodes, | ||
58 | struct zone **zone) | ||
59 | { | ||
60 | /* | ||
61 | * Find the next suitable zone to use for the allocation. | ||
62 | * Only filter based on nodemask if it's set | ||
63 | */ | ||
64 | if (likely(nodes == NULL)) | ||
65 | while (zonelist_zone_idx(z) > highest_zoneidx) | ||
66 | z++; | ||
67 | else | ||
68 | while (zonelist_zone_idx(z) > highest_zoneidx || | ||
69 | (z->zone && !zref_in_nodemask(z, nodes))) | ||
70 | z++; | ||
71 | |||
72 | *zone = zonelist_zone(z++); | ||
73 | return z; | ||
74 | } | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 5d8ae086f74e..1d32fe89d57b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -105,7 +105,11 @@ unsigned int kobjsize(const void *objp) | |||
105 | { | 105 | { |
106 | struct page *page; | 106 | struct page *page; |
107 | 107 | ||
108 | if (!objp || !((page = virt_to_page(objp)))) | 108 | /* |
109 | * If the object we have should not have ksize performed on it, | ||
110 | * return size of 0 | ||
111 | */ | ||
112 | if (!objp || (unsigned long)objp >= memory_end || !((page = virt_to_page(objp)))) | ||
109 | return 0; | 113 | return 0; |
110 | 114 | ||
111 | if (PageSlab(page)) | 115 | if (PageSlab(page)) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index beb592fe9389..8a5467ee6265 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -53,8 +53,7 @@ static DEFINE_SPINLOCK(zone_scan_mutex); | |||
53 | * of least surprise ... (be careful when you change it) | 53 | * of least surprise ... (be careful when you change it) |
54 | */ | 54 | */ |
55 | 55 | ||
56 | unsigned long badness(struct task_struct *p, unsigned long uptime, | 56 | unsigned long badness(struct task_struct *p, unsigned long uptime) |
57 | struct mem_cgroup *mem) | ||
58 | { | 57 | { |
59 | unsigned long points, cpu_time, run_time, s; | 58 | unsigned long points, cpu_time, run_time, s; |
60 | struct mm_struct *mm; | 59 | struct mm_struct *mm; |
@@ -175,12 +174,14 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
175 | gfp_t gfp_mask) | 174 | gfp_t gfp_mask) |
176 | { | 175 | { |
177 | #ifdef CONFIG_NUMA | 176 | #ifdef CONFIG_NUMA |
178 | struct zone **z; | 177 | struct zone *zone; |
178 | struct zoneref *z; | ||
179 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
179 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; | 180 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; |
180 | 181 | ||
181 | for (z = zonelist->zones; *z; z++) | 182 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
182 | if (cpuset_zone_allowed_softwall(*z, gfp_mask)) | 183 | if (cpuset_zone_allowed_softwall(zone, gfp_mask)) |
183 | node_clear(zone_to_nid(*z), nodes); | 184 | node_clear(zone_to_nid(zone), nodes); |
184 | else | 185 | else |
185 | return CONSTRAINT_CPUSET; | 186 | return CONSTRAINT_CPUSET; |
186 | 187 | ||
@@ -254,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
254 | if (p->oomkilladj == OOM_DISABLE) | 255 | if (p->oomkilladj == OOM_DISABLE) |
255 | continue; | 256 | continue; |
256 | 257 | ||
257 | points = badness(p, uptime.tv_sec, mem); | 258 | points = badness(p, uptime.tv_sec); |
258 | if (points > *ppoints || !chosen) { | 259 | if (points > *ppoints || !chosen) { |
259 | chosen = p; | 260 | chosen = p; |
260 | *ppoints = points; | 261 | *ppoints = points; |
@@ -460,29 +461,29 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); | |||
460 | * if a parallel OOM killing is already taking place that includes a zone in | 461 | * if a parallel OOM killing is already taking place that includes a zone in |
461 | * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. | 462 | * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. |
462 | */ | 463 | */ |
463 | int try_set_zone_oom(struct zonelist *zonelist) | 464 | int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) |
464 | { | 465 | { |
465 | struct zone **z; | 466 | struct zoneref *z; |
467 | struct zone *zone; | ||
466 | int ret = 1; | 468 | int ret = 1; |
467 | 469 | ||
468 | z = zonelist->zones; | ||
469 | |||
470 | spin_lock(&zone_scan_mutex); | 470 | spin_lock(&zone_scan_mutex); |
471 | do { | 471 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
472 | if (zone_is_oom_locked(*z)) { | 472 | if (zone_is_oom_locked(zone)) { |
473 | ret = 0; | 473 | ret = 0; |
474 | goto out; | 474 | goto out; |
475 | } | 475 | } |
476 | } while (*(++z) != NULL); | 476 | } |
477 | |||
478 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | ||
479 | /* | ||
480 | * Lock each zone in the zonelist under zone_scan_mutex so a | ||
481 | * parallel invocation of try_set_zone_oom() doesn't succeed | ||
482 | * when it shouldn't. | ||
483 | */ | ||
484 | zone_set_flag(zone, ZONE_OOM_LOCKED); | ||
485 | } | ||
477 | 486 | ||
478 | /* | ||
479 | * Lock each zone in the zonelist under zone_scan_mutex so a parallel | ||
480 | * invocation of try_set_zone_oom() doesn't succeed when it shouldn't. | ||
481 | */ | ||
482 | z = zonelist->zones; | ||
483 | do { | ||
484 | zone_set_flag(*z, ZONE_OOM_LOCKED); | ||
485 | } while (*(++z) != NULL); | ||
486 | out: | 487 | out: |
487 | spin_unlock(&zone_scan_mutex); | 488 | spin_unlock(&zone_scan_mutex); |
488 | return ret; | 489 | return ret; |
@@ -493,16 +494,15 @@ out: | |||
493 | * allocation attempts with zonelists containing them may now recall the OOM | 494 | * allocation attempts with zonelists containing them may now recall the OOM |
494 | * killer, if necessary. | 495 | * killer, if necessary. |
495 | */ | 496 | */ |
496 | void clear_zonelist_oom(struct zonelist *zonelist) | 497 | void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) |
497 | { | 498 | { |
498 | struct zone **z; | 499 | struct zoneref *z; |
499 | 500 | struct zone *zone; | |
500 | z = zonelist->zones; | ||
501 | 501 | ||
502 | spin_lock(&zone_scan_mutex); | 502 | spin_lock(&zone_scan_mutex); |
503 | do { | 503 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
504 | zone_clear_flag(*z, ZONE_OOM_LOCKED); | 504 | zone_clear_flag(zone, ZONE_OOM_LOCKED); |
505 | } while (*(++z) != NULL); | 505 | } |
506 | spin_unlock(&zone_scan_mutex); | 506 | spin_unlock(&zone_scan_mutex); |
507 | } | 507 | } |
508 | 508 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 32e796af12a1..d1cf4f05dcda 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -546,7 +546,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
546 | /* | 546 | /* |
547 | * permit the bootmem allocator to evade page validation on high-order frees | 547 | * permit the bootmem allocator to evade page validation on high-order frees |
548 | */ | 548 | */ |
549 | void __init __free_pages_bootmem(struct page *page, unsigned int order) | 549 | void __free_pages_bootmem(struct page *page, unsigned int order) |
550 | { | 550 | { |
551 | if (order == 0) { | 551 | if (order == 0) { |
552 | __ClearPageReserved(page); | 552 | __ClearPageReserved(page); |
@@ -632,7 +632,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
632 | if (PageReserved(page)) | 632 | if (PageReserved(page)) |
633 | return 1; | 633 | return 1; |
634 | 634 | ||
635 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | | 635 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | |
636 | 1 << PG_referenced | 1 << PG_arch_1 | | 636 | 1 << PG_referenced | 1 << PG_arch_1 | |
637 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); | 637 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); |
638 | set_page_private(page, 0); | 638 | set_page_private(page, 0); |
@@ -1050,7 +1050,7 @@ void split_page(struct page *page, unsigned int order) | |||
1050 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1050 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
1051 | * or two. | 1051 | * or two. |
1052 | */ | 1052 | */ |
1053 | static struct page *buffered_rmqueue(struct zonelist *zonelist, | 1053 | static struct page *buffered_rmqueue(struct zone *preferred_zone, |
1054 | struct zone *zone, int order, gfp_t gfp_flags) | 1054 | struct zone *zone, int order, gfp_t gfp_flags) |
1055 | { | 1055 | { |
1056 | unsigned long flags; | 1056 | unsigned long flags; |
@@ -1102,7 +1102,7 @@ again: | |||
1102 | } | 1102 | } |
1103 | 1103 | ||
1104 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1104 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1105 | zone_statistics(zonelist, zone); | 1105 | zone_statistics(preferred_zone, zone); |
1106 | local_irq_restore(flags); | 1106 | local_irq_restore(flags); |
1107 | put_cpu(); | 1107 | put_cpu(); |
1108 | 1108 | ||
@@ -1284,7 +1284,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1284 | if (!zlc) | 1284 | if (!zlc) |
1285 | return NULL; | 1285 | return NULL; |
1286 | 1286 | ||
1287 | if (time_after(jiffies, zlc->last_full_zap + HZ)) { | 1287 | if (time_after(jiffies, zlc->last_full_zap + HZ)) { |
1288 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1288 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1289 | zlc->last_full_zap = jiffies; | 1289 | zlc->last_full_zap = jiffies; |
1290 | } | 1290 | } |
@@ -1317,7 +1317,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1317 | * We are low on memory in the second scan, and should leave no stone | 1317 | * We are low on memory in the second scan, and should leave no stone |
1318 | * unturned looking for a free page. | 1318 | * unturned looking for a free page. |
1319 | */ | 1319 | */ |
1320 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | 1320 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, |
1321 | nodemask_t *allowednodes) | 1321 | nodemask_t *allowednodes) |
1322 | { | 1322 | { |
1323 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1323 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
@@ -1328,7 +1328,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | |||
1328 | if (!zlc) | 1328 | if (!zlc) |
1329 | return 1; | 1329 | return 1; |
1330 | 1330 | ||
1331 | i = z - zonelist->zones; | 1331 | i = z - zonelist->_zonerefs; |
1332 | n = zlc->z_to_n[i]; | 1332 | n = zlc->z_to_n[i]; |
1333 | 1333 | ||
1334 | /* This zone is worth trying if it is allowed but not full */ | 1334 | /* This zone is worth trying if it is allowed but not full */ |
@@ -1340,7 +1340,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | |||
1340 | * zlc->fullzones, so that subsequent attempts to allocate a page | 1340 | * zlc->fullzones, so that subsequent attempts to allocate a page |
1341 | * from that zone don't waste time re-examining it. | 1341 | * from that zone don't waste time re-examining it. |
1342 | */ | 1342 | */ |
1343 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | 1343 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1344 | { | 1344 | { |
1345 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1345 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1346 | int i; /* index of *z in zonelist zones */ | 1346 | int i; /* index of *z in zonelist zones */ |
@@ -1349,7 +1349,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | |||
1349 | if (!zlc) | 1349 | if (!zlc) |
1350 | return; | 1350 | return; |
1351 | 1351 | ||
1352 | i = z - zonelist->zones; | 1352 | i = z - zonelist->_zonerefs; |
1353 | 1353 | ||
1354 | set_bit(i, zlc->fullzones); | 1354 | set_bit(i, zlc->fullzones); |
1355 | } | 1355 | } |
@@ -1361,13 +1361,13 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1361 | return NULL; | 1361 | return NULL; |
1362 | } | 1362 | } |
1363 | 1363 | ||
1364 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | 1364 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, |
1365 | nodemask_t *allowednodes) | 1365 | nodemask_t *allowednodes) |
1366 | { | 1366 | { |
1367 | return 1; | 1367 | return 1; |
1368 | } | 1368 | } |
1369 | 1369 | ||
1370 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | 1370 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1371 | { | 1371 | { |
1372 | } | 1372 | } |
1373 | #endif /* CONFIG_NUMA */ | 1373 | #endif /* CONFIG_NUMA */ |
@@ -1377,42 +1377,31 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | |||
1377 | * a page. | 1377 | * a page. |
1378 | */ | 1378 | */ |
1379 | static struct page * | 1379 | static struct page * |
1380 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1380 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1381 | struct zonelist *zonelist, int alloc_flags) | 1381 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) |
1382 | { | 1382 | { |
1383 | struct zone **z; | 1383 | struct zoneref *z; |
1384 | struct page *page = NULL; | 1384 | struct page *page = NULL; |
1385 | int classzone_idx = zone_idx(zonelist->zones[0]); | 1385 | int classzone_idx; |
1386 | struct zone *zone; | 1386 | struct zone *zone, *preferred_zone; |
1387 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1387 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1388 | int zlc_active = 0; /* set if using zonelist_cache */ | 1388 | int zlc_active = 0; /* set if using zonelist_cache */ |
1389 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1389 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1390 | enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ | 1390 | |
1391 | (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, | ||
1392 | &preferred_zone); | ||
1393 | classzone_idx = zone_idx(preferred_zone); | ||
1391 | 1394 | ||
1392 | zonelist_scan: | 1395 | zonelist_scan: |
1393 | /* | 1396 | /* |
1394 | * Scan zonelist, looking for a zone with enough free. | 1397 | * Scan zonelist, looking for a zone with enough free. |
1395 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1398 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1396 | */ | 1399 | */ |
1397 | z = zonelist->zones; | 1400 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1398 | 1401 | high_zoneidx, nodemask) { | |
1399 | do { | ||
1400 | /* | ||
1401 | * In NUMA, this could be a policy zonelist which contains | ||
1402 | * zones that may not be allowed by the current gfp_mask. | ||
1403 | * Check the zone is allowed by the current flags | ||
1404 | */ | ||
1405 | if (unlikely(alloc_should_filter_zonelist(zonelist))) { | ||
1406 | if (highest_zoneidx == -1) | ||
1407 | highest_zoneidx = gfp_zone(gfp_mask); | ||
1408 | if (zone_idx(*z) > highest_zoneidx) | ||
1409 | continue; | ||
1410 | } | ||
1411 | |||
1412 | if (NUMA_BUILD && zlc_active && | 1402 | if (NUMA_BUILD && zlc_active && |
1413 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1403 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1414 | continue; | 1404 | continue; |
1415 | zone = *z; | ||
1416 | if ((alloc_flags & ALLOC_CPUSET) && | 1405 | if ((alloc_flags & ALLOC_CPUSET) && |
1417 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1406 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1418 | goto try_next_zone; | 1407 | goto try_next_zone; |
@@ -1433,7 +1422,7 @@ zonelist_scan: | |||
1433 | } | 1422 | } |
1434 | } | 1423 | } |
1435 | 1424 | ||
1436 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); | 1425 | page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); |
1437 | if (page) | 1426 | if (page) |
1438 | break; | 1427 | break; |
1439 | this_zone_full: | 1428 | this_zone_full: |
@@ -1446,7 +1435,7 @@ try_next_zone: | |||
1446 | zlc_active = 1; | 1435 | zlc_active = 1; |
1447 | did_zlc_setup = 1; | 1436 | did_zlc_setup = 1; |
1448 | } | 1437 | } |
1449 | } while (*(++z) != NULL); | 1438 | } |
1450 | 1439 | ||
1451 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1440 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { |
1452 | /* Disable zlc cache for second zonelist scan */ | 1441 | /* Disable zlc cache for second zonelist scan */ |
@@ -1459,12 +1448,14 @@ try_next_zone: | |||
1459 | /* | 1448 | /* |
1460 | * This is the 'heart' of the zoned buddy allocator. | 1449 | * This is the 'heart' of the zoned buddy allocator. |
1461 | */ | 1450 | */ |
1462 | struct page * | 1451 | static struct page * |
1463 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | 1452 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, |
1464 | struct zonelist *zonelist) | 1453 | struct zonelist *zonelist, nodemask_t *nodemask) |
1465 | { | 1454 | { |
1466 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1455 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
1467 | struct zone **z; | 1456 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1457 | struct zoneref *z; | ||
1458 | struct zone *zone; | ||
1468 | struct page *page; | 1459 | struct page *page; |
1469 | struct reclaim_state reclaim_state; | 1460 | struct reclaim_state reclaim_state; |
1470 | struct task_struct *p = current; | 1461 | struct task_struct *p = current; |
@@ -1478,9 +1469,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, | |||
1478 | return NULL; | 1469 | return NULL; |
1479 | 1470 | ||
1480 | restart: | 1471 | restart: |
1481 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1472 | z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ |
1482 | 1473 | ||
1483 | if (unlikely(*z == NULL)) { | 1474 | if (unlikely(!z->zone)) { |
1484 | /* | 1475 | /* |
1485 | * Happens if we have an empty zonelist as a result of | 1476 | * Happens if we have an empty zonelist as a result of |
1486 | * GFP_THISNODE being used on a memoryless node | 1477 | * GFP_THISNODE being used on a memoryless node |
@@ -1488,8 +1479,8 @@ restart: | |||
1488 | return NULL; | 1479 | return NULL; |
1489 | } | 1480 | } |
1490 | 1481 | ||
1491 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1482 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
1492 | zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1483 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); |
1493 | if (page) | 1484 | if (page) |
1494 | goto got_pg; | 1485 | goto got_pg; |
1495 | 1486 | ||
@@ -1504,8 +1495,8 @@ restart: | |||
1504 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 1495 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
1505 | goto nopage; | 1496 | goto nopage; |
1506 | 1497 | ||
1507 | for (z = zonelist->zones; *z; z++) | 1498 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
1508 | wakeup_kswapd(*z, order); | 1499 | wakeup_kswapd(zone, order); |
1509 | 1500 | ||
1510 | /* | 1501 | /* |
1511 | * OK, we're below the kswapd watermark and have kicked background | 1502 | * OK, we're below the kswapd watermark and have kicked background |
@@ -1533,7 +1524,8 @@ restart: | |||
1533 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 1524 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1534 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1525 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1535 | */ | 1526 | */ |
1536 | page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); | 1527 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
1528 | high_zoneidx, alloc_flags); | ||
1537 | if (page) | 1529 | if (page) |
1538 | goto got_pg; | 1530 | goto got_pg; |
1539 | 1531 | ||
@@ -1545,8 +1537,8 @@ rebalance: | |||
1545 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1537 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
1546 | nofail_alloc: | 1538 | nofail_alloc: |
1547 | /* go through the zonelist yet again, ignoring mins */ | 1539 | /* go through the zonelist yet again, ignoring mins */ |
1548 | page = get_page_from_freelist(gfp_mask, order, | 1540 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1549 | zonelist, ALLOC_NO_WATERMARKS); | 1541 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); |
1550 | if (page) | 1542 | if (page) |
1551 | goto got_pg; | 1543 | goto got_pg; |
1552 | if (gfp_mask & __GFP_NOFAIL) { | 1544 | if (gfp_mask & __GFP_NOFAIL) { |
@@ -1569,7 +1561,7 @@ nofail_alloc: | |||
1569 | reclaim_state.reclaimed_slab = 0; | 1561 | reclaim_state.reclaimed_slab = 0; |
1570 | p->reclaim_state = &reclaim_state; | 1562 | p->reclaim_state = &reclaim_state; |
1571 | 1563 | ||
1572 | did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); | 1564 | did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); |
1573 | 1565 | ||
1574 | p->reclaim_state = NULL; | 1566 | p->reclaim_state = NULL; |
1575 | p->flags &= ~PF_MEMALLOC; | 1567 | p->flags &= ~PF_MEMALLOC; |
@@ -1580,12 +1572,12 @@ nofail_alloc: | |||
1580 | drain_all_pages(); | 1572 | drain_all_pages(); |
1581 | 1573 | ||
1582 | if (likely(did_some_progress)) { | 1574 | if (likely(did_some_progress)) { |
1583 | page = get_page_from_freelist(gfp_mask, order, | 1575 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1584 | zonelist, alloc_flags); | 1576 | zonelist, high_zoneidx, alloc_flags); |
1585 | if (page) | 1577 | if (page) |
1586 | goto got_pg; | 1578 | goto got_pg; |
1587 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 1579 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
1588 | if (!try_set_zone_oom(zonelist)) { | 1580 | if (!try_set_zone_oom(zonelist, gfp_mask)) { |
1589 | schedule_timeout_uninterruptible(1); | 1581 | schedule_timeout_uninterruptible(1); |
1590 | goto restart; | 1582 | goto restart; |
1591 | } | 1583 | } |
@@ -1596,21 +1588,22 @@ nofail_alloc: | |||
1596 | * a parallel oom killing, we must fail if we're still | 1588 | * a parallel oom killing, we must fail if we're still |
1597 | * under heavy pressure. | 1589 | * under heavy pressure. |
1598 | */ | 1590 | */ |
1599 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1591 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, |
1600 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); | 1592 | order, zonelist, high_zoneidx, |
1593 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
1601 | if (page) { | 1594 | if (page) { |
1602 | clear_zonelist_oom(zonelist); | 1595 | clear_zonelist_oom(zonelist, gfp_mask); |
1603 | goto got_pg; | 1596 | goto got_pg; |
1604 | } | 1597 | } |
1605 | 1598 | ||
1606 | /* The OOM killer will not help higher order allocs so fail */ | 1599 | /* The OOM killer will not help higher order allocs so fail */ |
1607 | if (order > PAGE_ALLOC_COSTLY_ORDER) { | 1600 | if (order > PAGE_ALLOC_COSTLY_ORDER) { |
1608 | clear_zonelist_oom(zonelist); | 1601 | clear_zonelist_oom(zonelist, gfp_mask); |
1609 | goto nopage; | 1602 | goto nopage; |
1610 | } | 1603 | } |
1611 | 1604 | ||
1612 | out_of_memory(zonelist, gfp_mask, order); | 1605 | out_of_memory(zonelist, gfp_mask, order); |
1613 | clear_zonelist_oom(zonelist); | 1606 | clear_zonelist_oom(zonelist, gfp_mask); |
1614 | goto restart; | 1607 | goto restart; |
1615 | } | 1608 | } |
1616 | 1609 | ||
@@ -1646,6 +1639,20 @@ got_pg: | |||
1646 | return page; | 1639 | return page; |
1647 | } | 1640 | } |
1648 | 1641 | ||
1642 | struct page * | ||
1643 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | ||
1644 | struct zonelist *zonelist) | ||
1645 | { | ||
1646 | return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); | ||
1647 | } | ||
1648 | |||
1649 | struct page * | ||
1650 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1651 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
1652 | { | ||
1653 | return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); | ||
1654 | } | ||
1655 | |||
1649 | EXPORT_SYMBOL(__alloc_pages); | 1656 | EXPORT_SYMBOL(__alloc_pages); |
1650 | 1657 | ||
1651 | /* | 1658 | /* |
@@ -1712,15 +1719,15 @@ EXPORT_SYMBOL(free_pages); | |||
1712 | 1719 | ||
1713 | static unsigned int nr_free_zone_pages(int offset) | 1720 | static unsigned int nr_free_zone_pages(int offset) |
1714 | { | 1721 | { |
1722 | struct zoneref *z; | ||
1723 | struct zone *zone; | ||
1724 | |||
1715 | /* Just pick one node, since fallback list is circular */ | 1725 | /* Just pick one node, since fallback list is circular */ |
1716 | pg_data_t *pgdat = NODE_DATA(numa_node_id()); | ||
1717 | unsigned int sum = 0; | 1726 | unsigned int sum = 0; |
1718 | 1727 | ||
1719 | struct zonelist *zonelist = pgdat->node_zonelists + offset; | 1728 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
1720 | struct zone **zonep = zonelist->zones; | ||
1721 | struct zone *zone; | ||
1722 | 1729 | ||
1723 | for (zone = *zonep++; zone; zone = *zonep++) { | 1730 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
1724 | unsigned long size = zone->present_pages; | 1731 | unsigned long size = zone->present_pages; |
1725 | unsigned long high = zone->pages_high; | 1732 | unsigned long high = zone->pages_high; |
1726 | if (size > high) | 1733 | if (size > high) |
@@ -1889,6 +1896,12 @@ void show_free_areas(void) | |||
1889 | show_swap_cache_info(); | 1896 | show_swap_cache_info(); |
1890 | } | 1897 | } |
1891 | 1898 | ||
1899 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | ||
1900 | { | ||
1901 | zoneref->zone = zone; | ||
1902 | zoneref->zone_idx = zone_idx(zone); | ||
1903 | } | ||
1904 | |||
1892 | /* | 1905 | /* |
1893 | * Builds allocation fallback zone lists. | 1906 | * Builds allocation fallback zone lists. |
1894 | * | 1907 | * |
@@ -1906,7 +1919,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, | |||
1906 | zone_type--; | 1919 | zone_type--; |
1907 | zone = pgdat->node_zones + zone_type; | 1920 | zone = pgdat->node_zones + zone_type; |
1908 | if (populated_zone(zone)) { | 1921 | if (populated_zone(zone)) { |
1909 | zonelist->zones[nr_zones++] = zone; | 1922 | zoneref_set_zone(zone, |
1923 | &zonelist->_zonerefs[nr_zones++]); | ||
1910 | check_highest_zone(zone_type); | 1924 | check_highest_zone(zone_type); |
1911 | } | 1925 | } |
1912 | 1926 | ||
@@ -2078,17 +2092,16 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
2078 | */ | 2092 | */ |
2079 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | 2093 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) |
2080 | { | 2094 | { |
2081 | enum zone_type i; | ||
2082 | int j; | 2095 | int j; |
2083 | struct zonelist *zonelist; | 2096 | struct zonelist *zonelist; |
2084 | 2097 | ||
2085 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2098 | zonelist = &pgdat->node_zonelists[0]; |
2086 | zonelist = pgdat->node_zonelists + i; | 2099 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) |
2087 | for (j = 0; zonelist->zones[j] != NULL; j++) | 2100 | ; |
2088 | ; | 2101 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
2089 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 2102 | MAX_NR_ZONES - 1); |
2090 | zonelist->zones[j] = NULL; | 2103 | zonelist->_zonerefs[j].zone = NULL; |
2091 | } | 2104 | zonelist->_zonerefs[j].zone_idx = 0; |
2092 | } | 2105 | } |
2093 | 2106 | ||
2094 | /* | 2107 | /* |
@@ -2096,15 +2109,13 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | |||
2096 | */ | 2109 | */ |
2097 | static void build_thisnode_zonelists(pg_data_t *pgdat) | 2110 | static void build_thisnode_zonelists(pg_data_t *pgdat) |
2098 | { | 2111 | { |
2099 | enum zone_type i; | ||
2100 | int j; | 2112 | int j; |
2101 | struct zonelist *zonelist; | 2113 | struct zonelist *zonelist; |
2102 | 2114 | ||
2103 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2115 | zonelist = &pgdat->node_zonelists[1]; |
2104 | zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; | 2116 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); |
2105 | j = build_zonelists_node(pgdat, zonelist, 0, i); | 2117 | zonelist->_zonerefs[j].zone = NULL; |
2106 | zonelist->zones[j] = NULL; | 2118 | zonelist->_zonerefs[j].zone_idx = 0; |
2107 | } | ||
2108 | } | 2119 | } |
2109 | 2120 | ||
2110 | /* | 2121 | /* |
@@ -2117,27 +2128,26 @@ static int node_order[MAX_NUMNODES]; | |||
2117 | 2128 | ||
2118 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | 2129 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) |
2119 | { | 2130 | { |
2120 | enum zone_type i; | ||
2121 | int pos, j, node; | 2131 | int pos, j, node; |
2122 | int zone_type; /* needs to be signed */ | 2132 | int zone_type; /* needs to be signed */ |
2123 | struct zone *z; | 2133 | struct zone *z; |
2124 | struct zonelist *zonelist; | 2134 | struct zonelist *zonelist; |
2125 | 2135 | ||
2126 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2136 | zonelist = &pgdat->node_zonelists[0]; |
2127 | zonelist = pgdat->node_zonelists + i; | 2137 | pos = 0; |
2128 | pos = 0; | 2138 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { |
2129 | for (zone_type = i; zone_type >= 0; zone_type--) { | 2139 | for (j = 0; j < nr_nodes; j++) { |
2130 | for (j = 0; j < nr_nodes; j++) { | 2140 | node = node_order[j]; |
2131 | node = node_order[j]; | 2141 | z = &NODE_DATA(node)->node_zones[zone_type]; |
2132 | z = &NODE_DATA(node)->node_zones[zone_type]; | 2142 | if (populated_zone(z)) { |
2133 | if (populated_zone(z)) { | 2143 | zoneref_set_zone(z, |
2134 | zonelist->zones[pos++] = z; | 2144 | &zonelist->_zonerefs[pos++]); |
2135 | check_highest_zone(zone_type); | 2145 | check_highest_zone(zone_type); |
2136 | } | ||
2137 | } | 2146 | } |
2138 | } | 2147 | } |
2139 | zonelist->zones[pos] = NULL; | ||
2140 | } | 2148 | } |
2149 | zonelist->_zonerefs[pos].zone = NULL; | ||
2150 | zonelist->_zonerefs[pos].zone_idx = 0; | ||
2141 | } | 2151 | } |
2142 | 2152 | ||
2143 | static int default_zonelist_order(void) | 2153 | static int default_zonelist_order(void) |
@@ -2214,7 +2224,8 @@ static void build_zonelists(pg_data_t *pgdat) | |||
2214 | /* initialize zonelists */ | 2224 | /* initialize zonelists */ |
2215 | for (i = 0; i < MAX_ZONELISTS; i++) { | 2225 | for (i = 0; i < MAX_ZONELISTS; i++) { |
2216 | zonelist = pgdat->node_zonelists + i; | 2226 | zonelist = pgdat->node_zonelists + i; |
2217 | zonelist->zones[0] = NULL; | 2227 | zonelist->_zonerefs[0].zone = NULL; |
2228 | zonelist->_zonerefs[0].zone_idx = 0; | ||
2218 | } | 2229 | } |
2219 | 2230 | ||
2220 | /* NUMA-aware ordering of nodes */ | 2231 | /* NUMA-aware ordering of nodes */ |
@@ -2264,19 +2275,15 @@ static void build_zonelists(pg_data_t *pgdat) | |||
2264 | /* Construct the zonelist performance cache - see further mmzone.h */ | 2275 | /* Construct the zonelist performance cache - see further mmzone.h */ |
2265 | static void build_zonelist_cache(pg_data_t *pgdat) | 2276 | static void build_zonelist_cache(pg_data_t *pgdat) |
2266 | { | 2277 | { |
2267 | int i; | 2278 | struct zonelist *zonelist; |
2268 | 2279 | struct zonelist_cache *zlc; | |
2269 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2280 | struct zoneref *z; |
2270 | struct zonelist *zonelist; | ||
2271 | struct zonelist_cache *zlc; | ||
2272 | struct zone **z; | ||
2273 | 2281 | ||
2274 | zonelist = pgdat->node_zonelists + i; | 2282 | zonelist = &pgdat->node_zonelists[0]; |
2275 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | 2283 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; |
2276 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 2284 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
2277 | for (z = zonelist->zones; *z; z++) | 2285 | for (z = zonelist->_zonerefs; z->zone; z++) |
2278 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); | 2286 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); |
2279 | } | ||
2280 | } | 2287 | } |
2281 | 2288 | ||
2282 | 2289 | ||
@@ -2290,45 +2297,44 @@ static void set_zonelist_order(void) | |||
2290 | static void build_zonelists(pg_data_t *pgdat) | 2297 | static void build_zonelists(pg_data_t *pgdat) |
2291 | { | 2298 | { |
2292 | int node, local_node; | 2299 | int node, local_node; |
2293 | enum zone_type i,j; | 2300 | enum zone_type j; |
2301 | struct zonelist *zonelist; | ||
2294 | 2302 | ||
2295 | local_node = pgdat->node_id; | 2303 | local_node = pgdat->node_id; |
2296 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
2297 | struct zonelist *zonelist; | ||
2298 | 2304 | ||
2299 | zonelist = pgdat->node_zonelists + i; | 2305 | zonelist = &pgdat->node_zonelists[0]; |
2306 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); | ||
2300 | 2307 | ||
2301 | j = build_zonelists_node(pgdat, zonelist, 0, i); | 2308 | /* |
2302 | /* | 2309 | * Now we build the zonelist so that it contains the zones |
2303 | * Now we build the zonelist so that it contains the zones | 2310 | * of all the other nodes. |
2304 | * of all the other nodes. | 2311 | * We don't want to pressure a particular node, so when |
2305 | * We don't want to pressure a particular node, so when | 2312 | * building the zones for node N, we make sure that the |
2306 | * building the zones for node N, we make sure that the | 2313 | * zones coming right after the local ones are those from |
2307 | * zones coming right after the local ones are those from | 2314 | * node N+1 (modulo N) |
2308 | * node N+1 (modulo N) | 2315 | */ |
2309 | */ | 2316 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { |
2310 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 2317 | if (!node_online(node)) |
2311 | if (!node_online(node)) | 2318 | continue; |
2312 | continue; | 2319 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
2313 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 2320 | MAX_NR_ZONES - 1); |
2314 | } | 2321 | } |
2315 | for (node = 0; node < local_node; node++) { | 2322 | for (node = 0; node < local_node; node++) { |
2316 | if (!node_online(node)) | 2323 | if (!node_online(node)) |
2317 | continue; | 2324 | continue; |
2318 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 2325 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
2319 | } | 2326 | MAX_NR_ZONES - 1); |
2320 | |||
2321 | zonelist->zones[j] = NULL; | ||
2322 | } | 2327 | } |
2328 | |||
2329 | zonelist->_zonerefs[j].zone = NULL; | ||
2330 | zonelist->_zonerefs[j].zone_idx = 0; | ||
2323 | } | 2331 | } |
2324 | 2332 | ||
2325 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | 2333 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ |
2326 | static void build_zonelist_cache(pg_data_t *pgdat) | 2334 | static void build_zonelist_cache(pg_data_t *pgdat) |
2327 | { | 2335 | { |
2328 | int i; | 2336 | pgdat->node_zonelists[0].zlcache_ptr = NULL; |
2329 | 2337 | pgdat->node_zonelists[1].zlcache_ptr = NULL; | |
2330 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
2331 | pgdat->node_zonelists[i].zlcache_ptr = NULL; | ||
2332 | } | 2338 | } |
2333 | 2339 | ||
2334 | #endif /* CONFIG_NUMA */ | 2340 | #endif /* CONFIG_NUMA */ |
@@ -4339,9 +4345,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4339 | else if (hashdist) | 4345 | else if (hashdist) |
4340 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 4346 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
4341 | else { | 4347 | else { |
4342 | unsigned long order; | 4348 | unsigned long order = get_order(size); |
4343 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) | ||
4344 | ; | ||
4345 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | 4349 | table = (void*) __get_free_pages(GFP_ATOMIC, order); |
4346 | /* | 4350 | /* |
4347 | * If bucketsize is not a power-of-two, we may free | 4351 | * If bucketsize is not a power-of-two, we may free |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 1cf1417ef8b7..0afd2387e507 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -9,11 +9,15 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
9 | int err = 0; | 9 | int err = 0; |
10 | 10 | ||
11 | pte = pte_offset_map(pmd, addr); | 11 | pte = pte_offset_map(pmd, addr); |
12 | do { | 12 | for (;;) { |
13 | err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private); | 13 | err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private); |
14 | if (err) | 14 | if (err) |
15 | break; | 15 | break; |
16 | } while (pte++, addr += PAGE_SIZE, addr != end); | 16 | addr += PAGE_SIZE; |
17 | if (addr == end) | ||
18 | break; | ||
19 | pte++; | ||
20 | } | ||
17 | 21 | ||
18 | pte_unmap(pte); | 22 | pte_unmap(pte); |
19 | return err; | 23 | return err; |
@@ -413,9 +413,6 @@ int page_referenced(struct page *page, int is_locked, | |||
413 | { | 413 | { |
414 | int referenced = 0; | 414 | int referenced = 0; |
415 | 415 | ||
416 | if (page_test_and_clear_young(page)) | ||
417 | referenced++; | ||
418 | |||
419 | if (TestClearPageReferenced(page)) | 416 | if (TestClearPageReferenced(page)) |
420 | referenced++; | 417 | referenced++; |
421 | 418 | ||
@@ -433,6 +430,10 @@ int page_referenced(struct page *page, int is_locked, | |||
433 | unlock_page(page); | 430 | unlock_page(page); |
434 | } | 431 | } |
435 | } | 432 | } |
433 | |||
434 | if (page_test_and_clear_young(page)) | ||
435 | referenced++; | ||
436 | |||
436 | return referenced; | 437 | return referenced; |
437 | } | 438 | } |
438 | 439 | ||
@@ -661,7 +662,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | |||
661 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | 662 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); |
662 | print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); | 663 | print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); |
663 | if (vma->vm_ops) { | 664 | if (vma->vm_ops) { |
664 | print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage); | ||
665 | print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); | 665 | print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); |
666 | } | 666 | } |
667 | if (vma->vm_file && vma->vm_file->f_op) | 667 | if (vma->vm_file && vma->vm_file->f_op) |
diff --git a/mm/shmem.c b/mm/shmem.c index f514dd392cd9..e6d9298aa22a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1079,104 +1079,47 @@ redirty: | |||
1079 | 1079 | ||
1080 | #ifdef CONFIG_NUMA | 1080 | #ifdef CONFIG_NUMA |
1081 | #ifdef CONFIG_TMPFS | 1081 | #ifdef CONFIG_TMPFS |
1082 | static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) | 1082 | static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) |
1083 | { | 1083 | { |
1084 | char *nodelist = strchr(value, ':'); | 1084 | char buffer[64]; |
1085 | int err = 1; | ||
1086 | 1085 | ||
1087 | if (nodelist) { | 1086 | if (!mpol || mpol->mode == MPOL_DEFAULT) |
1088 | /* NUL-terminate policy string */ | 1087 | return; /* show nothing */ |
1089 | *nodelist++ = '\0'; | ||
1090 | if (nodelist_parse(nodelist, *policy_nodes)) | ||
1091 | goto out; | ||
1092 | if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY])) | ||
1093 | goto out; | ||
1094 | } | ||
1095 | if (!strcmp(value, "default")) { | ||
1096 | *policy = MPOL_DEFAULT; | ||
1097 | /* Don't allow a nodelist */ | ||
1098 | if (!nodelist) | ||
1099 | err = 0; | ||
1100 | } else if (!strcmp(value, "prefer")) { | ||
1101 | *policy = MPOL_PREFERRED; | ||
1102 | /* Insist on a nodelist of one node only */ | ||
1103 | if (nodelist) { | ||
1104 | char *rest = nodelist; | ||
1105 | while (isdigit(*rest)) | ||
1106 | rest++; | ||
1107 | if (!*rest) | ||
1108 | err = 0; | ||
1109 | } | ||
1110 | } else if (!strcmp(value, "bind")) { | ||
1111 | *policy = MPOL_BIND; | ||
1112 | /* Insist on a nodelist */ | ||
1113 | if (nodelist) | ||
1114 | err = 0; | ||
1115 | } else if (!strcmp(value, "interleave")) { | ||
1116 | *policy = MPOL_INTERLEAVE; | ||
1117 | /* | ||
1118 | * Default to online nodes with memory if no nodelist | ||
1119 | */ | ||
1120 | if (!nodelist) | ||
1121 | *policy_nodes = node_states[N_HIGH_MEMORY]; | ||
1122 | err = 0; | ||
1123 | } | ||
1124 | out: | ||
1125 | /* Restore string for error message */ | ||
1126 | if (nodelist) | ||
1127 | *--nodelist = ':'; | ||
1128 | return err; | ||
1129 | } | ||
1130 | |||
1131 | static void shmem_show_mpol(struct seq_file *seq, int policy, | ||
1132 | const nodemask_t policy_nodes) | ||
1133 | { | ||
1134 | char *policy_string; | ||
1135 | 1088 | ||
1136 | switch (policy) { | 1089 | mpol_to_str(buffer, sizeof(buffer), mpol, 1); |
1137 | case MPOL_PREFERRED: | ||
1138 | policy_string = "prefer"; | ||
1139 | break; | ||
1140 | case MPOL_BIND: | ||
1141 | policy_string = "bind"; | ||
1142 | break; | ||
1143 | case MPOL_INTERLEAVE: | ||
1144 | policy_string = "interleave"; | ||
1145 | break; | ||
1146 | default: | ||
1147 | /* MPOL_DEFAULT */ | ||
1148 | return; | ||
1149 | } | ||
1150 | 1090 | ||
1151 | seq_printf(seq, ",mpol=%s", policy_string); | 1091 | seq_printf(seq, ",mpol=%s", buffer); |
1152 | 1092 | } | |
1153 | if (policy != MPOL_INTERLEAVE || | ||
1154 | !nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) { | ||
1155 | char buffer[64]; | ||
1156 | int len; | ||
1157 | 1093 | ||
1158 | len = nodelist_scnprintf(buffer, sizeof(buffer), policy_nodes); | 1094 | static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) |
1159 | if (len < sizeof(buffer)) | 1095 | { |
1160 | seq_printf(seq, ":%s", buffer); | 1096 | struct mempolicy *mpol = NULL; |
1161 | else | 1097 | if (sbinfo->mpol) { |
1162 | seq_printf(seq, ":?"); | 1098 | spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ |
1099 | mpol = sbinfo->mpol; | ||
1100 | mpol_get(mpol); | ||
1101 | spin_unlock(&sbinfo->stat_lock); | ||
1163 | } | 1102 | } |
1103 | return mpol; | ||
1164 | } | 1104 | } |
1165 | #endif /* CONFIG_TMPFS */ | 1105 | #endif /* CONFIG_TMPFS */ |
1166 | 1106 | ||
1167 | static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, | 1107 | static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, |
1168 | struct shmem_inode_info *info, unsigned long idx) | 1108 | struct shmem_inode_info *info, unsigned long idx) |
1169 | { | 1109 | { |
1110 | struct mempolicy mpol, *spol; | ||
1170 | struct vm_area_struct pvma; | 1111 | struct vm_area_struct pvma; |
1171 | struct page *page; | 1112 | struct page *page; |
1172 | 1113 | ||
1114 | spol = mpol_cond_copy(&mpol, | ||
1115 | mpol_shared_policy_lookup(&info->policy, idx)); | ||
1116 | |||
1173 | /* Create a pseudo vma that just contains the policy */ | 1117 | /* Create a pseudo vma that just contains the policy */ |
1174 | pvma.vm_start = 0; | 1118 | pvma.vm_start = 0; |
1175 | pvma.vm_pgoff = idx; | 1119 | pvma.vm_pgoff = idx; |
1176 | pvma.vm_ops = NULL; | 1120 | pvma.vm_ops = NULL; |
1177 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | 1121 | pvma.vm_policy = spol; |
1178 | page = swapin_readahead(entry, gfp, &pvma, 0); | 1122 | page = swapin_readahead(entry, gfp, &pvma, 0); |
1179 | mpol_free(pvma.vm_policy); | ||
1180 | return page; | 1123 | return page; |
1181 | } | 1124 | } |
1182 | 1125 | ||
@@ -1184,27 +1127,21 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
1184 | struct shmem_inode_info *info, unsigned long idx) | 1127 | struct shmem_inode_info *info, unsigned long idx) |
1185 | { | 1128 | { |
1186 | struct vm_area_struct pvma; | 1129 | struct vm_area_struct pvma; |
1187 | struct page *page; | ||
1188 | 1130 | ||
1189 | /* Create a pseudo vma that just contains the policy */ | 1131 | /* Create a pseudo vma that just contains the policy */ |
1190 | pvma.vm_start = 0; | 1132 | pvma.vm_start = 0; |
1191 | pvma.vm_pgoff = idx; | 1133 | pvma.vm_pgoff = idx; |
1192 | pvma.vm_ops = NULL; | 1134 | pvma.vm_ops = NULL; |
1193 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | 1135 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); |
1194 | page = alloc_page_vma(gfp, &pvma, 0); | 1136 | |
1195 | mpol_free(pvma.vm_policy); | 1137 | /* |
1196 | return page; | 1138 | * alloc_page_vma() will drop the shared policy reference |
1139 | */ | ||
1140 | return alloc_page_vma(gfp, &pvma, 0); | ||
1197 | } | 1141 | } |
1198 | #else /* !CONFIG_NUMA */ | 1142 | #else /* !CONFIG_NUMA */ |
1199 | #ifdef CONFIG_TMPFS | 1143 | #ifdef CONFIG_TMPFS |
1200 | static inline int shmem_parse_mpol(char *value, int *policy, | 1144 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) |
1201 | nodemask_t *policy_nodes) | ||
1202 | { | ||
1203 | return 1; | ||
1204 | } | ||
1205 | |||
1206 | static inline void shmem_show_mpol(struct seq_file *seq, int policy, | ||
1207 | const nodemask_t policy_nodes) | ||
1208 | { | 1145 | { |
1209 | } | 1146 | } |
1210 | #endif /* CONFIG_TMPFS */ | 1147 | #endif /* CONFIG_TMPFS */ |
@@ -1222,6 +1159,13 @@ static inline struct page *shmem_alloc_page(gfp_t gfp, | |||
1222 | } | 1159 | } |
1223 | #endif /* CONFIG_NUMA */ | 1160 | #endif /* CONFIG_NUMA */ |
1224 | 1161 | ||
1162 | #if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) | ||
1163 | static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | ||
1164 | { | ||
1165 | return NULL; | ||
1166 | } | ||
1167 | #endif | ||
1168 | |||
1225 | /* | 1169 | /* |
1226 | * shmem_getpage - either get the page from swap or allocate a new one | 1170 | * shmem_getpage - either get the page from swap or allocate a new one |
1227 | * | 1171 | * |
@@ -1576,8 +1520,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1576 | case S_IFREG: | 1520 | case S_IFREG: |
1577 | inode->i_op = &shmem_inode_operations; | 1521 | inode->i_op = &shmem_inode_operations; |
1578 | inode->i_fop = &shmem_file_operations; | 1522 | inode->i_fop = &shmem_file_operations; |
1579 | mpol_shared_policy_init(&info->policy, sbinfo->policy, | 1523 | mpol_shared_policy_init(&info->policy, |
1580 | &sbinfo->policy_nodes); | 1524 | shmem_get_sbmpol(sbinfo)); |
1581 | break; | 1525 | break; |
1582 | case S_IFDIR: | 1526 | case S_IFDIR: |
1583 | inc_nlink(inode); | 1527 | inc_nlink(inode); |
@@ -1591,8 +1535,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1591 | * Must not load anything in the rbtree, | 1535 | * Must not load anything in the rbtree, |
1592 | * mpol_free_shared_policy will not be called. | 1536 | * mpol_free_shared_policy will not be called. |
1593 | */ | 1537 | */ |
1594 | mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, | 1538 | mpol_shared_policy_init(&info->policy, NULL); |
1595 | NULL); | ||
1596 | break; | 1539 | break; |
1597 | } | 1540 | } |
1598 | } else | 1541 | } else |
@@ -2207,8 +2150,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2207 | if (*rest) | 2150 | if (*rest) |
2208 | goto bad_val; | 2151 | goto bad_val; |
2209 | } else if (!strcmp(this_char,"mpol")) { | 2152 | } else if (!strcmp(this_char,"mpol")) { |
2210 | if (shmem_parse_mpol(value, &sbinfo->policy, | 2153 | if (mpol_parse_str(value, &sbinfo->mpol, 1)) |
2211 | &sbinfo->policy_nodes)) | ||
2212 | goto bad_val; | 2154 | goto bad_val; |
2213 | } else { | 2155 | } else { |
2214 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", | 2156 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", |
@@ -2259,8 +2201,9 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2259 | sbinfo->free_blocks = config.max_blocks - blocks; | 2201 | sbinfo->free_blocks = config.max_blocks - blocks; |
2260 | sbinfo->max_inodes = config.max_inodes; | 2202 | sbinfo->max_inodes = config.max_inodes; |
2261 | sbinfo->free_inodes = config.max_inodes - inodes; | 2203 | sbinfo->free_inodes = config.max_inodes - inodes; |
2262 | sbinfo->policy = config.policy; | 2204 | |
2263 | sbinfo->policy_nodes = config.policy_nodes; | 2205 | mpol_put(sbinfo->mpol); |
2206 | sbinfo->mpol = config.mpol; /* transfers initial ref */ | ||
2264 | out: | 2207 | out: |
2265 | spin_unlock(&sbinfo->stat_lock); | 2208 | spin_unlock(&sbinfo->stat_lock); |
2266 | return error; | 2209 | return error; |
@@ -2281,7 +2224,7 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
2281 | seq_printf(seq, ",uid=%u", sbinfo->uid); | 2224 | seq_printf(seq, ",uid=%u", sbinfo->uid); |
2282 | if (sbinfo->gid != 0) | 2225 | if (sbinfo->gid != 0) |
2283 | seq_printf(seq, ",gid=%u", sbinfo->gid); | 2226 | seq_printf(seq, ",gid=%u", sbinfo->gid); |
2284 | shmem_show_mpol(seq, sbinfo->policy, sbinfo->policy_nodes); | 2227 | shmem_show_mpol(seq, sbinfo->mpol); |
2285 | return 0; | 2228 | return 0; |
2286 | } | 2229 | } |
2287 | #endif /* CONFIG_TMPFS */ | 2230 | #endif /* CONFIG_TMPFS */ |
@@ -2311,8 +2254,7 @@ static int shmem_fill_super(struct super_block *sb, | |||
2311 | sbinfo->mode = S_IRWXUGO | S_ISVTX; | 2254 | sbinfo->mode = S_IRWXUGO | S_ISVTX; |
2312 | sbinfo->uid = current->fsuid; | 2255 | sbinfo->uid = current->fsuid; |
2313 | sbinfo->gid = current->fsgid; | 2256 | sbinfo->gid = current->fsgid; |
2314 | sbinfo->policy = MPOL_DEFAULT; | 2257 | sbinfo->mpol = NULL; |
2315 | sbinfo->policy_nodes = node_states[N_HIGH_MEMORY]; | ||
2316 | sb->s_fs_info = sbinfo; | 2258 | sb->s_fs_info = sbinfo; |
2317 | 2259 | ||
2318 | #ifdef CONFIG_TMPFS | 2260 | #ifdef CONFIG_TMPFS |
@@ -139,10 +139,6 @@ | |||
139 | #define BYTES_PER_WORD sizeof(void *) | 139 | #define BYTES_PER_WORD sizeof(void *) |
140 | #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) | 140 | #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) |
141 | 141 | ||
142 | #ifndef cache_line_size | ||
143 | #define cache_line_size() L1_CACHE_BYTES | ||
144 | #endif | ||
145 | |||
146 | #ifndef ARCH_KMALLOC_MINALIGN | 142 | #ifndef ARCH_KMALLOC_MINALIGN |
147 | /* | 143 | /* |
148 | * Enforce a minimum alignment for the kmalloc caches. | 144 | * Enforce a minimum alignment for the kmalloc caches. |
@@ -3242,15 +3238,16 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3242 | { | 3238 | { |
3243 | struct zonelist *zonelist; | 3239 | struct zonelist *zonelist; |
3244 | gfp_t local_flags; | 3240 | gfp_t local_flags; |
3245 | struct zone **z; | 3241 | struct zoneref *z; |
3242 | struct zone *zone; | ||
3243 | enum zone_type high_zoneidx = gfp_zone(flags); | ||
3246 | void *obj = NULL; | 3244 | void *obj = NULL; |
3247 | int nid; | 3245 | int nid; |
3248 | 3246 | ||
3249 | if (flags & __GFP_THISNODE) | 3247 | if (flags & __GFP_THISNODE) |
3250 | return NULL; | 3248 | return NULL; |
3251 | 3249 | ||
3252 | zonelist = &NODE_DATA(slab_node(current->mempolicy)) | 3250 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); |
3253 | ->node_zonelists[gfp_zone(flags)]; | ||
3254 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 3251 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
3255 | 3252 | ||
3256 | retry: | 3253 | retry: |
@@ -3258,10 +3255,10 @@ retry: | |||
3258 | * Look through allowed nodes for objects available | 3255 | * Look through allowed nodes for objects available |
3259 | * from existing per node queues. | 3256 | * from existing per node queues. |
3260 | */ | 3257 | */ |
3261 | for (z = zonelist->zones; *z && !obj; z++) { | 3258 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
3262 | nid = zone_to_nid(*z); | 3259 | nid = zone_to_nid(zone); |
3263 | 3260 | ||
3264 | if (cpuset_zone_allowed_hardwall(*z, flags) && | 3261 | if (cpuset_zone_allowed_hardwall(zone, flags) && |
3265 | cache->nodelists[nid] && | 3262 | cache->nodelists[nid] && |
3266 | cache->nodelists[nid]->free_objects) | 3263 | cache->nodelists[nid]->free_objects) |
3267 | obj = ____cache_alloc_node(cache, | 3264 | obj = ____cache_alloc_node(cache, |
@@ -186,11 +186,6 @@ static inline void ClearSlabDebug(struct page *page) | |||
186 | #define __OBJECT_POISON 0x80000000 /* Poison object */ | 186 | #define __OBJECT_POISON 0x80000000 /* Poison object */ |
187 | #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ | 187 | #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ |
188 | 188 | ||
189 | /* Not all arches define cache_line_size */ | ||
190 | #ifndef cache_line_size | ||
191 | #define cache_line_size() L1_CACHE_BYTES | ||
192 | #endif | ||
193 | |||
194 | static int kmem_size = sizeof(struct kmem_cache); | 189 | static int kmem_size = sizeof(struct kmem_cache); |
195 | 190 | ||
196 | #ifdef CONFIG_SMP | 191 | #ifdef CONFIG_SMP |
@@ -1330,7 +1325,9 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1330 | { | 1325 | { |
1331 | #ifdef CONFIG_NUMA | 1326 | #ifdef CONFIG_NUMA |
1332 | struct zonelist *zonelist; | 1327 | struct zonelist *zonelist; |
1333 | struct zone **z; | 1328 | struct zoneref *z; |
1329 | struct zone *zone; | ||
1330 | enum zone_type high_zoneidx = gfp_zone(flags); | ||
1334 | struct page *page; | 1331 | struct page *page; |
1335 | 1332 | ||
1336 | /* | 1333 | /* |
@@ -1355,14 +1352,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1355 | get_cycles() % 1024 > s->remote_node_defrag_ratio) | 1352 | get_cycles() % 1024 > s->remote_node_defrag_ratio) |
1356 | return NULL; | 1353 | return NULL; |
1357 | 1354 | ||
1358 | zonelist = &NODE_DATA( | 1355 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); |
1359 | slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)]; | 1356 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1360 | for (z = zonelist->zones; *z; z++) { | ||
1361 | struct kmem_cache_node *n; | 1357 | struct kmem_cache_node *n; |
1362 | 1358 | ||
1363 | n = get_node(s, zone_to_nid(*z)); | 1359 | n = get_node(s, zone_to_nid(zone)); |
1364 | 1360 | ||
1365 | if (n && cpuset_zone_allowed_hardwall(*z, flags) && | 1361 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1366 | n->nr_partial > MIN_PARTIAL) { | 1362 | n->nr_partial > MIN_PARTIAL) { |
1367 | page = get_partial_node(n); | 1363 | page = get_partial_node(n); |
1368 | if (page) | 1364 | if (page) |
diff --git a/mm/sparse.c b/mm/sparse.c index 98d6b39c3472..dff71f173ae9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/spinlock.h> | 9 | #include <linux/spinlock.h> |
10 | #include <linux/vmalloc.h> | 10 | #include <linux/vmalloc.h> |
11 | #include "internal.h" | ||
11 | #include <asm/dma.h> | 12 | #include <asm/dma.h> |
12 | #include <asm/pgalloc.h> | 13 | #include <asm/pgalloc.h> |
13 | #include <asm/pgtable.h> | 14 | #include <asm/pgtable.h> |
@@ -208,12 +209,12 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p | |||
208 | } | 209 | } |
209 | 210 | ||
210 | /* | 211 | /* |
211 | * We need this if we ever free the mem_maps. While not implemented yet, | 212 | * Decode mem_map from the coded memmap |
212 | * this function is included for parity with its sibling. | ||
213 | */ | 213 | */ |
214 | static __attribute((unused)) | ||
215 | struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) | 214 | struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) |
216 | { | 215 | { |
216 | /* mask off the extra low bits of information */ | ||
217 | coded_mem_map &= SECTION_MAP_MASK; | ||
217 | return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); | 218 | return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); |
218 | } | 219 | } |
219 | 220 | ||
@@ -232,7 +233,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, | |||
232 | return 1; | 233 | return 1; |
233 | } | 234 | } |
234 | 235 | ||
235 | static unsigned long usemap_size(void) | 236 | unsigned long usemap_size(void) |
236 | { | 237 | { |
237 | unsigned long size_bytes; | 238 | unsigned long size_bytes; |
238 | size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; | 239 | size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; |
@@ -249,11 +250,22 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
249 | 250 | ||
250 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) | 251 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) |
251 | { | 252 | { |
252 | unsigned long *usemap; | 253 | unsigned long *usemap, section_nr; |
253 | struct mem_section *ms = __nr_to_section(pnum); | 254 | struct mem_section *ms = __nr_to_section(pnum); |
254 | int nid = sparse_early_nid(ms); | 255 | int nid = sparse_early_nid(ms); |
256 | struct pglist_data *pgdat = NODE_DATA(nid); | ||
255 | 257 | ||
256 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | 258 | /* |
259 | * Usemap's page can't be freed until freeing other sections | ||
260 | * which use it. And, Pgdat has same feature. | ||
261 | * If section A has pgdat and section B has usemap for other | ||
262 | * sections (includes section A), both sections can't be removed, | ||
263 | * because there is the dependency each other. | ||
264 | * To solve above issue, this collects all usemap on the same section | ||
265 | * which has pgdat. | ||
266 | */ | ||
267 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | ||
268 | usemap = alloc_bootmem_section(usemap_size(), section_nr); | ||
257 | if (usemap) | 269 | if (usemap) |
258 | return usemap; | 270 | return usemap; |
259 | 271 | ||
@@ -273,8 +285,8 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
273 | if (map) | 285 | if (map) |
274 | return map; | 286 | return map; |
275 | 287 | ||
276 | map = alloc_bootmem_node(NODE_DATA(nid), | 288 | map = alloc_bootmem_pages_node(NODE_DATA(nid), |
277 | sizeof(struct page) * PAGES_PER_SECTION); | 289 | PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); |
278 | return map; | 290 | return map; |
279 | } | 291 | } |
280 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 292 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ |
@@ -295,6 +307,9 @@ struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | |||
295 | return NULL; | 307 | return NULL; |
296 | } | 308 | } |
297 | 309 | ||
310 | void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) | ||
311 | { | ||
312 | } | ||
298 | /* | 313 | /* |
299 | * Allocate the accumulated non-linear sections, allocate a mem_map | 314 | * Allocate the accumulated non-linear sections, allocate a mem_map |
300 | * for each and record the physical to section mapping. | 315 | * for each and record the physical to section mapping. |
@@ -304,22 +319,50 @@ void __init sparse_init(void) | |||
304 | unsigned long pnum; | 319 | unsigned long pnum; |
305 | struct page *map; | 320 | struct page *map; |
306 | unsigned long *usemap; | 321 | unsigned long *usemap; |
322 | unsigned long **usemap_map; | ||
323 | int size; | ||
324 | |||
325 | /* | ||
326 | * map is using big page (aka 2M in x86 64 bit) | ||
327 | * usemap is less one page (aka 24 bytes) | ||
328 | * so alloc 2M (with 2M align) and 24 bytes in turn will | ||
329 | * make next 2M slip to one more 2M later. | ||
330 | * then in big system, the memory will have a lot of holes... | ||
331 | * here try to allocate 2M pages continously. | ||
332 | * | ||
333 | * powerpc need to call sparse_init_one_section right after each | ||
334 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. | ||
335 | */ | ||
336 | size = sizeof(unsigned long *) * NR_MEM_SECTIONS; | ||
337 | usemap_map = alloc_bootmem(size); | ||
338 | if (!usemap_map) | ||
339 | panic("can not allocate usemap_map\n"); | ||
307 | 340 | ||
308 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 341 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
309 | if (!present_section_nr(pnum)) | 342 | if (!present_section_nr(pnum)) |
310 | continue; | 343 | continue; |
344 | usemap_map[pnum] = sparse_early_usemap_alloc(pnum); | ||
345 | } | ||
311 | 346 | ||
312 | map = sparse_early_mem_map_alloc(pnum); | 347 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
313 | if (!map) | 348 | if (!present_section_nr(pnum)) |
314 | continue; | 349 | continue; |
315 | 350 | ||
316 | usemap = sparse_early_usemap_alloc(pnum); | 351 | usemap = usemap_map[pnum]; |
317 | if (!usemap) | 352 | if (!usemap) |
318 | continue; | 353 | continue; |
319 | 354 | ||
355 | map = sparse_early_mem_map_alloc(pnum); | ||
356 | if (!map) | ||
357 | continue; | ||
358 | |||
320 | sparse_init_one_section(__nr_to_section(pnum), pnum, map, | 359 | sparse_init_one_section(__nr_to_section(pnum), pnum, map, |
321 | usemap); | 360 | usemap); |
322 | } | 361 | } |
362 | |||
363 | vmemmap_populate_print_last(); | ||
364 | |||
365 | free_bootmem(__pa(usemap_map), size); | ||
323 | } | 366 | } |
324 | 367 | ||
325 | #ifdef CONFIG_MEMORY_HOTPLUG | 368 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -334,6 +377,9 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
334 | { | 377 | { |
335 | return; /* XXX: Not implemented yet */ | 378 | return; /* XXX: Not implemented yet */ |
336 | } | 379 | } |
380 | static void free_map_bootmem(struct page *page, unsigned long nr_pages) | ||
381 | { | ||
382 | } | ||
337 | #else | 383 | #else |
338 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 384 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) |
339 | { | 385 | { |
@@ -371,8 +417,69 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
371 | free_pages((unsigned long)memmap, | 417 | free_pages((unsigned long)memmap, |
372 | get_order(sizeof(struct page) * nr_pages)); | 418 | get_order(sizeof(struct page) * nr_pages)); |
373 | } | 419 | } |
420 | |||
421 | static void free_map_bootmem(struct page *page, unsigned long nr_pages) | ||
422 | { | ||
423 | unsigned long maps_section_nr, removing_section_nr, i; | ||
424 | int magic; | ||
425 | |||
426 | for (i = 0; i < nr_pages; i++, page++) { | ||
427 | magic = atomic_read(&page->_mapcount); | ||
428 | |||
429 | BUG_ON(magic == NODE_INFO); | ||
430 | |||
431 | maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); | ||
432 | removing_section_nr = page->private; | ||
433 | |||
434 | /* | ||
435 | * When this function is called, the removing section is | ||
436 | * logical offlined state. This means all pages are isolated | ||
437 | * from page allocator. If removing section's memmap is placed | ||
438 | * on the same section, it must not be freed. | ||
439 | * If it is freed, page allocator may allocate it which will | ||
440 | * be removed physically soon. | ||
441 | */ | ||
442 | if (maps_section_nr != removing_section_nr) | ||
443 | put_page_bootmem(page); | ||
444 | } | ||
445 | } | ||
374 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | 446 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ |
375 | 447 | ||
448 | static void free_section_usemap(struct page *memmap, unsigned long *usemap) | ||
449 | { | ||
450 | struct page *usemap_page; | ||
451 | unsigned long nr_pages; | ||
452 | |||
453 | if (!usemap) | ||
454 | return; | ||
455 | |||
456 | usemap_page = virt_to_page(usemap); | ||
457 | /* | ||
458 | * Check to see if allocation came from hot-plug-add | ||
459 | */ | ||
460 | if (PageSlab(usemap_page)) { | ||
461 | kfree(usemap); | ||
462 | if (memmap) | ||
463 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); | ||
464 | return; | ||
465 | } | ||
466 | |||
467 | /* | ||
468 | * The usemap came from bootmem. This is packed with other usemaps | ||
469 | * on the section which has pgdat at boot time. Just keep it as is now. | ||
470 | */ | ||
471 | |||
472 | if (memmap) { | ||
473 | struct page *memmap_page; | ||
474 | memmap_page = virt_to_page(memmap); | ||
475 | |||
476 | nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) | ||
477 | >> PAGE_SHIFT; | ||
478 | |||
479 | free_map_bootmem(memmap_page, nr_pages); | ||
480 | } | ||
481 | } | ||
482 | |||
376 | /* | 483 | /* |
377 | * returns the number of sections whose mem_maps were properly | 484 | * returns the number of sections whose mem_maps were properly |
378 | * set. If this is <=0, then that means that the passed-in | 485 | * set. If this is <=0, then that means that the passed-in |
@@ -425,4 +532,20 @@ out: | |||
425 | } | 532 | } |
426 | return ret; | 533 | return ret; |
427 | } | 534 | } |
535 | |||
536 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | ||
537 | { | ||
538 | struct page *memmap = NULL; | ||
539 | unsigned long *usemap = NULL; | ||
540 | |||
541 | if (ms->section_mem_map) { | ||
542 | usemap = ms->pageblock_flags; | ||
543 | memmap = sparse_decode_mem_map(ms->section_mem_map, | ||
544 | __section_nr(ms)); | ||
545 | ms->section_mem_map = 0; | ||
546 | ms->pageblock_flags = NULL; | ||
547 | } | ||
548 | |||
549 | free_section_usemap(memmap, usemap); | ||
550 | } | ||
428 | #endif | 551 | #endif |
@@ -132,34 +132,21 @@ static void pagevec_move_tail(struct pagevec *pvec) | |||
132 | * Writeback is about to end against a page which has been marked for immediate | 132 | * Writeback is about to end against a page which has been marked for immediate |
133 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 133 | * reclaim. If it still appears to be reclaimable, move it to the tail of the |
134 | * inactive list. | 134 | * inactive list. |
135 | * | ||
136 | * Returns zero if it cleared PG_writeback. | ||
137 | */ | 135 | */ |
138 | int rotate_reclaimable_page(struct page *page) | 136 | void rotate_reclaimable_page(struct page *page) |
139 | { | 137 | { |
140 | struct pagevec *pvec; | 138 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && |
141 | unsigned long flags; | 139 | PageLRU(page)) { |
142 | 140 | struct pagevec *pvec; | |
143 | if (PageLocked(page)) | 141 | unsigned long flags; |
144 | return 1; | ||
145 | if (PageDirty(page)) | ||
146 | return 1; | ||
147 | if (PageActive(page)) | ||
148 | return 1; | ||
149 | if (!PageLRU(page)) | ||
150 | return 1; | ||
151 | |||
152 | page_cache_get(page); | ||
153 | local_irq_save(flags); | ||
154 | pvec = &__get_cpu_var(lru_rotate_pvecs); | ||
155 | if (!pagevec_add(pvec, page)) | ||
156 | pagevec_move_tail(pvec); | ||
157 | local_irq_restore(flags); | ||
158 | |||
159 | if (!test_clear_page_writeback(page)) | ||
160 | BUG(); | ||
161 | 142 | ||
162 | return 0; | 143 | page_cache_get(page); |
144 | local_irq_save(flags); | ||
145 | pvec = &__get_cpu_var(lru_rotate_pvecs); | ||
146 | if (!pagevec_add(pvec, page)) | ||
147 | pagevec_move_tail(pvec); | ||
148 | local_irq_restore(flags); | ||
149 | } | ||
163 | } | 150 | } |
164 | 151 | ||
165 | /* | 152 | /* |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 2da149cfc9ac..67051be7083a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1582,6 +1582,14 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1582 | error = -EINVAL; | 1582 | error = -EINVAL; |
1583 | goto bad_swap; | 1583 | goto bad_swap; |
1584 | case 2: | 1584 | case 2: |
1585 | /* swap partition endianess hack... */ | ||
1586 | if (swab32(swap_header->info.version) == 1) { | ||
1587 | swab32s(&swap_header->info.version); | ||
1588 | swab32s(&swap_header->info.last_page); | ||
1589 | swab32s(&swap_header->info.nr_badpages); | ||
1590 | for (i = 0; i < swap_header->info.nr_badpages; i++) | ||
1591 | swab32s(&swap_header->info.badpages[i]); | ||
1592 | } | ||
1585 | /* Check the swap header's sub-version and the size of | 1593 | /* Check the swap header's sub-version and the size of |
1586 | the swap file and bad block lists */ | 1594 | the swap file and bad block lists */ |
1587 | if (swap_header->info.version != 1) { | 1595 | if (swap_header->info.version != 1) { |
diff --git a/mm/truncate.c b/mm/truncate.c index 7d20ce41ecf5..b8961cb63414 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -391,6 +391,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
391 | pgoff_t next; | 391 | pgoff_t next; |
392 | int i; | 392 | int i; |
393 | int ret = 0; | 393 | int ret = 0; |
394 | int ret2 = 0; | ||
394 | int did_range_unmap = 0; | 395 | int did_range_unmap = 0; |
395 | int wrapped = 0; | 396 | int wrapped = 0; |
396 | 397 | ||
@@ -438,9 +439,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
438 | } | 439 | } |
439 | } | 440 | } |
440 | BUG_ON(page_mapped(page)); | 441 | BUG_ON(page_mapped(page)); |
441 | ret = do_launder_page(mapping, page); | 442 | ret2 = do_launder_page(mapping, page); |
442 | if (ret == 0 && !invalidate_complete_page2(mapping, page)) | 443 | if (ret2 == 0) { |
443 | ret = -EIO; | 444 | if (!invalidate_complete_page2(mapping, page)) |
445 | ret2 = -EIO; | ||
446 | } | ||
447 | if (ret2 < 0) | ||
448 | ret = ret2; | ||
444 | unlock_page(page); | 449 | unlock_page(page); |
445 | } | 450 | } |
446 | pagevec_release(&pvec); | 451 | pagevec_release(&pvec); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ecf91f8034bf..e33e0ae69ad1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -14,8 +14,9 @@ | |||
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/spinlock.h> | 15 | #include <linux/spinlock.h> |
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | 17 | #include <linux/seq_file.h> | |
18 | #include <linux/vmalloc.h> | 18 | #include <linux/vmalloc.h> |
19 | #include <linux/kallsyms.h> | ||
19 | 20 | ||
20 | #include <asm/uaccess.h> | 21 | #include <asm/uaccess.h> |
21 | #include <asm/tlbflush.h> | 22 | #include <asm/tlbflush.h> |
@@ -25,7 +26,7 @@ DEFINE_RWLOCK(vmlist_lock); | |||
25 | struct vm_struct *vmlist; | 26 | struct vm_struct *vmlist; |
26 | 27 | ||
27 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | 28 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, |
28 | int node); | 29 | int node, void *caller); |
29 | 30 | ||
30 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 31 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) |
31 | { | 32 | { |
@@ -204,9 +205,9 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) | |||
204 | } | 205 | } |
205 | EXPORT_SYMBOL(vmalloc_to_pfn); | 206 | EXPORT_SYMBOL(vmalloc_to_pfn); |
206 | 207 | ||
207 | static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, | 208 | static struct vm_struct * |
208 | unsigned long start, unsigned long end, | 209 | __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, |
209 | int node, gfp_t gfp_mask) | 210 | unsigned long end, int node, gfp_t gfp_mask, void *caller) |
210 | { | 211 | { |
211 | struct vm_struct **p, *tmp, *area; | 212 | struct vm_struct **p, *tmp, *area; |
212 | unsigned long align = 1; | 213 | unsigned long align = 1; |
@@ -269,6 +270,7 @@ found: | |||
269 | area->pages = NULL; | 270 | area->pages = NULL; |
270 | area->nr_pages = 0; | 271 | area->nr_pages = 0; |
271 | area->phys_addr = 0; | 272 | area->phys_addr = 0; |
273 | area->caller = caller; | ||
272 | write_unlock(&vmlist_lock); | 274 | write_unlock(&vmlist_lock); |
273 | 275 | ||
274 | return area; | 276 | return area; |
@@ -284,7 +286,8 @@ out: | |||
284 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 286 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
285 | unsigned long start, unsigned long end) | 287 | unsigned long start, unsigned long end) |
286 | { | 288 | { |
287 | return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL); | 289 | return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, |
290 | __builtin_return_address(0)); | ||
288 | } | 291 | } |
289 | EXPORT_SYMBOL_GPL(__get_vm_area); | 292 | EXPORT_SYMBOL_GPL(__get_vm_area); |
290 | 293 | ||
@@ -299,14 +302,22 @@ EXPORT_SYMBOL_GPL(__get_vm_area); | |||
299 | */ | 302 | */ |
300 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | 303 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) |
301 | { | 304 | { |
302 | return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); | 305 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, |
306 | -1, GFP_KERNEL, __builtin_return_address(0)); | ||
307 | } | ||
308 | |||
309 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | ||
310 | void *caller) | ||
311 | { | ||
312 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, | ||
313 | -1, GFP_KERNEL, caller); | ||
303 | } | 314 | } |
304 | 315 | ||
305 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | 316 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, |
306 | int node, gfp_t gfp_mask) | 317 | int node, gfp_t gfp_mask) |
307 | { | 318 | { |
308 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, | 319 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, |
309 | gfp_mask); | 320 | gfp_mask, __builtin_return_address(0)); |
310 | } | 321 | } |
311 | 322 | ||
312 | /* Caller must hold vmlist_lock */ | 323 | /* Caller must hold vmlist_lock */ |
@@ -455,9 +466,11 @@ void *vmap(struct page **pages, unsigned int count, | |||
455 | if (count > num_physpages) | 466 | if (count > num_physpages) |
456 | return NULL; | 467 | return NULL; |
457 | 468 | ||
458 | area = get_vm_area((count << PAGE_SHIFT), flags); | 469 | area = get_vm_area_caller((count << PAGE_SHIFT), flags, |
470 | __builtin_return_address(0)); | ||
459 | if (!area) | 471 | if (!area) |
460 | return NULL; | 472 | return NULL; |
473 | |||
461 | if (map_vm_area(area, prot, &pages)) { | 474 | if (map_vm_area(area, prot, &pages)) { |
462 | vunmap(area->addr); | 475 | vunmap(area->addr); |
463 | return NULL; | 476 | return NULL; |
@@ -468,7 +481,7 @@ void *vmap(struct page **pages, unsigned int count, | |||
468 | EXPORT_SYMBOL(vmap); | 481 | EXPORT_SYMBOL(vmap); |
469 | 482 | ||
470 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 483 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
471 | pgprot_t prot, int node) | 484 | pgprot_t prot, int node, void *caller) |
472 | { | 485 | { |
473 | struct page **pages; | 486 | struct page **pages; |
474 | unsigned int nr_pages, array_size, i; | 487 | unsigned int nr_pages, array_size, i; |
@@ -480,7 +493,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
480 | /* Please note that the recursion is strictly bounded. */ | 493 | /* Please note that the recursion is strictly bounded. */ |
481 | if (array_size > PAGE_SIZE) { | 494 | if (array_size > PAGE_SIZE) { |
482 | pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, | 495 | pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, |
483 | PAGE_KERNEL, node); | 496 | PAGE_KERNEL, node, caller); |
484 | area->flags |= VM_VPAGES; | 497 | area->flags |= VM_VPAGES; |
485 | } else { | 498 | } else { |
486 | pages = kmalloc_node(array_size, | 499 | pages = kmalloc_node(array_size, |
@@ -488,6 +501,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
488 | node); | 501 | node); |
489 | } | 502 | } |
490 | area->pages = pages; | 503 | area->pages = pages; |
504 | area->caller = caller; | ||
491 | if (!area->pages) { | 505 | if (!area->pages) { |
492 | remove_vm_area(area->addr); | 506 | remove_vm_area(area->addr); |
493 | kfree(area); | 507 | kfree(area); |
@@ -521,7 +535,8 @@ fail: | |||
521 | 535 | ||
522 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | 536 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) |
523 | { | 537 | { |
524 | return __vmalloc_area_node(area, gfp_mask, prot, -1); | 538 | return __vmalloc_area_node(area, gfp_mask, prot, -1, |
539 | __builtin_return_address(0)); | ||
525 | } | 540 | } |
526 | 541 | ||
527 | /** | 542 | /** |
@@ -536,7 +551,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
536 | * kernel virtual space, using a pagetable protection of @prot. | 551 | * kernel virtual space, using a pagetable protection of @prot. |
537 | */ | 552 | */ |
538 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | 553 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, |
539 | int node) | 554 | int node, void *caller) |
540 | { | 555 | { |
541 | struct vm_struct *area; | 556 | struct vm_struct *area; |
542 | 557 | ||
@@ -544,16 +559,19 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | |||
544 | if (!size || (size >> PAGE_SHIFT) > num_physpages) | 559 | if (!size || (size >> PAGE_SHIFT) > num_physpages) |
545 | return NULL; | 560 | return NULL; |
546 | 561 | ||
547 | area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask); | 562 | area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, |
563 | node, gfp_mask, caller); | ||
564 | |||
548 | if (!area) | 565 | if (!area) |
549 | return NULL; | 566 | return NULL; |
550 | 567 | ||
551 | return __vmalloc_area_node(area, gfp_mask, prot, node); | 568 | return __vmalloc_area_node(area, gfp_mask, prot, node, caller); |
552 | } | 569 | } |
553 | 570 | ||
554 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 571 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
555 | { | 572 | { |
556 | return __vmalloc_node(size, gfp_mask, prot, -1); | 573 | return __vmalloc_node(size, gfp_mask, prot, -1, |
574 | __builtin_return_address(0)); | ||
557 | } | 575 | } |
558 | EXPORT_SYMBOL(__vmalloc); | 576 | EXPORT_SYMBOL(__vmalloc); |
559 | 577 | ||
@@ -568,7 +586,8 @@ EXPORT_SYMBOL(__vmalloc); | |||
568 | */ | 586 | */ |
569 | void *vmalloc(unsigned long size) | 587 | void *vmalloc(unsigned long size) |
570 | { | 588 | { |
571 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); | 589 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, |
590 | -1, __builtin_return_address(0)); | ||
572 | } | 591 | } |
573 | EXPORT_SYMBOL(vmalloc); | 592 | EXPORT_SYMBOL(vmalloc); |
574 | 593 | ||
@@ -608,7 +627,8 @@ EXPORT_SYMBOL(vmalloc_user); | |||
608 | */ | 627 | */ |
609 | void *vmalloc_node(unsigned long size, int node) | 628 | void *vmalloc_node(unsigned long size, int node) |
610 | { | 629 | { |
611 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); | 630 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, |
631 | node, __builtin_return_address(0)); | ||
612 | } | 632 | } |
613 | EXPORT_SYMBOL(vmalloc_node); | 633 | EXPORT_SYMBOL(vmalloc_node); |
614 | 634 | ||
@@ -843,7 +863,8 @@ struct vm_struct *alloc_vm_area(size_t size) | |||
843 | { | 863 | { |
844 | struct vm_struct *area; | 864 | struct vm_struct *area; |
845 | 865 | ||
846 | area = get_vm_area(size, VM_IOREMAP); | 866 | area = get_vm_area_caller(size, VM_IOREMAP, |
867 | __builtin_return_address(0)); | ||
847 | if (area == NULL) | 868 | if (area == NULL) |
848 | return NULL; | 869 | return NULL; |
849 | 870 | ||
@@ -873,3 +894,85 @@ void free_vm_area(struct vm_struct *area) | |||
873 | kfree(area); | 894 | kfree(area); |
874 | } | 895 | } |
875 | EXPORT_SYMBOL_GPL(free_vm_area); | 896 | EXPORT_SYMBOL_GPL(free_vm_area); |
897 | |||
898 | |||
899 | #ifdef CONFIG_PROC_FS | ||
900 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
901 | { | ||
902 | loff_t n = *pos; | ||
903 | struct vm_struct *v; | ||
904 | |||
905 | read_lock(&vmlist_lock); | ||
906 | v = vmlist; | ||
907 | while (n > 0 && v) { | ||
908 | n--; | ||
909 | v = v->next; | ||
910 | } | ||
911 | if (!n) | ||
912 | return v; | ||
913 | |||
914 | return NULL; | ||
915 | |||
916 | } | ||
917 | |||
918 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
919 | { | ||
920 | struct vm_struct *v = p; | ||
921 | |||
922 | ++*pos; | ||
923 | return v->next; | ||
924 | } | ||
925 | |||
926 | static void s_stop(struct seq_file *m, void *p) | ||
927 | { | ||
928 | read_unlock(&vmlist_lock); | ||
929 | } | ||
930 | |||
931 | static int s_show(struct seq_file *m, void *p) | ||
932 | { | ||
933 | struct vm_struct *v = p; | ||
934 | |||
935 | seq_printf(m, "0x%p-0x%p %7ld", | ||
936 | v->addr, v->addr + v->size, v->size); | ||
937 | |||
938 | if (v->caller) { | ||
939 | char buff[2 * KSYM_NAME_LEN]; | ||
940 | |||
941 | seq_putc(m, ' '); | ||
942 | sprint_symbol(buff, (unsigned long)v->caller); | ||
943 | seq_puts(m, buff); | ||
944 | } | ||
945 | |||
946 | if (v->nr_pages) | ||
947 | seq_printf(m, " pages=%d", v->nr_pages); | ||
948 | |||
949 | if (v->phys_addr) | ||
950 | seq_printf(m, " phys=%lx", v->phys_addr); | ||
951 | |||
952 | if (v->flags & VM_IOREMAP) | ||
953 | seq_printf(m, " ioremap"); | ||
954 | |||
955 | if (v->flags & VM_ALLOC) | ||
956 | seq_printf(m, " vmalloc"); | ||
957 | |||
958 | if (v->flags & VM_MAP) | ||
959 | seq_printf(m, " vmap"); | ||
960 | |||
961 | if (v->flags & VM_USERMAP) | ||
962 | seq_printf(m, " user"); | ||
963 | |||
964 | if (v->flags & VM_VPAGES) | ||
965 | seq_printf(m, " vpages"); | ||
966 | |||
967 | seq_putc(m, '\n'); | ||
968 | return 0; | ||
969 | } | ||
970 | |||
971 | const struct seq_operations vmalloc_op = { | ||
972 | .start = s_start, | ||
973 | .next = s_next, | ||
974 | .stop = s_stop, | ||
975 | .show = s_show, | ||
976 | }; | ||
977 | #endif | ||
978 | |||
diff --git a/mm/vmscan.c b/mm/vmscan.c index f80a5b7c057f..eceac9f9032f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1246,17 +1246,16 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1246 | * If a zone is deemed to be full of pinned pages then just give it a light | 1246 | * If a zone is deemed to be full of pinned pages then just give it a light |
1247 | * scan then give up on it. | 1247 | * scan then give up on it. |
1248 | */ | 1248 | */ |
1249 | static unsigned long shrink_zones(int priority, struct zone **zones, | 1249 | static unsigned long shrink_zones(int priority, struct zonelist *zonelist, |
1250 | struct scan_control *sc) | 1250 | struct scan_control *sc) |
1251 | { | 1251 | { |
1252 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | ||
1252 | unsigned long nr_reclaimed = 0; | 1253 | unsigned long nr_reclaimed = 0; |
1253 | int i; | 1254 | struct zoneref *z; |
1254 | 1255 | struct zone *zone; | |
1255 | 1256 | ||
1256 | sc->all_unreclaimable = 1; | 1257 | sc->all_unreclaimable = 1; |
1257 | for (i = 0; zones[i] != NULL; i++) { | 1258 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1258 | struct zone *zone = zones[i]; | ||
1259 | |||
1260 | if (!populated_zone(zone)) | 1259 | if (!populated_zone(zone)) |
1261 | continue; | 1260 | continue; |
1262 | /* | 1261 | /* |
@@ -1301,8 +1300,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
1301 | * holds filesystem locks which prevent writeout this might not work, and the | 1300 | * holds filesystem locks which prevent writeout this might not work, and the |
1302 | * allocation attempt will fail. | 1301 | * allocation attempt will fail. |
1303 | */ | 1302 | */ |
1304 | static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, | 1303 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, |
1305 | struct scan_control *sc) | 1304 | struct scan_control *sc) |
1306 | { | 1305 | { |
1307 | int priority; | 1306 | int priority; |
1308 | int ret = 0; | 1307 | int ret = 0; |
@@ -1310,7 +1309,9 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, | |||
1310 | unsigned long nr_reclaimed = 0; | 1309 | unsigned long nr_reclaimed = 0; |
1311 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1310 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1312 | unsigned long lru_pages = 0; | 1311 | unsigned long lru_pages = 0; |
1313 | int i; | 1312 | struct zoneref *z; |
1313 | struct zone *zone; | ||
1314 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | ||
1314 | 1315 | ||
1315 | if (scan_global_lru(sc)) | 1316 | if (scan_global_lru(sc)) |
1316 | count_vm_event(ALLOCSTALL); | 1317 | count_vm_event(ALLOCSTALL); |
@@ -1318,8 +1319,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, | |||
1318 | * mem_cgroup will not do shrink_slab. | 1319 | * mem_cgroup will not do shrink_slab. |
1319 | */ | 1320 | */ |
1320 | if (scan_global_lru(sc)) { | 1321 | if (scan_global_lru(sc)) { |
1321 | for (i = 0; zones[i] != NULL; i++) { | 1322 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1322 | struct zone *zone = zones[i]; | ||
1323 | 1323 | ||
1324 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1324 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1325 | continue; | 1325 | continue; |
@@ -1333,13 +1333,13 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, | |||
1333 | sc->nr_scanned = 0; | 1333 | sc->nr_scanned = 0; |
1334 | if (!priority) | 1334 | if (!priority) |
1335 | disable_swap_token(); | 1335 | disable_swap_token(); |
1336 | nr_reclaimed += shrink_zones(priority, zones, sc); | 1336 | nr_reclaimed += shrink_zones(priority, zonelist, sc); |
1337 | /* | 1337 | /* |
1338 | * Don't shrink slabs when reclaiming memory from | 1338 | * Don't shrink slabs when reclaiming memory from |
1339 | * over limit cgroups | 1339 | * over limit cgroups |
1340 | */ | 1340 | */ |
1341 | if (scan_global_lru(sc)) { | 1341 | if (scan_global_lru(sc)) { |
1342 | shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); | 1342 | shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); |
1343 | if (reclaim_state) { | 1343 | if (reclaim_state) { |
1344 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1344 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1345 | reclaim_state->reclaimed_slab = 0; | 1345 | reclaim_state->reclaimed_slab = 0; |
@@ -1383,8 +1383,7 @@ out: | |||
1383 | priority = 0; | 1383 | priority = 0; |
1384 | 1384 | ||
1385 | if (scan_global_lru(sc)) { | 1385 | if (scan_global_lru(sc)) { |
1386 | for (i = 0; zones[i] != NULL; i++) { | 1386 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1387 | struct zone *zone = zones[i]; | ||
1388 | 1387 | ||
1389 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1388 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1390 | continue; | 1389 | continue; |
@@ -1397,7 +1396,8 @@ out: | |||
1397 | return ret; | 1396 | return ret; |
1398 | } | 1397 | } |
1399 | 1398 | ||
1400 | unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | 1399 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
1400 | gfp_t gfp_mask) | ||
1401 | { | 1401 | { |
1402 | struct scan_control sc = { | 1402 | struct scan_control sc = { |
1403 | .gfp_mask = gfp_mask, | 1403 | .gfp_mask = gfp_mask, |
@@ -1410,7 +1410,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |||
1410 | .isolate_pages = isolate_pages_global, | 1410 | .isolate_pages = isolate_pages_global, |
1411 | }; | 1411 | }; |
1412 | 1412 | ||
1413 | return do_try_to_free_pages(zones, gfp_mask, &sc); | 1413 | return do_try_to_free_pages(zonelist, &sc); |
1414 | } | 1414 | } |
1415 | 1415 | ||
1416 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 1416 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
@@ -1419,7 +1419,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1419 | gfp_t gfp_mask) | 1419 | gfp_t gfp_mask) |
1420 | { | 1420 | { |
1421 | struct scan_control sc = { | 1421 | struct scan_control sc = { |
1422 | .gfp_mask = gfp_mask, | ||
1423 | .may_writepage = !laptop_mode, | 1422 | .may_writepage = !laptop_mode, |
1424 | .may_swap = 1, | 1423 | .may_swap = 1, |
1425 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1424 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
@@ -1428,13 +1427,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1428 | .mem_cgroup = mem_cont, | 1427 | .mem_cgroup = mem_cont, |
1429 | .isolate_pages = mem_cgroup_isolate_pages, | 1428 | .isolate_pages = mem_cgroup_isolate_pages, |
1430 | }; | 1429 | }; |
1431 | struct zone **zones; | 1430 | struct zonelist *zonelist; |
1432 | int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE); | ||
1433 | 1431 | ||
1434 | zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones; | 1432 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
1435 | if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) | 1433 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
1436 | return 1; | 1434 | zonelist = NODE_DATA(numa_node_id())->node_zonelists; |
1437 | return 0; | 1435 | return do_try_to_free_pages(zonelist, &sc); |
1438 | } | 1436 | } |
1439 | #endif | 1437 | #endif |
1440 | 1438 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 7c7286e9506d..ec6035eda933 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -322,6 +322,7 @@ void refresh_cpu_vm_stats(int cpu) | |||
322 | p->expire = 3; | 322 | p->expire = 3; |
323 | #endif | 323 | #endif |
324 | } | 324 | } |
325 | cond_resched(); | ||
325 | #ifdef CONFIG_NUMA | 326 | #ifdef CONFIG_NUMA |
326 | /* | 327 | /* |
327 | * Deal with draining the remote pageset of this | 328 | * Deal with draining the remote pageset of this |
@@ -364,13 +365,13 @@ void refresh_cpu_vm_stats(int cpu) | |||
364 | * | 365 | * |
365 | * Must be called with interrupts disabled. | 366 | * Must be called with interrupts disabled. |
366 | */ | 367 | */ |
367 | void zone_statistics(struct zonelist *zonelist, struct zone *z) | 368 | void zone_statistics(struct zone *preferred_zone, struct zone *z) |
368 | { | 369 | { |
369 | if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) { | 370 | if (z->zone_pgdat == preferred_zone->zone_pgdat) { |
370 | __inc_zone_state(z, NUMA_HIT); | 371 | __inc_zone_state(z, NUMA_HIT); |
371 | } else { | 372 | } else { |
372 | __inc_zone_state(z, NUMA_MISS); | 373 | __inc_zone_state(z, NUMA_MISS); |
373 | __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); | 374 | __inc_zone_state(preferred_zone, NUMA_FOREIGN); |
374 | } | 375 | } |
375 | if (z->node == numa_node_id()) | 376 | if (z->node == numa_node_id()) |
376 | __inc_zone_state(z, NUMA_LOCAL); | 377 | __inc_zone_state(z, NUMA_LOCAL); |
@@ -645,6 +646,10 @@ static const char * const vmstat_text[] = { | |||
645 | "allocstall", | 646 | "allocstall", |
646 | 647 | ||
647 | "pgrotated", | 648 | "pgrotated", |
649 | #ifdef CONFIG_HUGETLB_PAGE | ||
650 | "htlb_buddy_alloc_success", | ||
651 | "htlb_buddy_alloc_fail", | ||
652 | #endif | ||
648 | #endif | 653 | #endif |
649 | }; | 654 | }; |
650 | 655 | ||