diff options
author | Ingo Molnar <mingo@elte.hu> | 2011-02-14 05:55:18 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-02-14 05:55:18 -0500 |
commit | d2137d5af4259f50c19addb8246a186c9ffac325 (patch) | |
tree | 2f7e309f9cf8ef2f2698532c226edda38021fe69 /mm | |
parent | f005fe12b90c5b9fe180a09209a893e09affa8aa (diff) | |
parent | 795abaf1e4e188c4171e3cd3dbb11a9fcacaf505 (diff) |
Merge branch 'linus' into x86/bootmem
Conflicts:
arch/x86/mm/numa_64.c
Merge reason: fix the conflict, update to latest -rc and pick up this
dependent fix from Yinghai:
e6d2e2b2b1e1: memblock: don't adjust size in memblock_find_base()
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 40 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/compaction.c | 186 | ||||
-rw-r--r-- | mm/dmapool.c | 16 | ||||
-rw-r--r-- | mm/filemap.c | 25 | ||||
-rw-r--r-- | mm/huge_memory.c | 2354 | ||||
-rw-r--r-- | mm/hugetlb.c | 114 | ||||
-rw-r--r-- | mm/internal.h | 7 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 13 | ||||
-rw-r--r-- | mm/ksm.c | 88 | ||||
-rw-r--r-- | mm/madvise.c | 10 | ||||
-rw-r--r-- | mm/memblock.c | 10 | ||||
-rw-r--r-- | mm/memcontrol.c | 431 | ||||
-rw-r--r-- | mm/memory-failure.c | 118 | ||||
-rw-r--r-- | mm/memory.c | 360 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 52 | ||||
-rw-r--r-- | mm/mempolicy.c | 26 | ||||
-rw-r--r-- | mm/migrate.c | 134 | ||||
-rw-r--r-- | mm/mincore.c | 7 | ||||
-rw-r--r-- | mm/mlock.c | 170 | ||||
-rw-r--r-- | mm/mmap.c | 33 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 20 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/mprotect.c | 20 | ||||
-rw-r--r-- | mm/mremap.c | 9 | ||||
-rw-r--r-- | mm/nommu.c | 35 | ||||
-rw-r--r-- | mm/page-writeback.c | 11 | ||||
-rw-r--r-- | mm/page_alloc.c | 216 | ||||
-rw-r--r-- | mm/pagewalk.c | 6 | ||||
-rw-r--r-- | mm/percpu-vm.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 12 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 121 | ||||
-rw-r--r-- | mm/rmap.c | 93 | ||||
-rw-r--r-- | mm/shmem.c | 9 | ||||
-rw-r--r-- | mm/slab.c | 82 | ||||
-rw-r--r-- | mm/slob.c | 5 | ||||
-rw-r--r-- | mm/slub.c | 89 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 2 | ||||
-rw-r--r-- | mm/sparse.c | 4 | ||||
-rw-r--r-- | mm/swap.c | 131 | ||||
-rw-r--r-- | mm/swap_state.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 9 | ||||
-rw-r--r-- | mm/truncate.c | 15 | ||||
-rw-r--r-- | mm/util.c | 21 | ||||
-rw-r--r-- | mm/vmalloc.c | 118 | ||||
-rw-r--r-- | mm/vmscan.c | 435 | ||||
-rw-r--r-- | mm/vmstat.c | 206 |
48 files changed, 4789 insertions, 1112 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c2c8a4a1189..e9c0c61f2dd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS | |||
179 | config COMPACTION | 179 | config COMPACTION |
180 | bool "Allow for memory compaction" | 180 | bool "Allow for memory compaction" |
181 | select MIGRATION | 181 | select MIGRATION |
182 | depends on EXPERIMENTAL && HUGETLB_PAGE && MMU | 182 | depends on MMU |
183 | help | 183 | help |
184 | Allows the compaction of memory for the allocation of huge pages. | 184 | Allows the compaction of memory for the allocation of huge pages. |
185 | 185 | ||
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS | |||
302 | 302 | ||
303 | See Documentation/nommu-mmap.txt for more information. | 303 | See Documentation/nommu-mmap.txt for more information. |
304 | 304 | ||
305 | config TRANSPARENT_HUGEPAGE | ||
306 | bool "Transparent Hugepage Support" | ||
307 | depends on X86 && MMU | ||
308 | select COMPACTION | ||
309 | help | ||
310 | Transparent Hugepages allows the kernel to use huge pages and | ||
311 | huge tlb transparently to the applications whenever possible. | ||
312 | This feature can improve computing performance to certain | ||
313 | applications by speeding up page faults during memory | ||
314 | allocation, by reducing the number of tlb misses and by speeding | ||
315 | up the pagetable walking. | ||
316 | |||
317 | If memory constrained on embedded, you may want to say N. | ||
318 | |||
319 | choice | ||
320 | prompt "Transparent Hugepage Support sysfs defaults" | ||
321 | depends on TRANSPARENT_HUGEPAGE | ||
322 | default TRANSPARENT_HUGEPAGE_ALWAYS | ||
323 | help | ||
324 | Selects the sysfs defaults for Transparent Hugepage Support. | ||
325 | |||
326 | config TRANSPARENT_HUGEPAGE_ALWAYS | ||
327 | bool "always" | ||
328 | help | ||
329 | Enabling Transparent Hugepage always, can increase the | ||
330 | memory footprint of applications without a guaranteed | ||
331 | benefit but it will work automatically for all applications. | ||
332 | |||
333 | config TRANSPARENT_HUGEPAGE_MADVISE | ||
334 | bool "madvise" | ||
335 | help | ||
336 | Enabling Transparent Hugepage madvise, will only provide a | ||
337 | performance improvement benefit to the applications using | ||
338 | madvise(MADV_HUGEPAGE) but it won't risk to increase the | ||
339 | memory footprint of applications without a guaranteed | ||
340 | benefit. | ||
341 | endchoice | ||
342 | |||
305 | # | 343 | # |
306 | # UP and nommu archs use km based percpu allocator | 344 | # UP and nommu archs use km based percpu allocator |
307 | # | 345 | # |
diff --git a/mm/Makefile b/mm/Makefile index f73f75a29f8..2b1b575ae71 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -5,7 +5,7 @@ | |||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o | 8 | vmalloc.o pagewalk.o pgtable-generic.o |
9 | 9 | ||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | maccess.o page_alloc.o page-writeback.o \ | 11 | maccess.o page_alloc.o page-writeback.o \ |
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | |||
37 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 37 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
38 | obj-$(CONFIG_MIGRATION) += migrate.o | 38 | obj-$(CONFIG_MIGRATION) += migrate.o |
39 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 39 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
40 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | ||
40 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 41 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
41 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 42 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
42 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 43 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
diff --git a/mm/compaction.c b/mm/compaction.c index 4d709ee5901..8be430b812d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -16,6 +16,9 @@ | |||
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include "internal.h" | 17 | #include "internal.h" |
18 | 18 | ||
19 | #define CREATE_TRACE_POINTS | ||
20 | #include <trace/events/compaction.h> | ||
21 | |||
19 | /* | 22 | /* |
20 | * compact_control is used to track pages being migrated and the free pages | 23 | * compact_control is used to track pages being migrated and the free pages |
21 | * they are being migrated to during memory compaction. The free_pfn starts | 24 | * they are being migrated to during memory compaction. The free_pfn starts |
@@ -30,6 +33,7 @@ struct compact_control { | |||
30 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 33 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
31 | unsigned long free_pfn; /* isolate_freepages search base */ | 34 | unsigned long free_pfn; /* isolate_freepages search base */ |
32 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
36 | bool sync; /* Synchronous migration */ | ||
33 | 37 | ||
34 | /* Account for isolated anon and file pages */ | 38 | /* Account for isolated anon and file pages */ |
35 | unsigned long nr_anon; | 39 | unsigned long nr_anon; |
@@ -38,6 +42,8 @@ struct compact_control { | |||
38 | unsigned int order; /* order a direct compactor needs */ | 42 | unsigned int order; /* order a direct compactor needs */ |
39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 43 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
40 | struct zone *zone; | 44 | struct zone *zone; |
45 | |||
46 | int compact_mode; | ||
41 | }; | 47 | }; |
42 | 48 | ||
43 | static unsigned long release_freepages(struct list_head *freelist) | 49 | static unsigned long release_freepages(struct list_head *freelist) |
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
60 | struct list_head *freelist) | 66 | struct list_head *freelist) |
61 | { | 67 | { |
62 | unsigned long zone_end_pfn, end_pfn; | 68 | unsigned long zone_end_pfn, end_pfn; |
63 | int total_isolated = 0; | 69 | int nr_scanned = 0, total_isolated = 0; |
64 | struct page *cursor; | 70 | struct page *cursor; |
65 | 71 | ||
66 | /* Get the last PFN we should scan for free pages at */ | 72 | /* Get the last PFN we should scan for free pages at */ |
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
81 | 87 | ||
82 | if (!pfn_valid_within(blockpfn)) | 88 | if (!pfn_valid_within(blockpfn)) |
83 | continue; | 89 | continue; |
90 | nr_scanned++; | ||
84 | 91 | ||
85 | if (!PageBuddy(page)) | 92 | if (!PageBuddy(page)) |
86 | continue; | 93 | continue; |
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
100 | } | 107 | } |
101 | } | 108 | } |
102 | 109 | ||
110 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | ||
103 | return total_isolated; | 111 | return total_isolated; |
104 | } | 112 | } |
105 | 113 | ||
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
234 | struct compact_control *cc) | 242 | struct compact_control *cc) |
235 | { | 243 | { |
236 | unsigned long low_pfn, end_pfn; | 244 | unsigned long low_pfn, end_pfn; |
245 | unsigned long last_pageblock_nr = 0, pageblock_nr; | ||
246 | unsigned long nr_scanned = 0, nr_isolated = 0; | ||
237 | struct list_head *migratelist = &cc->migratepages; | 247 | struct list_head *migratelist = &cc->migratepages; |
238 | 248 | ||
239 | /* Do not scan outside zone boundaries */ | 249 | /* Do not scan outside zone boundaries */ |
@@ -266,21 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
266 | struct page *page; | 276 | struct page *page; |
267 | if (!pfn_valid_within(low_pfn)) | 277 | if (!pfn_valid_within(low_pfn)) |
268 | continue; | 278 | continue; |
279 | nr_scanned++; | ||
269 | 280 | ||
270 | /* Get the page and skip if free */ | 281 | /* Get the page and skip if free */ |
271 | page = pfn_to_page(low_pfn); | 282 | page = pfn_to_page(low_pfn); |
272 | if (PageBuddy(page)) | 283 | if (PageBuddy(page)) |
273 | continue; | 284 | continue; |
274 | 285 | ||
286 | /* | ||
287 | * For async migration, also only scan in MOVABLE blocks. Async | ||
288 | * migration is optimistic to see if the minimum amount of work | ||
289 | * satisfies the allocation | ||
290 | */ | ||
291 | pageblock_nr = low_pfn >> pageblock_order; | ||
292 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | ||
293 | get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { | ||
294 | low_pfn += pageblock_nr_pages; | ||
295 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | ||
296 | last_pageblock_nr = pageblock_nr; | ||
297 | continue; | ||
298 | } | ||
299 | |||
300 | if (!PageLRU(page)) | ||
301 | continue; | ||
302 | |||
303 | /* | ||
304 | * PageLRU is set, and lru_lock excludes isolation, | ||
305 | * splitting and collapsing (collapsing has already | ||
306 | * happened if PageLRU is set). | ||
307 | */ | ||
308 | if (PageTransHuge(page)) { | ||
309 | low_pfn += (1 << compound_order(page)) - 1; | ||
310 | continue; | ||
311 | } | ||
312 | |||
275 | /* Try isolate the page */ | 313 | /* Try isolate the page */ |
276 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) | 314 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) |
277 | continue; | 315 | continue; |
278 | 316 | ||
317 | VM_BUG_ON(PageTransCompound(page)); | ||
318 | |||
279 | /* Successfully isolated */ | 319 | /* Successfully isolated */ |
280 | del_page_from_lru_list(zone, page, page_lru(page)); | 320 | del_page_from_lru_list(zone, page, page_lru(page)); |
281 | list_add(&page->lru, migratelist); | 321 | list_add(&page->lru, migratelist); |
282 | mem_cgroup_del_lru(page); | ||
283 | cc->nr_migratepages++; | 322 | cc->nr_migratepages++; |
323 | nr_isolated++; | ||
284 | 324 | ||
285 | /* Avoid isolating too much */ | 325 | /* Avoid isolating too much */ |
286 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) | 326 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) |
@@ -292,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
292 | spin_unlock_irq(&zone->lru_lock); | 332 | spin_unlock_irq(&zone->lru_lock); |
293 | cc->migrate_pfn = low_pfn; | 333 | cc->migrate_pfn = low_pfn; |
294 | 334 | ||
335 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | ||
336 | |||
295 | return cc->nr_migratepages; | 337 | return cc->nr_migratepages; |
296 | } | 338 | } |
297 | 339 | ||
@@ -342,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc) | |||
342 | } | 384 | } |
343 | 385 | ||
344 | static int compact_finished(struct zone *zone, | 386 | static int compact_finished(struct zone *zone, |
345 | struct compact_control *cc) | 387 | struct compact_control *cc) |
346 | { | 388 | { |
347 | unsigned int order; | 389 | unsigned int order; |
348 | unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); | 390 | unsigned long watermark; |
349 | 391 | ||
350 | if (fatal_signal_pending(current)) | 392 | if (fatal_signal_pending(current)) |
351 | return COMPACT_PARTIAL; | 393 | return COMPACT_PARTIAL; |
@@ -355,12 +397,31 @@ static int compact_finished(struct zone *zone, | |||
355 | return COMPACT_COMPLETE; | 397 | return COMPACT_COMPLETE; |
356 | 398 | ||
357 | /* Compaction run is not finished if the watermark is not met */ | 399 | /* Compaction run is not finished if the watermark is not met */ |
400 | if (cc->compact_mode != COMPACT_MODE_KSWAPD) | ||
401 | watermark = low_wmark_pages(zone); | ||
402 | else | ||
403 | watermark = high_wmark_pages(zone); | ||
404 | watermark += (1 << cc->order); | ||
405 | |||
358 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | 406 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) |
359 | return COMPACT_CONTINUE; | 407 | return COMPACT_CONTINUE; |
360 | 408 | ||
409 | /* | ||
410 | * order == -1 is expected when compacting via | ||
411 | * /proc/sys/vm/compact_memory | ||
412 | */ | ||
361 | if (cc->order == -1) | 413 | if (cc->order == -1) |
362 | return COMPACT_CONTINUE; | 414 | return COMPACT_CONTINUE; |
363 | 415 | ||
416 | /* | ||
417 | * Generating only one page of the right order is not enough | ||
418 | * for kswapd, we must continue until we're above the high | ||
419 | * watermark as a pool for high order GFP_ATOMIC allocations | ||
420 | * too. | ||
421 | */ | ||
422 | if (cc->compact_mode == COMPACT_MODE_KSWAPD) | ||
423 | return COMPACT_CONTINUE; | ||
424 | |||
364 | /* Direct compactor: Is a suitable page free? */ | 425 | /* Direct compactor: Is a suitable page free? */ |
365 | for (order = cc->order; order < MAX_ORDER; order++) { | 426 | for (order = cc->order; order < MAX_ORDER; order++) { |
366 | /* Job done if page is free of the right migratetype */ | 427 | /* Job done if page is free of the right migratetype */ |
@@ -375,10 +436,69 @@ static int compact_finished(struct zone *zone, | |||
375 | return COMPACT_CONTINUE; | 436 | return COMPACT_CONTINUE; |
376 | } | 437 | } |
377 | 438 | ||
439 | /* | ||
440 | * compaction_suitable: Is this suitable to run compaction on this zone now? | ||
441 | * Returns | ||
442 | * COMPACT_SKIPPED - If there are too few free pages for compaction | ||
443 | * COMPACT_PARTIAL - If the allocation would succeed without compaction | ||
444 | * COMPACT_CONTINUE - If compaction should run now | ||
445 | */ | ||
446 | unsigned long compaction_suitable(struct zone *zone, int order) | ||
447 | { | ||
448 | int fragindex; | ||
449 | unsigned long watermark; | ||
450 | |||
451 | /* | ||
452 | * Watermarks for order-0 must be met for compaction. Note the 2UL. | ||
453 | * This is because during migration, copies of pages need to be | ||
454 | * allocated and for a short time, the footprint is higher | ||
455 | */ | ||
456 | watermark = low_wmark_pages(zone) + (2UL << order); | ||
457 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | ||
458 | return COMPACT_SKIPPED; | ||
459 | |||
460 | /* | ||
461 | * order == -1 is expected when compacting via | ||
462 | * /proc/sys/vm/compact_memory | ||
463 | */ | ||
464 | if (order == -1) | ||
465 | return COMPACT_CONTINUE; | ||
466 | |||
467 | /* | ||
468 | * fragmentation index determines if allocation failures are due to | ||
469 | * low memory or external fragmentation | ||
470 | * | ||
471 | * index of -1 implies allocations might succeed dependingon watermarks | ||
472 | * index towards 0 implies failure is due to lack of memory | ||
473 | * index towards 1000 implies failure is due to fragmentation | ||
474 | * | ||
475 | * Only compact if a failure would be due to fragmentation. | ||
476 | */ | ||
477 | fragindex = fragmentation_index(zone, order); | ||
478 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | ||
479 | return COMPACT_SKIPPED; | ||
480 | |||
481 | if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) | ||
482 | return COMPACT_PARTIAL; | ||
483 | |||
484 | return COMPACT_CONTINUE; | ||
485 | } | ||
486 | |||
378 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 487 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
379 | { | 488 | { |
380 | int ret; | 489 | int ret; |
381 | 490 | ||
491 | ret = compaction_suitable(zone, cc->order); | ||
492 | switch (ret) { | ||
493 | case COMPACT_PARTIAL: | ||
494 | case COMPACT_SKIPPED: | ||
495 | /* Compaction is likely to fail */ | ||
496 | return ret; | ||
497 | case COMPACT_CONTINUE: | ||
498 | /* Fall through to compaction */ | ||
499 | ; | ||
500 | } | ||
501 | |||
382 | /* Setup to move all movable pages to the end of the zone */ | 502 | /* Setup to move all movable pages to the end of the zone */ |
383 | cc->migrate_pfn = zone->zone_start_pfn; | 503 | cc->migrate_pfn = zone->zone_start_pfn; |
384 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; | 504 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; |
@@ -394,7 +514,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
394 | 514 | ||
395 | nr_migrate = cc->nr_migratepages; | 515 | nr_migrate = cc->nr_migratepages; |
396 | migrate_pages(&cc->migratepages, compaction_alloc, | 516 | migrate_pages(&cc->migratepages, compaction_alloc, |
397 | (unsigned long)cc, 0); | 517 | (unsigned long)cc, false, |
518 | cc->sync); | ||
398 | update_nr_listpages(cc); | 519 | update_nr_listpages(cc); |
399 | nr_remaining = cc->nr_migratepages; | 520 | nr_remaining = cc->nr_migratepages; |
400 | 521 | ||
@@ -402,6 +523,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
402 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); | 523 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); |
403 | if (nr_remaining) | 524 | if (nr_remaining) |
404 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); | 525 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); |
526 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | ||
527 | nr_remaining); | ||
405 | 528 | ||
406 | /* Release LRU pages not migrated */ | 529 | /* Release LRU pages not migrated */ |
407 | if (!list_empty(&cc->migratepages)) { | 530 | if (!list_empty(&cc->migratepages)) { |
@@ -418,8 +541,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
418 | return ret; | 541 | return ret; |
419 | } | 542 | } |
420 | 543 | ||
421 | static unsigned long compact_zone_order(struct zone *zone, | 544 | unsigned long compact_zone_order(struct zone *zone, |
422 | int order, gfp_t gfp_mask) | 545 | int order, gfp_t gfp_mask, |
546 | bool sync, | ||
547 | int compact_mode) | ||
423 | { | 548 | { |
424 | struct compact_control cc = { | 549 | struct compact_control cc = { |
425 | .nr_freepages = 0, | 550 | .nr_freepages = 0, |
@@ -427,6 +552,8 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
427 | .order = order, | 552 | .order = order, |
428 | .migratetype = allocflags_to_migratetype(gfp_mask), | 553 | .migratetype = allocflags_to_migratetype(gfp_mask), |
429 | .zone = zone, | 554 | .zone = zone, |
555 | .sync = sync, | ||
556 | .compact_mode = compact_mode, | ||
430 | }; | 557 | }; |
431 | INIT_LIST_HEAD(&cc.freepages); | 558 | INIT_LIST_HEAD(&cc.freepages); |
432 | INIT_LIST_HEAD(&cc.migratepages); | 559 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -442,16 +569,17 @@ int sysctl_extfrag_threshold = 500; | |||
442 | * @order: The order of the current allocation | 569 | * @order: The order of the current allocation |
443 | * @gfp_mask: The GFP mask of the current allocation | 570 | * @gfp_mask: The GFP mask of the current allocation |
444 | * @nodemask: The allowed nodes to allocate from | 571 | * @nodemask: The allowed nodes to allocate from |
572 | * @sync: Whether migration is synchronous or not | ||
445 | * | 573 | * |
446 | * This is the main entry point for direct page compaction. | 574 | * This is the main entry point for direct page compaction. |
447 | */ | 575 | */ |
448 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 576 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
449 | int order, gfp_t gfp_mask, nodemask_t *nodemask) | 577 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
578 | bool sync) | ||
450 | { | 579 | { |
451 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 580 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
452 | int may_enter_fs = gfp_mask & __GFP_FS; | 581 | int may_enter_fs = gfp_mask & __GFP_FS; |
453 | int may_perform_io = gfp_mask & __GFP_IO; | 582 | int may_perform_io = gfp_mask & __GFP_IO; |
454 | unsigned long watermark; | ||
455 | struct zoneref *z; | 583 | struct zoneref *z; |
456 | struct zone *zone; | 584 | struct zone *zone; |
457 | int rc = COMPACT_SKIPPED; | 585 | int rc = COMPACT_SKIPPED; |
@@ -461,7 +589,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
461 | * made because an assumption is made that the page allocator can satisfy | 589 | * made because an assumption is made that the page allocator can satisfy |
462 | * the "cheaper" orders without taking special steps | 590 | * the "cheaper" orders without taking special steps |
463 | */ | 591 | */ |
464 | if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) | 592 | if (!order || !may_enter_fs || !may_perform_io) |
465 | return rc; | 593 | return rc; |
466 | 594 | ||
467 | count_vm_event(COMPACTSTALL); | 595 | count_vm_event(COMPACTSTALL); |
@@ -469,43 +597,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
469 | /* Compact each zone in the list */ | 597 | /* Compact each zone in the list */ |
470 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 598 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
471 | nodemask) { | 599 | nodemask) { |
472 | int fragindex; | ||
473 | int status; | 600 | int status; |
474 | 601 | ||
475 | /* | 602 | status = compact_zone_order(zone, order, gfp_mask, sync, |
476 | * Watermarks for order-0 must be met for compaction. Note | 603 | COMPACT_MODE_DIRECT_RECLAIM); |
477 | * the 2UL. This is because during migration, copies of | ||
478 | * pages need to be allocated and for a short time, the | ||
479 | * footprint is higher | ||
480 | */ | ||
481 | watermark = low_wmark_pages(zone) + (2UL << order); | ||
482 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | ||
483 | continue; | ||
484 | |||
485 | /* | ||
486 | * fragmentation index determines if allocation failures are | ||
487 | * due to low memory or external fragmentation | ||
488 | * | ||
489 | * index of -1 implies allocations might succeed depending | ||
490 | * on watermarks | ||
491 | * index towards 0 implies failure is due to lack of memory | ||
492 | * index towards 1000 implies failure is due to fragmentation | ||
493 | * | ||
494 | * Only compact if a failure would be due to fragmentation. | ||
495 | */ | ||
496 | fragindex = fragmentation_index(zone, order); | ||
497 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | ||
498 | continue; | ||
499 | |||
500 | if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { | ||
501 | rc = COMPACT_PARTIAL; | ||
502 | break; | ||
503 | } | ||
504 | |||
505 | status = compact_zone_order(zone, order, gfp_mask); | ||
506 | rc = max(status, rc); | 604 | rc = max(status, rc); |
507 | 605 | ||
508 | if (zone_watermark_ok(zone, order, watermark, 0, 0)) | 606 | /* If a normal allocation would succeed, stop compacting */ |
607 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
509 | break; | 608 | break; |
510 | } | 609 | } |
511 | 610 | ||
@@ -532,6 +631,7 @@ static int compact_node(int nid) | |||
532 | .nr_freepages = 0, | 631 | .nr_freepages = 0, |
533 | .nr_migratepages = 0, | 632 | .nr_migratepages = 0, |
534 | .order = -1, | 633 | .order = -1, |
634 | .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, | ||
535 | }; | 635 | }; |
536 | 636 | ||
537 | zone = &pgdat->node_zones[zoneid]; | 637 | zone = &pgdat->node_zones[zoneid]; |
diff --git a/mm/dmapool.c b/mm/dmapool.c index 4df2de77e06..03bf3bb4519 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
324 | if (mem_flags & __GFP_WAIT) { | 324 | if (mem_flags & __GFP_WAIT) { |
325 | DECLARE_WAITQUEUE(wait, current); | 325 | DECLARE_WAITQUEUE(wait, current); |
326 | 326 | ||
327 | __set_current_state(TASK_INTERRUPTIBLE); | 327 | __set_current_state(TASK_UNINTERRUPTIBLE); |
328 | __add_wait_queue(&pool->waitq, &wait); | 328 | __add_wait_queue(&pool->waitq, &wait); |
329 | spin_unlock_irqrestore(&pool->lock, flags); | 329 | spin_unlock_irqrestore(&pool->lock, flags); |
330 | 330 | ||
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc); | |||
355 | 355 | ||
356 | static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) | 356 | static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) |
357 | { | 357 | { |
358 | unsigned long flags; | ||
359 | struct dma_page *page; | 358 | struct dma_page *page; |
360 | 359 | ||
361 | spin_lock_irqsave(&pool->lock, flags); | ||
362 | list_for_each_entry(page, &pool->page_list, page_list) { | 360 | list_for_each_entry(page, &pool->page_list, page_list) { |
363 | if (dma < page->dma) | 361 | if (dma < page->dma) |
364 | continue; | 362 | continue; |
365 | if (dma < (page->dma + pool->allocation)) | 363 | if (dma < (page->dma + pool->allocation)) |
366 | goto done; | 364 | return page; |
367 | } | 365 | } |
368 | page = NULL; | 366 | return NULL; |
369 | done: | ||
370 | spin_unlock_irqrestore(&pool->lock, flags); | ||
371 | return page; | ||
372 | } | 367 | } |
373 | 368 | ||
374 | /** | 369 | /** |
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
386 | unsigned long flags; | 381 | unsigned long flags; |
387 | unsigned int offset; | 382 | unsigned int offset; |
388 | 383 | ||
384 | spin_lock_irqsave(&pool->lock, flags); | ||
389 | page = pool_find_page(pool, dma); | 385 | page = pool_find_page(pool, dma); |
390 | if (!page) { | 386 | if (!page) { |
387 | spin_unlock_irqrestore(&pool->lock, flags); | ||
391 | if (pool->dev) | 388 | if (pool->dev) |
392 | dev_err(pool->dev, | 389 | dev_err(pool->dev, |
393 | "dma_pool_free %s, %p/%lx (bad dma)\n", | 390 | "dma_pool_free %s, %p/%lx (bad dma)\n", |
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
401 | offset = vaddr - page->vaddr; | 398 | offset = vaddr - page->vaddr; |
402 | #ifdef DMAPOOL_DEBUG | 399 | #ifdef DMAPOOL_DEBUG |
403 | if ((dma - page->dma) != offset) { | 400 | if ((dma - page->dma) != offset) { |
401 | spin_unlock_irqrestore(&pool->lock, flags); | ||
404 | if (pool->dev) | 402 | if (pool->dev) |
405 | dev_err(pool->dev, | 403 | dev_err(pool->dev, |
406 | "dma_pool_free %s, %p (bad vaddr)/%Lx\n", | 404 | "dma_pool_free %s, %p (bad vaddr)/%Lx\n", |
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
418 | chain = *(int *)(page->vaddr + chain); | 416 | chain = *(int *)(page->vaddr + chain); |
419 | continue; | 417 | continue; |
420 | } | 418 | } |
419 | spin_unlock_irqrestore(&pool->lock, flags); | ||
421 | if (pool->dev) | 420 | if (pool->dev) |
422 | dev_err(pool->dev, "dma_pool_free %s, dma %Lx " | 421 | dev_err(pool->dev, "dma_pool_free %s, dma %Lx " |
423 | "already free\n", pool->name, | 422 | "already free\n", pool->name, |
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
432 | memset(vaddr, POOL_POISON_FREED, pool->size); | 431 | memset(vaddr, POOL_POISON_FREED, pool->size); |
433 | #endif | 432 | #endif |
434 | 433 | ||
435 | spin_lock_irqsave(&pool->lock, flags); | ||
436 | page->in_use--; | 434 | page->in_use--; |
437 | *(int *)vaddr = page->offset; | 435 | *(int *)vaddr = page->offset; |
438 | page->offset = offset; | 436 | page->offset = offset; |
diff --git a/mm/filemap.c b/mm/filemap.c index ea89840fc65..83a45d35468 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -102,9 +102,6 @@ | |||
102 | * ->inode_lock (zap_pte_range->set_page_dirty) | 102 | * ->inode_lock (zap_pte_range->set_page_dirty) |
103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | * | 104 | * |
105 | * ->task->proc_lock | ||
106 | * ->dcache_lock (proc_pid_lookup) | ||
107 | * | ||
108 | * (code doesn't rely on that order, so you could switch it around) | 105 | * (code doesn't rely on that order, so you could switch it around) |
109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 106 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
110 | * ->i_mmap_lock | 107 | * ->i_mmap_lock |
@@ -143,13 +140,18 @@ void __remove_from_page_cache(struct page *page) | |||
143 | void remove_from_page_cache(struct page *page) | 140 | void remove_from_page_cache(struct page *page) |
144 | { | 141 | { |
145 | struct address_space *mapping = page->mapping; | 142 | struct address_space *mapping = page->mapping; |
143 | void (*freepage)(struct page *); | ||
146 | 144 | ||
147 | BUG_ON(!PageLocked(page)); | 145 | BUG_ON(!PageLocked(page)); |
148 | 146 | ||
147 | freepage = mapping->a_ops->freepage; | ||
149 | spin_lock_irq(&mapping->tree_lock); | 148 | spin_lock_irq(&mapping->tree_lock); |
150 | __remove_from_page_cache(page); | 149 | __remove_from_page_cache(page); |
151 | spin_unlock_irq(&mapping->tree_lock); | 150 | spin_unlock_irq(&mapping->tree_lock); |
152 | mem_cgroup_uncharge_cache_page(page); | 151 | mem_cgroup_uncharge_cache_page(page); |
152 | |||
153 | if (freepage) | ||
154 | freepage(page); | ||
153 | } | 155 | } |
154 | EXPORT_SYMBOL(remove_from_page_cache); | 156 | EXPORT_SYMBOL(remove_from_page_cache); |
155 | 157 | ||
@@ -296,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, | |||
296 | continue; | 298 | continue; |
297 | 299 | ||
298 | wait_on_page_writeback(page); | 300 | wait_on_page_writeback(page); |
299 | if (PageError(page)) | 301 | if (TestClearPageError(page)) |
300 | ret = -EIO; | 302 | ret = -EIO; |
301 | } | 303 | } |
302 | pagevec_release(&pvec); | 304 | pagevec_release(&pvec); |
@@ -835,9 +837,6 @@ repeat: | |||
835 | if (radix_tree_deref_retry(page)) | 837 | if (radix_tree_deref_retry(page)) |
836 | goto restart; | 838 | goto restart; |
837 | 839 | ||
838 | if (page->mapping == NULL || page->index != index) | ||
839 | break; | ||
840 | |||
841 | if (!page_cache_get_speculative(page)) | 840 | if (!page_cache_get_speculative(page)) |
842 | goto repeat; | 841 | goto repeat; |
843 | 842 | ||
@@ -847,6 +846,16 @@ repeat: | |||
847 | goto repeat; | 846 | goto repeat; |
848 | } | 847 | } |
849 | 848 | ||
849 | /* | ||
850 | * must check mapping and index after taking the ref. | ||
851 | * otherwise we can get both false positives and false | ||
852 | * negatives, which is just confusing to the caller. | ||
853 | */ | ||
854 | if (page->mapping == NULL || page->index != index) { | ||
855 | page_cache_release(page); | ||
856 | break; | ||
857 | } | ||
858 | |||
850 | pages[ret] = page; | 859 | pages[ret] = page; |
851 | ret++; | 860 | ret++; |
852 | index++; | 861 | index++; |
@@ -2218,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, | |||
2218 | gfp_notmask = __GFP_FS; | 2227 | gfp_notmask = __GFP_FS; |
2219 | repeat: | 2228 | repeat: |
2220 | page = find_lock_page(mapping, index); | 2229 | page = find_lock_page(mapping, index); |
2221 | if (likely(page)) | 2230 | if (page) |
2222 | return page; | 2231 | return page; |
2223 | 2232 | ||
2224 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); | 2233 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 00000000000..e62ddb8f24b --- /dev/null +++ b/mm/huge_memory.c | |||
@@ -0,0 +1,2354 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2009 Red Hat, Inc. | ||
3 | * | ||
4 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
5 | * the COPYING file in the top-level directory. | ||
6 | */ | ||
7 | |||
8 | #include <linux/mm.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/highmem.h> | ||
11 | #include <linux/hugetlb.h> | ||
12 | #include <linux/mmu_notifier.h> | ||
13 | #include <linux/rmap.h> | ||
14 | #include <linux/swap.h> | ||
15 | #include <linux/mm_inline.h> | ||
16 | #include <linux/kthread.h> | ||
17 | #include <linux/khugepaged.h> | ||
18 | #include <linux/freezer.h> | ||
19 | #include <linux/mman.h> | ||
20 | #include <asm/tlb.h> | ||
21 | #include <asm/pgalloc.h> | ||
22 | #include "internal.h" | ||
23 | |||
24 | /* | ||
25 | * By default transparent hugepage support is enabled for all mappings | ||
26 | * and khugepaged scans all mappings. Defrag is only invoked by | ||
27 | * khugepaged hugepage allocations and by page faults inside | ||
28 | * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived | ||
29 | * allocations. | ||
30 | */ | ||
31 | unsigned long transparent_hugepage_flags __read_mostly = | ||
32 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS | ||
33 | (1<<TRANSPARENT_HUGEPAGE_FLAG)| | ||
34 | #endif | ||
35 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE | ||
36 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| | ||
37 | #endif | ||
38 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| | ||
39 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
40 | |||
41 | /* default scan 8*512 pte (or vmas) every 30 second */ | ||
42 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | ||
43 | static unsigned int khugepaged_pages_collapsed; | ||
44 | static unsigned int khugepaged_full_scans; | ||
45 | static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; | ||
46 | /* during fragmentation poll the hugepage allocator once every minute */ | ||
47 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; | ||
48 | static struct task_struct *khugepaged_thread __read_mostly; | ||
49 | static DEFINE_MUTEX(khugepaged_mutex); | ||
50 | static DEFINE_SPINLOCK(khugepaged_mm_lock); | ||
51 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | ||
52 | /* | ||
53 | * default collapse hugepages if there is at least one pte mapped like | ||
54 | * it would have happened if the vma was large enough during page | ||
55 | * fault. | ||
56 | */ | ||
57 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | ||
58 | |||
59 | static int khugepaged(void *none); | ||
60 | static int mm_slots_hash_init(void); | ||
61 | static int khugepaged_slab_init(void); | ||
62 | static void khugepaged_slab_free(void); | ||
63 | |||
64 | #define MM_SLOTS_HASH_HEADS 1024 | ||
65 | static struct hlist_head *mm_slots_hash __read_mostly; | ||
66 | static struct kmem_cache *mm_slot_cache __read_mostly; | ||
67 | |||
68 | /** | ||
69 | * struct mm_slot - hash lookup from mm to mm_slot | ||
70 | * @hash: hash collision list | ||
71 | * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head | ||
72 | * @mm: the mm that this information is valid for | ||
73 | */ | ||
74 | struct mm_slot { | ||
75 | struct hlist_node hash; | ||
76 | struct list_head mm_node; | ||
77 | struct mm_struct *mm; | ||
78 | }; | ||
79 | |||
80 | /** | ||
81 | * struct khugepaged_scan - cursor for scanning | ||
82 | * @mm_head: the head of the mm list to scan | ||
83 | * @mm_slot: the current mm_slot we are scanning | ||
84 | * @address: the next address inside that to be scanned | ||
85 | * | ||
86 | * There is only the one khugepaged_scan instance of this cursor structure. | ||
87 | */ | ||
88 | struct khugepaged_scan { | ||
89 | struct list_head mm_head; | ||
90 | struct mm_slot *mm_slot; | ||
91 | unsigned long address; | ||
92 | } khugepaged_scan = { | ||
93 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | ||
94 | }; | ||
95 | |||
96 | |||
97 | static int set_recommended_min_free_kbytes(void) | ||
98 | { | ||
99 | struct zone *zone; | ||
100 | int nr_zones = 0; | ||
101 | unsigned long recommended_min; | ||
102 | extern int min_free_kbytes; | ||
103 | |||
104 | if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
105 | &transparent_hugepage_flags) && | ||
106 | !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
107 | &transparent_hugepage_flags)) | ||
108 | return 0; | ||
109 | |||
110 | for_each_populated_zone(zone) | ||
111 | nr_zones++; | ||
112 | |||
113 | /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ | ||
114 | recommended_min = pageblock_nr_pages * nr_zones * 2; | ||
115 | |||
116 | /* | ||
117 | * Make sure that on average at least two pageblocks are almost free | ||
118 | * of another type, one for a migratetype to fall back to and a | ||
119 | * second to avoid subsequent fallbacks of other types There are 3 | ||
120 | * MIGRATE_TYPES we care about. | ||
121 | */ | ||
122 | recommended_min += pageblock_nr_pages * nr_zones * | ||
123 | MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; | ||
124 | |||
125 | /* don't ever allow to reserve more than 5% of the lowmem */ | ||
126 | recommended_min = min(recommended_min, | ||
127 | (unsigned long) nr_free_buffer_pages() / 20); | ||
128 | recommended_min <<= (PAGE_SHIFT-10); | ||
129 | |||
130 | if (recommended_min > min_free_kbytes) | ||
131 | min_free_kbytes = recommended_min; | ||
132 | setup_per_zone_wmarks(); | ||
133 | return 0; | ||
134 | } | ||
135 | late_initcall(set_recommended_min_free_kbytes); | ||
136 | |||
137 | static int start_khugepaged(void) | ||
138 | { | ||
139 | int err = 0; | ||
140 | if (khugepaged_enabled()) { | ||
141 | int wakeup; | ||
142 | if (unlikely(!mm_slot_cache || !mm_slots_hash)) { | ||
143 | err = -ENOMEM; | ||
144 | goto out; | ||
145 | } | ||
146 | mutex_lock(&khugepaged_mutex); | ||
147 | if (!khugepaged_thread) | ||
148 | khugepaged_thread = kthread_run(khugepaged, NULL, | ||
149 | "khugepaged"); | ||
150 | if (unlikely(IS_ERR(khugepaged_thread))) { | ||
151 | printk(KERN_ERR | ||
152 | "khugepaged: kthread_run(khugepaged) failed\n"); | ||
153 | err = PTR_ERR(khugepaged_thread); | ||
154 | khugepaged_thread = NULL; | ||
155 | } | ||
156 | wakeup = !list_empty(&khugepaged_scan.mm_head); | ||
157 | mutex_unlock(&khugepaged_mutex); | ||
158 | if (wakeup) | ||
159 | wake_up_interruptible(&khugepaged_wait); | ||
160 | |||
161 | set_recommended_min_free_kbytes(); | ||
162 | } else | ||
163 | /* wakeup to exit */ | ||
164 | wake_up_interruptible(&khugepaged_wait); | ||
165 | out: | ||
166 | return err; | ||
167 | } | ||
168 | |||
169 | #ifdef CONFIG_SYSFS | ||
170 | |||
171 | static ssize_t double_flag_show(struct kobject *kobj, | ||
172 | struct kobj_attribute *attr, char *buf, | ||
173 | enum transparent_hugepage_flag enabled, | ||
174 | enum transparent_hugepage_flag req_madv) | ||
175 | { | ||
176 | if (test_bit(enabled, &transparent_hugepage_flags)) { | ||
177 | VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); | ||
178 | return sprintf(buf, "[always] madvise never\n"); | ||
179 | } else if (test_bit(req_madv, &transparent_hugepage_flags)) | ||
180 | return sprintf(buf, "always [madvise] never\n"); | ||
181 | else | ||
182 | return sprintf(buf, "always madvise [never]\n"); | ||
183 | } | ||
184 | static ssize_t double_flag_store(struct kobject *kobj, | ||
185 | struct kobj_attribute *attr, | ||
186 | const char *buf, size_t count, | ||
187 | enum transparent_hugepage_flag enabled, | ||
188 | enum transparent_hugepage_flag req_madv) | ||
189 | { | ||
190 | if (!memcmp("always", buf, | ||
191 | min(sizeof("always")-1, count))) { | ||
192 | set_bit(enabled, &transparent_hugepage_flags); | ||
193 | clear_bit(req_madv, &transparent_hugepage_flags); | ||
194 | } else if (!memcmp("madvise", buf, | ||
195 | min(sizeof("madvise")-1, count))) { | ||
196 | clear_bit(enabled, &transparent_hugepage_flags); | ||
197 | set_bit(req_madv, &transparent_hugepage_flags); | ||
198 | } else if (!memcmp("never", buf, | ||
199 | min(sizeof("never")-1, count))) { | ||
200 | clear_bit(enabled, &transparent_hugepage_flags); | ||
201 | clear_bit(req_madv, &transparent_hugepage_flags); | ||
202 | } else | ||
203 | return -EINVAL; | ||
204 | |||
205 | return count; | ||
206 | } | ||
207 | |||
208 | static ssize_t enabled_show(struct kobject *kobj, | ||
209 | struct kobj_attribute *attr, char *buf) | ||
210 | { | ||
211 | return double_flag_show(kobj, attr, buf, | ||
212 | TRANSPARENT_HUGEPAGE_FLAG, | ||
213 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | ||
214 | } | ||
215 | static ssize_t enabled_store(struct kobject *kobj, | ||
216 | struct kobj_attribute *attr, | ||
217 | const char *buf, size_t count) | ||
218 | { | ||
219 | ssize_t ret; | ||
220 | |||
221 | ret = double_flag_store(kobj, attr, buf, count, | ||
222 | TRANSPARENT_HUGEPAGE_FLAG, | ||
223 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | ||
224 | |||
225 | if (ret > 0) { | ||
226 | int err = start_khugepaged(); | ||
227 | if (err) | ||
228 | ret = err; | ||
229 | } | ||
230 | |||
231 | if (ret > 0 && | ||
232 | (test_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
233 | &transparent_hugepage_flags) || | ||
234 | test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
235 | &transparent_hugepage_flags))) | ||
236 | set_recommended_min_free_kbytes(); | ||
237 | |||
238 | return ret; | ||
239 | } | ||
240 | static struct kobj_attribute enabled_attr = | ||
241 | __ATTR(enabled, 0644, enabled_show, enabled_store); | ||
242 | |||
243 | static ssize_t single_flag_show(struct kobject *kobj, | ||
244 | struct kobj_attribute *attr, char *buf, | ||
245 | enum transparent_hugepage_flag flag) | ||
246 | { | ||
247 | if (test_bit(flag, &transparent_hugepage_flags)) | ||
248 | return sprintf(buf, "[yes] no\n"); | ||
249 | else | ||
250 | return sprintf(buf, "yes [no]\n"); | ||
251 | } | ||
252 | static ssize_t single_flag_store(struct kobject *kobj, | ||
253 | struct kobj_attribute *attr, | ||
254 | const char *buf, size_t count, | ||
255 | enum transparent_hugepage_flag flag) | ||
256 | { | ||
257 | if (!memcmp("yes", buf, | ||
258 | min(sizeof("yes")-1, count))) { | ||
259 | set_bit(flag, &transparent_hugepage_flags); | ||
260 | } else if (!memcmp("no", buf, | ||
261 | min(sizeof("no")-1, count))) { | ||
262 | clear_bit(flag, &transparent_hugepage_flags); | ||
263 | } else | ||
264 | return -EINVAL; | ||
265 | |||
266 | return count; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Currently defrag only disables __GFP_NOWAIT for allocation. A blind | ||
271 | * __GFP_REPEAT is too aggressive, it's never worth swapping tons of | ||
272 | * memory just to allocate one more hugepage. | ||
273 | */ | ||
274 | static ssize_t defrag_show(struct kobject *kobj, | ||
275 | struct kobj_attribute *attr, char *buf) | ||
276 | { | ||
277 | return double_flag_show(kobj, attr, buf, | ||
278 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | ||
279 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); | ||
280 | } | ||
281 | static ssize_t defrag_store(struct kobject *kobj, | ||
282 | struct kobj_attribute *attr, | ||
283 | const char *buf, size_t count) | ||
284 | { | ||
285 | return double_flag_store(kobj, attr, buf, count, | ||
286 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | ||
287 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); | ||
288 | } | ||
289 | static struct kobj_attribute defrag_attr = | ||
290 | __ATTR(defrag, 0644, defrag_show, defrag_store); | ||
291 | |||
292 | #ifdef CONFIG_DEBUG_VM | ||
293 | static ssize_t debug_cow_show(struct kobject *kobj, | ||
294 | struct kobj_attribute *attr, char *buf) | ||
295 | { | ||
296 | return single_flag_show(kobj, attr, buf, | ||
297 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | ||
298 | } | ||
299 | static ssize_t debug_cow_store(struct kobject *kobj, | ||
300 | struct kobj_attribute *attr, | ||
301 | const char *buf, size_t count) | ||
302 | { | ||
303 | return single_flag_store(kobj, attr, buf, count, | ||
304 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | ||
305 | } | ||
306 | static struct kobj_attribute debug_cow_attr = | ||
307 | __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); | ||
308 | #endif /* CONFIG_DEBUG_VM */ | ||
309 | |||
310 | static struct attribute *hugepage_attr[] = { | ||
311 | &enabled_attr.attr, | ||
312 | &defrag_attr.attr, | ||
313 | #ifdef CONFIG_DEBUG_VM | ||
314 | &debug_cow_attr.attr, | ||
315 | #endif | ||
316 | NULL, | ||
317 | }; | ||
318 | |||
319 | static struct attribute_group hugepage_attr_group = { | ||
320 | .attrs = hugepage_attr, | ||
321 | }; | ||
322 | |||
323 | static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, | ||
324 | struct kobj_attribute *attr, | ||
325 | char *buf) | ||
326 | { | ||
327 | return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); | ||
328 | } | ||
329 | |||
330 | static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, | ||
331 | struct kobj_attribute *attr, | ||
332 | const char *buf, size_t count) | ||
333 | { | ||
334 | unsigned long msecs; | ||
335 | int err; | ||
336 | |||
337 | err = strict_strtoul(buf, 10, &msecs); | ||
338 | if (err || msecs > UINT_MAX) | ||
339 | return -EINVAL; | ||
340 | |||
341 | khugepaged_scan_sleep_millisecs = msecs; | ||
342 | wake_up_interruptible(&khugepaged_wait); | ||
343 | |||
344 | return count; | ||
345 | } | ||
346 | static struct kobj_attribute scan_sleep_millisecs_attr = | ||
347 | __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, | ||
348 | scan_sleep_millisecs_store); | ||
349 | |||
350 | static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, | ||
351 | struct kobj_attribute *attr, | ||
352 | char *buf) | ||
353 | { | ||
354 | return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); | ||
355 | } | ||
356 | |||
357 | static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, | ||
358 | struct kobj_attribute *attr, | ||
359 | const char *buf, size_t count) | ||
360 | { | ||
361 | unsigned long msecs; | ||
362 | int err; | ||
363 | |||
364 | err = strict_strtoul(buf, 10, &msecs); | ||
365 | if (err || msecs > UINT_MAX) | ||
366 | return -EINVAL; | ||
367 | |||
368 | khugepaged_alloc_sleep_millisecs = msecs; | ||
369 | wake_up_interruptible(&khugepaged_wait); | ||
370 | |||
371 | return count; | ||
372 | } | ||
373 | static struct kobj_attribute alloc_sleep_millisecs_attr = | ||
374 | __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, | ||
375 | alloc_sleep_millisecs_store); | ||
376 | |||
377 | static ssize_t pages_to_scan_show(struct kobject *kobj, | ||
378 | struct kobj_attribute *attr, | ||
379 | char *buf) | ||
380 | { | ||
381 | return sprintf(buf, "%u\n", khugepaged_pages_to_scan); | ||
382 | } | ||
383 | static ssize_t pages_to_scan_store(struct kobject *kobj, | ||
384 | struct kobj_attribute *attr, | ||
385 | const char *buf, size_t count) | ||
386 | { | ||
387 | int err; | ||
388 | unsigned long pages; | ||
389 | |||
390 | err = strict_strtoul(buf, 10, &pages); | ||
391 | if (err || !pages || pages > UINT_MAX) | ||
392 | return -EINVAL; | ||
393 | |||
394 | khugepaged_pages_to_scan = pages; | ||
395 | |||
396 | return count; | ||
397 | } | ||
398 | static struct kobj_attribute pages_to_scan_attr = | ||
399 | __ATTR(pages_to_scan, 0644, pages_to_scan_show, | ||
400 | pages_to_scan_store); | ||
401 | |||
402 | static ssize_t pages_collapsed_show(struct kobject *kobj, | ||
403 | struct kobj_attribute *attr, | ||
404 | char *buf) | ||
405 | { | ||
406 | return sprintf(buf, "%u\n", khugepaged_pages_collapsed); | ||
407 | } | ||
408 | static struct kobj_attribute pages_collapsed_attr = | ||
409 | __ATTR_RO(pages_collapsed); | ||
410 | |||
411 | static ssize_t full_scans_show(struct kobject *kobj, | ||
412 | struct kobj_attribute *attr, | ||
413 | char *buf) | ||
414 | { | ||
415 | return sprintf(buf, "%u\n", khugepaged_full_scans); | ||
416 | } | ||
417 | static struct kobj_attribute full_scans_attr = | ||
418 | __ATTR_RO(full_scans); | ||
419 | |||
420 | static ssize_t khugepaged_defrag_show(struct kobject *kobj, | ||
421 | struct kobj_attribute *attr, char *buf) | ||
422 | { | ||
423 | return single_flag_show(kobj, attr, buf, | ||
424 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
425 | } | ||
426 | static ssize_t khugepaged_defrag_store(struct kobject *kobj, | ||
427 | struct kobj_attribute *attr, | ||
428 | const char *buf, size_t count) | ||
429 | { | ||
430 | return single_flag_store(kobj, attr, buf, count, | ||
431 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
432 | } | ||
433 | static struct kobj_attribute khugepaged_defrag_attr = | ||
434 | __ATTR(defrag, 0644, khugepaged_defrag_show, | ||
435 | khugepaged_defrag_store); | ||
436 | |||
437 | /* | ||
438 | * max_ptes_none controls if khugepaged should collapse hugepages over | ||
439 | * any unmapped ptes in turn potentially increasing the memory | ||
440 | * footprint of the vmas. When max_ptes_none is 0 khugepaged will not | ||
441 | * reduce the available free memory in the system as it | ||
442 | * runs. Increasing max_ptes_none will instead potentially reduce the | ||
443 | * free memory in the system during the khugepaged scan. | ||
444 | */ | ||
445 | static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, | ||
446 | struct kobj_attribute *attr, | ||
447 | char *buf) | ||
448 | { | ||
449 | return sprintf(buf, "%u\n", khugepaged_max_ptes_none); | ||
450 | } | ||
451 | static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, | ||
452 | struct kobj_attribute *attr, | ||
453 | const char *buf, size_t count) | ||
454 | { | ||
455 | int err; | ||
456 | unsigned long max_ptes_none; | ||
457 | |||
458 | err = strict_strtoul(buf, 10, &max_ptes_none); | ||
459 | if (err || max_ptes_none > HPAGE_PMD_NR-1) | ||
460 | return -EINVAL; | ||
461 | |||
462 | khugepaged_max_ptes_none = max_ptes_none; | ||
463 | |||
464 | return count; | ||
465 | } | ||
466 | static struct kobj_attribute khugepaged_max_ptes_none_attr = | ||
467 | __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, | ||
468 | khugepaged_max_ptes_none_store); | ||
469 | |||
470 | static struct attribute *khugepaged_attr[] = { | ||
471 | &khugepaged_defrag_attr.attr, | ||
472 | &khugepaged_max_ptes_none_attr.attr, | ||
473 | &pages_to_scan_attr.attr, | ||
474 | &pages_collapsed_attr.attr, | ||
475 | &full_scans_attr.attr, | ||
476 | &scan_sleep_millisecs_attr.attr, | ||
477 | &alloc_sleep_millisecs_attr.attr, | ||
478 | NULL, | ||
479 | }; | ||
480 | |||
481 | static struct attribute_group khugepaged_attr_group = { | ||
482 | .attrs = khugepaged_attr, | ||
483 | .name = "khugepaged", | ||
484 | }; | ||
485 | #endif /* CONFIG_SYSFS */ | ||
486 | |||
487 | static int __init hugepage_init(void) | ||
488 | { | ||
489 | int err; | ||
490 | #ifdef CONFIG_SYSFS | ||
491 | static struct kobject *hugepage_kobj; | ||
492 | #endif | ||
493 | |||
494 | err = -EINVAL; | ||
495 | if (!has_transparent_hugepage()) { | ||
496 | transparent_hugepage_flags = 0; | ||
497 | goto out; | ||
498 | } | ||
499 | |||
500 | #ifdef CONFIG_SYSFS | ||
501 | err = -ENOMEM; | ||
502 | hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); | ||
503 | if (unlikely(!hugepage_kobj)) { | ||
504 | printk(KERN_ERR "hugepage: failed kobject create\n"); | ||
505 | goto out; | ||
506 | } | ||
507 | |||
508 | err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); | ||
509 | if (err) { | ||
510 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | ||
511 | goto out; | ||
512 | } | ||
513 | |||
514 | err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); | ||
515 | if (err) { | ||
516 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | ||
517 | goto out; | ||
518 | } | ||
519 | #endif | ||
520 | |||
521 | err = khugepaged_slab_init(); | ||
522 | if (err) | ||
523 | goto out; | ||
524 | |||
525 | err = mm_slots_hash_init(); | ||
526 | if (err) { | ||
527 | khugepaged_slab_free(); | ||
528 | goto out; | ||
529 | } | ||
530 | |||
531 | /* | ||
532 | * By default disable transparent hugepages on smaller systems, | ||
533 | * where the extra memory used could hurt more than TLB overhead | ||
534 | * is likely to save. The admin can still enable it through /sys. | ||
535 | */ | ||
536 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) | ||
537 | transparent_hugepage_flags = 0; | ||
538 | |||
539 | start_khugepaged(); | ||
540 | |||
541 | set_recommended_min_free_kbytes(); | ||
542 | |||
543 | out: | ||
544 | return err; | ||
545 | } | ||
546 | module_init(hugepage_init) | ||
547 | |||
548 | static int __init setup_transparent_hugepage(char *str) | ||
549 | { | ||
550 | int ret = 0; | ||
551 | if (!str) | ||
552 | goto out; | ||
553 | if (!strcmp(str, "always")) { | ||
554 | set_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
555 | &transparent_hugepage_flags); | ||
556 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
557 | &transparent_hugepage_flags); | ||
558 | ret = 1; | ||
559 | } else if (!strcmp(str, "madvise")) { | ||
560 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
561 | &transparent_hugepage_flags); | ||
562 | set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
563 | &transparent_hugepage_flags); | ||
564 | ret = 1; | ||
565 | } else if (!strcmp(str, "never")) { | ||
566 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
567 | &transparent_hugepage_flags); | ||
568 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
569 | &transparent_hugepage_flags); | ||
570 | ret = 1; | ||
571 | } | ||
572 | out: | ||
573 | if (!ret) | ||
574 | printk(KERN_WARNING | ||
575 | "transparent_hugepage= cannot parse, ignored\n"); | ||
576 | return ret; | ||
577 | } | ||
578 | __setup("transparent_hugepage=", setup_transparent_hugepage); | ||
579 | |||
580 | static void prepare_pmd_huge_pte(pgtable_t pgtable, | ||
581 | struct mm_struct *mm) | ||
582 | { | ||
583 | assert_spin_locked(&mm->page_table_lock); | ||
584 | |||
585 | /* FIFO */ | ||
586 | if (!mm->pmd_huge_pte) | ||
587 | INIT_LIST_HEAD(&pgtable->lru); | ||
588 | else | ||
589 | list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); | ||
590 | mm->pmd_huge_pte = pgtable; | ||
591 | } | ||
592 | |||
593 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | ||
594 | { | ||
595 | if (likely(vma->vm_flags & VM_WRITE)) | ||
596 | pmd = pmd_mkwrite(pmd); | ||
597 | return pmd; | ||
598 | } | ||
599 | |||
600 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | ||
601 | struct vm_area_struct *vma, | ||
602 | unsigned long haddr, pmd_t *pmd, | ||
603 | struct page *page) | ||
604 | { | ||
605 | int ret = 0; | ||
606 | pgtable_t pgtable; | ||
607 | |||
608 | VM_BUG_ON(!PageCompound(page)); | ||
609 | pgtable = pte_alloc_one(mm, haddr); | ||
610 | if (unlikely(!pgtable)) { | ||
611 | mem_cgroup_uncharge_page(page); | ||
612 | put_page(page); | ||
613 | return VM_FAULT_OOM; | ||
614 | } | ||
615 | |||
616 | clear_huge_page(page, haddr, HPAGE_PMD_NR); | ||
617 | __SetPageUptodate(page); | ||
618 | |||
619 | spin_lock(&mm->page_table_lock); | ||
620 | if (unlikely(!pmd_none(*pmd))) { | ||
621 | spin_unlock(&mm->page_table_lock); | ||
622 | mem_cgroup_uncharge_page(page); | ||
623 | put_page(page); | ||
624 | pte_free(mm, pgtable); | ||
625 | } else { | ||
626 | pmd_t entry; | ||
627 | entry = mk_pmd(page, vma->vm_page_prot); | ||
628 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
629 | entry = pmd_mkhuge(entry); | ||
630 | /* | ||
631 | * The spinlocking to take the lru_lock inside | ||
632 | * page_add_new_anon_rmap() acts as a full memory | ||
633 | * barrier to be sure clear_huge_page writes become | ||
634 | * visible after the set_pmd_at() write. | ||
635 | */ | ||
636 | page_add_new_anon_rmap(page, vma, haddr); | ||
637 | set_pmd_at(mm, haddr, pmd, entry); | ||
638 | prepare_pmd_huge_pte(pgtable, mm); | ||
639 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | ||
640 | spin_unlock(&mm->page_table_lock); | ||
641 | } | ||
642 | |||
643 | return ret; | ||
644 | } | ||
645 | |||
646 | static inline gfp_t alloc_hugepage_gfpmask(int defrag) | ||
647 | { | ||
648 | return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); | ||
649 | } | ||
650 | |||
651 | static inline struct page *alloc_hugepage_vma(int defrag, | ||
652 | struct vm_area_struct *vma, | ||
653 | unsigned long haddr) | ||
654 | { | ||
655 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), | ||
656 | HPAGE_PMD_ORDER, vma, haddr); | ||
657 | } | ||
658 | |||
659 | #ifndef CONFIG_NUMA | ||
660 | static inline struct page *alloc_hugepage(int defrag) | ||
661 | { | ||
662 | return alloc_pages(alloc_hugepage_gfpmask(defrag), | ||
663 | HPAGE_PMD_ORDER); | ||
664 | } | ||
665 | #endif | ||
666 | |||
667 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
668 | unsigned long address, pmd_t *pmd, | ||
669 | unsigned int flags) | ||
670 | { | ||
671 | struct page *page; | ||
672 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
673 | pte_t *pte; | ||
674 | |||
675 | if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { | ||
676 | if (unlikely(anon_vma_prepare(vma))) | ||
677 | return VM_FAULT_OOM; | ||
678 | if (unlikely(khugepaged_enter(vma))) | ||
679 | return VM_FAULT_OOM; | ||
680 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | ||
681 | vma, haddr); | ||
682 | if (unlikely(!page)) | ||
683 | goto out; | ||
684 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | ||
685 | put_page(page); | ||
686 | goto out; | ||
687 | } | ||
688 | |||
689 | return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); | ||
690 | } | ||
691 | out: | ||
692 | /* | ||
693 | * Use __pte_alloc instead of pte_alloc_map, because we can't | ||
694 | * run pte_offset_map on the pmd, if an huge pmd could | ||
695 | * materialize from under us from a different thread. | ||
696 | */ | ||
697 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
698 | return VM_FAULT_OOM; | ||
699 | /* if an huge pmd materialized from under us just retry later */ | ||
700 | if (unlikely(pmd_trans_huge(*pmd))) | ||
701 | return 0; | ||
702 | /* | ||
703 | * A regular pmd is established and it can't morph into a huge pmd | ||
704 | * from under us anymore at this point because we hold the mmap_sem | ||
705 | * read mode and khugepaged takes it in write mode. So now it's | ||
706 | * safe to run pte_offset_map(). | ||
707 | */ | ||
708 | pte = pte_offset_map(pmd, address); | ||
709 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | ||
710 | } | ||
711 | |||
712 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||
713 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | ||
714 | struct vm_area_struct *vma) | ||
715 | { | ||
716 | struct page *src_page; | ||
717 | pmd_t pmd; | ||
718 | pgtable_t pgtable; | ||
719 | int ret; | ||
720 | |||
721 | ret = -ENOMEM; | ||
722 | pgtable = pte_alloc_one(dst_mm, addr); | ||
723 | if (unlikely(!pgtable)) | ||
724 | goto out; | ||
725 | |||
726 | spin_lock(&dst_mm->page_table_lock); | ||
727 | spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); | ||
728 | |||
729 | ret = -EAGAIN; | ||
730 | pmd = *src_pmd; | ||
731 | if (unlikely(!pmd_trans_huge(pmd))) { | ||
732 | pte_free(dst_mm, pgtable); | ||
733 | goto out_unlock; | ||
734 | } | ||
735 | if (unlikely(pmd_trans_splitting(pmd))) { | ||
736 | /* split huge page running from under us */ | ||
737 | spin_unlock(&src_mm->page_table_lock); | ||
738 | spin_unlock(&dst_mm->page_table_lock); | ||
739 | pte_free(dst_mm, pgtable); | ||
740 | |||
741 | wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ | ||
742 | goto out; | ||
743 | } | ||
744 | src_page = pmd_page(pmd); | ||
745 | VM_BUG_ON(!PageHead(src_page)); | ||
746 | get_page(src_page); | ||
747 | page_dup_rmap(src_page); | ||
748 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | ||
749 | |||
750 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | ||
751 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | ||
752 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | ||
753 | prepare_pmd_huge_pte(pgtable, dst_mm); | ||
754 | |||
755 | ret = 0; | ||
756 | out_unlock: | ||
757 | spin_unlock(&src_mm->page_table_lock); | ||
758 | spin_unlock(&dst_mm->page_table_lock); | ||
759 | out: | ||
760 | return ret; | ||
761 | } | ||
762 | |||
763 | /* no "address" argument so destroys page coloring of some arch */ | ||
764 | pgtable_t get_pmd_huge_pte(struct mm_struct *mm) | ||
765 | { | ||
766 | pgtable_t pgtable; | ||
767 | |||
768 | assert_spin_locked(&mm->page_table_lock); | ||
769 | |||
770 | /* FIFO */ | ||
771 | pgtable = mm->pmd_huge_pte; | ||
772 | if (list_empty(&pgtable->lru)) | ||
773 | mm->pmd_huge_pte = NULL; | ||
774 | else { | ||
775 | mm->pmd_huge_pte = list_entry(pgtable->lru.next, | ||
776 | struct page, lru); | ||
777 | list_del(&pgtable->lru); | ||
778 | } | ||
779 | return pgtable; | ||
780 | } | ||
781 | |||
782 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | ||
783 | struct vm_area_struct *vma, | ||
784 | unsigned long address, | ||
785 | pmd_t *pmd, pmd_t orig_pmd, | ||
786 | struct page *page, | ||
787 | unsigned long haddr) | ||
788 | { | ||
789 | pgtable_t pgtable; | ||
790 | pmd_t _pmd; | ||
791 | int ret = 0, i; | ||
792 | struct page **pages; | ||
793 | |||
794 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, | ||
795 | GFP_KERNEL); | ||
796 | if (unlikely(!pages)) { | ||
797 | ret |= VM_FAULT_OOM; | ||
798 | goto out; | ||
799 | } | ||
800 | |||
801 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
802 | pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | ||
803 | vma, address); | ||
804 | if (unlikely(!pages[i] || | ||
805 | mem_cgroup_newpage_charge(pages[i], mm, | ||
806 | GFP_KERNEL))) { | ||
807 | if (pages[i]) | ||
808 | put_page(pages[i]); | ||
809 | mem_cgroup_uncharge_start(); | ||
810 | while (--i >= 0) { | ||
811 | mem_cgroup_uncharge_page(pages[i]); | ||
812 | put_page(pages[i]); | ||
813 | } | ||
814 | mem_cgroup_uncharge_end(); | ||
815 | kfree(pages); | ||
816 | ret |= VM_FAULT_OOM; | ||
817 | goto out; | ||
818 | } | ||
819 | } | ||
820 | |||
821 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
822 | copy_user_highpage(pages[i], page + i, | ||
823 | haddr + PAGE_SHIFT*i, vma); | ||
824 | __SetPageUptodate(pages[i]); | ||
825 | cond_resched(); | ||
826 | } | ||
827 | |||
828 | spin_lock(&mm->page_table_lock); | ||
829 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
830 | goto out_free_pages; | ||
831 | VM_BUG_ON(!PageHead(page)); | ||
832 | |||
833 | pmdp_clear_flush_notify(vma, haddr, pmd); | ||
834 | /* leave pmd empty until pte is filled */ | ||
835 | |||
836 | pgtable = get_pmd_huge_pte(mm); | ||
837 | pmd_populate(mm, &_pmd, pgtable); | ||
838 | |||
839 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
840 | pte_t *pte, entry; | ||
841 | entry = mk_pte(pages[i], vma->vm_page_prot); | ||
842 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
843 | page_add_new_anon_rmap(pages[i], vma, haddr); | ||
844 | pte = pte_offset_map(&_pmd, haddr); | ||
845 | VM_BUG_ON(!pte_none(*pte)); | ||
846 | set_pte_at(mm, haddr, pte, entry); | ||
847 | pte_unmap(pte); | ||
848 | } | ||
849 | kfree(pages); | ||
850 | |||
851 | mm->nr_ptes++; | ||
852 | smp_wmb(); /* make pte visible before pmd */ | ||
853 | pmd_populate(mm, pmd, pgtable); | ||
854 | page_remove_rmap(page); | ||
855 | spin_unlock(&mm->page_table_lock); | ||
856 | |||
857 | ret |= VM_FAULT_WRITE; | ||
858 | put_page(page); | ||
859 | |||
860 | out: | ||
861 | return ret; | ||
862 | |||
863 | out_free_pages: | ||
864 | spin_unlock(&mm->page_table_lock); | ||
865 | mem_cgroup_uncharge_start(); | ||
866 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
867 | mem_cgroup_uncharge_page(pages[i]); | ||
868 | put_page(pages[i]); | ||
869 | } | ||
870 | mem_cgroup_uncharge_end(); | ||
871 | kfree(pages); | ||
872 | goto out; | ||
873 | } | ||
874 | |||
875 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
876 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | ||
877 | { | ||
878 | int ret = 0; | ||
879 | struct page *page, *new_page; | ||
880 | unsigned long haddr; | ||
881 | |||
882 | VM_BUG_ON(!vma->anon_vma); | ||
883 | spin_lock(&mm->page_table_lock); | ||
884 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
885 | goto out_unlock; | ||
886 | |||
887 | page = pmd_page(orig_pmd); | ||
888 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | ||
889 | haddr = address & HPAGE_PMD_MASK; | ||
890 | if (page_mapcount(page) == 1) { | ||
891 | pmd_t entry; | ||
892 | entry = pmd_mkyoung(orig_pmd); | ||
893 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
894 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) | ||
895 | update_mmu_cache(vma, address, entry); | ||
896 | ret |= VM_FAULT_WRITE; | ||
897 | goto out_unlock; | ||
898 | } | ||
899 | get_page(page); | ||
900 | spin_unlock(&mm->page_table_lock); | ||
901 | |||
902 | if (transparent_hugepage_enabled(vma) && | ||
903 | !transparent_hugepage_debug_cow()) | ||
904 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | ||
905 | vma, haddr); | ||
906 | else | ||
907 | new_page = NULL; | ||
908 | |||
909 | if (unlikely(!new_page)) { | ||
910 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | ||
911 | pmd, orig_pmd, page, haddr); | ||
912 | put_page(page); | ||
913 | goto out; | ||
914 | } | ||
915 | |||
916 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | ||
917 | put_page(new_page); | ||
918 | put_page(page); | ||
919 | ret |= VM_FAULT_OOM; | ||
920 | goto out; | ||
921 | } | ||
922 | |||
923 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | ||
924 | __SetPageUptodate(new_page); | ||
925 | |||
926 | spin_lock(&mm->page_table_lock); | ||
927 | put_page(page); | ||
928 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | ||
929 | mem_cgroup_uncharge_page(new_page); | ||
930 | put_page(new_page); | ||
931 | } else { | ||
932 | pmd_t entry; | ||
933 | VM_BUG_ON(!PageHead(page)); | ||
934 | entry = mk_pmd(new_page, vma->vm_page_prot); | ||
935 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
936 | entry = pmd_mkhuge(entry); | ||
937 | pmdp_clear_flush_notify(vma, haddr, pmd); | ||
938 | page_add_new_anon_rmap(new_page, vma, haddr); | ||
939 | set_pmd_at(mm, haddr, pmd, entry); | ||
940 | update_mmu_cache(vma, address, entry); | ||
941 | page_remove_rmap(page); | ||
942 | put_page(page); | ||
943 | ret |= VM_FAULT_WRITE; | ||
944 | } | ||
945 | out_unlock: | ||
946 | spin_unlock(&mm->page_table_lock); | ||
947 | out: | ||
948 | return ret; | ||
949 | } | ||
950 | |||
951 | struct page *follow_trans_huge_pmd(struct mm_struct *mm, | ||
952 | unsigned long addr, | ||
953 | pmd_t *pmd, | ||
954 | unsigned int flags) | ||
955 | { | ||
956 | struct page *page = NULL; | ||
957 | |||
958 | assert_spin_locked(&mm->page_table_lock); | ||
959 | |||
960 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) | ||
961 | goto out; | ||
962 | |||
963 | page = pmd_page(*pmd); | ||
964 | VM_BUG_ON(!PageHead(page)); | ||
965 | if (flags & FOLL_TOUCH) { | ||
966 | pmd_t _pmd; | ||
967 | /* | ||
968 | * We should set the dirty bit only for FOLL_WRITE but | ||
969 | * for now the dirty bit in the pmd is meaningless. | ||
970 | * And if the dirty bit will become meaningful and | ||
971 | * we'll only set it with FOLL_WRITE, an atomic | ||
972 | * set_bit will be required on the pmd to set the | ||
973 | * young bit, instead of the current set_pmd_at. | ||
974 | */ | ||
975 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | ||
976 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); | ||
977 | } | ||
978 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | ||
979 | VM_BUG_ON(!PageCompound(page)); | ||
980 | if (flags & FOLL_GET) | ||
981 | get_page(page); | ||
982 | |||
983 | out: | ||
984 | return page; | ||
985 | } | ||
986 | |||
987 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | ||
988 | pmd_t *pmd) | ||
989 | { | ||
990 | int ret = 0; | ||
991 | |||
992 | spin_lock(&tlb->mm->page_table_lock); | ||
993 | if (likely(pmd_trans_huge(*pmd))) { | ||
994 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
995 | spin_unlock(&tlb->mm->page_table_lock); | ||
996 | wait_split_huge_page(vma->anon_vma, | ||
997 | pmd); | ||
998 | } else { | ||
999 | struct page *page; | ||
1000 | pgtable_t pgtable; | ||
1001 | pgtable = get_pmd_huge_pte(tlb->mm); | ||
1002 | page = pmd_page(*pmd); | ||
1003 | pmd_clear(pmd); | ||
1004 | page_remove_rmap(page); | ||
1005 | VM_BUG_ON(page_mapcount(page) < 0); | ||
1006 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1007 | VM_BUG_ON(!PageHead(page)); | ||
1008 | spin_unlock(&tlb->mm->page_table_lock); | ||
1009 | tlb_remove_page(tlb, page); | ||
1010 | pte_free(tlb->mm, pgtable); | ||
1011 | ret = 1; | ||
1012 | } | ||
1013 | } else | ||
1014 | spin_unlock(&tlb->mm->page_table_lock); | ||
1015 | |||
1016 | return ret; | ||
1017 | } | ||
1018 | |||
1019 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | ||
1020 | unsigned long addr, unsigned long end, | ||
1021 | unsigned char *vec) | ||
1022 | { | ||
1023 | int ret = 0; | ||
1024 | |||
1025 | spin_lock(&vma->vm_mm->page_table_lock); | ||
1026 | if (likely(pmd_trans_huge(*pmd))) { | ||
1027 | ret = !pmd_trans_splitting(*pmd); | ||
1028 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1029 | if (unlikely(!ret)) | ||
1030 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1031 | else { | ||
1032 | /* | ||
1033 | * All logical pages in the range are present | ||
1034 | * if backed by a huge page. | ||
1035 | */ | ||
1036 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1037 | } | ||
1038 | } else | ||
1039 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1040 | |||
1041 | return ret; | ||
1042 | } | ||
1043 | |||
1044 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | ||
1045 | unsigned long addr, pgprot_t newprot) | ||
1046 | { | ||
1047 | struct mm_struct *mm = vma->vm_mm; | ||
1048 | int ret = 0; | ||
1049 | |||
1050 | spin_lock(&mm->page_table_lock); | ||
1051 | if (likely(pmd_trans_huge(*pmd))) { | ||
1052 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
1053 | spin_unlock(&mm->page_table_lock); | ||
1054 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1055 | } else { | ||
1056 | pmd_t entry; | ||
1057 | |||
1058 | entry = pmdp_get_and_clear(mm, addr, pmd); | ||
1059 | entry = pmd_modify(entry, newprot); | ||
1060 | set_pmd_at(mm, addr, pmd, entry); | ||
1061 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1062 | flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); | ||
1063 | ret = 1; | ||
1064 | } | ||
1065 | } else | ||
1066 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1067 | |||
1068 | return ret; | ||
1069 | } | ||
1070 | |||
1071 | pmd_t *page_check_address_pmd(struct page *page, | ||
1072 | struct mm_struct *mm, | ||
1073 | unsigned long address, | ||
1074 | enum page_check_address_pmd_flag flag) | ||
1075 | { | ||
1076 | pgd_t *pgd; | ||
1077 | pud_t *pud; | ||
1078 | pmd_t *pmd, *ret = NULL; | ||
1079 | |||
1080 | if (address & ~HPAGE_PMD_MASK) | ||
1081 | goto out; | ||
1082 | |||
1083 | pgd = pgd_offset(mm, address); | ||
1084 | if (!pgd_present(*pgd)) | ||
1085 | goto out; | ||
1086 | |||
1087 | pud = pud_offset(pgd, address); | ||
1088 | if (!pud_present(*pud)) | ||
1089 | goto out; | ||
1090 | |||
1091 | pmd = pmd_offset(pud, address); | ||
1092 | if (pmd_none(*pmd)) | ||
1093 | goto out; | ||
1094 | if (pmd_page(*pmd) != page) | ||
1095 | goto out; | ||
1096 | /* | ||
1097 | * split_vma() may create temporary aliased mappings. There is | ||
1098 | * no risk as long as all huge pmd are found and have their | ||
1099 | * splitting bit set before __split_huge_page_refcount | ||
1100 | * runs. Finding the same huge pmd more than once during the | ||
1101 | * same rmap walk is not a problem. | ||
1102 | */ | ||
1103 | if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && | ||
1104 | pmd_trans_splitting(*pmd)) | ||
1105 | goto out; | ||
1106 | if (pmd_trans_huge(*pmd)) { | ||
1107 | VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && | ||
1108 | !pmd_trans_splitting(*pmd)); | ||
1109 | ret = pmd; | ||
1110 | } | ||
1111 | out: | ||
1112 | return ret; | ||
1113 | } | ||
1114 | |||
1115 | static int __split_huge_page_splitting(struct page *page, | ||
1116 | struct vm_area_struct *vma, | ||
1117 | unsigned long address) | ||
1118 | { | ||
1119 | struct mm_struct *mm = vma->vm_mm; | ||
1120 | pmd_t *pmd; | ||
1121 | int ret = 0; | ||
1122 | |||
1123 | spin_lock(&mm->page_table_lock); | ||
1124 | pmd = page_check_address_pmd(page, mm, address, | ||
1125 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); | ||
1126 | if (pmd) { | ||
1127 | /* | ||
1128 | * We can't temporarily set the pmd to null in order | ||
1129 | * to split it, the pmd must remain marked huge at all | ||
1130 | * times or the VM won't take the pmd_trans_huge paths | ||
1131 | * and it won't wait on the anon_vma->root->lock to | ||
1132 | * serialize against split_huge_page*. | ||
1133 | */ | ||
1134 | pmdp_splitting_flush_notify(vma, address, pmd); | ||
1135 | ret = 1; | ||
1136 | } | ||
1137 | spin_unlock(&mm->page_table_lock); | ||
1138 | |||
1139 | return ret; | ||
1140 | } | ||
1141 | |||
1142 | static void __split_huge_page_refcount(struct page *page) | ||
1143 | { | ||
1144 | int i; | ||
1145 | unsigned long head_index = page->index; | ||
1146 | struct zone *zone = page_zone(page); | ||
1147 | int zonestat; | ||
1148 | |||
1149 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | ||
1150 | spin_lock_irq(&zone->lru_lock); | ||
1151 | compound_lock(page); | ||
1152 | |||
1153 | for (i = 1; i < HPAGE_PMD_NR; i++) { | ||
1154 | struct page *page_tail = page + i; | ||
1155 | |||
1156 | /* tail_page->_count cannot change */ | ||
1157 | atomic_sub(atomic_read(&page_tail->_count), &page->_count); | ||
1158 | BUG_ON(page_count(page) <= 0); | ||
1159 | atomic_add(page_mapcount(page) + 1, &page_tail->_count); | ||
1160 | BUG_ON(atomic_read(&page_tail->_count) <= 0); | ||
1161 | |||
1162 | /* after clearing PageTail the gup refcount can be released */ | ||
1163 | smp_mb(); | ||
1164 | |||
1165 | /* | ||
1166 | * retain hwpoison flag of the poisoned tail page: | ||
1167 | * fix for the unsuitable process killed on Guest Machine(KVM) | ||
1168 | * by the memory-failure. | ||
1169 | */ | ||
1170 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; | ||
1171 | page_tail->flags |= (page->flags & | ||
1172 | ((1L << PG_referenced) | | ||
1173 | (1L << PG_swapbacked) | | ||
1174 | (1L << PG_mlocked) | | ||
1175 | (1L << PG_uptodate))); | ||
1176 | page_tail->flags |= (1L << PG_dirty); | ||
1177 | |||
1178 | /* | ||
1179 | * 1) clear PageTail before overwriting first_page | ||
1180 | * 2) clear PageTail before clearing PageHead for VM_BUG_ON | ||
1181 | */ | ||
1182 | smp_wmb(); | ||
1183 | |||
1184 | /* | ||
1185 | * __split_huge_page_splitting() already set the | ||
1186 | * splitting bit in all pmd that could map this | ||
1187 | * hugepage, that will ensure no CPU can alter the | ||
1188 | * mapcount on the head page. The mapcount is only | ||
1189 | * accounted in the head page and it has to be | ||
1190 | * transferred to all tail pages in the below code. So | ||
1191 | * for this code to be safe, the split the mapcount | ||
1192 | * can't change. But that doesn't mean userland can't | ||
1193 | * keep changing and reading the page contents while | ||
1194 | * we transfer the mapcount, so the pmd splitting | ||
1195 | * status is achieved setting a reserved bit in the | ||
1196 | * pmd, not by clearing the present bit. | ||
1197 | */ | ||
1198 | BUG_ON(page_mapcount(page_tail)); | ||
1199 | page_tail->_mapcount = page->_mapcount; | ||
1200 | |||
1201 | BUG_ON(page_tail->mapping); | ||
1202 | page_tail->mapping = page->mapping; | ||
1203 | |||
1204 | page_tail->index = ++head_index; | ||
1205 | |||
1206 | BUG_ON(!PageAnon(page_tail)); | ||
1207 | BUG_ON(!PageUptodate(page_tail)); | ||
1208 | BUG_ON(!PageDirty(page_tail)); | ||
1209 | BUG_ON(!PageSwapBacked(page_tail)); | ||
1210 | |||
1211 | mem_cgroup_split_huge_fixup(page, page_tail); | ||
1212 | |||
1213 | lru_add_page_tail(zone, page, page_tail); | ||
1214 | } | ||
1215 | |||
1216 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | ||
1217 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | ||
1218 | |||
1219 | /* | ||
1220 | * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, | ||
1221 | * so adjust those appropriately if this page is on the LRU. | ||
1222 | */ | ||
1223 | if (PageLRU(page)) { | ||
1224 | zonestat = NR_LRU_BASE + page_lru(page); | ||
1225 | __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); | ||
1226 | } | ||
1227 | |||
1228 | ClearPageCompound(page); | ||
1229 | compound_unlock(page); | ||
1230 | spin_unlock_irq(&zone->lru_lock); | ||
1231 | |||
1232 | for (i = 1; i < HPAGE_PMD_NR; i++) { | ||
1233 | struct page *page_tail = page + i; | ||
1234 | BUG_ON(page_count(page_tail) <= 0); | ||
1235 | /* | ||
1236 | * Tail pages may be freed if there wasn't any mapping | ||
1237 | * like if add_to_swap() is running on a lru page that | ||
1238 | * had its mapping zapped. And freeing these pages | ||
1239 | * requires taking the lru_lock so we do the put_page | ||
1240 | * of the tail pages after the split is complete. | ||
1241 | */ | ||
1242 | put_page(page_tail); | ||
1243 | } | ||
1244 | |||
1245 | /* | ||
1246 | * Only the head page (now become a regular page) is required | ||
1247 | * to be pinned by the caller. | ||
1248 | */ | ||
1249 | BUG_ON(page_count(page) <= 0); | ||
1250 | } | ||
1251 | |||
1252 | static int __split_huge_page_map(struct page *page, | ||
1253 | struct vm_area_struct *vma, | ||
1254 | unsigned long address) | ||
1255 | { | ||
1256 | struct mm_struct *mm = vma->vm_mm; | ||
1257 | pmd_t *pmd, _pmd; | ||
1258 | int ret = 0, i; | ||
1259 | pgtable_t pgtable; | ||
1260 | unsigned long haddr; | ||
1261 | |||
1262 | spin_lock(&mm->page_table_lock); | ||
1263 | pmd = page_check_address_pmd(page, mm, address, | ||
1264 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); | ||
1265 | if (pmd) { | ||
1266 | pgtable = get_pmd_huge_pte(mm); | ||
1267 | pmd_populate(mm, &_pmd, pgtable); | ||
1268 | |||
1269 | for (i = 0, haddr = address; i < HPAGE_PMD_NR; | ||
1270 | i++, haddr += PAGE_SIZE) { | ||
1271 | pte_t *pte, entry; | ||
1272 | BUG_ON(PageCompound(page+i)); | ||
1273 | entry = mk_pte(page + i, vma->vm_page_prot); | ||
1274 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1275 | if (!pmd_write(*pmd)) | ||
1276 | entry = pte_wrprotect(entry); | ||
1277 | else | ||
1278 | BUG_ON(page_mapcount(page) != 1); | ||
1279 | if (!pmd_young(*pmd)) | ||
1280 | entry = pte_mkold(entry); | ||
1281 | pte = pte_offset_map(&_pmd, haddr); | ||
1282 | BUG_ON(!pte_none(*pte)); | ||
1283 | set_pte_at(mm, haddr, pte, entry); | ||
1284 | pte_unmap(pte); | ||
1285 | } | ||
1286 | |||
1287 | mm->nr_ptes++; | ||
1288 | smp_wmb(); /* make pte visible before pmd */ | ||
1289 | /* | ||
1290 | * Up to this point the pmd is present and huge and | ||
1291 | * userland has the whole access to the hugepage | ||
1292 | * during the split (which happens in place). If we | ||
1293 | * overwrite the pmd with the not-huge version | ||
1294 | * pointing to the pte here (which of course we could | ||
1295 | * if all CPUs were bug free), userland could trigger | ||
1296 | * a small page size TLB miss on the small sized TLB | ||
1297 | * while the hugepage TLB entry is still established | ||
1298 | * in the huge TLB. Some CPU doesn't like that. See | ||
1299 | * http://support.amd.com/us/Processor_TechDocs/41322.pdf, | ||
1300 | * Erratum 383 on page 93. Intel should be safe but is | ||
1301 | * also warns that it's only safe if the permission | ||
1302 | * and cache attributes of the two entries loaded in | ||
1303 | * the two TLB is identical (which should be the case | ||
1304 | * here). But it is generally safer to never allow | ||
1305 | * small and huge TLB entries for the same virtual | ||
1306 | * address to be loaded simultaneously. So instead of | ||
1307 | * doing "pmd_populate(); flush_tlb_range();" we first | ||
1308 | * mark the current pmd notpresent (atomically because | ||
1309 | * here the pmd_trans_huge and pmd_trans_splitting | ||
1310 | * must remain set at all times on the pmd until the | ||
1311 | * split is complete for this pmd), then we flush the | ||
1312 | * SMP TLB and finally we write the non-huge version | ||
1313 | * of the pmd entry with pmd_populate. | ||
1314 | */ | ||
1315 | set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); | ||
1316 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
1317 | pmd_populate(mm, pmd, pgtable); | ||
1318 | ret = 1; | ||
1319 | } | ||
1320 | spin_unlock(&mm->page_table_lock); | ||
1321 | |||
1322 | return ret; | ||
1323 | } | ||
1324 | |||
1325 | /* must be called with anon_vma->root->lock hold */ | ||
1326 | static void __split_huge_page(struct page *page, | ||
1327 | struct anon_vma *anon_vma) | ||
1328 | { | ||
1329 | int mapcount, mapcount2; | ||
1330 | struct anon_vma_chain *avc; | ||
1331 | |||
1332 | BUG_ON(!PageHead(page)); | ||
1333 | BUG_ON(PageTail(page)); | ||
1334 | |||
1335 | mapcount = 0; | ||
1336 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | ||
1337 | struct vm_area_struct *vma = avc->vma; | ||
1338 | unsigned long addr = vma_address(page, vma); | ||
1339 | BUG_ON(is_vma_temporary_stack(vma)); | ||
1340 | if (addr == -EFAULT) | ||
1341 | continue; | ||
1342 | mapcount += __split_huge_page_splitting(page, vma, addr); | ||
1343 | } | ||
1344 | /* | ||
1345 | * It is critical that new vmas are added to the tail of the | ||
1346 | * anon_vma list. This guarantes that if copy_huge_pmd() runs | ||
1347 | * and establishes a child pmd before | ||
1348 | * __split_huge_page_splitting() freezes the parent pmd (so if | ||
1349 | * we fail to prevent copy_huge_pmd() from running until the | ||
1350 | * whole __split_huge_page() is complete), we will still see | ||
1351 | * the newly established pmd of the child later during the | ||
1352 | * walk, to be able to set it as pmd_trans_splitting too. | ||
1353 | */ | ||
1354 | if (mapcount != page_mapcount(page)) | ||
1355 | printk(KERN_ERR "mapcount %d page_mapcount %d\n", | ||
1356 | mapcount, page_mapcount(page)); | ||
1357 | BUG_ON(mapcount != page_mapcount(page)); | ||
1358 | |||
1359 | __split_huge_page_refcount(page); | ||
1360 | |||
1361 | mapcount2 = 0; | ||
1362 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | ||
1363 | struct vm_area_struct *vma = avc->vma; | ||
1364 | unsigned long addr = vma_address(page, vma); | ||
1365 | BUG_ON(is_vma_temporary_stack(vma)); | ||
1366 | if (addr == -EFAULT) | ||
1367 | continue; | ||
1368 | mapcount2 += __split_huge_page_map(page, vma, addr); | ||
1369 | } | ||
1370 | if (mapcount != mapcount2) | ||
1371 | printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", | ||
1372 | mapcount, mapcount2, page_mapcount(page)); | ||
1373 | BUG_ON(mapcount != mapcount2); | ||
1374 | } | ||
1375 | |||
1376 | int split_huge_page(struct page *page) | ||
1377 | { | ||
1378 | struct anon_vma *anon_vma; | ||
1379 | int ret = 1; | ||
1380 | |||
1381 | BUG_ON(!PageAnon(page)); | ||
1382 | anon_vma = page_lock_anon_vma(page); | ||
1383 | if (!anon_vma) | ||
1384 | goto out; | ||
1385 | ret = 0; | ||
1386 | if (!PageCompound(page)) | ||
1387 | goto out_unlock; | ||
1388 | |||
1389 | BUG_ON(!PageSwapBacked(page)); | ||
1390 | __split_huge_page(page, anon_vma); | ||
1391 | |||
1392 | BUG_ON(PageCompound(page)); | ||
1393 | out_unlock: | ||
1394 | page_unlock_anon_vma(anon_vma); | ||
1395 | out: | ||
1396 | return ret; | ||
1397 | } | ||
1398 | |||
1399 | int hugepage_madvise(struct vm_area_struct *vma, | ||
1400 | unsigned long *vm_flags, int advice) | ||
1401 | { | ||
1402 | switch (advice) { | ||
1403 | case MADV_HUGEPAGE: | ||
1404 | /* | ||
1405 | * Be somewhat over-protective like KSM for now! | ||
1406 | */ | ||
1407 | if (*vm_flags & (VM_HUGEPAGE | | ||
1408 | VM_SHARED | VM_MAYSHARE | | ||
1409 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | ||
1410 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | ||
1411 | VM_MIXEDMAP | VM_SAO)) | ||
1412 | return -EINVAL; | ||
1413 | *vm_flags &= ~VM_NOHUGEPAGE; | ||
1414 | *vm_flags |= VM_HUGEPAGE; | ||
1415 | /* | ||
1416 | * If the vma become good for khugepaged to scan, | ||
1417 | * register it here without waiting a page fault that | ||
1418 | * may not happen any time soon. | ||
1419 | */ | ||
1420 | if (unlikely(khugepaged_enter_vma_merge(vma))) | ||
1421 | return -ENOMEM; | ||
1422 | break; | ||
1423 | case MADV_NOHUGEPAGE: | ||
1424 | /* | ||
1425 | * Be somewhat over-protective like KSM for now! | ||
1426 | */ | ||
1427 | if (*vm_flags & (VM_NOHUGEPAGE | | ||
1428 | VM_SHARED | VM_MAYSHARE | | ||
1429 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | ||
1430 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | ||
1431 | VM_MIXEDMAP | VM_SAO)) | ||
1432 | return -EINVAL; | ||
1433 | *vm_flags &= ~VM_HUGEPAGE; | ||
1434 | *vm_flags |= VM_NOHUGEPAGE; | ||
1435 | /* | ||
1436 | * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning | ||
1437 | * this vma even if we leave the mm registered in khugepaged if | ||
1438 | * it got registered before VM_NOHUGEPAGE was set. | ||
1439 | */ | ||
1440 | break; | ||
1441 | } | ||
1442 | |||
1443 | return 0; | ||
1444 | } | ||
1445 | |||
1446 | static int __init khugepaged_slab_init(void) | ||
1447 | { | ||
1448 | mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", | ||
1449 | sizeof(struct mm_slot), | ||
1450 | __alignof__(struct mm_slot), 0, NULL); | ||
1451 | if (!mm_slot_cache) | ||
1452 | return -ENOMEM; | ||
1453 | |||
1454 | return 0; | ||
1455 | } | ||
1456 | |||
1457 | static void __init khugepaged_slab_free(void) | ||
1458 | { | ||
1459 | kmem_cache_destroy(mm_slot_cache); | ||
1460 | mm_slot_cache = NULL; | ||
1461 | } | ||
1462 | |||
1463 | static inline struct mm_slot *alloc_mm_slot(void) | ||
1464 | { | ||
1465 | if (!mm_slot_cache) /* initialization failed */ | ||
1466 | return NULL; | ||
1467 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); | ||
1468 | } | ||
1469 | |||
1470 | static inline void free_mm_slot(struct mm_slot *mm_slot) | ||
1471 | { | ||
1472 | kmem_cache_free(mm_slot_cache, mm_slot); | ||
1473 | } | ||
1474 | |||
1475 | static int __init mm_slots_hash_init(void) | ||
1476 | { | ||
1477 | mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), | ||
1478 | GFP_KERNEL); | ||
1479 | if (!mm_slots_hash) | ||
1480 | return -ENOMEM; | ||
1481 | return 0; | ||
1482 | } | ||
1483 | |||
1484 | #if 0 | ||
1485 | static void __init mm_slots_hash_free(void) | ||
1486 | { | ||
1487 | kfree(mm_slots_hash); | ||
1488 | mm_slots_hash = NULL; | ||
1489 | } | ||
1490 | #endif | ||
1491 | |||
1492 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | ||
1493 | { | ||
1494 | struct mm_slot *mm_slot; | ||
1495 | struct hlist_head *bucket; | ||
1496 | struct hlist_node *node; | ||
1497 | |||
1498 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
1499 | % MM_SLOTS_HASH_HEADS]; | ||
1500 | hlist_for_each_entry(mm_slot, node, bucket, hash) { | ||
1501 | if (mm == mm_slot->mm) | ||
1502 | return mm_slot; | ||
1503 | } | ||
1504 | return NULL; | ||
1505 | } | ||
1506 | |||
1507 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | ||
1508 | struct mm_slot *mm_slot) | ||
1509 | { | ||
1510 | struct hlist_head *bucket; | ||
1511 | |||
1512 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
1513 | % MM_SLOTS_HASH_HEADS]; | ||
1514 | mm_slot->mm = mm; | ||
1515 | hlist_add_head(&mm_slot->hash, bucket); | ||
1516 | } | ||
1517 | |||
1518 | static inline int khugepaged_test_exit(struct mm_struct *mm) | ||
1519 | { | ||
1520 | return atomic_read(&mm->mm_users) == 0; | ||
1521 | } | ||
1522 | |||
1523 | int __khugepaged_enter(struct mm_struct *mm) | ||
1524 | { | ||
1525 | struct mm_slot *mm_slot; | ||
1526 | int wakeup; | ||
1527 | |||
1528 | mm_slot = alloc_mm_slot(); | ||
1529 | if (!mm_slot) | ||
1530 | return -ENOMEM; | ||
1531 | |||
1532 | /* __khugepaged_exit() must not run from under us */ | ||
1533 | VM_BUG_ON(khugepaged_test_exit(mm)); | ||
1534 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { | ||
1535 | free_mm_slot(mm_slot); | ||
1536 | return 0; | ||
1537 | } | ||
1538 | |||
1539 | spin_lock(&khugepaged_mm_lock); | ||
1540 | insert_to_mm_slots_hash(mm, mm_slot); | ||
1541 | /* | ||
1542 | * Insert just behind the scanning cursor, to let the area settle | ||
1543 | * down a little. | ||
1544 | */ | ||
1545 | wakeup = list_empty(&khugepaged_scan.mm_head); | ||
1546 | list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); | ||
1547 | spin_unlock(&khugepaged_mm_lock); | ||
1548 | |||
1549 | atomic_inc(&mm->mm_count); | ||
1550 | if (wakeup) | ||
1551 | wake_up_interruptible(&khugepaged_wait); | ||
1552 | |||
1553 | return 0; | ||
1554 | } | ||
1555 | |||
1556 | int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | ||
1557 | { | ||
1558 | unsigned long hstart, hend; | ||
1559 | if (!vma->anon_vma) | ||
1560 | /* | ||
1561 | * Not yet faulted in so we will register later in the | ||
1562 | * page fault if needed. | ||
1563 | */ | ||
1564 | return 0; | ||
1565 | if (vma->vm_file || vma->vm_ops) | ||
1566 | /* khugepaged not yet working on file or special mappings */ | ||
1567 | return 0; | ||
1568 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
1569 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
1570 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
1571 | if (hstart < hend) | ||
1572 | return khugepaged_enter(vma); | ||
1573 | return 0; | ||
1574 | } | ||
1575 | |||
1576 | void __khugepaged_exit(struct mm_struct *mm) | ||
1577 | { | ||
1578 | struct mm_slot *mm_slot; | ||
1579 | int free = 0; | ||
1580 | |||
1581 | spin_lock(&khugepaged_mm_lock); | ||
1582 | mm_slot = get_mm_slot(mm); | ||
1583 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | ||
1584 | hlist_del(&mm_slot->hash); | ||
1585 | list_del(&mm_slot->mm_node); | ||
1586 | free = 1; | ||
1587 | } | ||
1588 | |||
1589 | if (free) { | ||
1590 | spin_unlock(&khugepaged_mm_lock); | ||
1591 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
1592 | free_mm_slot(mm_slot); | ||
1593 | mmdrop(mm); | ||
1594 | } else if (mm_slot) { | ||
1595 | spin_unlock(&khugepaged_mm_lock); | ||
1596 | /* | ||
1597 | * This is required to serialize against | ||
1598 | * khugepaged_test_exit() (which is guaranteed to run | ||
1599 | * under mmap sem read mode). Stop here (after we | ||
1600 | * return all pagetables will be destroyed) until | ||
1601 | * khugepaged has finished working on the pagetables | ||
1602 | * under the mmap_sem. | ||
1603 | */ | ||
1604 | down_write(&mm->mmap_sem); | ||
1605 | up_write(&mm->mmap_sem); | ||
1606 | } else | ||
1607 | spin_unlock(&khugepaged_mm_lock); | ||
1608 | } | ||
1609 | |||
1610 | static void release_pte_page(struct page *page) | ||
1611 | { | ||
1612 | /* 0 stands for page_is_file_cache(page) == false */ | ||
1613 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
1614 | unlock_page(page); | ||
1615 | putback_lru_page(page); | ||
1616 | } | ||
1617 | |||
1618 | static void release_pte_pages(pte_t *pte, pte_t *_pte) | ||
1619 | { | ||
1620 | while (--_pte >= pte) { | ||
1621 | pte_t pteval = *_pte; | ||
1622 | if (!pte_none(pteval)) | ||
1623 | release_pte_page(pte_page(pteval)); | ||
1624 | } | ||
1625 | } | ||
1626 | |||
1627 | static void release_all_pte_pages(pte_t *pte) | ||
1628 | { | ||
1629 | release_pte_pages(pte, pte + HPAGE_PMD_NR); | ||
1630 | } | ||
1631 | |||
1632 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | ||
1633 | unsigned long address, | ||
1634 | pte_t *pte) | ||
1635 | { | ||
1636 | struct page *page; | ||
1637 | pte_t *_pte; | ||
1638 | int referenced = 0, isolated = 0, none = 0; | ||
1639 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
1640 | _pte++, address += PAGE_SIZE) { | ||
1641 | pte_t pteval = *_pte; | ||
1642 | if (pte_none(pteval)) { | ||
1643 | if (++none <= khugepaged_max_ptes_none) | ||
1644 | continue; | ||
1645 | else { | ||
1646 | release_pte_pages(pte, _pte); | ||
1647 | goto out; | ||
1648 | } | ||
1649 | } | ||
1650 | if (!pte_present(pteval) || !pte_write(pteval)) { | ||
1651 | release_pte_pages(pte, _pte); | ||
1652 | goto out; | ||
1653 | } | ||
1654 | page = vm_normal_page(vma, address, pteval); | ||
1655 | if (unlikely(!page)) { | ||
1656 | release_pte_pages(pte, _pte); | ||
1657 | goto out; | ||
1658 | } | ||
1659 | VM_BUG_ON(PageCompound(page)); | ||
1660 | BUG_ON(!PageAnon(page)); | ||
1661 | VM_BUG_ON(!PageSwapBacked(page)); | ||
1662 | |||
1663 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
1664 | if (page_count(page) != 1) { | ||
1665 | release_pte_pages(pte, _pte); | ||
1666 | goto out; | ||
1667 | } | ||
1668 | /* | ||
1669 | * We can do it before isolate_lru_page because the | ||
1670 | * page can't be freed from under us. NOTE: PG_lock | ||
1671 | * is needed to serialize against split_huge_page | ||
1672 | * when invoked from the VM. | ||
1673 | */ | ||
1674 | if (!trylock_page(page)) { | ||
1675 | release_pte_pages(pte, _pte); | ||
1676 | goto out; | ||
1677 | } | ||
1678 | /* | ||
1679 | * Isolate the page to avoid collapsing an hugepage | ||
1680 | * currently in use by the VM. | ||
1681 | */ | ||
1682 | if (isolate_lru_page(page)) { | ||
1683 | unlock_page(page); | ||
1684 | release_pte_pages(pte, _pte); | ||
1685 | goto out; | ||
1686 | } | ||
1687 | /* 0 stands for page_is_file_cache(page) == false */ | ||
1688 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
1689 | VM_BUG_ON(!PageLocked(page)); | ||
1690 | VM_BUG_ON(PageLRU(page)); | ||
1691 | |||
1692 | /* If there is no mapped pte young don't collapse the page */ | ||
1693 | if (pte_young(pteval) || PageReferenced(page) || | ||
1694 | mmu_notifier_test_young(vma->vm_mm, address)) | ||
1695 | referenced = 1; | ||
1696 | } | ||
1697 | if (unlikely(!referenced)) | ||
1698 | release_all_pte_pages(pte); | ||
1699 | else | ||
1700 | isolated = 1; | ||
1701 | out: | ||
1702 | return isolated; | ||
1703 | } | ||
1704 | |||
1705 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | ||
1706 | struct vm_area_struct *vma, | ||
1707 | unsigned long address, | ||
1708 | spinlock_t *ptl) | ||
1709 | { | ||
1710 | pte_t *_pte; | ||
1711 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { | ||
1712 | pte_t pteval = *_pte; | ||
1713 | struct page *src_page; | ||
1714 | |||
1715 | if (pte_none(pteval)) { | ||
1716 | clear_user_highpage(page, address); | ||
1717 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); | ||
1718 | } else { | ||
1719 | src_page = pte_page(pteval); | ||
1720 | copy_user_highpage(page, src_page, address, vma); | ||
1721 | VM_BUG_ON(page_mapcount(src_page) != 1); | ||
1722 | VM_BUG_ON(page_count(src_page) != 2); | ||
1723 | release_pte_page(src_page); | ||
1724 | /* | ||
1725 | * ptl mostly unnecessary, but preempt has to | ||
1726 | * be disabled to update the per-cpu stats | ||
1727 | * inside page_remove_rmap(). | ||
1728 | */ | ||
1729 | spin_lock(ptl); | ||
1730 | /* | ||
1731 | * paravirt calls inside pte_clear here are | ||
1732 | * superfluous. | ||
1733 | */ | ||
1734 | pte_clear(vma->vm_mm, address, _pte); | ||
1735 | page_remove_rmap(src_page); | ||
1736 | spin_unlock(ptl); | ||
1737 | free_page_and_swap_cache(src_page); | ||
1738 | } | ||
1739 | |||
1740 | address += PAGE_SIZE; | ||
1741 | page++; | ||
1742 | } | ||
1743 | } | ||
1744 | |||
1745 | static void collapse_huge_page(struct mm_struct *mm, | ||
1746 | unsigned long address, | ||
1747 | struct page **hpage, | ||
1748 | struct vm_area_struct *vma) | ||
1749 | { | ||
1750 | pgd_t *pgd; | ||
1751 | pud_t *pud; | ||
1752 | pmd_t *pmd, _pmd; | ||
1753 | pte_t *pte; | ||
1754 | pgtable_t pgtable; | ||
1755 | struct page *new_page; | ||
1756 | spinlock_t *ptl; | ||
1757 | int isolated; | ||
1758 | unsigned long hstart, hend; | ||
1759 | |||
1760 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1761 | #ifndef CONFIG_NUMA | ||
1762 | VM_BUG_ON(!*hpage); | ||
1763 | new_page = *hpage; | ||
1764 | #else | ||
1765 | VM_BUG_ON(*hpage); | ||
1766 | /* | ||
1767 | * Allocate the page while the vma is still valid and under | ||
1768 | * the mmap_sem read mode so there is no memory allocation | ||
1769 | * later when we take the mmap_sem in write mode. This is more | ||
1770 | * friendly behavior (OTOH it may actually hide bugs) to | ||
1771 | * filesystems in userland with daemons allocating memory in | ||
1772 | * the userland I/O paths. Allocating memory with the | ||
1773 | * mmap_sem in read mode is good idea also to allow greater | ||
1774 | * scalability. | ||
1775 | */ | ||
1776 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); | ||
1777 | if (unlikely(!new_page)) { | ||
1778 | up_read(&mm->mmap_sem); | ||
1779 | *hpage = ERR_PTR(-ENOMEM); | ||
1780 | return; | ||
1781 | } | ||
1782 | #endif | ||
1783 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | ||
1784 | up_read(&mm->mmap_sem); | ||
1785 | put_page(new_page); | ||
1786 | return; | ||
1787 | } | ||
1788 | |||
1789 | /* after allocating the hugepage upgrade to mmap_sem write mode */ | ||
1790 | up_read(&mm->mmap_sem); | ||
1791 | |||
1792 | /* | ||
1793 | * Prevent all access to pagetables with the exception of | ||
1794 | * gup_fast later hanlded by the ptep_clear_flush and the VM | ||
1795 | * handled by the anon_vma lock + PG_lock. | ||
1796 | */ | ||
1797 | down_write(&mm->mmap_sem); | ||
1798 | if (unlikely(khugepaged_test_exit(mm))) | ||
1799 | goto out; | ||
1800 | |||
1801 | vma = find_vma(mm, address); | ||
1802 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
1803 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
1804 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | ||
1805 | goto out; | ||
1806 | |||
1807 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
1808 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
1809 | goto out; | ||
1810 | |||
1811 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | ||
1812 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) | ||
1813 | goto out; | ||
1814 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
1815 | |||
1816 | pgd = pgd_offset(mm, address); | ||
1817 | if (!pgd_present(*pgd)) | ||
1818 | goto out; | ||
1819 | |||
1820 | pud = pud_offset(pgd, address); | ||
1821 | if (!pud_present(*pud)) | ||
1822 | goto out; | ||
1823 | |||
1824 | pmd = pmd_offset(pud, address); | ||
1825 | /* pmd can't go away or become huge under us */ | ||
1826 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
1827 | goto out; | ||
1828 | |||
1829 | anon_vma_lock(vma->anon_vma); | ||
1830 | |||
1831 | pte = pte_offset_map(pmd, address); | ||
1832 | ptl = pte_lockptr(mm, pmd); | ||
1833 | |||
1834 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | ||
1835 | /* | ||
1836 | * After this gup_fast can't run anymore. This also removes | ||
1837 | * any huge TLB entry from the CPU so we won't allow | ||
1838 | * huge and small TLB entries for the same virtual address | ||
1839 | * to avoid the risk of CPU bugs in that area. | ||
1840 | */ | ||
1841 | _pmd = pmdp_clear_flush_notify(vma, address, pmd); | ||
1842 | spin_unlock(&mm->page_table_lock); | ||
1843 | |||
1844 | spin_lock(ptl); | ||
1845 | isolated = __collapse_huge_page_isolate(vma, address, pte); | ||
1846 | spin_unlock(ptl); | ||
1847 | |||
1848 | if (unlikely(!isolated)) { | ||
1849 | pte_unmap(pte); | ||
1850 | spin_lock(&mm->page_table_lock); | ||
1851 | BUG_ON(!pmd_none(*pmd)); | ||
1852 | set_pmd_at(mm, address, pmd, _pmd); | ||
1853 | spin_unlock(&mm->page_table_lock); | ||
1854 | anon_vma_unlock(vma->anon_vma); | ||
1855 | goto out; | ||
1856 | } | ||
1857 | |||
1858 | /* | ||
1859 | * All pages are isolated and locked so anon_vma rmap | ||
1860 | * can't run anymore. | ||
1861 | */ | ||
1862 | anon_vma_unlock(vma->anon_vma); | ||
1863 | |||
1864 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); | ||
1865 | pte_unmap(pte); | ||
1866 | __SetPageUptodate(new_page); | ||
1867 | pgtable = pmd_pgtable(_pmd); | ||
1868 | VM_BUG_ON(page_count(pgtable) != 1); | ||
1869 | VM_BUG_ON(page_mapcount(pgtable) != 0); | ||
1870 | |||
1871 | _pmd = mk_pmd(new_page, vma->vm_page_prot); | ||
1872 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
1873 | _pmd = pmd_mkhuge(_pmd); | ||
1874 | |||
1875 | /* | ||
1876 | * spin_lock() below is not the equivalent of smp_wmb(), so | ||
1877 | * this is needed to avoid the copy_huge_page writes to become | ||
1878 | * visible after the set_pmd_at() write. | ||
1879 | */ | ||
1880 | smp_wmb(); | ||
1881 | |||
1882 | spin_lock(&mm->page_table_lock); | ||
1883 | BUG_ON(!pmd_none(*pmd)); | ||
1884 | page_add_new_anon_rmap(new_page, vma, address); | ||
1885 | set_pmd_at(mm, address, pmd, _pmd); | ||
1886 | update_mmu_cache(vma, address, entry); | ||
1887 | prepare_pmd_huge_pte(pgtable, mm); | ||
1888 | mm->nr_ptes--; | ||
1889 | spin_unlock(&mm->page_table_lock); | ||
1890 | |||
1891 | #ifndef CONFIG_NUMA | ||
1892 | *hpage = NULL; | ||
1893 | #endif | ||
1894 | khugepaged_pages_collapsed++; | ||
1895 | out_up_write: | ||
1896 | up_write(&mm->mmap_sem); | ||
1897 | return; | ||
1898 | |||
1899 | out: | ||
1900 | mem_cgroup_uncharge_page(new_page); | ||
1901 | #ifdef CONFIG_NUMA | ||
1902 | put_page(new_page); | ||
1903 | #endif | ||
1904 | goto out_up_write; | ||
1905 | } | ||
1906 | |||
1907 | static int khugepaged_scan_pmd(struct mm_struct *mm, | ||
1908 | struct vm_area_struct *vma, | ||
1909 | unsigned long address, | ||
1910 | struct page **hpage) | ||
1911 | { | ||
1912 | pgd_t *pgd; | ||
1913 | pud_t *pud; | ||
1914 | pmd_t *pmd; | ||
1915 | pte_t *pte, *_pte; | ||
1916 | int ret = 0, referenced = 0, none = 0; | ||
1917 | struct page *page; | ||
1918 | unsigned long _address; | ||
1919 | spinlock_t *ptl; | ||
1920 | |||
1921 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1922 | |||
1923 | pgd = pgd_offset(mm, address); | ||
1924 | if (!pgd_present(*pgd)) | ||
1925 | goto out; | ||
1926 | |||
1927 | pud = pud_offset(pgd, address); | ||
1928 | if (!pud_present(*pud)) | ||
1929 | goto out; | ||
1930 | |||
1931 | pmd = pmd_offset(pud, address); | ||
1932 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
1933 | goto out; | ||
1934 | |||
1935 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
1936 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
1937 | _pte++, _address += PAGE_SIZE) { | ||
1938 | pte_t pteval = *_pte; | ||
1939 | if (pte_none(pteval)) { | ||
1940 | if (++none <= khugepaged_max_ptes_none) | ||
1941 | continue; | ||
1942 | else | ||
1943 | goto out_unmap; | ||
1944 | } | ||
1945 | if (!pte_present(pteval) || !pte_write(pteval)) | ||
1946 | goto out_unmap; | ||
1947 | page = vm_normal_page(vma, _address, pteval); | ||
1948 | if (unlikely(!page)) | ||
1949 | goto out_unmap; | ||
1950 | VM_BUG_ON(PageCompound(page)); | ||
1951 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | ||
1952 | goto out_unmap; | ||
1953 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
1954 | if (page_count(page) != 1) | ||
1955 | goto out_unmap; | ||
1956 | if (pte_young(pteval) || PageReferenced(page) || | ||
1957 | mmu_notifier_test_young(vma->vm_mm, address)) | ||
1958 | referenced = 1; | ||
1959 | } | ||
1960 | if (referenced) | ||
1961 | ret = 1; | ||
1962 | out_unmap: | ||
1963 | pte_unmap_unlock(pte, ptl); | ||
1964 | if (ret) | ||
1965 | /* collapse_huge_page will return with the mmap_sem released */ | ||
1966 | collapse_huge_page(mm, address, hpage, vma); | ||
1967 | out: | ||
1968 | return ret; | ||
1969 | } | ||
1970 | |||
1971 | static void collect_mm_slot(struct mm_slot *mm_slot) | ||
1972 | { | ||
1973 | struct mm_struct *mm = mm_slot->mm; | ||
1974 | |||
1975 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | ||
1976 | |||
1977 | if (khugepaged_test_exit(mm)) { | ||
1978 | /* free mm_slot */ | ||
1979 | hlist_del(&mm_slot->hash); | ||
1980 | list_del(&mm_slot->mm_node); | ||
1981 | |||
1982 | /* | ||
1983 | * Not strictly needed because the mm exited already. | ||
1984 | * | ||
1985 | * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
1986 | */ | ||
1987 | |||
1988 | /* khugepaged_mm_lock actually not necessary for the below */ | ||
1989 | free_mm_slot(mm_slot); | ||
1990 | mmdrop(mm); | ||
1991 | } | ||
1992 | } | ||
1993 | |||
1994 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | ||
1995 | struct page **hpage) | ||
1996 | { | ||
1997 | struct mm_slot *mm_slot; | ||
1998 | struct mm_struct *mm; | ||
1999 | struct vm_area_struct *vma; | ||
2000 | int progress = 0; | ||
2001 | |||
2002 | VM_BUG_ON(!pages); | ||
2003 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | ||
2004 | |||
2005 | if (khugepaged_scan.mm_slot) | ||
2006 | mm_slot = khugepaged_scan.mm_slot; | ||
2007 | else { | ||
2008 | mm_slot = list_entry(khugepaged_scan.mm_head.next, | ||
2009 | struct mm_slot, mm_node); | ||
2010 | khugepaged_scan.address = 0; | ||
2011 | khugepaged_scan.mm_slot = mm_slot; | ||
2012 | } | ||
2013 | spin_unlock(&khugepaged_mm_lock); | ||
2014 | |||
2015 | mm = mm_slot->mm; | ||
2016 | down_read(&mm->mmap_sem); | ||
2017 | if (unlikely(khugepaged_test_exit(mm))) | ||
2018 | vma = NULL; | ||
2019 | else | ||
2020 | vma = find_vma(mm, khugepaged_scan.address); | ||
2021 | |||
2022 | progress++; | ||
2023 | for (; vma; vma = vma->vm_next) { | ||
2024 | unsigned long hstart, hend; | ||
2025 | |||
2026 | cond_resched(); | ||
2027 | if (unlikely(khugepaged_test_exit(mm))) { | ||
2028 | progress++; | ||
2029 | break; | ||
2030 | } | ||
2031 | |||
2032 | if ((!(vma->vm_flags & VM_HUGEPAGE) && | ||
2033 | !khugepaged_always()) || | ||
2034 | (vma->vm_flags & VM_NOHUGEPAGE)) { | ||
2035 | progress++; | ||
2036 | continue; | ||
2037 | } | ||
2038 | |||
2039 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | ||
2040 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { | ||
2041 | khugepaged_scan.address = vma->vm_end; | ||
2042 | progress++; | ||
2043 | continue; | ||
2044 | } | ||
2045 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
2046 | |||
2047 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
2048 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
2049 | if (hstart >= hend) { | ||
2050 | progress++; | ||
2051 | continue; | ||
2052 | } | ||
2053 | if (khugepaged_scan.address < hstart) | ||
2054 | khugepaged_scan.address = hstart; | ||
2055 | if (khugepaged_scan.address > hend) { | ||
2056 | khugepaged_scan.address = hend + HPAGE_PMD_SIZE; | ||
2057 | progress++; | ||
2058 | continue; | ||
2059 | } | ||
2060 | BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); | ||
2061 | |||
2062 | while (khugepaged_scan.address < hend) { | ||
2063 | int ret; | ||
2064 | cond_resched(); | ||
2065 | if (unlikely(khugepaged_test_exit(mm))) | ||
2066 | goto breakouterloop; | ||
2067 | |||
2068 | VM_BUG_ON(khugepaged_scan.address < hstart || | ||
2069 | khugepaged_scan.address + HPAGE_PMD_SIZE > | ||
2070 | hend); | ||
2071 | ret = khugepaged_scan_pmd(mm, vma, | ||
2072 | khugepaged_scan.address, | ||
2073 | hpage); | ||
2074 | /* move to next address */ | ||
2075 | khugepaged_scan.address += HPAGE_PMD_SIZE; | ||
2076 | progress += HPAGE_PMD_NR; | ||
2077 | if (ret) | ||
2078 | /* we released mmap_sem so break loop */ | ||
2079 | goto breakouterloop_mmap_sem; | ||
2080 | if (progress >= pages) | ||
2081 | goto breakouterloop; | ||
2082 | } | ||
2083 | } | ||
2084 | breakouterloop: | ||
2085 | up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ | ||
2086 | breakouterloop_mmap_sem: | ||
2087 | |||
2088 | spin_lock(&khugepaged_mm_lock); | ||
2089 | BUG_ON(khugepaged_scan.mm_slot != mm_slot); | ||
2090 | /* | ||
2091 | * Release the current mm_slot if this mm is about to die, or | ||
2092 | * if we scanned all vmas of this mm. | ||
2093 | */ | ||
2094 | if (khugepaged_test_exit(mm) || !vma) { | ||
2095 | /* | ||
2096 | * Make sure that if mm_users is reaching zero while | ||
2097 | * khugepaged runs here, khugepaged_exit will find | ||
2098 | * mm_slot not pointing to the exiting mm. | ||
2099 | */ | ||
2100 | if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { | ||
2101 | khugepaged_scan.mm_slot = list_entry( | ||
2102 | mm_slot->mm_node.next, | ||
2103 | struct mm_slot, mm_node); | ||
2104 | khugepaged_scan.address = 0; | ||
2105 | } else { | ||
2106 | khugepaged_scan.mm_slot = NULL; | ||
2107 | khugepaged_full_scans++; | ||
2108 | } | ||
2109 | |||
2110 | collect_mm_slot(mm_slot); | ||
2111 | } | ||
2112 | |||
2113 | return progress; | ||
2114 | } | ||
2115 | |||
2116 | static int khugepaged_has_work(void) | ||
2117 | { | ||
2118 | return !list_empty(&khugepaged_scan.mm_head) && | ||
2119 | khugepaged_enabled(); | ||
2120 | } | ||
2121 | |||
2122 | static int khugepaged_wait_event(void) | ||
2123 | { | ||
2124 | return !list_empty(&khugepaged_scan.mm_head) || | ||
2125 | !khugepaged_enabled(); | ||
2126 | } | ||
2127 | |||
2128 | static void khugepaged_do_scan(struct page **hpage) | ||
2129 | { | ||
2130 | unsigned int progress = 0, pass_through_head = 0; | ||
2131 | unsigned int pages = khugepaged_pages_to_scan; | ||
2132 | |||
2133 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | ||
2134 | |||
2135 | while (progress < pages) { | ||
2136 | cond_resched(); | ||
2137 | |||
2138 | #ifndef CONFIG_NUMA | ||
2139 | if (!*hpage) { | ||
2140 | *hpage = alloc_hugepage(khugepaged_defrag()); | ||
2141 | if (unlikely(!*hpage)) | ||
2142 | break; | ||
2143 | } | ||
2144 | #else | ||
2145 | if (IS_ERR(*hpage)) | ||
2146 | break; | ||
2147 | #endif | ||
2148 | |||
2149 | if (unlikely(kthread_should_stop() || freezing(current))) | ||
2150 | break; | ||
2151 | |||
2152 | spin_lock(&khugepaged_mm_lock); | ||
2153 | if (!khugepaged_scan.mm_slot) | ||
2154 | pass_through_head++; | ||
2155 | if (khugepaged_has_work() && | ||
2156 | pass_through_head < 2) | ||
2157 | progress += khugepaged_scan_mm_slot(pages - progress, | ||
2158 | hpage); | ||
2159 | else | ||
2160 | progress = pages; | ||
2161 | spin_unlock(&khugepaged_mm_lock); | ||
2162 | } | ||
2163 | } | ||
2164 | |||
2165 | static void khugepaged_alloc_sleep(void) | ||
2166 | { | ||
2167 | DEFINE_WAIT(wait); | ||
2168 | add_wait_queue(&khugepaged_wait, &wait); | ||
2169 | schedule_timeout_interruptible( | ||
2170 | msecs_to_jiffies( | ||
2171 | khugepaged_alloc_sleep_millisecs)); | ||
2172 | remove_wait_queue(&khugepaged_wait, &wait); | ||
2173 | } | ||
2174 | |||
2175 | #ifndef CONFIG_NUMA | ||
2176 | static struct page *khugepaged_alloc_hugepage(void) | ||
2177 | { | ||
2178 | struct page *hpage; | ||
2179 | |||
2180 | do { | ||
2181 | hpage = alloc_hugepage(khugepaged_defrag()); | ||
2182 | if (!hpage) | ||
2183 | khugepaged_alloc_sleep(); | ||
2184 | } while (unlikely(!hpage) && | ||
2185 | likely(khugepaged_enabled())); | ||
2186 | return hpage; | ||
2187 | } | ||
2188 | #endif | ||
2189 | |||
2190 | static void khugepaged_loop(void) | ||
2191 | { | ||
2192 | struct page *hpage; | ||
2193 | |||
2194 | #ifdef CONFIG_NUMA | ||
2195 | hpage = NULL; | ||
2196 | #endif | ||
2197 | while (likely(khugepaged_enabled())) { | ||
2198 | #ifndef CONFIG_NUMA | ||
2199 | hpage = khugepaged_alloc_hugepage(); | ||
2200 | if (unlikely(!hpage)) | ||
2201 | break; | ||
2202 | #else | ||
2203 | if (IS_ERR(hpage)) { | ||
2204 | khugepaged_alloc_sleep(); | ||
2205 | hpage = NULL; | ||
2206 | } | ||
2207 | #endif | ||
2208 | |||
2209 | khugepaged_do_scan(&hpage); | ||
2210 | #ifndef CONFIG_NUMA | ||
2211 | if (hpage) | ||
2212 | put_page(hpage); | ||
2213 | #endif | ||
2214 | try_to_freeze(); | ||
2215 | if (unlikely(kthread_should_stop())) | ||
2216 | break; | ||
2217 | if (khugepaged_has_work()) { | ||
2218 | DEFINE_WAIT(wait); | ||
2219 | if (!khugepaged_scan_sleep_millisecs) | ||
2220 | continue; | ||
2221 | add_wait_queue(&khugepaged_wait, &wait); | ||
2222 | schedule_timeout_interruptible( | ||
2223 | msecs_to_jiffies( | ||
2224 | khugepaged_scan_sleep_millisecs)); | ||
2225 | remove_wait_queue(&khugepaged_wait, &wait); | ||
2226 | } else if (khugepaged_enabled()) | ||
2227 | wait_event_freezable(khugepaged_wait, | ||
2228 | khugepaged_wait_event()); | ||
2229 | } | ||
2230 | } | ||
2231 | |||
2232 | static int khugepaged(void *none) | ||
2233 | { | ||
2234 | struct mm_slot *mm_slot; | ||
2235 | |||
2236 | set_freezable(); | ||
2237 | set_user_nice(current, 19); | ||
2238 | |||
2239 | /* serialize with start_khugepaged() */ | ||
2240 | mutex_lock(&khugepaged_mutex); | ||
2241 | |||
2242 | for (;;) { | ||
2243 | mutex_unlock(&khugepaged_mutex); | ||
2244 | BUG_ON(khugepaged_thread != current); | ||
2245 | khugepaged_loop(); | ||
2246 | BUG_ON(khugepaged_thread != current); | ||
2247 | |||
2248 | mutex_lock(&khugepaged_mutex); | ||
2249 | if (!khugepaged_enabled()) | ||
2250 | break; | ||
2251 | if (unlikely(kthread_should_stop())) | ||
2252 | break; | ||
2253 | } | ||
2254 | |||
2255 | spin_lock(&khugepaged_mm_lock); | ||
2256 | mm_slot = khugepaged_scan.mm_slot; | ||
2257 | khugepaged_scan.mm_slot = NULL; | ||
2258 | if (mm_slot) | ||
2259 | collect_mm_slot(mm_slot); | ||
2260 | spin_unlock(&khugepaged_mm_lock); | ||
2261 | |||
2262 | khugepaged_thread = NULL; | ||
2263 | mutex_unlock(&khugepaged_mutex); | ||
2264 | |||
2265 | return 0; | ||
2266 | } | ||
2267 | |||
2268 | void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | ||
2269 | { | ||
2270 | struct page *page; | ||
2271 | |||
2272 | spin_lock(&mm->page_table_lock); | ||
2273 | if (unlikely(!pmd_trans_huge(*pmd))) { | ||
2274 | spin_unlock(&mm->page_table_lock); | ||
2275 | return; | ||
2276 | } | ||
2277 | page = pmd_page(*pmd); | ||
2278 | VM_BUG_ON(!page_count(page)); | ||
2279 | get_page(page); | ||
2280 | spin_unlock(&mm->page_table_lock); | ||
2281 | |||
2282 | split_huge_page(page); | ||
2283 | |||
2284 | put_page(page); | ||
2285 | BUG_ON(pmd_trans_huge(*pmd)); | ||
2286 | } | ||
2287 | |||
2288 | static void split_huge_page_address(struct mm_struct *mm, | ||
2289 | unsigned long address) | ||
2290 | { | ||
2291 | pgd_t *pgd; | ||
2292 | pud_t *pud; | ||
2293 | pmd_t *pmd; | ||
2294 | |||
2295 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); | ||
2296 | |||
2297 | pgd = pgd_offset(mm, address); | ||
2298 | if (!pgd_present(*pgd)) | ||
2299 | return; | ||
2300 | |||
2301 | pud = pud_offset(pgd, address); | ||
2302 | if (!pud_present(*pud)) | ||
2303 | return; | ||
2304 | |||
2305 | pmd = pmd_offset(pud, address); | ||
2306 | if (!pmd_present(*pmd)) | ||
2307 | return; | ||
2308 | /* | ||
2309 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | ||
2310 | * materialize from under us. | ||
2311 | */ | ||
2312 | split_huge_page_pmd(mm, pmd); | ||
2313 | } | ||
2314 | |||
2315 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | ||
2316 | unsigned long start, | ||
2317 | unsigned long end, | ||
2318 | long adjust_next) | ||
2319 | { | ||
2320 | /* | ||
2321 | * If the new start address isn't hpage aligned and it could | ||
2322 | * previously contain an hugepage: check if we need to split | ||
2323 | * an huge pmd. | ||
2324 | */ | ||
2325 | if (start & ~HPAGE_PMD_MASK && | ||
2326 | (start & HPAGE_PMD_MASK) >= vma->vm_start && | ||
2327 | (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | ||
2328 | split_huge_page_address(vma->vm_mm, start); | ||
2329 | |||
2330 | /* | ||
2331 | * If the new end address isn't hpage aligned and it could | ||
2332 | * previously contain an hugepage: check if we need to split | ||
2333 | * an huge pmd. | ||
2334 | */ | ||
2335 | if (end & ~HPAGE_PMD_MASK && | ||
2336 | (end & HPAGE_PMD_MASK) >= vma->vm_start && | ||
2337 | (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | ||
2338 | split_huge_page_address(vma->vm_mm, end); | ||
2339 | |||
2340 | /* | ||
2341 | * If we're also updating the vma->vm_next->vm_start, if the new | ||
2342 | * vm_next->vm_start isn't page aligned and it could previously | ||
2343 | * contain an hugepage: check if we need to split an huge pmd. | ||
2344 | */ | ||
2345 | if (adjust_next > 0) { | ||
2346 | struct vm_area_struct *next = vma->vm_next; | ||
2347 | unsigned long nstart = next->vm_start; | ||
2348 | nstart += adjust_next << PAGE_SHIFT; | ||
2349 | if (nstart & ~HPAGE_PMD_MASK && | ||
2350 | (nstart & HPAGE_PMD_MASK) >= next->vm_start && | ||
2351 | (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) | ||
2352 | split_huge_page_address(next->vm_mm, nstart); | ||
2353 | } | ||
2354 | } | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c4a3558589a..bb0b7c12801 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma) | |||
394 | return 0; | 394 | return 0; |
395 | } | 395 | } |
396 | 396 | ||
397 | static void clear_gigantic_page(struct page *page, | ||
398 | unsigned long addr, unsigned long sz) | ||
399 | { | ||
400 | int i; | ||
401 | struct page *p = page; | ||
402 | |||
403 | might_sleep(); | ||
404 | for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { | ||
405 | cond_resched(); | ||
406 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
407 | } | ||
408 | } | ||
409 | static void clear_huge_page(struct page *page, | ||
410 | unsigned long addr, unsigned long sz) | ||
411 | { | ||
412 | int i; | ||
413 | |||
414 | if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { | ||
415 | clear_gigantic_page(page, addr, sz); | ||
416 | return; | ||
417 | } | ||
418 | |||
419 | might_sleep(); | ||
420 | for (i = 0; i < sz/PAGE_SIZE; i++) { | ||
421 | cond_resched(); | ||
422 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
423 | } | ||
424 | } | ||
425 | |||
426 | static void copy_user_gigantic_page(struct page *dst, struct page *src, | ||
427 | unsigned long addr, struct vm_area_struct *vma) | ||
428 | { | ||
429 | int i; | ||
430 | struct hstate *h = hstate_vma(vma); | ||
431 | struct page *dst_base = dst; | ||
432 | struct page *src_base = src; | ||
433 | |||
434 | for (i = 0; i < pages_per_huge_page(h); ) { | ||
435 | cond_resched(); | ||
436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | ||
437 | |||
438 | i++; | ||
439 | dst = mem_map_next(dst, dst_base, i); | ||
440 | src = mem_map_next(src, src_base, i); | ||
441 | } | ||
442 | } | ||
443 | |||
444 | static void copy_user_huge_page(struct page *dst, struct page *src, | ||
445 | unsigned long addr, struct vm_area_struct *vma) | ||
446 | { | ||
447 | int i; | ||
448 | struct hstate *h = hstate_vma(vma); | ||
449 | |||
450 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | ||
451 | copy_user_gigantic_page(dst, src, addr, vma); | ||
452 | return; | ||
453 | } | ||
454 | |||
455 | might_sleep(); | ||
456 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
457 | cond_resched(); | ||
458 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | ||
459 | } | ||
460 | } | ||
461 | |||
462 | static void copy_gigantic_page(struct page *dst, struct page *src) | 397 | static void copy_gigantic_page(struct page *dst, struct page *src) |
463 | { | 398 | { |
464 | int i; | 399 | int i; |
@@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, | |||
1428 | 1363 | ||
1429 | return sprintf(buf, "%lu\n", nr_huge_pages); | 1364 | return sprintf(buf, "%lu\n", nr_huge_pages); |
1430 | } | 1365 | } |
1366 | |||
1431 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | 1367 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
1432 | struct kobject *kobj, struct kobj_attribute *attr, | 1368 | struct kobject *kobj, struct kobj_attribute *attr, |
1433 | const char *buf, size_t len) | 1369 | const char *buf, size_t len) |
@@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1440 | 1376 | ||
1441 | err = strict_strtoul(buf, 10, &count); | 1377 | err = strict_strtoul(buf, 10, &count); |
1442 | if (err) | 1378 | if (err) |
1443 | return 0; | 1379 | goto out; |
1444 | 1380 | ||
1445 | h = kobj_to_hstate(kobj, &nid); | 1381 | h = kobj_to_hstate(kobj, &nid); |
1382 | if (h->order >= MAX_ORDER) { | ||
1383 | err = -EINVAL; | ||
1384 | goto out; | ||
1385 | } | ||
1386 | |||
1446 | if (nid == NUMA_NO_NODE) { | 1387 | if (nid == NUMA_NO_NODE) { |
1447 | /* | 1388 | /* |
1448 | * global hstate attribute | 1389 | * global hstate attribute |
@@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1468 | NODEMASK_FREE(nodes_allowed); | 1409 | NODEMASK_FREE(nodes_allowed); |
1469 | 1410 | ||
1470 | return len; | 1411 | return len; |
1412 | out: | ||
1413 | NODEMASK_FREE(nodes_allowed); | ||
1414 | return err; | ||
1471 | } | 1415 | } |
1472 | 1416 | ||
1473 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1417 | static ssize_t nr_hugepages_show(struct kobject *kobj, |
@@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | |||
1510 | struct hstate *h = kobj_to_hstate(kobj, NULL); | 1454 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1511 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1455 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
1512 | } | 1456 | } |
1457 | |||
1513 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1458 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
1514 | struct kobj_attribute *attr, const char *buf, size_t count) | 1459 | struct kobj_attribute *attr, const char *buf, size_t count) |
1515 | { | 1460 | { |
@@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1517 | unsigned long input; | 1462 | unsigned long input; |
1518 | struct hstate *h = kobj_to_hstate(kobj, NULL); | 1463 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1519 | 1464 | ||
1465 | if (h->order >= MAX_ORDER) | ||
1466 | return -EINVAL; | ||
1467 | |||
1520 | err = strict_strtoul(buf, 10, &input); | 1468 | err = strict_strtoul(buf, 10, &input); |
1521 | if (err) | 1469 | if (err) |
1522 | return 0; | 1470 | return err; |
1523 | 1471 | ||
1524 | spin_lock(&hugetlb_lock); | 1472 | spin_lock(&hugetlb_lock); |
1525 | h->nr_overcommit_huge_pages = input; | 1473 | h->nr_overcommit_huge_pages = input; |
@@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
1922 | { | 1870 | { |
1923 | struct hstate *h = &default_hstate; | 1871 | struct hstate *h = &default_hstate; |
1924 | unsigned long tmp; | 1872 | unsigned long tmp; |
1873 | int ret; | ||
1925 | 1874 | ||
1926 | if (!write) | 1875 | if (!write) |
1927 | tmp = h->max_huge_pages; | 1876 | tmp = h->max_huge_pages; |
1928 | 1877 | ||
1878 | if (write && h->order >= MAX_ORDER) | ||
1879 | return -EINVAL; | ||
1880 | |||
1929 | table->data = &tmp; | 1881 | table->data = &tmp; |
1930 | table->maxlen = sizeof(unsigned long); | 1882 | table->maxlen = sizeof(unsigned long); |
1931 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1883 | ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1884 | if (ret) | ||
1885 | goto out; | ||
1932 | 1886 | ||
1933 | if (write) { | 1887 | if (write) { |
1934 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, | 1888 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
@@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
1943 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 1897 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) |
1944 | NODEMASK_FREE(nodes_allowed); | 1898 | NODEMASK_FREE(nodes_allowed); |
1945 | } | 1899 | } |
1946 | 1900 | out: | |
1947 | return 0; | 1901 | return ret; |
1948 | } | 1902 | } |
1949 | 1903 | ||
1950 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1904 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
@@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
1982 | { | 1936 | { |
1983 | struct hstate *h = &default_hstate; | 1937 | struct hstate *h = &default_hstate; |
1984 | unsigned long tmp; | 1938 | unsigned long tmp; |
1939 | int ret; | ||
1985 | 1940 | ||
1986 | if (!write) | 1941 | if (!write) |
1987 | tmp = h->nr_overcommit_huge_pages; | 1942 | tmp = h->nr_overcommit_huge_pages; |
1988 | 1943 | ||
1944 | if (write && h->order >= MAX_ORDER) | ||
1945 | return -EINVAL; | ||
1946 | |||
1989 | table->data = &tmp; | 1947 | table->data = &tmp; |
1990 | table->maxlen = sizeof(unsigned long); | 1948 | table->maxlen = sizeof(unsigned long); |
1991 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1949 | ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1950 | if (ret) | ||
1951 | goto out; | ||
1992 | 1952 | ||
1993 | if (write) { | 1953 | if (write) { |
1994 | spin_lock(&hugetlb_lock); | 1954 | spin_lock(&hugetlb_lock); |
1995 | h->nr_overcommit_huge_pages = tmp; | 1955 | h->nr_overcommit_huge_pages = tmp; |
1996 | spin_unlock(&hugetlb_lock); | 1956 | spin_unlock(&hugetlb_lock); |
1997 | } | 1957 | } |
1998 | 1958 | out: | |
1999 | return 0; | 1959 | return ret; |
2000 | } | 1960 | } |
2001 | 1961 | ||
2002 | #endif /* CONFIG_SYSCTL */ | 1962 | #endif /* CONFIG_SYSCTL */ |
@@ -2454,7 +2414,8 @@ retry_avoidcopy: | |||
2454 | return VM_FAULT_OOM; | 2414 | return VM_FAULT_OOM; |
2455 | } | 2415 | } |
2456 | 2416 | ||
2457 | copy_user_huge_page(new_page, old_page, address, vma); | 2417 | copy_user_huge_page(new_page, old_page, address, vma, |
2418 | pages_per_huge_page(h)); | ||
2458 | __SetPageUptodate(new_page); | 2419 | __SetPageUptodate(new_page); |
2459 | 2420 | ||
2460 | /* | 2421 | /* |
@@ -2558,7 +2519,7 @@ retry: | |||
2558 | ret = -PTR_ERR(page); | 2519 | ret = -PTR_ERR(page); |
2559 | goto out; | 2520 | goto out; |
2560 | } | 2521 | } |
2561 | clear_huge_page(page, address, huge_page_size(h)); | 2522 | clear_huge_page(page, address, pages_per_huge_page(h)); |
2562 | __SetPageUptodate(page); | 2523 | __SetPageUptodate(page); |
2563 | 2524 | ||
2564 | if (vma->vm_flags & VM_MAYSHARE) { | 2525 | if (vma->vm_flags & VM_MAYSHARE) { |
@@ -2738,7 +2699,8 @@ out_page_table_lock: | |||
2738 | unlock_page(pagecache_page); | 2699 | unlock_page(pagecache_page); |
2739 | put_page(pagecache_page); | 2700 | put_page(pagecache_page); |
2740 | } | 2701 | } |
2741 | unlock_page(page); | 2702 | if (page != pagecache_page) |
2703 | unlock_page(page); | ||
2742 | 2704 | ||
2743 | out_mutex: | 2705 | out_mutex: |
2744 | mutex_unlock(&hugetlb_instantiation_mutex); | 2706 | mutex_unlock(&hugetlb_instantiation_mutex); |
diff --git a/mm/internal.h b/mm/internal.h index dedb0aff673..69488205723 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
134 | } | 134 | } |
135 | } | 135 | } |
136 | 136 | ||
137 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
138 | extern unsigned long vma_address(struct page *page, | ||
139 | struct vm_area_struct *vma); | ||
140 | #endif | ||
137 | #else /* !CONFIG_MMU */ | 141 | #else /* !CONFIG_MMU */ |
138 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 142 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
139 | { | 143 | { |
@@ -243,7 +247,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
243 | 247 | ||
244 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 248 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
245 | unsigned long start, int len, unsigned int foll_flags, | 249 | unsigned long start, int len, unsigned int foll_flags, |
246 | struct page **pages, struct vm_area_struct **vmas); | 250 | struct page **pages, struct vm_area_struct **vmas, |
251 | int *nonblocking); | ||
247 | 252 | ||
248 | #define ZONE_RECLAIM_NOSCAN -2 | 253 | #define ZONE_RECLAIM_NOSCAN -2 |
249 | #define ZONE_RECLAIM_FULL -1 | 254 | #define ZONE_RECLAIM_FULL -1 |
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index 177a5169bbd..ff0d9779cec 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c | |||
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void) | |||
75 | * after the module is removed. | 75 | * after the module is removed. |
76 | */ | 76 | */ |
77 | for (i = 0; i < 10; i++) { | 77 | for (i = 0; i < 10; i++) { |
78 | elem = kmalloc(sizeof(*elem), GFP_KERNEL); | 78 | elem = kzalloc(sizeof(*elem), GFP_KERNEL); |
79 | pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); | 79 | pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); |
80 | if (!elem) | 80 | if (!elem) |
81 | return -ENOMEM; | 81 | return -ENOMEM; |
82 | memset(elem, 0, sizeof(*elem)); | ||
83 | INIT_LIST_HEAD(&elem->list); | 82 | INIT_LIST_HEAD(&elem->list); |
84 | |||
85 | list_add_tail(&elem->list, &test_list); | 83 | list_add_tail(&elem->list, &test_list); |
86 | } | 84 | } |
87 | 85 | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index bd9bc214091..84225f3b719 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -113,7 +113,9 @@ | |||
113 | #define BYTES_PER_POINTER sizeof(void *) | 113 | #define BYTES_PER_POINTER sizeof(void *) |
114 | 114 | ||
115 | /* GFP bitmask for kmemleak internal allocations */ | 115 | /* GFP bitmask for kmemleak internal allocations */ |
116 | #define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) | 116 | #define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ |
117 | __GFP_NORETRY | __GFP_NOMEMALLOC | \ | ||
118 | __GFP_NOWARN) | ||
117 | 119 | ||
118 | /* scanning area inside a memory block */ | 120 | /* scanning area inside a memory block */ |
119 | struct kmemleak_scan_area { | 121 | struct kmemleak_scan_area { |
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
511 | struct kmemleak_object *object; | 513 | struct kmemleak_object *object; |
512 | struct prio_tree_node *node; | 514 | struct prio_tree_node *node; |
513 | 515 | ||
514 | object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); | 516 | object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); |
515 | if (!object) { | 517 | if (!object) { |
516 | kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); | 518 | pr_warning("Cannot allocate a kmemleak_object structure\n"); |
519 | kmemleak_disable(); | ||
517 | return NULL; | 520 | return NULL; |
518 | } | 521 | } |
519 | 522 | ||
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) | |||
734 | return; | 737 | return; |
735 | } | 738 | } |
736 | 739 | ||
737 | area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); | 740 | area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); |
738 | if (!area) { | 741 | if (!area) { |
739 | kmemleak_warn("Cannot allocate a scan area\n"); | 742 | pr_warning("Cannot allocate a scan area\n"); |
740 | goto out; | 743 | goto out; |
741 | } | 744 | } |
742 | 745 | ||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
36 | #include <linux/hash.h> | 36 | #include <linux/hash.h> |
37 | #include <linux/freezer.h> | ||
37 | 38 | ||
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
39 | #include "internal.h" | 40 | #include "internal.h" |
@@ -411,6 +412,20 @@ out: | |||
411 | up_read(&mm->mmap_sem); | 412 | up_read(&mm->mmap_sem); |
412 | } | 413 | } |
413 | 414 | ||
415 | static struct page *page_trans_compound_anon(struct page *page) | ||
416 | { | ||
417 | if (PageTransCompound(page)) { | ||
418 | struct page *head = compound_trans_head(page); | ||
419 | /* | ||
420 | * head may actually be splitted and freed from under | ||
421 | * us but it's ok here. | ||
422 | */ | ||
423 | if (PageAnon(head)) | ||
424 | return head; | ||
425 | } | ||
426 | return NULL; | ||
427 | } | ||
428 | |||
414 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) | 429 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) |
415 | { | 430 | { |
416 | struct mm_struct *mm = rmap_item->mm; | 431 | struct mm_struct *mm = rmap_item->mm; |
@@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
430 | page = follow_page(vma, addr, FOLL_GET); | 445 | page = follow_page(vma, addr, FOLL_GET); |
431 | if (IS_ERR_OR_NULL(page)) | 446 | if (IS_ERR_OR_NULL(page)) |
432 | goto out; | 447 | goto out; |
433 | if (PageAnon(page)) { | 448 | if (PageAnon(page) || page_trans_compound_anon(page)) { |
434 | flush_anon_page(vma, page, addr); | 449 | flush_anon_page(vma, page, addr); |
435 | flush_dcache_page(page); | 450 | flush_dcache_page(page); |
436 | } else { | 451 | } else { |
@@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
708 | if (addr == -EFAULT) | 723 | if (addr == -EFAULT) |
709 | goto out; | 724 | goto out; |
710 | 725 | ||
726 | BUG_ON(PageTransCompound(page)); | ||
711 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 727 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
712 | if (!ptep) | 728 | if (!ptep) |
713 | goto out; | 729 | goto out; |
@@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
783 | goto out; | 799 | goto out; |
784 | 800 | ||
785 | pmd = pmd_offset(pud, addr); | 801 | pmd = pmd_offset(pud, addr); |
802 | BUG_ON(pmd_trans_huge(*pmd)); | ||
786 | if (!pmd_present(*pmd)) | 803 | if (!pmd_present(*pmd)) |
787 | goto out; | 804 | goto out; |
788 | 805 | ||
@@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
800 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | 817 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
801 | 818 | ||
802 | page_remove_rmap(page); | 819 | page_remove_rmap(page); |
820 | if (!page_mapped(page)) | ||
821 | try_to_free_swap(page); | ||
803 | put_page(page); | 822 | put_page(page); |
804 | 823 | ||
805 | pte_unmap_unlock(ptep, ptl); | 824 | pte_unmap_unlock(ptep, ptl); |
@@ -808,6 +827,33 @@ out: | |||
808 | return err; | 827 | return err; |
809 | } | 828 | } |
810 | 829 | ||
830 | static int page_trans_compound_anon_split(struct page *page) | ||
831 | { | ||
832 | int ret = 0; | ||
833 | struct page *transhuge_head = page_trans_compound_anon(page); | ||
834 | if (transhuge_head) { | ||
835 | /* Get the reference on the head to split it. */ | ||
836 | if (get_page_unless_zero(transhuge_head)) { | ||
837 | /* | ||
838 | * Recheck we got the reference while the head | ||
839 | * was still anonymous. | ||
840 | */ | ||
841 | if (PageAnon(transhuge_head)) | ||
842 | ret = split_huge_page(transhuge_head); | ||
843 | else | ||
844 | /* | ||
845 | * Retry later if split_huge_page run | ||
846 | * from under us. | ||
847 | */ | ||
848 | ret = 1; | ||
849 | put_page(transhuge_head); | ||
850 | } else | ||
851 | /* Retry later if split_huge_page run from under us. */ | ||
852 | ret = 1; | ||
853 | } | ||
854 | return ret; | ||
855 | } | ||
856 | |||
811 | /* | 857 | /* |
812 | * try_to_merge_one_page - take two pages and merge them into one | 858 | * try_to_merge_one_page - take two pages and merge them into one |
813 | * @vma: the vma that holds the pte pointing to page | 859 | * @vma: the vma that holds the pte pointing to page |
@@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
828 | 874 | ||
829 | if (!(vma->vm_flags & VM_MERGEABLE)) | 875 | if (!(vma->vm_flags & VM_MERGEABLE)) |
830 | goto out; | 876 | goto out; |
877 | if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | ||
878 | goto out; | ||
879 | BUG_ON(PageTransCompound(page)); | ||
831 | if (!PageAnon(page)) | 880 | if (!PageAnon(page)) |
832 | goto out; | 881 | goto out; |
833 | 882 | ||
@@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1247 | 1296 | ||
1248 | slot = ksm_scan.mm_slot; | 1297 | slot = ksm_scan.mm_slot; |
1249 | if (slot == &ksm_mm_head) { | 1298 | if (slot == &ksm_mm_head) { |
1299 | /* | ||
1300 | * A number of pages can hang around indefinitely on per-cpu | ||
1301 | * pagevecs, raised page count preventing write_protect_page | ||
1302 | * from merging them. Though it doesn't really matter much, | ||
1303 | * it is puzzling to see some stuck in pages_volatile until | ||
1304 | * other activity jostles them out, and they also prevented | ||
1305 | * LTP's KSM test from succeeding deterministically; so drain | ||
1306 | * them here (here rather than on entry to ksm_do_scan(), | ||
1307 | * so we don't IPI too often when pages_to_scan is set low). | ||
1308 | */ | ||
1309 | lru_add_drain_all(); | ||
1310 | |||
1250 | root_unstable_tree = RB_ROOT; | 1311 | root_unstable_tree = RB_ROOT; |
1251 | 1312 | ||
1252 | spin_lock(&ksm_mmlist_lock); | 1313 | spin_lock(&ksm_mmlist_lock); |
@@ -1277,7 +1338,13 @@ next_mm: | |||
1277 | if (ksm_test_exit(mm)) | 1338 | if (ksm_test_exit(mm)) |
1278 | break; | 1339 | break; |
1279 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); | 1340 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); |
1280 | if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { | 1341 | if (IS_ERR_OR_NULL(*page)) { |
1342 | ksm_scan.address += PAGE_SIZE; | ||
1343 | cond_resched(); | ||
1344 | continue; | ||
1345 | } | ||
1346 | if (PageAnon(*page) || | ||
1347 | page_trans_compound_anon(*page)) { | ||
1281 | flush_anon_page(vma, *page, ksm_scan.address); | 1348 | flush_anon_page(vma, *page, ksm_scan.address); |
1282 | flush_dcache_page(*page); | 1349 | flush_dcache_page(*page); |
1283 | rmap_item = get_next_rmap_item(slot, | 1350 | rmap_item = get_next_rmap_item(slot, |
@@ -1291,8 +1358,7 @@ next_mm: | |||
1291 | up_read(&mm->mmap_sem); | 1358 | up_read(&mm->mmap_sem); |
1292 | return rmap_item; | 1359 | return rmap_item; |
1293 | } | 1360 | } |
1294 | if (!IS_ERR_OR_NULL(*page)) | 1361 | put_page(*page); |
1295 | put_page(*page); | ||
1296 | ksm_scan.address += PAGE_SIZE; | 1362 | ksm_scan.address += PAGE_SIZE; |
1297 | cond_resched(); | 1363 | cond_resched(); |
1298 | } | 1364 | } |
@@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1352 | struct rmap_item *rmap_item; | 1418 | struct rmap_item *rmap_item; |
1353 | struct page *uninitialized_var(page); | 1419 | struct page *uninitialized_var(page); |
1354 | 1420 | ||
1355 | while (scan_npages--) { | 1421 | while (scan_npages-- && likely(!freezing(current))) { |
1356 | cond_resched(); | 1422 | cond_resched(); |
1357 | rmap_item = scan_get_next_rmap_item(&page); | 1423 | rmap_item = scan_get_next_rmap_item(&page); |
1358 | if (!rmap_item) | 1424 | if (!rmap_item) |
@@ -1370,6 +1436,7 @@ static int ksmd_should_run(void) | |||
1370 | 1436 | ||
1371 | static int ksm_scan_thread(void *nothing) | 1437 | static int ksm_scan_thread(void *nothing) |
1372 | { | 1438 | { |
1439 | set_freezable(); | ||
1373 | set_user_nice(current, 5); | 1440 | set_user_nice(current, 5); |
1374 | 1441 | ||
1375 | while (!kthread_should_stop()) { | 1442 | while (!kthread_should_stop()) { |
@@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing) | |||
1378 | ksm_do_scan(ksm_thread_pages_to_scan); | 1445 | ksm_do_scan(ksm_thread_pages_to_scan); |
1379 | mutex_unlock(&ksm_thread_mutex); | 1446 | mutex_unlock(&ksm_thread_mutex); |
1380 | 1447 | ||
1448 | try_to_freeze(); | ||
1449 | |||
1381 | if (ksmd_should_run()) { | 1450 | if (ksmd_should_run()) { |
1382 | schedule_timeout_interruptible( | 1451 | schedule_timeout_interruptible( |
1383 | msecs_to_jiffies(ksm_thread_sleep_millisecs)); | 1452 | msecs_to_jiffies(ksm_thread_sleep_millisecs)); |
1384 | } else { | 1453 | } else { |
1385 | wait_event_interruptible(ksm_thread_wait, | 1454 | wait_event_freezable(ksm_thread_wait, |
1386 | ksmd_should_run() || kthread_should_stop()); | 1455 | ksmd_should_run() || kthread_should_stop()); |
1387 | } | 1456 | } |
1388 | } | 1457 | } |
@@ -1724,8 +1793,13 @@ static int ksm_memory_callback(struct notifier_block *self, | |||
1724 | /* | 1793 | /* |
1725 | * Keep it very simple for now: just lock out ksmd and | 1794 | * Keep it very simple for now: just lock out ksmd and |
1726 | * MADV_UNMERGEABLE while any memory is going offline. | 1795 | * MADV_UNMERGEABLE while any memory is going offline. |
1796 | * mutex_lock_nested() is necessary because lockdep was alarmed | ||
1797 | * that here we take ksm_thread_mutex inside notifier chain | ||
1798 | * mutex, and later take notifier chain mutex inside | ||
1799 | * ksm_thread_mutex to unlock it. But that's safe because both | ||
1800 | * are inside mem_hotplug_mutex. | ||
1727 | */ | 1801 | */ |
1728 | mutex_lock(&ksm_thread_mutex); | 1802 | mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); |
1729 | break; | 1803 | break; |
1730 | 1804 | ||
1731 | case MEM_OFFLINE: | 1805 | case MEM_OFFLINE: |
diff --git a/mm/madvise.c b/mm/madvise.c index 319528b8db7..2221491ed50 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma, | |||
71 | if (error) | 71 | if (error) |
72 | goto out; | 72 | goto out; |
73 | break; | 73 | break; |
74 | case MADV_HUGEPAGE: | ||
75 | case MADV_NOHUGEPAGE: | ||
76 | error = hugepage_madvise(vma, &new_flags, behavior); | ||
77 | if (error) | ||
78 | goto out; | ||
79 | break; | ||
74 | } | 80 | } |
75 | 81 | ||
76 | if (new_flags == vma->vm_flags) { | 82 | if (new_flags == vma->vm_flags) { |
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior) | |||
283 | case MADV_MERGEABLE: | 289 | case MADV_MERGEABLE: |
284 | case MADV_UNMERGEABLE: | 290 | case MADV_UNMERGEABLE: |
285 | #endif | 291 | #endif |
292 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
293 | case MADV_HUGEPAGE: | ||
294 | case MADV_NOHUGEPAGE: | ||
295 | #endif | ||
286 | return 1; | 296 | return 1; |
287 | 297 | ||
288 | default: | 298 | default: |
diff --git a/mm/memblock.c b/mm/memblock.c index 400dc62697d..4618fda975a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, | |||
137 | 137 | ||
138 | BUG_ON(0 == size); | 138 | BUG_ON(0 == size); |
139 | 139 | ||
140 | size = memblock_align_up(size, align); | ||
141 | |||
142 | /* Pump up max_addr */ | 140 | /* Pump up max_addr */ |
143 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) | 141 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) |
144 | end = memblock.current_limit; | 142 | end = memblock.current_limit; |
@@ -683,13 +681,13 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) | |||
683 | 681 | ||
684 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) | 682 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) |
685 | { | 683 | { |
686 | int idx = memblock_search(&memblock.reserved, base); | 684 | int idx = memblock_search(&memblock.memory, base); |
687 | 685 | ||
688 | if (idx == -1) | 686 | if (idx == -1) |
689 | return 0; | 687 | return 0; |
690 | return memblock.reserved.regions[idx].base <= base && | 688 | return memblock.memory.regions[idx].base <= base && |
691 | (memblock.reserved.regions[idx].base + | 689 | (memblock.memory.regions[idx].base + |
692 | memblock.reserved.regions[idx].size) >= (base + size); | 690 | memblock.memory.regions[idx].size) >= (base + size); |
693 | } | 691 | } |
694 | 692 | ||
695 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) | 693 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2efa8ea07ff..da53a252b25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; | |||
61 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 61 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
62 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 62 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
63 | int do_swap_account __read_mostly; | 63 | int do_swap_account __read_mostly; |
64 | static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | 64 | |
65 | /* for remember boot option*/ | ||
66 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED | ||
67 | static int really_do_swap_account __initdata = 1; | ||
68 | #else | ||
69 | static int really_do_swap_account __initdata = 0; | ||
70 | #endif | ||
71 | |||
65 | #else | 72 | #else |
66 | #define do_swap_account (0) | 73 | #define do_swap_account (0) |
67 | #endif | 74 | #endif |
@@ -278,7 +285,7 @@ enum move_type { | |||
278 | 285 | ||
279 | /* "mc" and its members are protected by cgroup_mutex */ | 286 | /* "mc" and its members are protected by cgroup_mutex */ |
280 | static struct move_charge_struct { | 287 | static struct move_charge_struct { |
281 | spinlock_t lock; /* for from, to, moving_task */ | 288 | spinlock_t lock; /* for from, to */ |
282 | struct mem_cgroup *from; | 289 | struct mem_cgroup *from; |
283 | struct mem_cgroup *to; | 290 | struct mem_cgroup *to; |
284 | unsigned long precharge; | 291 | unsigned long precharge; |
@@ -593,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | |||
593 | } | 600 | } |
594 | 601 | ||
595 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 602 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
596 | struct page_cgroup *pc, | 603 | bool file, int nr_pages) |
597 | bool charge) | ||
598 | { | 604 | { |
599 | int val = (charge) ? 1 : -1; | ||
600 | |||
601 | preempt_disable(); | 605 | preempt_disable(); |
602 | 606 | ||
603 | if (PageCgroupCache(pc)) | 607 | if (file) |
604 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); | 608 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); |
605 | else | 609 | else |
606 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); | 610 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); |
607 | 611 | ||
608 | if (charge) | 612 | /* pagein of a big page is an event. So, ignore page size */ |
613 | if (nr_pages > 0) | ||
609 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); | 614 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); |
610 | else | 615 | else { |
611 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); | 616 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); |
612 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); | 617 | nr_pages = -nr_pages; /* for event */ |
618 | } | ||
619 | |||
620 | __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); | ||
613 | 621 | ||
614 | preempt_enable(); | 622 | preempt_enable(); |
615 | } | 623 | } |
@@ -808,12 +816,12 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | |||
808 | * removed from global LRU. | 816 | * removed from global LRU. |
809 | */ | 817 | */ |
810 | mz = page_cgroup_zoneinfo(pc); | 818 | mz = page_cgroup_zoneinfo(pc); |
811 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 819 | /* huge page split is done under lru_lock. so, we have no races. */ |
820 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | ||
812 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 821 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
813 | return; | 822 | return; |
814 | VM_BUG_ON(list_empty(&pc->lru)); | 823 | VM_BUG_ON(list_empty(&pc->lru)); |
815 | list_del_init(&pc->lru); | 824 | list_del_init(&pc->lru); |
816 | return; | ||
817 | } | 825 | } |
818 | 826 | ||
819 | void mem_cgroup_del_lru(struct page *page) | 827 | void mem_cgroup_del_lru(struct page *page) |
@@ -830,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
830 | return; | 838 | return; |
831 | 839 | ||
832 | pc = lookup_page_cgroup(page); | 840 | pc = lookup_page_cgroup(page); |
833 | /* | ||
834 | * Used bit is set without atomic ops but after smp_wmb(). | ||
835 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | ||
836 | */ | ||
837 | smp_rmb(); | ||
838 | /* unused or root page is not rotated. */ | 841 | /* unused or root page is not rotated. */ |
839 | if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) | 842 | if (!PageCgroupUsed(pc)) |
843 | return; | ||
844 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | ||
845 | smp_rmb(); | ||
846 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
840 | return; | 847 | return; |
841 | mz = page_cgroup_zoneinfo(pc); | 848 | mz = page_cgroup_zoneinfo(pc); |
842 | list_move(&pc->lru, &mz->lists[lru]); | 849 | list_move(&pc->lru, &mz->lists[lru]); |
@@ -851,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
851 | return; | 858 | return; |
852 | pc = lookup_page_cgroup(page); | 859 | pc = lookup_page_cgroup(page); |
853 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | 860 | VM_BUG_ON(PageCgroupAcctLRU(pc)); |
854 | /* | ||
855 | * Used bit is set without atomic ops but after smp_wmb(). | ||
856 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | ||
857 | */ | ||
858 | smp_rmb(); | ||
859 | if (!PageCgroupUsed(pc)) | 861 | if (!PageCgroupUsed(pc)) |
860 | return; | 862 | return; |
861 | 863 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | |
864 | smp_rmb(); | ||
862 | mz = page_cgroup_zoneinfo(pc); | 865 | mz = page_cgroup_zoneinfo(pc); |
863 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 866 | /* huge page split is done under lru_lock. so, we have no races. */ |
867 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | ||
864 | SetPageCgroupAcctLRU(pc); | 868 | SetPageCgroupAcctLRU(pc); |
865 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 869 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
866 | return; | 870 | return; |
@@ -1024,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1024 | return NULL; | 1028 | return NULL; |
1025 | 1029 | ||
1026 | pc = lookup_page_cgroup(page); | 1030 | pc = lookup_page_cgroup(page); |
1027 | /* | ||
1028 | * Used bit is set without atomic ops but after smp_wmb(). | ||
1029 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | ||
1030 | */ | ||
1031 | smp_rmb(); | ||
1032 | if (!PageCgroupUsed(pc)) | 1031 | if (!PageCgroupUsed(pc)) |
1033 | return NULL; | 1032 | return NULL; |
1034 | 1033 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | |
1034 | smp_rmb(); | ||
1035 | mz = page_cgroup_zoneinfo(pc); | 1035 | mz = page_cgroup_zoneinfo(pc); |
1036 | if (!mz) | 1036 | if (!mz) |
1037 | return NULL; | 1037 | return NULL; |
@@ -1079,7 +1079,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1079 | case 0: | 1079 | case 0: |
1080 | list_move(&page->lru, dst); | 1080 | list_move(&page->lru, dst); |
1081 | mem_cgroup_del_lru(page); | 1081 | mem_cgroup_del_lru(page); |
1082 | nr_taken++; | 1082 | nr_taken += hpage_nr_pages(page); |
1083 | break; | 1083 | break; |
1084 | case -EBUSY: | 1084 | case -EBUSY: |
1085 | /* we don't affect global LRU but rotate in our LRU */ | 1085 | /* we don't affect global LRU but rotate in our LRU */ |
@@ -1113,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) | |||
1113 | return false; | 1113 | return false; |
1114 | } | 1114 | } |
1115 | 1115 | ||
1116 | /** | ||
1117 | * mem_cgroup_check_margin - check if the memory cgroup allows charging | ||
1118 | * @mem: memory cgroup to check | ||
1119 | * @bytes: the number of bytes the caller intends to charge | ||
1120 | * | ||
1121 | * Returns a boolean value on whether @mem can be charged @bytes or | ||
1122 | * whether this would exceed the limit. | ||
1123 | */ | ||
1124 | static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) | ||
1125 | { | ||
1126 | if (!res_counter_check_margin(&mem->res, bytes)) | ||
1127 | return false; | ||
1128 | if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) | ||
1129 | return false; | ||
1130 | return true; | ||
1131 | } | ||
1132 | |||
1116 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1133 | static unsigned int get_swappiness(struct mem_cgroup *memcg) |
1117 | { | 1134 | { |
1118 | struct cgroup *cgrp = memcg->css.cgroup; | 1135 | struct cgroup *cgrp = memcg->css.cgroup; |
@@ -1304,8 +1321,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1304 | u64 limit; | 1321 | u64 limit; |
1305 | u64 memsw; | 1322 | u64 memsw; |
1306 | 1323 | ||
1307 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + | 1324 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
1308 | total_swap_pages; | 1325 | limit += total_swap_pages << PAGE_SHIFT; |
1326 | |||
1309 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 1327 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
1310 | /* | 1328 | /* |
1311 | * If memsw is finite and limits the amount of swap space available | 1329 | * If memsw is finite and limits the amount of swap space available |
@@ -1592,11 +1610,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1592 | * possibility of race condition. If there is, we take a lock. | 1610 | * possibility of race condition. If there is, we take a lock. |
1593 | */ | 1611 | */ |
1594 | 1612 | ||
1595 | static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) | 1613 | void mem_cgroup_update_page_stat(struct page *page, |
1614 | enum mem_cgroup_page_stat_item idx, int val) | ||
1596 | { | 1615 | { |
1597 | struct mem_cgroup *mem; | 1616 | struct mem_cgroup *mem; |
1598 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1617 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1599 | bool need_unlock = false; | 1618 | bool need_unlock = false; |
1619 | unsigned long uninitialized_var(flags); | ||
1600 | 1620 | ||
1601 | if (unlikely(!pc)) | 1621 | if (unlikely(!pc)) |
1602 | return; | 1622 | return; |
@@ -1606,39 +1626,36 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) | |||
1606 | if (unlikely(!mem || !PageCgroupUsed(pc))) | 1626 | if (unlikely(!mem || !PageCgroupUsed(pc))) |
1607 | goto out; | 1627 | goto out; |
1608 | /* pc->mem_cgroup is unstable ? */ | 1628 | /* pc->mem_cgroup is unstable ? */ |
1609 | if (unlikely(mem_cgroup_stealed(mem))) { | 1629 | if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { |
1610 | /* take a lock against to access pc->mem_cgroup */ | 1630 | /* take a lock against to access pc->mem_cgroup */ |
1611 | lock_page_cgroup(pc); | 1631 | move_lock_page_cgroup(pc, &flags); |
1612 | need_unlock = true; | 1632 | need_unlock = true; |
1613 | mem = pc->mem_cgroup; | 1633 | mem = pc->mem_cgroup; |
1614 | if (!mem || !PageCgroupUsed(pc)) | 1634 | if (!mem || !PageCgroupUsed(pc)) |
1615 | goto out; | 1635 | goto out; |
1616 | } | 1636 | } |
1617 | 1637 | ||
1618 | this_cpu_add(mem->stat->count[idx], val); | ||
1619 | |||
1620 | switch (idx) { | 1638 | switch (idx) { |
1621 | case MEM_CGROUP_STAT_FILE_MAPPED: | 1639 | case MEMCG_NR_FILE_MAPPED: |
1622 | if (val > 0) | 1640 | if (val > 0) |
1623 | SetPageCgroupFileMapped(pc); | 1641 | SetPageCgroupFileMapped(pc); |
1624 | else if (!page_mapped(page)) | 1642 | else if (!page_mapped(page)) |
1625 | ClearPageCgroupFileMapped(pc); | 1643 | ClearPageCgroupFileMapped(pc); |
1644 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | ||
1626 | break; | 1645 | break; |
1627 | default: | 1646 | default: |
1628 | BUG(); | 1647 | BUG(); |
1629 | } | 1648 | } |
1630 | 1649 | ||
1650 | this_cpu_add(mem->stat->count[idx], val); | ||
1651 | |||
1631 | out: | 1652 | out: |
1632 | if (unlikely(need_unlock)) | 1653 | if (unlikely(need_unlock)) |
1633 | unlock_page_cgroup(pc); | 1654 | move_unlock_page_cgroup(pc, &flags); |
1634 | rcu_read_unlock(); | 1655 | rcu_read_unlock(); |
1635 | return; | 1656 | return; |
1636 | } | 1657 | } |
1637 | 1658 | EXPORT_SYMBOL(mem_cgroup_update_page_stat); | |
1638 | void mem_cgroup_update_file_mapped(struct page *page, int val) | ||
1639 | { | ||
1640 | mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); | ||
1641 | } | ||
1642 | 1659 | ||
1643 | /* | 1660 | /* |
1644 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1661 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
@@ -1834,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1834 | if (likely(!ret)) | 1851 | if (likely(!ret)) |
1835 | return CHARGE_OK; | 1852 | return CHARGE_OK; |
1836 | 1853 | ||
1854 | res_counter_uncharge(&mem->res, csize); | ||
1837 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 1855 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); |
1838 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1856 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1839 | } else | 1857 | } else |
1840 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 1858 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
1841 | 1859 | /* | |
1842 | if (csize > PAGE_SIZE) /* change csize and retry */ | 1860 | * csize can be either a huge page (HPAGE_SIZE), a batch of |
1861 | * regular pages (CHARGE_SIZE), or a single regular page | ||
1862 | * (PAGE_SIZE). | ||
1863 | * | ||
1864 | * Never reclaim on behalf of optional batching, retry with a | ||
1865 | * single page instead. | ||
1866 | */ | ||
1867 | if (csize == CHARGE_SIZE) | ||
1843 | return CHARGE_RETRY; | 1868 | return CHARGE_RETRY; |
1844 | 1869 | ||
1845 | if (!(gfp_mask & __GFP_WAIT)) | 1870 | if (!(gfp_mask & __GFP_WAIT)) |
1846 | return CHARGE_WOULDBLOCK; | 1871 | return CHARGE_WOULDBLOCK; |
1847 | 1872 | ||
1848 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 1873 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1849 | gfp_mask, flags); | 1874 | gfp_mask, flags); |
1875 | if (mem_cgroup_check_margin(mem_over_limit, csize)) | ||
1876 | return CHARGE_RETRY; | ||
1850 | /* | 1877 | /* |
1851 | * try_to_free_mem_cgroup_pages() might not give us a full | 1878 | * Even though the limit is exceeded at this point, reclaim |
1852 | * picture of reclaim. Some pages are reclaimed and might be | 1879 | * may have been able to free some pages. Retry the charge |
1853 | * moved to swap cache or just unmapped from the cgroup. | 1880 | * before killing the task. |
1854 | * Check the limit again to see if the reclaim reduced the | 1881 | * |
1855 | * current usage of the cgroup before giving up | 1882 | * Only for regular pages, though: huge pages are rather |
1883 | * unlikely to succeed so close to the limit, and we fall back | ||
1884 | * to regular pages anyway in case of failure. | ||
1856 | */ | 1885 | */ |
1857 | if (ret || mem_cgroup_check_under_limit(mem_over_limit)) | 1886 | if (csize == PAGE_SIZE && ret) |
1858 | return CHARGE_RETRY; | 1887 | return CHARGE_RETRY; |
1859 | 1888 | ||
1860 | /* | 1889 | /* |
@@ -1879,12 +1908,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1879 | * oom-killer can be invoked. | 1908 | * oom-killer can be invoked. |
1880 | */ | 1909 | */ |
1881 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1910 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1882 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | 1911 | gfp_t gfp_mask, |
1912 | struct mem_cgroup **memcg, bool oom, | ||
1913 | int page_size) | ||
1883 | { | 1914 | { |
1884 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1915 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1885 | struct mem_cgroup *mem = NULL; | 1916 | struct mem_cgroup *mem = NULL; |
1886 | int ret; | 1917 | int ret; |
1887 | int csize = CHARGE_SIZE; | 1918 | int csize = max(CHARGE_SIZE, (unsigned long) page_size); |
1888 | 1919 | ||
1889 | /* | 1920 | /* |
1890 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage | 1921 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
@@ -1909,7 +1940,7 @@ again: | |||
1909 | VM_BUG_ON(css_is_removed(&mem->css)); | 1940 | VM_BUG_ON(css_is_removed(&mem->css)); |
1910 | if (mem_cgroup_is_root(mem)) | 1941 | if (mem_cgroup_is_root(mem)) |
1911 | goto done; | 1942 | goto done; |
1912 | if (consume_stock(mem)) | 1943 | if (page_size == PAGE_SIZE && consume_stock(mem)) |
1913 | goto done; | 1944 | goto done; |
1914 | css_get(&mem->css); | 1945 | css_get(&mem->css); |
1915 | } else { | 1946 | } else { |
@@ -1917,23 +1948,22 @@ again: | |||
1917 | 1948 | ||
1918 | rcu_read_lock(); | 1949 | rcu_read_lock(); |
1919 | p = rcu_dereference(mm->owner); | 1950 | p = rcu_dereference(mm->owner); |
1920 | VM_BUG_ON(!p); | ||
1921 | /* | 1951 | /* |
1922 | * because we don't have task_lock(), "p" can exit while | 1952 | * Because we don't have task_lock(), "p" can exit. |
1923 | * we're here. In that case, "mem" can point to root | 1953 | * In that case, "mem" can point to root or p can be NULL with |
1924 | * cgroup but never be NULL. (and task_struct itself is freed | 1954 | * race with swapoff. Then, we have small risk of mis-accouning. |
1925 | * by RCU, cgroup itself is RCU safe.) Then, we have small | 1955 | * But such kind of mis-account by race always happens because |
1926 | * risk here to get wrong cgroup. But such kind of mis-account | 1956 | * we don't have cgroup_mutex(). It's overkill and we allo that |
1927 | * by race always happens because we don't have cgroup_mutex(). | 1957 | * small race, here. |
1928 | * It's overkill and we allow that small race, here. | 1958 | * (*) swapoff at el will charge against mm-struct not against |
1959 | * task-struct. So, mm->owner can be NULL. | ||
1929 | */ | 1960 | */ |
1930 | mem = mem_cgroup_from_task(p); | 1961 | mem = mem_cgroup_from_task(p); |
1931 | VM_BUG_ON(!mem); | 1962 | if (!mem || mem_cgroup_is_root(mem)) { |
1932 | if (mem_cgroup_is_root(mem)) { | ||
1933 | rcu_read_unlock(); | 1963 | rcu_read_unlock(); |
1934 | goto done; | 1964 | goto done; |
1935 | } | 1965 | } |
1936 | if (consume_stock(mem)) { | 1966 | if (page_size == PAGE_SIZE && consume_stock(mem)) { |
1937 | /* | 1967 | /* |
1938 | * It seems dagerous to access memcg without css_get(). | 1968 | * It seems dagerous to access memcg without css_get(). |
1939 | * But considering how consume_stok works, it's not | 1969 | * But considering how consume_stok works, it's not |
@@ -1974,7 +2004,7 @@ again: | |||
1974 | case CHARGE_OK: | 2004 | case CHARGE_OK: |
1975 | break; | 2005 | break; |
1976 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 2006 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
1977 | csize = PAGE_SIZE; | 2007 | csize = page_size; |
1978 | css_put(&mem->css); | 2008 | css_put(&mem->css); |
1979 | mem = NULL; | 2009 | mem = NULL; |
1980 | goto again; | 2010 | goto again; |
@@ -1995,8 +2025,8 @@ again: | |||
1995 | } | 2025 | } |
1996 | } while (ret != CHARGE_OK); | 2026 | } while (ret != CHARGE_OK); |
1997 | 2027 | ||
1998 | if (csize > PAGE_SIZE) | 2028 | if (csize > page_size) |
1999 | refill_stock(mem, csize - PAGE_SIZE); | 2029 | refill_stock(mem, csize - page_size); |
2000 | css_put(&mem->css); | 2030 | css_put(&mem->css); |
2001 | done: | 2031 | done: |
2002 | *memcg = mem; | 2032 | *memcg = mem; |
@@ -2024,9 +2054,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | |||
2024 | } | 2054 | } |
2025 | } | 2055 | } |
2026 | 2056 | ||
2027 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | 2057 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
2058 | int page_size) | ||
2028 | { | 2059 | { |
2029 | __mem_cgroup_cancel_charge(mem, 1); | 2060 | __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); |
2030 | } | 2061 | } |
2031 | 2062 | ||
2032 | /* | 2063 | /* |
@@ -2076,15 +2107,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2076 | return mem; | 2107 | return mem; |
2077 | } | 2108 | } |
2078 | 2109 | ||
2079 | /* | ||
2080 | * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be | ||
2081 | * USED state. If already USED, uncharge and return. | ||
2082 | */ | ||
2083 | |||
2084 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 2110 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, |
2085 | struct page_cgroup *pc, | 2111 | struct page_cgroup *pc, |
2086 | enum charge_type ctype) | 2112 | enum charge_type ctype, |
2113 | int page_size) | ||
2087 | { | 2114 | { |
2115 | int nr_pages = page_size >> PAGE_SHIFT; | ||
2116 | |||
2088 | /* try_charge() can return NULL to *memcg, taking care of it. */ | 2117 | /* try_charge() can return NULL to *memcg, taking care of it. */ |
2089 | if (!mem) | 2118 | if (!mem) |
2090 | return; | 2119 | return; |
@@ -2092,10 +2121,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2092 | lock_page_cgroup(pc); | 2121 | lock_page_cgroup(pc); |
2093 | if (unlikely(PageCgroupUsed(pc))) { | 2122 | if (unlikely(PageCgroupUsed(pc))) { |
2094 | unlock_page_cgroup(pc); | 2123 | unlock_page_cgroup(pc); |
2095 | mem_cgroup_cancel_charge(mem); | 2124 | mem_cgroup_cancel_charge(mem, page_size); |
2096 | return; | 2125 | return; |
2097 | } | 2126 | } |
2098 | 2127 | /* | |
2128 | * we don't need page_cgroup_lock about tail pages, becase they are not | ||
2129 | * accessed by any other context at this point. | ||
2130 | */ | ||
2099 | pc->mem_cgroup = mem; | 2131 | pc->mem_cgroup = mem; |
2100 | /* | 2132 | /* |
2101 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | 2133 | * We access a page_cgroup asynchronously without lock_page_cgroup(). |
@@ -2119,8 +2151,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2119 | break; | 2151 | break; |
2120 | } | 2152 | } |
2121 | 2153 | ||
2122 | mem_cgroup_charge_statistics(mem, pc, true); | 2154 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); |
2123 | |||
2124 | unlock_page_cgroup(pc); | 2155 | unlock_page_cgroup(pc); |
2125 | /* | 2156 | /* |
2126 | * "charge_statistics" updated event counter. Then, check it. | 2157 | * "charge_statistics" updated event counter. Then, check it. |
@@ -2130,6 +2161,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2130 | memcg_check_events(mem, pc->page); | 2161 | memcg_check_events(mem, pc->page); |
2131 | } | 2162 | } |
2132 | 2163 | ||
2164 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
2165 | |||
2166 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ | ||
2167 | (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) | ||
2168 | /* | ||
2169 | * Because tail pages are not marked as "used", set it. We're under | ||
2170 | * zone->lru_lock, 'splitting on pmd' and compund_lock. | ||
2171 | */ | ||
2172 | void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | ||
2173 | { | ||
2174 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | ||
2175 | struct page_cgroup *tail_pc = lookup_page_cgroup(tail); | ||
2176 | unsigned long flags; | ||
2177 | |||
2178 | if (mem_cgroup_disabled()) | ||
2179 | return; | ||
2180 | /* | ||
2181 | * We have no races with charge/uncharge but will have races with | ||
2182 | * page state accounting. | ||
2183 | */ | ||
2184 | move_lock_page_cgroup(head_pc, &flags); | ||
2185 | |||
2186 | tail_pc->mem_cgroup = head_pc->mem_cgroup; | ||
2187 | smp_wmb(); /* see __commit_charge() */ | ||
2188 | if (PageCgroupAcctLRU(head_pc)) { | ||
2189 | enum lru_list lru; | ||
2190 | struct mem_cgroup_per_zone *mz; | ||
2191 | |||
2192 | /* | ||
2193 | * LRU flags cannot be copied because we need to add tail | ||
2194 | *.page to LRU by generic call and our hook will be called. | ||
2195 | * We hold lru_lock, then, reduce counter directly. | ||
2196 | */ | ||
2197 | lru = page_lru(head); | ||
2198 | mz = page_cgroup_zoneinfo(head_pc); | ||
2199 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | ||
2200 | } | ||
2201 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | ||
2202 | move_unlock_page_cgroup(head_pc, &flags); | ||
2203 | } | ||
2204 | #endif | ||
2205 | |||
2133 | /** | 2206 | /** |
2134 | * __mem_cgroup_move_account - move account of the page | 2207 | * __mem_cgroup_move_account - move account of the page |
2135 | * @pc: page_cgroup of the page. | 2208 | * @pc: page_cgroup of the page. |
@@ -2148,11 +2221,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2148 | */ | 2221 | */ |
2149 | 2222 | ||
2150 | static void __mem_cgroup_move_account(struct page_cgroup *pc, | 2223 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
2151 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) | 2224 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, |
2225 | int charge_size) | ||
2152 | { | 2226 | { |
2227 | int nr_pages = charge_size >> PAGE_SHIFT; | ||
2228 | |||
2153 | VM_BUG_ON(from == to); | 2229 | VM_BUG_ON(from == to); |
2154 | VM_BUG_ON(PageLRU(pc->page)); | 2230 | VM_BUG_ON(PageLRU(pc->page)); |
2155 | VM_BUG_ON(!PageCgroupLocked(pc)); | 2231 | VM_BUG_ON(!page_is_cgroup_locked(pc)); |
2156 | VM_BUG_ON(!PageCgroupUsed(pc)); | 2232 | VM_BUG_ON(!PageCgroupUsed(pc)); |
2157 | VM_BUG_ON(pc->mem_cgroup != from); | 2233 | VM_BUG_ON(pc->mem_cgroup != from); |
2158 | 2234 | ||
@@ -2163,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2163 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 2239 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
2164 | preempt_enable(); | 2240 | preempt_enable(); |
2165 | } | 2241 | } |
2166 | mem_cgroup_charge_statistics(from, pc, false); | 2242 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); |
2167 | if (uncharge) | 2243 | if (uncharge) |
2168 | /* This is not "cancel", but cancel_charge does all we need. */ | 2244 | /* This is not "cancel", but cancel_charge does all we need. */ |
2169 | mem_cgroup_cancel_charge(from); | 2245 | mem_cgroup_cancel_charge(from, charge_size); |
2170 | 2246 | ||
2171 | /* caller should have done css_get */ | 2247 | /* caller should have done css_get */ |
2172 | pc->mem_cgroup = to; | 2248 | pc->mem_cgroup = to; |
2173 | mem_cgroup_charge_statistics(to, pc, true); | 2249 | mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); |
2174 | /* | 2250 | /* |
2175 | * We charges against "to" which may not have any tasks. Then, "to" | 2251 | * We charges against "to" which may not have any tasks. Then, "to" |
2176 | * can be under rmdir(). But in current implementation, caller of | 2252 | * can be under rmdir(). But in current implementation, caller of |
@@ -2185,12 +2261,25 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2185 | * __mem_cgroup_move_account() | 2261 | * __mem_cgroup_move_account() |
2186 | */ | 2262 | */ |
2187 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 2263 | static int mem_cgroup_move_account(struct page_cgroup *pc, |
2188 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) | 2264 | struct mem_cgroup *from, struct mem_cgroup *to, |
2265 | bool uncharge, int charge_size) | ||
2189 | { | 2266 | { |
2190 | int ret = -EINVAL; | 2267 | int ret = -EINVAL; |
2268 | unsigned long flags; | ||
2269 | /* | ||
2270 | * The page is isolated from LRU. So, collapse function | ||
2271 | * will not handle this page. But page splitting can happen. | ||
2272 | * Do this check under compound_page_lock(). The caller should | ||
2273 | * hold it. | ||
2274 | */ | ||
2275 | if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) | ||
2276 | return -EBUSY; | ||
2277 | |||
2191 | lock_page_cgroup(pc); | 2278 | lock_page_cgroup(pc); |
2192 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | 2279 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { |
2193 | __mem_cgroup_move_account(pc, from, to, uncharge); | 2280 | move_lock_page_cgroup(pc, &flags); |
2281 | __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); | ||
2282 | move_unlock_page_cgroup(pc, &flags); | ||
2194 | ret = 0; | 2283 | ret = 0; |
2195 | } | 2284 | } |
2196 | unlock_page_cgroup(pc); | 2285 | unlock_page_cgroup(pc); |
@@ -2214,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
2214 | struct cgroup *cg = child->css.cgroup; | 2303 | struct cgroup *cg = child->css.cgroup; |
2215 | struct cgroup *pcg = cg->parent; | 2304 | struct cgroup *pcg = cg->parent; |
2216 | struct mem_cgroup *parent; | 2305 | struct mem_cgroup *parent; |
2306 | int page_size = PAGE_SIZE; | ||
2307 | unsigned long flags; | ||
2217 | int ret; | 2308 | int ret; |
2218 | 2309 | ||
2219 | /* Is ROOT ? */ | 2310 | /* Is ROOT ? */ |
@@ -2226,14 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
2226 | if (isolate_lru_page(page)) | 2317 | if (isolate_lru_page(page)) |
2227 | goto put; | 2318 | goto put; |
2228 | 2319 | ||
2320 | if (PageTransHuge(page)) | ||
2321 | page_size = HPAGE_SIZE; | ||
2322 | |||
2229 | parent = mem_cgroup_from_cont(pcg); | 2323 | parent = mem_cgroup_from_cont(pcg); |
2230 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 2324 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, |
2325 | &parent, false, page_size); | ||
2231 | if (ret || !parent) | 2326 | if (ret || !parent) |
2232 | goto put_back; | 2327 | goto put_back; |
2233 | 2328 | ||
2234 | ret = mem_cgroup_move_account(pc, child, parent, true); | 2329 | if (page_size > PAGE_SIZE) |
2330 | flags = compound_lock_irqsave(page); | ||
2331 | |||
2332 | ret = mem_cgroup_move_account(pc, child, parent, true, page_size); | ||
2235 | if (ret) | 2333 | if (ret) |
2236 | mem_cgroup_cancel_charge(parent); | 2334 | mem_cgroup_cancel_charge(parent, page_size); |
2335 | |||
2336 | if (page_size > PAGE_SIZE) | ||
2337 | compound_unlock_irqrestore(page, flags); | ||
2237 | put_back: | 2338 | put_back: |
2238 | putback_lru_page(page); | 2339 | putback_lru_page(page); |
2239 | put: | 2340 | put: |
@@ -2252,20 +2353,32 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2252 | gfp_t gfp_mask, enum charge_type ctype) | 2353 | gfp_t gfp_mask, enum charge_type ctype) |
2253 | { | 2354 | { |
2254 | struct mem_cgroup *mem = NULL; | 2355 | struct mem_cgroup *mem = NULL; |
2356 | int page_size = PAGE_SIZE; | ||
2255 | struct page_cgroup *pc; | 2357 | struct page_cgroup *pc; |
2358 | bool oom = true; | ||
2256 | int ret; | 2359 | int ret; |
2257 | 2360 | ||
2361 | if (PageTransHuge(page)) { | ||
2362 | page_size <<= compound_order(page); | ||
2363 | VM_BUG_ON(!PageTransHuge(page)); | ||
2364 | /* | ||
2365 | * Never OOM-kill a process for a huge page. The | ||
2366 | * fault handler will fall back to regular pages. | ||
2367 | */ | ||
2368 | oom = false; | ||
2369 | } | ||
2370 | |||
2258 | pc = lookup_page_cgroup(page); | 2371 | pc = lookup_page_cgroup(page); |
2259 | /* can happen at boot */ | 2372 | /* can happen at boot */ |
2260 | if (unlikely(!pc)) | 2373 | if (unlikely(!pc)) |
2261 | return 0; | 2374 | return 0; |
2262 | prefetchw(pc); | 2375 | prefetchw(pc); |
2263 | 2376 | ||
2264 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 2377 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); |
2265 | if (ret || !mem) | 2378 | if (ret || !mem) |
2266 | return ret; | 2379 | return ret; |
2267 | 2380 | ||
2268 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2381 | __mem_cgroup_commit_charge(mem, pc, ctype, page_size); |
2269 | return 0; | 2382 | return 0; |
2270 | } | 2383 | } |
2271 | 2384 | ||
@@ -2274,8 +2387,6 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
2274 | { | 2387 | { |
2275 | if (mem_cgroup_disabled()) | 2388 | if (mem_cgroup_disabled()) |
2276 | return 0; | 2389 | return 0; |
2277 | if (PageCompound(page)) | ||
2278 | return 0; | ||
2279 | /* | 2390 | /* |
2280 | * If already mapped, we don't have to account. | 2391 | * If already mapped, we don't have to account. |
2281 | * If page cache, page->mapping has address_space. | 2392 | * If page cache, page->mapping has address_space. |
@@ -2381,13 +2492,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2381 | if (!mem) | 2492 | if (!mem) |
2382 | goto charge_cur_mm; | 2493 | goto charge_cur_mm; |
2383 | *ptr = mem; | 2494 | *ptr = mem; |
2384 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 2495 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); |
2385 | css_put(&mem->css); | 2496 | css_put(&mem->css); |
2386 | return ret; | 2497 | return ret; |
2387 | charge_cur_mm: | 2498 | charge_cur_mm: |
2388 | if (unlikely(!mm)) | 2499 | if (unlikely(!mm)) |
2389 | mm = &init_mm; | 2500 | mm = &init_mm; |
2390 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 2501 | return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); |
2391 | } | 2502 | } |
2392 | 2503 | ||
2393 | static void | 2504 | static void |
@@ -2403,7 +2514,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
2403 | cgroup_exclude_rmdir(&ptr->css); | 2514 | cgroup_exclude_rmdir(&ptr->css); |
2404 | pc = lookup_page_cgroup(page); | 2515 | pc = lookup_page_cgroup(page); |
2405 | mem_cgroup_lru_del_before_commit_swapcache(page); | 2516 | mem_cgroup_lru_del_before_commit_swapcache(page); |
2406 | __mem_cgroup_commit_charge(ptr, pc, ctype); | 2517 | __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); |
2407 | mem_cgroup_lru_add_after_commit_swapcache(page); | 2518 | mem_cgroup_lru_add_after_commit_swapcache(page); |
2408 | /* | 2519 | /* |
2409 | * Now swap is on-memory. This means this page may be | 2520 | * Now swap is on-memory. This means this page may be |
@@ -2452,11 +2563,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
2452 | return; | 2563 | return; |
2453 | if (!mem) | 2564 | if (!mem) |
2454 | return; | 2565 | return; |
2455 | mem_cgroup_cancel_charge(mem); | 2566 | mem_cgroup_cancel_charge(mem, PAGE_SIZE); |
2456 | } | 2567 | } |
2457 | 2568 | ||
2458 | static void | 2569 | static void |
2459 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | 2570 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, |
2571 | int page_size) | ||
2460 | { | 2572 | { |
2461 | struct memcg_batch_info *batch = NULL; | 2573 | struct memcg_batch_info *batch = NULL; |
2462 | bool uncharge_memsw = true; | 2574 | bool uncharge_memsw = true; |
@@ -2483,6 +2595,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2483 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | 2595 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) |
2484 | goto direct_uncharge; | 2596 | goto direct_uncharge; |
2485 | 2597 | ||
2598 | if (page_size != PAGE_SIZE) | ||
2599 | goto direct_uncharge; | ||
2600 | |||
2486 | /* | 2601 | /* |
2487 | * In typical case, batch->memcg == mem. This means we can | 2602 | * In typical case, batch->memcg == mem. This means we can |
2488 | * merge a series of uncharges to an uncharge of res_counter. | 2603 | * merge a series of uncharges to an uncharge of res_counter. |
@@ -2496,9 +2611,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2496 | batch->memsw_bytes += PAGE_SIZE; | 2611 | batch->memsw_bytes += PAGE_SIZE; |
2497 | return; | 2612 | return; |
2498 | direct_uncharge: | 2613 | direct_uncharge: |
2499 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2614 | res_counter_uncharge(&mem->res, page_size); |
2500 | if (uncharge_memsw) | 2615 | if (uncharge_memsw) |
2501 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 2616 | res_counter_uncharge(&mem->memsw, page_size); |
2502 | if (unlikely(batch->memcg != mem)) | 2617 | if (unlikely(batch->memcg != mem)) |
2503 | memcg_oom_recover(mem); | 2618 | memcg_oom_recover(mem); |
2504 | return; | 2619 | return; |
@@ -2510,8 +2625,10 @@ direct_uncharge: | |||
2510 | static struct mem_cgroup * | 2625 | static struct mem_cgroup * |
2511 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2626 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
2512 | { | 2627 | { |
2628 | int count; | ||
2513 | struct page_cgroup *pc; | 2629 | struct page_cgroup *pc; |
2514 | struct mem_cgroup *mem = NULL; | 2630 | struct mem_cgroup *mem = NULL; |
2631 | int page_size = PAGE_SIZE; | ||
2515 | 2632 | ||
2516 | if (mem_cgroup_disabled()) | 2633 | if (mem_cgroup_disabled()) |
2517 | return NULL; | 2634 | return NULL; |
@@ -2519,6 +2636,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2519 | if (PageSwapCache(page)) | 2636 | if (PageSwapCache(page)) |
2520 | return NULL; | 2637 | return NULL; |
2521 | 2638 | ||
2639 | if (PageTransHuge(page)) { | ||
2640 | page_size <<= compound_order(page); | ||
2641 | VM_BUG_ON(!PageTransHuge(page)); | ||
2642 | } | ||
2643 | |||
2644 | count = page_size >> PAGE_SHIFT; | ||
2522 | /* | 2645 | /* |
2523 | * Check if our page_cgroup is valid | 2646 | * Check if our page_cgroup is valid |
2524 | */ | 2647 | */ |
@@ -2551,7 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2551 | break; | 2674 | break; |
2552 | } | 2675 | } |
2553 | 2676 | ||
2554 | mem_cgroup_charge_statistics(mem, pc, false); | 2677 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); |
2555 | 2678 | ||
2556 | ClearPageCgroupUsed(pc); | 2679 | ClearPageCgroupUsed(pc); |
2557 | /* | 2680 | /* |
@@ -2572,7 +2695,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2572 | mem_cgroup_get(mem); | 2695 | mem_cgroup_get(mem); |
2573 | } | 2696 | } |
2574 | if (!mem_cgroup_is_root(mem)) | 2697 | if (!mem_cgroup_is_root(mem)) |
2575 | __do_uncharge(mem, ctype); | 2698 | __do_uncharge(mem, ctype, page_size); |
2576 | 2699 | ||
2577 | return mem; | 2700 | return mem; |
2578 | 2701 | ||
@@ -2767,6 +2890,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2767 | enum charge_type ctype; | 2890 | enum charge_type ctype; |
2768 | int ret = 0; | 2891 | int ret = 0; |
2769 | 2892 | ||
2893 | VM_BUG_ON(PageTransHuge(page)); | ||
2770 | if (mem_cgroup_disabled()) | 2894 | if (mem_cgroup_disabled()) |
2771 | return 0; | 2895 | return 0; |
2772 | 2896 | ||
@@ -2816,7 +2940,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2816 | return 0; | 2940 | return 0; |
2817 | 2941 | ||
2818 | *ptr = mem; | 2942 | *ptr = mem; |
2819 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); | 2943 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); |
2820 | css_put(&mem->css);/* drop extra refcnt */ | 2944 | css_put(&mem->css);/* drop extra refcnt */ |
2821 | if (ret || *ptr == NULL) { | 2945 | if (ret || *ptr == NULL) { |
2822 | if (PageAnon(page)) { | 2946 | if (PageAnon(page)) { |
@@ -2843,13 +2967,13 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2843 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 2967 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
2844 | else | 2968 | else |
2845 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 2969 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
2846 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2970 | __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); |
2847 | return ret; | 2971 | return ret; |
2848 | } | 2972 | } |
2849 | 2973 | ||
2850 | /* remove redundant charge if migration failed*/ | 2974 | /* remove redundant charge if migration failed*/ |
2851 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 2975 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
2852 | struct page *oldpage, struct page *newpage) | 2976 | struct page *oldpage, struct page *newpage, bool migration_ok) |
2853 | { | 2977 | { |
2854 | struct page *used, *unused; | 2978 | struct page *used, *unused; |
2855 | struct page_cgroup *pc; | 2979 | struct page_cgroup *pc; |
@@ -2858,8 +2982,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
2858 | return; | 2982 | return; |
2859 | /* blocks rmdir() */ | 2983 | /* blocks rmdir() */ |
2860 | cgroup_exclude_rmdir(&mem->css); | 2984 | cgroup_exclude_rmdir(&mem->css); |
2861 | /* at migration success, oldpage->mapping is NULL. */ | 2985 | if (!migration_ok) { |
2862 | if (oldpage->mapping) { | ||
2863 | used = oldpage; | 2986 | used = oldpage; |
2864 | unused = newpage; | 2987 | unused = newpage; |
2865 | } else { | 2988 | } else { |
@@ -4169,13 +4292,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
4169 | */ | 4292 | */ |
4170 | if (!node_state(node, N_NORMAL_MEMORY)) | 4293 | if (!node_state(node, N_NORMAL_MEMORY)) |
4171 | tmp = -1; | 4294 | tmp = -1; |
4172 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); | 4295 | pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); |
4173 | if (!pn) | 4296 | if (!pn) |
4174 | return 1; | 4297 | return 1; |
4175 | 4298 | ||
4176 | mem->info.nodeinfo[node] = pn; | 4299 | mem->info.nodeinfo[node] = pn; |
4177 | memset(pn, 0, sizeof(*pn)); | ||
4178 | |||
4179 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4300 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4180 | mz = &pn->zoneinfo[zone]; | 4301 | mz = &pn->zoneinfo[zone]; |
4181 | for_each_lru(l) | 4302 | for_each_lru(l) |
@@ -4199,14 +4320,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
4199 | 4320 | ||
4200 | /* Can be very big if MAX_NUMNODES is very big */ | 4321 | /* Can be very big if MAX_NUMNODES is very big */ |
4201 | if (size < PAGE_SIZE) | 4322 | if (size < PAGE_SIZE) |
4202 | mem = kmalloc(size, GFP_KERNEL); | 4323 | mem = kzalloc(size, GFP_KERNEL); |
4203 | else | 4324 | else |
4204 | mem = vmalloc(size); | 4325 | mem = vzalloc(size); |
4205 | 4326 | ||
4206 | if (!mem) | 4327 | if (!mem) |
4207 | return NULL; | 4328 | return NULL; |
4208 | 4329 | ||
4209 | memset(mem, 0, size); | ||
4210 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | 4330 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); |
4211 | if (!mem->stat) | 4331 | if (!mem->stat) |
4212 | goto out_free; | 4332 | goto out_free; |
@@ -4454,7 +4574,8 @@ one_by_one: | |||
4454 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 4574 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
4455 | cond_resched(); | 4575 | cond_resched(); |
4456 | } | 4576 | } |
4457 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 4577 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
4578 | PAGE_SIZE); | ||
4458 | if (ret || !mem) | 4579 | if (ret || !mem) |
4459 | /* mem_cgroup_clear_mc() will do uncharge later */ | 4580 | /* mem_cgroup_clear_mc() will do uncharge later */ |
4460 | return -ENOMEM; | 4581 | return -ENOMEM; |
@@ -4616,6 +4737,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
4616 | pte_t *pte; | 4737 | pte_t *pte; |
4617 | spinlock_t *ptl; | 4738 | spinlock_t *ptl; |
4618 | 4739 | ||
4740 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
4619 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 4741 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
4620 | for (; addr != end; pte++, addr += PAGE_SIZE) | 4742 | for (; addr != end; pte++, addr += PAGE_SIZE) |
4621 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | 4743 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) |
@@ -4653,10 +4775,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4653 | 4775 | ||
4654 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | 4776 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) |
4655 | { | 4777 | { |
4656 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | 4778 | unsigned long precharge = mem_cgroup_count_precharge(mm); |
4779 | |||
4780 | VM_BUG_ON(mc.moving_task); | ||
4781 | mc.moving_task = current; | ||
4782 | return mem_cgroup_do_precharge(precharge); | ||
4657 | } | 4783 | } |
4658 | 4784 | ||
4659 | static void mem_cgroup_clear_mc(void) | 4785 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ |
4786 | static void __mem_cgroup_clear_mc(void) | ||
4660 | { | 4787 | { |
4661 | struct mem_cgroup *from = mc.from; | 4788 | struct mem_cgroup *from = mc.from; |
4662 | struct mem_cgroup *to = mc.to; | 4789 | struct mem_cgroup *to = mc.to; |
@@ -4691,18 +4818,28 @@ static void mem_cgroup_clear_mc(void) | |||
4691 | PAGE_SIZE * mc.moved_swap); | 4818 | PAGE_SIZE * mc.moved_swap); |
4692 | } | 4819 | } |
4693 | /* we've already done mem_cgroup_get(mc.to) */ | 4820 | /* we've already done mem_cgroup_get(mc.to) */ |
4694 | |||
4695 | mc.moved_swap = 0; | 4821 | mc.moved_swap = 0; |
4696 | } | 4822 | } |
4823 | memcg_oom_recover(from); | ||
4824 | memcg_oom_recover(to); | ||
4825 | wake_up_all(&mc.waitq); | ||
4826 | } | ||
4827 | |||
4828 | static void mem_cgroup_clear_mc(void) | ||
4829 | { | ||
4830 | struct mem_cgroup *from = mc.from; | ||
4831 | |||
4832 | /* | ||
4833 | * we must clear moving_task before waking up waiters at the end of | ||
4834 | * task migration. | ||
4835 | */ | ||
4836 | mc.moving_task = NULL; | ||
4837 | __mem_cgroup_clear_mc(); | ||
4697 | spin_lock(&mc.lock); | 4838 | spin_lock(&mc.lock); |
4698 | mc.from = NULL; | 4839 | mc.from = NULL; |
4699 | mc.to = NULL; | 4840 | mc.to = NULL; |
4700 | mc.moving_task = NULL; | ||
4701 | spin_unlock(&mc.lock); | 4841 | spin_unlock(&mc.lock); |
4702 | mem_cgroup_end_move(from); | 4842 | mem_cgroup_end_move(from); |
4703 | memcg_oom_recover(from); | ||
4704 | memcg_oom_recover(to); | ||
4705 | wake_up_all(&mc.waitq); | ||
4706 | } | 4843 | } |
4707 | 4844 | ||
4708 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 4845 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
@@ -4729,16 +4866,12 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
4729 | VM_BUG_ON(mc.precharge); | 4866 | VM_BUG_ON(mc.precharge); |
4730 | VM_BUG_ON(mc.moved_charge); | 4867 | VM_BUG_ON(mc.moved_charge); |
4731 | VM_BUG_ON(mc.moved_swap); | 4868 | VM_BUG_ON(mc.moved_swap); |
4732 | VM_BUG_ON(mc.moving_task); | ||
4733 | mem_cgroup_start_move(from); | 4869 | mem_cgroup_start_move(from); |
4734 | spin_lock(&mc.lock); | 4870 | spin_lock(&mc.lock); |
4735 | mc.from = from; | 4871 | mc.from = from; |
4736 | mc.to = mem; | 4872 | mc.to = mem; |
4737 | mc.precharge = 0; | ||
4738 | mc.moved_charge = 0; | ||
4739 | mc.moved_swap = 0; | ||
4740 | mc.moving_task = current; | ||
4741 | spin_unlock(&mc.lock); | 4873 | spin_unlock(&mc.lock); |
4874 | /* We set mc.moving_task later */ | ||
4742 | 4875 | ||
4743 | ret = mem_cgroup_precharge_mc(mm); | 4876 | ret = mem_cgroup_precharge_mc(mm); |
4744 | if (ret) | 4877 | if (ret) |
@@ -4767,6 +4900,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
4767 | spinlock_t *ptl; | 4900 | spinlock_t *ptl; |
4768 | 4901 | ||
4769 | retry: | 4902 | retry: |
4903 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
4770 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 4904 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
4771 | for (; addr != end; addr += PAGE_SIZE) { | 4905 | for (; addr != end; addr += PAGE_SIZE) { |
4772 | pte_t ptent = *(pte++); | 4906 | pte_t ptent = *(pte++); |
@@ -4787,7 +4921,7 @@ retry: | |||
4787 | goto put; | 4921 | goto put; |
4788 | pc = lookup_page_cgroup(page); | 4922 | pc = lookup_page_cgroup(page); |
4789 | if (!mem_cgroup_move_account(pc, | 4923 | if (!mem_cgroup_move_account(pc, |
4790 | mc.from, mc.to, false)) { | 4924 | mc.from, mc.to, false, PAGE_SIZE)) { |
4791 | mc.precharge--; | 4925 | mc.precharge--; |
4792 | /* we uncharge from mc.from later. */ | 4926 | /* we uncharge from mc.from later. */ |
4793 | mc.moved_charge++; | 4927 | mc.moved_charge++; |
@@ -4832,7 +4966,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4832 | struct vm_area_struct *vma; | 4966 | struct vm_area_struct *vma; |
4833 | 4967 | ||
4834 | lru_add_drain_all(); | 4968 | lru_add_drain_all(); |
4835 | down_read(&mm->mmap_sem); | 4969 | retry: |
4970 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | ||
4971 | /* | ||
4972 | * Someone who are holding the mmap_sem might be waiting in | ||
4973 | * waitq. So we cancel all extra charges, wake up all waiters, | ||
4974 | * and retry. Because we cancel precharges, we might not be able | ||
4975 | * to move enough charges, but moving charge is a best-effort | ||
4976 | * feature anyway, so it wouldn't be a big problem. | ||
4977 | */ | ||
4978 | __mem_cgroup_clear_mc(); | ||
4979 | cond_resched(); | ||
4980 | goto retry; | ||
4981 | } | ||
4836 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4982 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4837 | int ret; | 4983 | int ret; |
4838 | struct mm_walk mem_cgroup_move_charge_walk = { | 4984 | struct mm_walk mem_cgroup_move_charge_walk = { |
@@ -4911,10 +5057,21 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
4911 | }; | 5057 | }; |
4912 | 5058 | ||
4913 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 5059 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
5060 | static int __init enable_swap_account(char *s) | ||
5061 | { | ||
5062 | /* consider enabled if no parameter or 1 is given */ | ||
5063 | if (!(*s) || !strcmp(s, "=1")) | ||
5064 | really_do_swap_account = 1; | ||
5065 | else if (!strcmp(s, "=0")) | ||
5066 | really_do_swap_account = 0; | ||
5067 | return 1; | ||
5068 | } | ||
5069 | __setup("swapaccount", enable_swap_account); | ||
4914 | 5070 | ||
4915 | static int __init disable_swap_account(char *s) | 5071 | static int __init disable_swap_account(char *s) |
4916 | { | 5072 | { |
4917 | really_do_swap_account = 0; | 5073 | printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); |
5074 | enable_swap_account("=0"); | ||
4918 | return 1; | 5075 | return 1; |
4919 | } | 5076 | } |
4920 | __setup("noswapaccount", disable_swap_account); | 5077 | __setup("noswapaccount", disable_swap_account); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 124324134ff..0207c2f6f8b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/slab.h> | 51 | #include <linux/slab.h> |
52 | #include <linux/swapops.h> | 52 | #include <linux/swapops.h> |
53 | #include <linux/hugetlb.h> | 53 | #include <linux/hugetlb.h> |
54 | #include <linux/memory_hotplug.h> | ||
54 | #include "internal.h" | 55 | #include "internal.h" |
55 | 56 | ||
56 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 57 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -202,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
202 | #ifdef __ARCH_SI_TRAPNO | 203 | #ifdef __ARCH_SI_TRAPNO |
203 | si.si_trapno = trapno; | 204 | si.si_trapno = trapno; |
204 | #endif | 205 | #endif |
205 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; | 206 | si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; |
206 | /* | 207 | /* |
207 | * Don't use force here, it's convenient if the signal | 208 | * Don't use force here, it's convenient if the signal |
208 | * can be temporarily blocked. | 209 | * can be temporarily blocked. |
@@ -232,8 +233,8 @@ void shake_page(struct page *p, int access) | |||
232 | } | 233 | } |
233 | 234 | ||
234 | /* | 235 | /* |
235 | * Only all shrink_slab here (which would also | 236 | * Only call shrink_slab here (which would also shrink other caches) if |
236 | * shrink other caches) if access is not potentially fatal. | 237 | * access is not potentially fatal. |
237 | */ | 238 | */ |
238 | if (access) { | 239 | if (access) { |
239 | int nr; | 240 | int nr; |
@@ -853,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
853 | int ret; | 854 | int ret; |
854 | int kill = 1; | 855 | int kill = 1; |
855 | struct page *hpage = compound_head(p); | 856 | struct page *hpage = compound_head(p); |
857 | struct page *ppage; | ||
856 | 858 | ||
857 | if (PageReserved(p) || PageSlab(p)) | 859 | if (PageReserved(p) || PageSlab(p)) |
858 | return SWAP_SUCCESS; | 860 | return SWAP_SUCCESS; |
@@ -894,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
894 | } | 896 | } |
895 | 897 | ||
896 | /* | 898 | /* |
899 | * ppage: poisoned page | ||
900 | * if p is regular page(4k page) | ||
901 | * ppage == real poisoned page; | ||
902 | * else p is hugetlb or THP, ppage == head page. | ||
903 | */ | ||
904 | ppage = hpage; | ||
905 | |||
906 | if (PageTransHuge(hpage)) { | ||
907 | /* | ||
908 | * Verify that this isn't a hugetlbfs head page, the check for | ||
909 | * PageAnon is just for avoid tripping a split_huge_page | ||
910 | * internal debug check, as split_huge_page refuses to deal with | ||
911 | * anything that isn't an anon page. PageAnon can't go away fro | ||
912 | * under us because we hold a refcount on the hpage, without a | ||
913 | * refcount on the hpage. split_huge_page can't be safely called | ||
914 | * in the first place, having a refcount on the tail isn't | ||
915 | * enough * to be safe. | ||
916 | */ | ||
917 | if (!PageHuge(hpage) && PageAnon(hpage)) { | ||
918 | if (unlikely(split_huge_page(hpage))) { | ||
919 | /* | ||
920 | * FIXME: if splitting THP is failed, it is | ||
921 | * better to stop the following operation rather | ||
922 | * than causing panic by unmapping. System might | ||
923 | * survive if the page is freed later. | ||
924 | */ | ||
925 | printk(KERN_INFO | ||
926 | "MCE %#lx: failed to split THP\n", pfn); | ||
927 | |||
928 | BUG_ON(!PageHWPoison(p)); | ||
929 | return SWAP_FAIL; | ||
930 | } | ||
931 | /* THP is split, so ppage should be the real poisoned page. */ | ||
932 | ppage = p; | ||
933 | } | ||
934 | } | ||
935 | |||
936 | /* | ||
897 | * First collect all the processes that have the page | 937 | * First collect all the processes that have the page |
898 | * mapped in dirty form. This has to be done before try_to_unmap, | 938 | * mapped in dirty form. This has to be done before try_to_unmap, |
899 | * because ttu takes the rmap data structures down. | 939 | * because ttu takes the rmap data structures down. |
@@ -902,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
902 | * there's nothing that can be done. | 942 | * there's nothing that can be done. |
903 | */ | 943 | */ |
904 | if (kill) | 944 | if (kill) |
905 | collect_procs(hpage, &tokill); | 945 | collect_procs(ppage, &tokill); |
906 | 946 | ||
907 | ret = try_to_unmap(hpage, ttu); | 947 | if (hpage != ppage) |
948 | lock_page_nosync(ppage); | ||
949 | |||
950 | ret = try_to_unmap(ppage, ttu); | ||
908 | if (ret != SWAP_SUCCESS) | 951 | if (ret != SWAP_SUCCESS) |
909 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 952 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
910 | pfn, page_mapcount(hpage)); | 953 | pfn, page_mapcount(ppage)); |
954 | |||
955 | if (hpage != ppage) | ||
956 | unlock_page(ppage); | ||
911 | 957 | ||
912 | /* | 958 | /* |
913 | * Now that the dirty bit has been propagated to the | 959 | * Now that the dirty bit has been propagated to the |
@@ -918,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
918 | * use a more force-full uncatchable kill to prevent | 964 | * use a more force-full uncatchable kill to prevent |
919 | * any accesses to the poisoned memory. | 965 | * any accesses to the poisoned memory. |
920 | */ | 966 | */ |
921 | kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, | 967 | kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, |
922 | ret != SWAP_SUCCESS, p, pfn); | 968 | ret != SWAP_SUCCESS, p, pfn); |
923 | 969 | ||
924 | return ret; | 970 | return ret; |
@@ -927,7 +973,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
927 | static void set_page_hwpoison_huge_page(struct page *hpage) | 973 | static void set_page_hwpoison_huge_page(struct page *hpage) |
928 | { | 974 | { |
929 | int i; | 975 | int i; |
930 | int nr_pages = 1 << compound_order(hpage); | 976 | int nr_pages = 1 << compound_trans_order(hpage); |
931 | for (i = 0; i < nr_pages; i++) | 977 | for (i = 0; i < nr_pages; i++) |
932 | SetPageHWPoison(hpage + i); | 978 | SetPageHWPoison(hpage + i); |
933 | } | 979 | } |
@@ -935,7 +981,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) | |||
935 | static void clear_page_hwpoison_huge_page(struct page *hpage) | 981 | static void clear_page_hwpoison_huge_page(struct page *hpage) |
936 | { | 982 | { |
937 | int i; | 983 | int i; |
938 | int nr_pages = 1 << compound_order(hpage); | 984 | int nr_pages = 1 << compound_trans_order(hpage); |
939 | for (i = 0; i < nr_pages; i++) | 985 | for (i = 0; i < nr_pages; i++) |
940 | ClearPageHWPoison(hpage + i); | 986 | ClearPageHWPoison(hpage + i); |
941 | } | 987 | } |
@@ -965,7 +1011,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
965 | return 0; | 1011 | return 0; |
966 | } | 1012 | } |
967 | 1013 | ||
968 | nr_pages = 1 << compound_order(hpage); | 1014 | nr_pages = 1 << compound_trans_order(hpage); |
969 | atomic_long_add(nr_pages, &mce_bad_pages); | 1015 | atomic_long_add(nr_pages, &mce_bad_pages); |
970 | 1016 | ||
971 | /* | 1017 | /* |
@@ -1019,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1019 | * The check (unnecessarily) ignores LRU pages being isolated and | 1065 | * The check (unnecessarily) ignores LRU pages being isolated and |
1020 | * walked by the page reclaim code, however that's not a big loss. | 1066 | * walked by the page reclaim code, however that's not a big loss. |
1021 | */ | 1067 | */ |
1022 | if (!PageLRU(p) && !PageHuge(p)) | 1068 | if (!PageHuge(p) && !PageTransCompound(p)) { |
1023 | shake_page(p, 0); | 1069 | if (!PageLRU(p)) |
1024 | if (!PageLRU(p) && !PageHuge(p)) { | 1070 | shake_page(p, 0); |
1025 | /* | 1071 | if (!PageLRU(p)) { |
1026 | * shake_page could have turned it free. | 1072 | /* |
1027 | */ | 1073 | * shake_page could have turned it free. |
1028 | if (is_free_buddy_page(p)) { | 1074 | */ |
1029 | action_result(pfn, "free buddy, 2nd try", DELAYED); | 1075 | if (is_free_buddy_page(p)) { |
1030 | return 0; | 1076 | action_result(pfn, "free buddy, 2nd try", |
1077 | DELAYED); | ||
1078 | return 0; | ||
1079 | } | ||
1080 | action_result(pfn, "non LRU", IGNORED); | ||
1081 | put_page(p); | ||
1082 | return -EBUSY; | ||
1031 | } | 1083 | } |
1032 | action_result(pfn, "non LRU", IGNORED); | ||
1033 | put_page(p); | ||
1034 | return -EBUSY; | ||
1035 | } | 1084 | } |
1036 | 1085 | ||
1037 | /* | 1086 | /* |
@@ -1061,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1061 | * For error on the tail page, we should set PG_hwpoison | 1110 | * For error on the tail page, we should set PG_hwpoison |
1062 | * on the head page to show that the hugepage is hwpoisoned | 1111 | * on the head page to show that the hugepage is hwpoisoned |
1063 | */ | 1112 | */ |
1064 | if (PageTail(p) && TestSetPageHWPoison(hpage)) { | 1113 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { |
1065 | action_result(pfn, "hugepage already hardware poisoned", | 1114 | action_result(pfn, "hugepage already hardware poisoned", |
1066 | IGNORED); | 1115 | IGNORED); |
1067 | unlock_page(hpage); | 1116 | unlock_page(hpage); |
@@ -1163,7 +1212,7 @@ int unpoison_memory(unsigned long pfn) | |||
1163 | return 0; | 1212 | return 0; |
1164 | } | 1213 | } |
1165 | 1214 | ||
1166 | nr_pages = 1 << compound_order(page); | 1215 | nr_pages = 1 << compound_trans_order(page); |
1167 | 1216 | ||
1168 | if (!get_page_unless_zero(page)) { | 1217 | if (!get_page_unless_zero(page)) { |
1169 | /* | 1218 | /* |
@@ -1230,11 +1279,10 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1230 | return 1; | 1279 | return 1; |
1231 | 1280 | ||
1232 | /* | 1281 | /* |
1233 | * The lock_system_sleep prevents a race with memory hotplug, | 1282 | * The lock_memory_hotplug prevents a race with memory hotplug. |
1234 | * because the isolation assumes there's only a single user. | ||
1235 | * This is a big hammer, a better would be nicer. | 1283 | * This is a big hammer, a better would be nicer. |
1236 | */ | 1284 | */ |
1237 | lock_system_sleep(); | 1285 | lock_memory_hotplug(); |
1238 | 1286 | ||
1239 | /* | 1287 | /* |
1240 | * Isolate the page, so that it doesn't get reallocated if it | 1288 | * Isolate the page, so that it doesn't get reallocated if it |
@@ -1264,7 +1312,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1264 | ret = 1; | 1312 | ret = 1; |
1265 | } | 1313 | } |
1266 | unset_migratetype_isolate(p); | 1314 | unset_migratetype_isolate(p); |
1267 | unlock_system_sleep(); | 1315 | unlock_memory_hotplug(); |
1268 | return ret; | 1316 | return ret; |
1269 | } | 1317 | } |
1270 | 1318 | ||
@@ -1290,9 +1338,13 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1290 | /* Keep page count to indicate a given hugepage is isolated. */ | 1338 | /* Keep page count to indicate a given hugepage is isolated. */ |
1291 | 1339 | ||
1292 | list_add(&hpage->lru, &pagelist); | 1340 | list_add(&hpage->lru, &pagelist); |
1293 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | 1341 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, |
1342 | true); | ||
1294 | if (ret) { | 1343 | if (ret) { |
1295 | putback_lru_pages(&pagelist); | 1344 | struct page *page1, *page2; |
1345 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | ||
1346 | put_page(page1); | ||
1347 | |||
1296 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | 1348 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", |
1297 | pfn, ret, page->flags); | 1349 | pfn, ret, page->flags); |
1298 | if (ret > 0) | 1350 | if (ret > 0) |
@@ -1301,7 +1353,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1301 | } | 1353 | } |
1302 | done: | 1354 | done: |
1303 | if (!PageHWPoison(hpage)) | 1355 | if (!PageHWPoison(hpage)) |
1304 | atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); | 1356 | atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); |
1305 | set_page_hwpoison_huge_page(hpage); | 1357 | set_page_hwpoison_huge_page(hpage); |
1306 | dequeue_hwpoisoned_huge_page(hpage); | 1358 | dequeue_hwpoisoned_huge_page(hpage); |
1307 | /* keep elevated page count for bad page */ | 1359 | /* keep elevated page count for bad page */ |
@@ -1413,8 +1465,10 @@ int soft_offline_page(struct page *page, int flags) | |||
1413 | LIST_HEAD(pagelist); | 1465 | LIST_HEAD(pagelist); |
1414 | 1466 | ||
1415 | list_add(&page->lru, &pagelist); | 1467 | list_add(&page->lru, &pagelist); |
1416 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | 1468 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1469 | 0, true); | ||
1417 | if (ret) { | 1470 | if (ret) { |
1471 | putback_lru_pages(&pagelist); | ||
1418 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1472 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1419 | pfn, ret, page->flags); | 1473 | pfn, ret, page->flags); |
1420 | if (ret > 0) | 1474 | if (ret > 0) |
diff --git a/mm/memory.c b/mm/memory.c index 02e48aa0ed1..8e8c1832486 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
394 | } | 394 | } |
395 | } | 395 | } |
396 | 396 | ||
397 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | 397 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
398 | pmd_t *pmd, unsigned long address) | ||
398 | { | 399 | { |
399 | pgtable_t new = pte_alloc_one(mm, address); | 400 | pgtable_t new = pte_alloc_one(mm, address); |
401 | int wait_split_huge_page; | ||
400 | if (!new) | 402 | if (!new) |
401 | return -ENOMEM; | 403 | return -ENOMEM; |
402 | 404 | ||
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | |||
416 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ | 418 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ |
417 | 419 | ||
418 | spin_lock(&mm->page_table_lock); | 420 | spin_lock(&mm->page_table_lock); |
419 | if (!pmd_present(*pmd)) { /* Has another populated it ? */ | 421 | wait_split_huge_page = 0; |
422 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | ||
420 | mm->nr_ptes++; | 423 | mm->nr_ptes++; |
421 | pmd_populate(mm, pmd, new); | 424 | pmd_populate(mm, pmd, new); |
422 | new = NULL; | 425 | new = NULL; |
423 | } | 426 | } else if (unlikely(pmd_trans_splitting(*pmd))) |
427 | wait_split_huge_page = 1; | ||
424 | spin_unlock(&mm->page_table_lock); | 428 | spin_unlock(&mm->page_table_lock); |
425 | if (new) | 429 | if (new) |
426 | pte_free(mm, new); | 430 | pte_free(mm, new); |
431 | if (wait_split_huge_page) | ||
432 | wait_split_huge_page(vma->anon_vma, pmd); | ||
427 | return 0; | 433 | return 0; |
428 | } | 434 | } |
429 | 435 | ||
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
436 | smp_wmb(); /* See comment in __pte_alloc */ | 442 | smp_wmb(); /* See comment in __pte_alloc */ |
437 | 443 | ||
438 | spin_lock(&init_mm.page_table_lock); | 444 | spin_lock(&init_mm.page_table_lock); |
439 | if (!pmd_present(*pmd)) { /* Has another populated it ? */ | 445 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
440 | pmd_populate_kernel(&init_mm, pmd, new); | 446 | pmd_populate_kernel(&init_mm, pmd, new); |
441 | new = NULL; | 447 | new = NULL; |
442 | } | 448 | } else |
449 | VM_BUG_ON(pmd_trans_splitting(*pmd)); | ||
443 | spin_unlock(&init_mm.page_table_lock); | 450 | spin_unlock(&init_mm.page_table_lock); |
444 | if (new) | 451 | if (new) |
445 | pte_free_kernel(&init_mm, new); | 452 | pte_free_kernel(&init_mm, new); |
@@ -719,9 +726,9 @@ out_set_pte: | |||
719 | return 0; | 726 | return 0; |
720 | } | 727 | } |
721 | 728 | ||
722 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 729 | int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
723 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 730 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
724 | unsigned long addr, unsigned long end) | 731 | unsigned long addr, unsigned long end) |
725 | { | 732 | { |
726 | pte_t *orig_src_pte, *orig_dst_pte; | 733 | pte_t *orig_src_pte, *orig_dst_pte; |
727 | pte_t *src_pte, *dst_pte; | 734 | pte_t *src_pte, *dst_pte; |
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src | |||
795 | src_pmd = pmd_offset(src_pud, addr); | 802 | src_pmd = pmd_offset(src_pud, addr); |
796 | do { | 803 | do { |
797 | next = pmd_addr_end(addr, end); | 804 | next = pmd_addr_end(addr, end); |
805 | if (pmd_trans_huge(*src_pmd)) { | ||
806 | int err; | ||
807 | VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); | ||
808 | err = copy_huge_pmd(dst_mm, src_mm, | ||
809 | dst_pmd, src_pmd, addr, vma); | ||
810 | if (err == -ENOMEM) | ||
811 | return -ENOMEM; | ||
812 | if (!err) | ||
813 | continue; | ||
814 | /* fall through */ | ||
815 | } | ||
798 | if (pmd_none_or_clear_bad(src_pmd)) | 816 | if (pmd_none_or_clear_bad(src_pmd)) |
799 | continue; | 817 | continue; |
800 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, | 818 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, |
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
997 | pmd = pmd_offset(pud, addr); | 1015 | pmd = pmd_offset(pud, addr); |
998 | do { | 1016 | do { |
999 | next = pmd_addr_end(addr, end); | 1017 | next = pmd_addr_end(addr, end); |
1018 | if (pmd_trans_huge(*pmd)) { | ||
1019 | if (next-addr != HPAGE_PMD_SIZE) { | ||
1020 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | ||
1021 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
1022 | } else if (zap_huge_pmd(tlb, vma, pmd)) { | ||
1023 | (*zap_work)--; | ||
1024 | continue; | ||
1025 | } | ||
1026 | /* fall through */ | ||
1027 | } | ||
1000 | if (pmd_none_or_clear_bad(pmd)) { | 1028 | if (pmd_none_or_clear_bad(pmd)) { |
1001 | (*zap_work)--; | 1029 | (*zap_work)--; |
1002 | continue; | 1030 | continue; |
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1262 | pud = pud_offset(pgd, address); | 1290 | pud = pud_offset(pgd, address); |
1263 | if (pud_none(*pud)) | 1291 | if (pud_none(*pud)) |
1264 | goto no_page_table; | 1292 | goto no_page_table; |
1265 | if (pud_huge(*pud)) { | 1293 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
1266 | BUG_ON(flags & FOLL_GET); | 1294 | BUG_ON(flags & FOLL_GET); |
1267 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 1295 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); |
1268 | goto out; | 1296 | goto out; |
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1273 | pmd = pmd_offset(pud, address); | 1301 | pmd = pmd_offset(pud, address); |
1274 | if (pmd_none(*pmd)) | 1302 | if (pmd_none(*pmd)) |
1275 | goto no_page_table; | 1303 | goto no_page_table; |
1276 | if (pmd_huge(*pmd)) { | 1304 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
1277 | BUG_ON(flags & FOLL_GET); | 1305 | BUG_ON(flags & FOLL_GET); |
1278 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1306 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
1279 | goto out; | 1307 | goto out; |
1280 | } | 1308 | } |
1309 | if (pmd_trans_huge(*pmd)) { | ||
1310 | if (flags & FOLL_SPLIT) { | ||
1311 | split_huge_page_pmd(mm, pmd); | ||
1312 | goto split_fallthrough; | ||
1313 | } | ||
1314 | spin_lock(&mm->page_table_lock); | ||
1315 | if (likely(pmd_trans_huge(*pmd))) { | ||
1316 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
1317 | spin_unlock(&mm->page_table_lock); | ||
1318 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1319 | } else { | ||
1320 | page = follow_trans_huge_pmd(mm, address, | ||
1321 | pmd, flags); | ||
1322 | spin_unlock(&mm->page_table_lock); | ||
1323 | goto out; | ||
1324 | } | ||
1325 | } else | ||
1326 | spin_unlock(&mm->page_table_lock); | ||
1327 | /* fall through */ | ||
1328 | } | ||
1329 | split_fallthrough: | ||
1281 | if (unlikely(pmd_bad(*pmd))) | 1330 | if (unlikely(pmd_bad(*pmd))) |
1282 | goto no_page_table; | 1331 | goto no_page_table; |
1283 | 1332 | ||
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1310 | */ | 1359 | */ |
1311 | mark_page_accessed(page); | 1360 | mark_page_accessed(page); |
1312 | } | 1361 | } |
1362 | if (flags & FOLL_MLOCK) { | ||
1363 | /* | ||
1364 | * The preliminary mapping check is mainly to avoid the | ||
1365 | * pointless overhead of lock_page on the ZERO_PAGE | ||
1366 | * which might bounce very badly if there is contention. | ||
1367 | * | ||
1368 | * If the page is already locked, we don't need to | ||
1369 | * handle it now - vmscan will handle it later if and | ||
1370 | * when it attempts to reclaim the page. | ||
1371 | */ | ||
1372 | if (page->mapping && trylock_page(page)) { | ||
1373 | lru_add_drain(); /* push cached pages to LRU */ | ||
1374 | /* | ||
1375 | * Because we lock page here and migration is | ||
1376 | * blocked by the pte's page reference, we need | ||
1377 | * only check for file-cache page truncation. | ||
1378 | */ | ||
1379 | if (page->mapping) | ||
1380 | mlock_vma_page(page); | ||
1381 | unlock_page(page); | ||
1382 | } | ||
1383 | } | ||
1313 | unlock: | 1384 | unlock: |
1314 | pte_unmap_unlock(ptep, ptl); | 1385 | pte_unmap_unlock(ptep, ptl); |
1315 | out: | 1386 | out: |
@@ -1341,7 +1412,8 @@ no_page_table: | |||
1341 | 1412 | ||
1342 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1413 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1343 | unsigned long start, int nr_pages, unsigned int gup_flags, | 1414 | unsigned long start, int nr_pages, unsigned int gup_flags, |
1344 | struct page **pages, struct vm_area_struct **vmas) | 1415 | struct page **pages, struct vm_area_struct **vmas, |
1416 | int *nonblocking) | ||
1345 | { | 1417 | { |
1346 | int i; | 1418 | int i; |
1347 | unsigned long vm_flags; | 1419 | unsigned long vm_flags; |
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1386 | pmd = pmd_offset(pud, pg); | 1458 | pmd = pmd_offset(pud, pg); |
1387 | if (pmd_none(*pmd)) | 1459 | if (pmd_none(*pmd)) |
1388 | return i ? : -EFAULT; | 1460 | return i ? : -EFAULT; |
1461 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1389 | pte = pte_offset_map(pmd, pg); | 1462 | pte = pte_offset_map(pmd, pg); |
1390 | if (pte_none(*pte)) { | 1463 | if (pte_none(*pte)) { |
1391 | pte_unmap(pte); | 1464 | pte_unmap(pte); |
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1441 | cond_resched(); | 1514 | cond_resched(); |
1442 | while (!(page = follow_page(vma, start, foll_flags))) { | 1515 | while (!(page = follow_page(vma, start, foll_flags))) { |
1443 | int ret; | 1516 | int ret; |
1517 | unsigned int fault_flags = 0; | ||
1518 | |||
1519 | if (foll_flags & FOLL_WRITE) | ||
1520 | fault_flags |= FAULT_FLAG_WRITE; | ||
1521 | if (nonblocking) | ||
1522 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
1444 | 1523 | ||
1445 | ret = handle_mm_fault(mm, vma, start, | 1524 | ret = handle_mm_fault(mm, vma, start, |
1446 | (foll_flags & FOLL_WRITE) ? | 1525 | fault_flags); |
1447 | FAULT_FLAG_WRITE : 0); | ||
1448 | 1526 | ||
1449 | if (ret & VM_FAULT_ERROR) { | 1527 | if (ret & VM_FAULT_ERROR) { |
1450 | if (ret & VM_FAULT_OOM) | 1528 | if (ret & VM_FAULT_OOM) |
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1460 | else | 1538 | else |
1461 | tsk->min_flt++; | 1539 | tsk->min_flt++; |
1462 | 1540 | ||
1541 | if (ret & VM_FAULT_RETRY) { | ||
1542 | *nonblocking = 0; | ||
1543 | return i; | ||
1544 | } | ||
1545 | |||
1463 | /* | 1546 | /* |
1464 | * The VM_FAULT_WRITE bit tells us that | 1547 | * The VM_FAULT_WRITE bit tells us that |
1465 | * do_wp_page has broken COW when necessary, | 1548 | * do_wp_page has broken COW when necessary, |
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1559 | if (force) | 1642 | if (force) |
1560 | flags |= FOLL_FORCE; | 1643 | flags |= FOLL_FORCE; |
1561 | 1644 | ||
1562 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 1645 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, |
1646 | NULL); | ||
1563 | } | 1647 | } |
1564 | EXPORT_SYMBOL(get_user_pages); | 1648 | EXPORT_SYMBOL(get_user_pages); |
1565 | 1649 | ||
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr) | |||
1584 | struct page *page; | 1668 | struct page *page; |
1585 | 1669 | ||
1586 | if (__get_user_pages(current, current->mm, addr, 1, | 1670 | if (__get_user_pages(current, current->mm, addr, 1, |
1587 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) | 1671 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, |
1672 | NULL) < 1) | ||
1588 | return NULL; | 1673 | return NULL; |
1589 | flush_cache_page(vma, addr, page_to_pfn(page)); | 1674 | flush_cache_page(vma, addr, page_to_pfn(page)); |
1590 | return page; | 1675 | return page; |
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, | |||
1598 | pud_t * pud = pud_alloc(mm, pgd, addr); | 1683 | pud_t * pud = pud_alloc(mm, pgd, addr); |
1599 | if (pud) { | 1684 | if (pud) { |
1600 | pmd_t * pmd = pmd_alloc(mm, pud, addr); | 1685 | pmd_t * pmd = pmd_alloc(mm, pud, addr); |
1601 | if (pmd) | 1686 | if (pmd) { |
1687 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1602 | return pte_alloc_map_lock(mm, pmd, addr, ptl); | 1688 | return pte_alloc_map_lock(mm, pmd, addr, ptl); |
1689 | } | ||
1603 | } | 1690 | } |
1604 | return NULL; | 1691 | return NULL; |
1605 | } | 1692 | } |
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
1818 | pmd = pmd_alloc(mm, pud, addr); | 1905 | pmd = pmd_alloc(mm, pud, addr); |
1819 | if (!pmd) | 1906 | if (!pmd) |
1820 | return -ENOMEM; | 1907 | return -ENOMEM; |
1908 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1821 | do { | 1909 | do { |
1822 | next = pmd_addr_end(addr, end); | 1910 | next = pmd_addr_end(addr, end); |
1823 | if (remap_pte_range(mm, pmd, addr, next, | 1911 | if (remap_pte_range(mm, pmd, addr, next, |
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | |||
2048 | return same; | 2136 | return same; |
2049 | } | 2137 | } |
2050 | 2138 | ||
2051 | /* | ||
2052 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when | ||
2053 | * servicing faults for write access. In the normal case, do always want | ||
2054 | * pte_mkwrite. But get_user_pages can cause write faults for mappings | ||
2055 | * that do not have writing enabled, when used by access_process_vm. | ||
2056 | */ | ||
2057 | static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | ||
2058 | { | ||
2059 | if (likely(vma->vm_flags & VM_WRITE)) | ||
2060 | pte = pte_mkwrite(pte); | ||
2061 | return pte; | ||
2062 | } | ||
2063 | |||
2064 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 2139 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
2065 | { | 2140 | { |
2066 | /* | 2141 | /* |
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2112 | { | 2187 | { |
2113 | struct page *old_page, *new_page; | 2188 | struct page *old_page, *new_page; |
2114 | pte_t entry; | 2189 | pte_t entry; |
2115 | int reuse = 0, ret = 0; | 2190 | int ret = 0; |
2116 | int page_mkwrite = 0; | 2191 | int page_mkwrite = 0; |
2117 | struct page *dirty_page = NULL; | 2192 | struct page *dirty_page = NULL; |
2118 | 2193 | ||
@@ -2144,19 +2219,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2144 | &ptl); | 2219 | &ptl); |
2145 | if (!pte_same(*page_table, orig_pte)) { | 2220 | if (!pte_same(*page_table, orig_pte)) { |
2146 | unlock_page(old_page); | 2221 | unlock_page(old_page); |
2147 | page_cache_release(old_page); | ||
2148 | goto unlock; | 2222 | goto unlock; |
2149 | } | 2223 | } |
2150 | page_cache_release(old_page); | 2224 | page_cache_release(old_page); |
2151 | } | 2225 | } |
2152 | reuse = reuse_swap_page(old_page); | 2226 | if (reuse_swap_page(old_page)) { |
2153 | if (reuse) | ||
2154 | /* | 2227 | /* |
2155 | * The page is all ours. Move it to our anon_vma so | 2228 | * The page is all ours. Move it to our anon_vma so |
2156 | * the rmap code will not search our parent or siblings. | 2229 | * the rmap code will not search our parent or siblings. |
2157 | * Protected against the rmap code by the page lock. | 2230 | * Protected against the rmap code by the page lock. |
2158 | */ | 2231 | */ |
2159 | page_move_anon_rmap(old_page, vma, address); | 2232 | page_move_anon_rmap(old_page, vma, address); |
2233 | unlock_page(old_page); | ||
2234 | goto reuse; | ||
2235 | } | ||
2160 | unlock_page(old_page); | 2236 | unlock_page(old_page); |
2161 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2237 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2162 | (VM_WRITE|VM_SHARED))) { | 2238 | (VM_WRITE|VM_SHARED))) { |
@@ -2212,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2212 | &ptl); | 2288 | &ptl); |
2213 | if (!pte_same(*page_table, orig_pte)) { | 2289 | if (!pte_same(*page_table, orig_pte)) { |
2214 | unlock_page(old_page); | 2290 | unlock_page(old_page); |
2215 | page_cache_release(old_page); | ||
2216 | goto unlock; | 2291 | goto unlock; |
2217 | } | 2292 | } |
2218 | 2293 | ||
@@ -2220,18 +2295,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2220 | } | 2295 | } |
2221 | dirty_page = old_page; | 2296 | dirty_page = old_page; |
2222 | get_page(dirty_page); | 2297 | get_page(dirty_page); |
2223 | reuse = 1; | ||
2224 | } | ||
2225 | 2298 | ||
2226 | if (reuse) { | ||
2227 | reuse: | 2299 | reuse: |
2228 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2300 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2229 | entry = pte_mkyoung(orig_pte); | 2301 | entry = pte_mkyoung(orig_pte); |
2230 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2302 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2231 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 2303 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
2232 | update_mmu_cache(vma, address, page_table); | 2304 | update_mmu_cache(vma, address, page_table); |
2305 | pte_unmap_unlock(page_table, ptl); | ||
2233 | ret |= VM_FAULT_WRITE; | 2306 | ret |= VM_FAULT_WRITE; |
2234 | goto unlock; | 2307 | |
2308 | if (!dirty_page) | ||
2309 | return ret; | ||
2310 | |||
2311 | /* | ||
2312 | * Yes, Virginia, this is actually required to prevent a race | ||
2313 | * with clear_page_dirty_for_io() from clearing the page dirty | ||
2314 | * bit after it clear all dirty ptes, but before a racing | ||
2315 | * do_wp_page installs a dirty pte. | ||
2316 | * | ||
2317 | * do_no_page is protected similarly. | ||
2318 | */ | ||
2319 | if (!page_mkwrite) { | ||
2320 | wait_on_page_locked(dirty_page); | ||
2321 | set_page_dirty_balance(dirty_page, page_mkwrite); | ||
2322 | } | ||
2323 | put_page(dirty_page); | ||
2324 | if (page_mkwrite) { | ||
2325 | struct address_space *mapping = dirty_page->mapping; | ||
2326 | |||
2327 | set_page_dirty(dirty_page); | ||
2328 | unlock_page(dirty_page); | ||
2329 | page_cache_release(dirty_page); | ||
2330 | if (mapping) { | ||
2331 | /* | ||
2332 | * Some device drivers do not set page.mapping | ||
2333 | * but still dirty their pages | ||
2334 | */ | ||
2335 | balance_dirty_pages_ratelimited(mapping); | ||
2336 | } | ||
2337 | } | ||
2338 | |||
2339 | /* file_update_time outside page_lock */ | ||
2340 | if (vma->vm_file) | ||
2341 | file_update_time(vma->vm_file); | ||
2342 | |||
2343 | return ret; | ||
2235 | } | 2344 | } |
2236 | 2345 | ||
2237 | /* | 2346 | /* |
@@ -2256,16 +2365,6 @@ gotten: | |||
2256 | } | 2365 | } |
2257 | __SetPageUptodate(new_page); | 2366 | __SetPageUptodate(new_page); |
2258 | 2367 | ||
2259 | /* | ||
2260 | * Don't let another task, with possibly unlocked vma, | ||
2261 | * keep the mlocked page. | ||
2262 | */ | ||
2263 | if ((vma->vm_flags & VM_LOCKED) && old_page) { | ||
2264 | lock_page(old_page); /* for LRU manipulation */ | ||
2265 | clear_page_mlock(old_page); | ||
2266 | unlock_page(old_page); | ||
2267 | } | ||
2268 | |||
2269 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) | 2368 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
2270 | goto oom_free_new; | 2369 | goto oom_free_new; |
2271 | 2370 | ||
@@ -2333,42 +2432,19 @@ gotten: | |||
2333 | 2432 | ||
2334 | if (new_page) | 2433 | if (new_page) |
2335 | page_cache_release(new_page); | 2434 | page_cache_release(new_page); |
2336 | if (old_page) | ||
2337 | page_cache_release(old_page); | ||
2338 | unlock: | 2435 | unlock: |
2339 | pte_unmap_unlock(page_table, ptl); | 2436 | pte_unmap_unlock(page_table, ptl); |
2340 | if (dirty_page) { | 2437 | if (old_page) { |
2341 | /* | 2438 | /* |
2342 | * Yes, Virginia, this is actually required to prevent a race | 2439 | * Don't let another task, with possibly unlocked vma, |
2343 | * with clear_page_dirty_for_io() from clearing the page dirty | 2440 | * keep the mlocked page. |
2344 | * bit after it clear all dirty ptes, but before a racing | ||
2345 | * do_wp_page installs a dirty pte. | ||
2346 | * | ||
2347 | * do_no_page is protected similarly. | ||
2348 | */ | 2441 | */ |
2349 | if (!page_mkwrite) { | 2442 | if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { |
2350 | wait_on_page_locked(dirty_page); | 2443 | lock_page(old_page); /* LRU manipulation */ |
2351 | set_page_dirty_balance(dirty_page, page_mkwrite); | 2444 | munlock_vma_page(old_page); |
2352 | } | 2445 | unlock_page(old_page); |
2353 | put_page(dirty_page); | ||
2354 | if (page_mkwrite) { | ||
2355 | struct address_space *mapping = dirty_page->mapping; | ||
2356 | |||
2357 | set_page_dirty(dirty_page); | ||
2358 | unlock_page(dirty_page); | ||
2359 | page_cache_release(dirty_page); | ||
2360 | if (mapping) { | ||
2361 | /* | ||
2362 | * Some device drivers do not set page.mapping | ||
2363 | * but still dirty their pages | ||
2364 | */ | ||
2365 | balance_dirty_pages_ratelimited(mapping); | ||
2366 | } | ||
2367 | } | 2446 | } |
2368 | 2447 | page_cache_release(old_page); | |
2369 | /* file_update_time outside page_lock */ | ||
2370 | if (vma->vm_file) | ||
2371 | file_update_time(vma->vm_file); | ||
2372 | } | 2448 | } |
2373 | return ret; | 2449 | return ret; |
2374 | oom_free_new: | 2450 | oom_free_new: |
@@ -2975,12 +3051,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2975 | goto out; | 3051 | goto out; |
2976 | } | 3052 | } |
2977 | charged = 1; | 3053 | charged = 1; |
2978 | /* | ||
2979 | * Don't let another task, with possibly unlocked vma, | ||
2980 | * keep the mlocked page. | ||
2981 | */ | ||
2982 | if (vma->vm_flags & VM_LOCKED) | ||
2983 | clear_page_mlock(vmf.page); | ||
2984 | copy_user_highpage(page, vmf.page, address, vma); | 3054 | copy_user_highpage(page, vmf.page, address, vma); |
2985 | __SetPageUptodate(page); | 3055 | __SetPageUptodate(page); |
2986 | } else { | 3056 | } else { |
@@ -3147,9 +3217,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3147 | * but allow concurrent faults), and pte mapped but not yet locked. | 3217 | * but allow concurrent faults), and pte mapped but not yet locked. |
3148 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 3218 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
3149 | */ | 3219 | */ |
3150 | static inline int handle_pte_fault(struct mm_struct *mm, | 3220 | int handle_pte_fault(struct mm_struct *mm, |
3151 | struct vm_area_struct *vma, unsigned long address, | 3221 | struct vm_area_struct *vma, unsigned long address, |
3152 | pte_t *pte, pmd_t *pmd, unsigned int flags) | 3222 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
3153 | { | 3223 | { |
3154 | pte_t entry; | 3224 | pte_t entry; |
3155 | spinlock_t *ptl; | 3225 | spinlock_t *ptl; |
@@ -3228,9 +3298,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3228 | pmd = pmd_alloc(mm, pud, address); | 3298 | pmd = pmd_alloc(mm, pud, address); |
3229 | if (!pmd) | 3299 | if (!pmd) |
3230 | return VM_FAULT_OOM; | 3300 | return VM_FAULT_OOM; |
3231 | pte = pte_alloc_map(mm, pmd, address); | 3301 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { |
3232 | if (!pte) | 3302 | if (!vma->vm_ops) |
3303 | return do_huge_pmd_anonymous_page(mm, vma, address, | ||
3304 | pmd, flags); | ||
3305 | } else { | ||
3306 | pmd_t orig_pmd = *pmd; | ||
3307 | barrier(); | ||
3308 | if (pmd_trans_huge(orig_pmd)) { | ||
3309 | if (flags & FAULT_FLAG_WRITE && | ||
3310 | !pmd_write(orig_pmd) && | ||
3311 | !pmd_trans_splitting(orig_pmd)) | ||
3312 | return do_huge_pmd_wp_page(mm, vma, address, | ||
3313 | pmd, orig_pmd); | ||
3314 | return 0; | ||
3315 | } | ||
3316 | } | ||
3317 | |||
3318 | /* | ||
3319 | * Use __pte_alloc instead of pte_alloc_map, because we can't | ||
3320 | * run pte_offset_map on the pmd, if an huge pmd could | ||
3321 | * materialize from under us from a different thread. | ||
3322 | */ | ||
3323 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
3233 | return VM_FAULT_OOM; | 3324 | return VM_FAULT_OOM; |
3325 | /* if an huge pmd materialized from under us just retry later */ | ||
3326 | if (unlikely(pmd_trans_huge(*pmd))) | ||
3327 | return 0; | ||
3328 | /* | ||
3329 | * A regular pmd is established and it can't morph into a huge pmd | ||
3330 | * from under us anymore at this point because we hold the mmap_sem | ||
3331 | * read mode and khugepaged takes it in write mode. So now it's | ||
3332 | * safe to run pte_offset_map(). | ||
3333 | */ | ||
3334 | pte = pte_offset_map(pmd, address); | ||
3234 | 3335 | ||
3235 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3336 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
3236 | } | 3337 | } |
@@ -3296,7 +3397,12 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
3296 | vma = find_vma(current->mm, addr); | 3397 | vma = find_vma(current->mm, addr); |
3297 | if (!vma) | 3398 | if (!vma) |
3298 | return -ENOMEM; | 3399 | return -ENOMEM; |
3299 | write = (vma->vm_flags & VM_WRITE) != 0; | 3400 | /* |
3401 | * We want to touch writable mappings with a write fault in order | ||
3402 | * to break COW, except for shared mappings because these don't COW | ||
3403 | * and we would not want to dirty them for nothing. | ||
3404 | */ | ||
3405 | write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; | ||
3300 | BUG_ON(addr >= end); | 3406 | BUG_ON(addr >= end); |
3301 | BUG_ON(end > vma->vm_end); | 3407 | BUG_ON(end > vma->vm_end); |
3302 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | 3408 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
@@ -3368,6 +3474,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, | |||
3368 | goto out; | 3474 | goto out; |
3369 | 3475 | ||
3370 | pmd = pmd_offset(pud, address); | 3476 | pmd = pmd_offset(pud, address); |
3477 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
3371 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 3478 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
3372 | goto out; | 3479 | goto out; |
3373 | 3480 | ||
@@ -3608,3 +3715,74 @@ void might_fault(void) | |||
3608 | } | 3715 | } |
3609 | EXPORT_SYMBOL(might_fault); | 3716 | EXPORT_SYMBOL(might_fault); |
3610 | #endif | 3717 | #endif |
3718 | |||
3719 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) | ||
3720 | static void clear_gigantic_page(struct page *page, | ||
3721 | unsigned long addr, | ||
3722 | unsigned int pages_per_huge_page) | ||
3723 | { | ||
3724 | int i; | ||
3725 | struct page *p = page; | ||
3726 | |||
3727 | might_sleep(); | ||
3728 | for (i = 0; i < pages_per_huge_page; | ||
3729 | i++, p = mem_map_next(p, page, i)) { | ||
3730 | cond_resched(); | ||
3731 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
3732 | } | ||
3733 | } | ||
3734 | void clear_huge_page(struct page *page, | ||
3735 | unsigned long addr, unsigned int pages_per_huge_page) | ||
3736 | { | ||
3737 | int i; | ||
3738 | |||
3739 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | ||
3740 | clear_gigantic_page(page, addr, pages_per_huge_page); | ||
3741 | return; | ||
3742 | } | ||
3743 | |||
3744 | might_sleep(); | ||
3745 | for (i = 0; i < pages_per_huge_page; i++) { | ||
3746 | cond_resched(); | ||
3747 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
3748 | } | ||
3749 | } | ||
3750 | |||
3751 | static void copy_user_gigantic_page(struct page *dst, struct page *src, | ||
3752 | unsigned long addr, | ||
3753 | struct vm_area_struct *vma, | ||
3754 | unsigned int pages_per_huge_page) | ||
3755 | { | ||
3756 | int i; | ||
3757 | struct page *dst_base = dst; | ||
3758 | struct page *src_base = src; | ||
3759 | |||
3760 | for (i = 0; i < pages_per_huge_page; ) { | ||
3761 | cond_resched(); | ||
3762 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | ||
3763 | |||
3764 | i++; | ||
3765 | dst = mem_map_next(dst, dst_base, i); | ||
3766 | src = mem_map_next(src, src_base, i); | ||
3767 | } | ||
3768 | } | ||
3769 | |||
3770 | void copy_user_huge_page(struct page *dst, struct page *src, | ||
3771 | unsigned long addr, struct vm_area_struct *vma, | ||
3772 | unsigned int pages_per_huge_page) | ||
3773 | { | ||
3774 | int i; | ||
3775 | |||
3776 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | ||
3777 | copy_user_gigantic_page(dst, src, addr, vma, | ||
3778 | pages_per_huge_page); | ||
3779 | return; | ||
3780 | } | ||
3781 | |||
3782 | might_sleep(); | ||
3783 | for (i = 0; i < pages_per_huge_page; i++) { | ||
3784 | cond_resched(); | ||
3785 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | ||
3786 | } | ||
3787 | } | ||
3788 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9260314a221..321fc7455df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -34,6 +34,23 @@ | |||
34 | 34 | ||
35 | #include "internal.h" | 35 | #include "internal.h" |
36 | 36 | ||
37 | DEFINE_MUTEX(mem_hotplug_mutex); | ||
38 | |||
39 | void lock_memory_hotplug(void) | ||
40 | { | ||
41 | mutex_lock(&mem_hotplug_mutex); | ||
42 | |||
43 | /* for exclusive hibernation if CONFIG_HIBERNATION=y */ | ||
44 | lock_system_sleep(); | ||
45 | } | ||
46 | |||
47 | void unlock_memory_hotplug(void) | ||
48 | { | ||
49 | unlock_system_sleep(); | ||
50 | mutex_unlock(&mem_hotplug_mutex); | ||
51 | } | ||
52 | |||
53 | |||
37 | /* add this memory to iomem resource */ | 54 | /* add this memory to iomem resource */ |
38 | static struct resource *register_memory_resource(u64 start, u64 size) | 55 | static struct resource *register_memory_resource(u64 start, u64 size) |
39 | { | 56 | { |
@@ -65,9 +82,10 @@ static void release_memory_resource(struct resource *res) | |||
65 | 82 | ||
66 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | 83 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE |
67 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 84 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
68 | static void get_page_bootmem(unsigned long info, struct page *page, int type) | 85 | static void get_page_bootmem(unsigned long info, struct page *page, |
86 | unsigned long type) | ||
69 | { | 87 | { |
70 | atomic_set(&page->_mapcount, type); | 88 | page->lru.next = (struct list_head *) type; |
71 | SetPagePrivate(page); | 89 | SetPagePrivate(page); |
72 | set_page_private(page, info); | 90 | set_page_private(page, info); |
73 | atomic_inc(&page->_count); | 91 | atomic_inc(&page->_count); |
@@ -77,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) | |||
77 | * so use __ref to tell modpost not to generate a warning */ | 95 | * so use __ref to tell modpost not to generate a warning */ |
78 | void __ref put_page_bootmem(struct page *page) | 96 | void __ref put_page_bootmem(struct page *page) |
79 | { | 97 | { |
80 | int type; | 98 | unsigned long type; |
81 | 99 | ||
82 | type = atomic_read(&page->_mapcount); | 100 | type = (unsigned long) page->lru.next; |
83 | BUG_ON(type >= -1); | 101 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
102 | type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); | ||
84 | 103 | ||
85 | if (atomic_dec_return(&page->_count) == 1) { | 104 | if (atomic_dec_return(&page->_count) == 1) { |
86 | ClearPagePrivate(page); | 105 | ClearPagePrivate(page); |
87 | set_page_private(page, 0); | 106 | set_page_private(page, 0); |
88 | reset_page_mapcount(page); | 107 | INIT_LIST_HEAD(&page->lru); |
89 | __free_pages_bootmem(page, 0); | 108 | __free_pages_bootmem(page, 0); |
90 | } | 109 | } |
91 | 110 | ||
@@ -390,6 +409,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
390 | int ret; | 409 | int ret; |
391 | struct memory_notify arg; | 410 | struct memory_notify arg; |
392 | 411 | ||
412 | lock_memory_hotplug(); | ||
393 | arg.start_pfn = pfn; | 413 | arg.start_pfn = pfn; |
394 | arg.nr_pages = nr_pages; | 414 | arg.nr_pages = nr_pages; |
395 | arg.status_change_nid = -1; | 415 | arg.status_change_nid = -1; |
@@ -402,6 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
402 | ret = notifier_to_errno(ret); | 422 | ret = notifier_to_errno(ret); |
403 | if (ret) { | 423 | if (ret) { |
404 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 424 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
425 | unlock_memory_hotplug(); | ||
405 | return ret; | 426 | return ret; |
406 | } | 427 | } |
407 | /* | 428 | /* |
@@ -426,6 +447,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
426 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", | 447 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", |
427 | nr_pages, pfn); | 448 | nr_pages, pfn); |
428 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 449 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
450 | unlock_memory_hotplug(); | ||
429 | return ret; | 451 | return ret; |
430 | } | 452 | } |
431 | 453 | ||
@@ -450,6 +472,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
450 | 472 | ||
451 | if (onlined_pages) | 473 | if (onlined_pages) |
452 | memory_notify(MEM_ONLINE, &arg); | 474 | memory_notify(MEM_ONLINE, &arg); |
475 | unlock_memory_hotplug(); | ||
453 | 476 | ||
454 | return 0; | 477 | return 0; |
455 | } | 478 | } |
@@ -493,7 +516,7 @@ int mem_online_node(int nid) | |||
493 | pg_data_t *pgdat; | 516 | pg_data_t *pgdat; |
494 | int ret; | 517 | int ret; |
495 | 518 | ||
496 | lock_system_sleep(); | 519 | lock_memory_hotplug(); |
497 | pgdat = hotadd_new_pgdat(nid, 0); | 520 | pgdat = hotadd_new_pgdat(nid, 0); |
498 | if (pgdat) { | 521 | if (pgdat) { |
499 | ret = -ENOMEM; | 522 | ret = -ENOMEM; |
@@ -504,7 +527,7 @@ int mem_online_node(int nid) | |||
504 | BUG_ON(ret); | 527 | BUG_ON(ret); |
505 | 528 | ||
506 | out: | 529 | out: |
507 | unlock_system_sleep(); | 530 | unlock_memory_hotplug(); |
508 | return ret; | 531 | return ret; |
509 | } | 532 | } |
510 | 533 | ||
@@ -516,7 +539,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
516 | struct resource *res; | 539 | struct resource *res; |
517 | int ret; | 540 | int ret; |
518 | 541 | ||
519 | lock_system_sleep(); | 542 | lock_memory_hotplug(); |
520 | 543 | ||
521 | res = register_memory_resource(start, size); | 544 | res = register_memory_resource(start, size); |
522 | ret = -EEXIST; | 545 | ret = -EEXIST; |
@@ -563,7 +586,7 @@ error: | |||
563 | release_memory_resource(res); | 586 | release_memory_resource(res); |
564 | 587 | ||
565 | out: | 588 | out: |
566 | unlock_system_sleep(); | 589 | unlock_memory_hotplug(); |
567 | return ret; | 590 | return ret; |
568 | } | 591 | } |
569 | EXPORT_SYMBOL_GPL(add_memory); | 592 | EXPORT_SYMBOL_GPL(add_memory); |
@@ -716,7 +739,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
716 | goto out; | 739 | goto out; |
717 | } | 740 | } |
718 | /* this function returns # of failed pages */ | 741 | /* this function returns # of failed pages */ |
719 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); | 742 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, |
743 | true, true); | ||
720 | if (ret) | 744 | if (ret) |
721 | putback_lru_pages(&source); | 745 | putback_lru_pages(&source); |
722 | } | 746 | } |
@@ -791,7 +815,7 @@ static int offline_pages(unsigned long start_pfn, | |||
791 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | 815 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) |
792 | return -EINVAL; | 816 | return -EINVAL; |
793 | 817 | ||
794 | lock_system_sleep(); | 818 | lock_memory_hotplug(); |
795 | 819 | ||
796 | zone = page_zone(pfn_to_page(start_pfn)); | 820 | zone = page_zone(pfn_to_page(start_pfn)); |
797 | node = zone_to_nid(zone); | 821 | node = zone_to_nid(zone); |
@@ -880,7 +904,7 @@ repeat: | |||
880 | writeback_set_ratelimit(); | 904 | writeback_set_ratelimit(); |
881 | 905 | ||
882 | memory_notify(MEM_OFFLINE, &arg); | 906 | memory_notify(MEM_OFFLINE, &arg); |
883 | unlock_system_sleep(); | 907 | unlock_memory_hotplug(); |
884 | return 0; | 908 | return 0; |
885 | 909 | ||
886 | failed_removal: | 910 | failed_removal: |
@@ -891,7 +915,7 @@ failed_removal: | |||
891 | undo_isolate_page_range(start_pfn, end_pfn); | 915 | undo_isolate_page_range(start_pfn, end_pfn); |
892 | 916 | ||
893 | out: | 917 | out: |
894 | unlock_system_sleep(); | 918 | unlock_memory_hotplug(); |
895 | return ret; | 919 | return ret; |
896 | } | 920 | } |
897 | 921 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4a57f135b76..368fc9d2361 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
514 | pmd = pmd_offset(pud, addr); | 514 | pmd = pmd_offset(pud, addr); |
515 | do { | 515 | do { |
516 | next = pmd_addr_end(addr, end); | 516 | next = pmd_addr_end(addr, end); |
517 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
517 | if (pmd_none_or_clear_bad(pmd)) | 518 | if (pmd_none_or_clear_bad(pmd)) |
518 | continue; | 519 | continue; |
519 | if (check_pte_range(vma, pmd, addr, next, nodes, | 520 | if (check_pte_range(vma, pmd, addr, next, nodes, |
@@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
935 | return PTR_ERR(vma); | 936 | return PTR_ERR(vma); |
936 | 937 | ||
937 | if (!list_empty(&pagelist)) { | 938 | if (!list_empty(&pagelist)) { |
938 | err = migrate_pages(&pagelist, new_node_page, dest, 0); | 939 | err = migrate_pages(&pagelist, new_node_page, dest, |
940 | false, true); | ||
939 | if (err) | 941 | if (err) |
940 | putback_lru_pages(&pagelist); | 942 | putback_lru_pages(&pagelist); |
941 | } | 943 | } |
@@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1155 | 1157 | ||
1156 | if (!list_empty(&pagelist)) { | 1158 | if (!list_empty(&pagelist)) { |
1157 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1159 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1158 | (unsigned long)vma, 0); | 1160 | (unsigned long)vma, |
1161 | false, true); | ||
1159 | if (nr_failed) | 1162 | if (nr_failed) |
1160 | putback_lru_pages(&pagelist); | 1163 | putback_lru_pages(&pagelist); |
1161 | } | 1164 | } |
@@ -1307,15 +1310,15 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1307 | goto out; | 1310 | goto out; |
1308 | 1311 | ||
1309 | /* Find the mm_struct */ | 1312 | /* Find the mm_struct */ |
1310 | read_lock(&tasklist_lock); | 1313 | rcu_read_lock(); |
1311 | task = pid ? find_task_by_vpid(pid) : current; | 1314 | task = pid ? find_task_by_vpid(pid) : current; |
1312 | if (!task) { | 1315 | if (!task) { |
1313 | read_unlock(&tasklist_lock); | 1316 | rcu_read_unlock(); |
1314 | err = -ESRCH; | 1317 | err = -ESRCH; |
1315 | goto out; | 1318 | goto out; |
1316 | } | 1319 | } |
1317 | mm = get_task_mm(task); | 1320 | mm = get_task_mm(task); |
1318 | read_unlock(&tasklist_lock); | 1321 | rcu_read_unlock(); |
1319 | 1322 | ||
1320 | err = -EINVAL; | 1323 | err = -EINVAL; |
1321 | if (!mm) | 1324 | if (!mm) |
@@ -1793,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1793 | } | 1796 | } |
1794 | 1797 | ||
1795 | /** | 1798 | /** |
1796 | * alloc_page_vma - Allocate a page for a VMA. | 1799 | * alloc_pages_vma - Allocate a page for a VMA. |
1797 | * | 1800 | * |
1798 | * @gfp: | 1801 | * @gfp: |
1799 | * %GFP_USER user allocation. | 1802 | * %GFP_USER user allocation. |
@@ -1802,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1802 | * %GFP_FS allocation should not call back into a file system. | 1805 | * %GFP_FS allocation should not call back into a file system. |
1803 | * %GFP_ATOMIC don't sleep. | 1806 | * %GFP_ATOMIC don't sleep. |
1804 | * | 1807 | * |
1808 | * @order:Order of the GFP allocation. | ||
1805 | * @vma: Pointer to VMA or NULL if not available. | 1809 | * @vma: Pointer to VMA or NULL if not available. |
1806 | * @addr: Virtual Address of the allocation. Must be inside the VMA. | 1810 | * @addr: Virtual Address of the allocation. Must be inside the VMA. |
1807 | * | 1811 | * |
@@ -1815,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1815 | * Should be called with the mm_sem of the vma hold. | 1819 | * Should be called with the mm_sem of the vma hold. |
1816 | */ | 1820 | */ |
1817 | struct page * | 1821 | struct page * |
1818 | alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | 1822 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
1823 | unsigned long addr) | ||
1819 | { | 1824 | { |
1820 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1825 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1821 | struct zonelist *zl; | 1826 | struct zonelist *zl; |
@@ -1827,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1827 | 1832 | ||
1828 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | 1833 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
1829 | mpol_cond_put(pol); | 1834 | mpol_cond_put(pol); |
1830 | page = alloc_page_interleave(gfp, 0, nid); | 1835 | page = alloc_page_interleave(gfp, order, nid); |
1831 | put_mems_allowed(); | 1836 | put_mems_allowed(); |
1832 | return page; | 1837 | return page; |
1833 | } | 1838 | } |
@@ -1836,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1836 | /* | 1841 | /* |
1837 | * slow path: ref counted shared policy | 1842 | * slow path: ref counted shared policy |
1838 | */ | 1843 | */ |
1839 | struct page *page = __alloc_pages_nodemask(gfp, 0, | 1844 | struct page *page = __alloc_pages_nodemask(gfp, order, |
1840 | zl, policy_nodemask(gfp, pol)); | 1845 | zl, policy_nodemask(gfp, pol)); |
1841 | __mpol_put(pol); | 1846 | __mpol_put(pol); |
1842 | put_mems_allowed(); | 1847 | put_mems_allowed(); |
@@ -1845,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1845 | /* | 1850 | /* |
1846 | * fast path: default or task policy | 1851 | * fast path: default or task policy |
1847 | */ | 1852 | */ |
1848 | page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); | 1853 | page = __alloc_pages_nodemask(gfp, order, zl, |
1854 | policy_nodemask(gfp, pol)); | ||
1849 | put_mems_allowed(); | 1855 | put_mems_allowed(); |
1850 | return page; | 1856 | return page; |
1851 | } | 1857 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index fe5a3c6a542..76611525380 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -35,6 +35,8 @@ | |||
35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
36 | #include <linux/gfp.h> | 36 | #include <linux/gfp.h> |
37 | 37 | ||
38 | #include <asm/tlbflush.h> | ||
39 | |||
38 | #include "internal.h" | 40 | #include "internal.h" |
39 | 41 | ||
40 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 42 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
@@ -111,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
111 | goto out; | 113 | goto out; |
112 | 114 | ||
113 | pmd = pmd_offset(pud, addr); | 115 | pmd = pmd_offset(pud, addr); |
116 | if (pmd_trans_huge(*pmd)) | ||
117 | goto out; | ||
114 | if (!pmd_present(*pmd)) | 118 | if (!pmd_present(*pmd)) |
115 | goto out; | 119 | goto out; |
116 | 120 | ||
@@ -244,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
244 | 248 | ||
245 | expected_count = 2 + page_has_private(page); | 249 | expected_count = 2 + page_has_private(page); |
246 | if (page_count(page) != expected_count || | 250 | if (page_count(page) != expected_count || |
247 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 251 | radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { |
248 | spin_unlock_irq(&mapping->tree_lock); | 252 | spin_unlock_irq(&mapping->tree_lock); |
249 | return -EAGAIN; | 253 | return -EAGAIN; |
250 | } | 254 | } |
@@ -316,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
316 | 320 | ||
317 | expected_count = 2 + page_has_private(page); | 321 | expected_count = 2 + page_has_private(page); |
318 | if (page_count(page) != expected_count || | 322 | if (page_count(page) != expected_count || |
319 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 323 | radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { |
320 | spin_unlock_irq(&mapping->tree_lock); | 324 | spin_unlock_irq(&mapping->tree_lock); |
321 | return -EAGAIN; | 325 | return -EAGAIN; |
322 | } | 326 | } |
@@ -612,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
612 | * to the newly allocated page in newpage. | 616 | * to the newly allocated page in newpage. |
613 | */ | 617 | */ |
614 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 618 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
615 | struct page *page, int force, int offlining) | 619 | struct page *page, int force, bool offlining, bool sync) |
616 | { | 620 | { |
617 | int rc = 0; | 621 | int rc = 0; |
618 | int *result = NULL; | 622 | int *result = NULL; |
619 | struct page *newpage = get_new_page(page, private, &result); | 623 | struct page *newpage = get_new_page(page, private, &result); |
620 | int remap_swapcache = 1; | 624 | int remap_swapcache = 1; |
621 | int rcu_locked = 0; | ||
622 | int charge = 0; | 625 | int charge = 0; |
623 | struct mem_cgroup *mem = NULL; | 626 | struct mem_cgroup *mem = NULL; |
624 | struct anon_vma *anon_vma = NULL; | 627 | struct anon_vma *anon_vma = NULL; |
@@ -630,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
630 | /* page was freed from under us. So we are done. */ | 633 | /* page was freed from under us. So we are done. */ |
631 | goto move_newpage; | 634 | goto move_newpage; |
632 | } | 635 | } |
636 | if (unlikely(PageTransHuge(page))) | ||
637 | if (unlikely(split_huge_page(page))) | ||
638 | goto move_newpage; | ||
633 | 639 | ||
634 | /* prepare cgroup just returns 0 or -ENOMEM */ | 640 | /* prepare cgroup just returns 0 or -ENOMEM */ |
635 | rc = -EAGAIN; | 641 | rc = -EAGAIN; |
@@ -637,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
637 | if (!trylock_page(page)) { | 643 | if (!trylock_page(page)) { |
638 | if (!force) | 644 | if (!force) |
639 | goto move_newpage; | 645 | goto move_newpage; |
646 | |||
647 | /* | ||
648 | * It's not safe for direct compaction to call lock_page. | ||
649 | * For example, during page readahead pages are added locked | ||
650 | * to the LRU. Later, when the IO completes the pages are | ||
651 | * marked uptodate and unlocked. However, the queueing | ||
652 | * could be merging multiple pages for one bio (e.g. | ||
653 | * mpage_readpages). If an allocation happens for the | ||
654 | * second or third page, the process can end up locking | ||
655 | * the same page twice and deadlocking. Rather than | ||
656 | * trying to be clever about what pages can be locked, | ||
657 | * avoid the use of lock_page for direct compaction | ||
658 | * altogether. | ||
659 | */ | ||
660 | if (current->flags & PF_MEMALLOC) | ||
661 | goto move_newpage; | ||
662 | |||
640 | lock_page(page); | 663 | lock_page(page); |
641 | } | 664 | } |
642 | 665 | ||
@@ -663,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
663 | BUG_ON(charge); | 686 | BUG_ON(charge); |
664 | 687 | ||
665 | if (PageWriteback(page)) { | 688 | if (PageWriteback(page)) { |
666 | if (!force) | 689 | if (!force || !sync) |
667 | goto uncharge; | 690 | goto uncharge; |
668 | wait_on_page_writeback(page); | 691 | wait_on_page_writeback(page); |
669 | } | 692 | } |
670 | /* | 693 | /* |
671 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, | 694 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, |
672 | * we cannot notice that anon_vma is freed while we migrates a page. | 695 | * we cannot notice that anon_vma is freed while we migrates a page. |
673 | * This rcu_read_lock() delays freeing anon_vma pointer until the end | 696 | * This get_anon_vma() delays freeing anon_vma pointer until the end |
674 | * of migration. File cache pages are no problem because of page_lock() | 697 | * of migration. File cache pages are no problem because of page_lock() |
675 | * File Caches may use write_page() or lock_page() in migration, then, | 698 | * File Caches may use write_page() or lock_page() in migration, then, |
676 | * just care Anon page here. | 699 | * just care Anon page here. |
677 | */ | 700 | */ |
678 | if (PageAnon(page)) { | 701 | if (PageAnon(page)) { |
679 | rcu_read_lock(); | 702 | /* |
680 | rcu_locked = 1; | 703 | * Only page_lock_anon_vma() understands the subtleties of |
681 | 704 | * getting a hold on an anon_vma from outside one of its mms. | |
682 | /* Determine how to safely use anon_vma */ | 705 | */ |
683 | if (!page_mapped(page)) { | 706 | anon_vma = page_lock_anon_vma(page); |
684 | if (!PageSwapCache(page)) | 707 | if (anon_vma) { |
685 | goto rcu_unlock; | 708 | /* |
686 | 709 | * Take a reference count on the anon_vma if the | |
710 | * page is mapped so that it is guaranteed to | ||
711 | * exist when the page is remapped later | ||
712 | */ | ||
713 | get_anon_vma(anon_vma); | ||
714 | page_unlock_anon_vma(anon_vma); | ||
715 | } else if (PageSwapCache(page)) { | ||
687 | /* | 716 | /* |
688 | * We cannot be sure that the anon_vma of an unmapped | 717 | * We cannot be sure that the anon_vma of an unmapped |
689 | * swapcache page is safe to use because we don't | 718 | * swapcache page is safe to use because we don't |
@@ -698,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
698 | */ | 727 | */ |
699 | remap_swapcache = 0; | 728 | remap_swapcache = 0; |
700 | } else { | 729 | } else { |
701 | /* | 730 | goto uncharge; |
702 | * Take a reference count on the anon_vma if the | ||
703 | * page is mapped so that it is guaranteed to | ||
704 | * exist when the page is remapped later | ||
705 | */ | ||
706 | anon_vma = page_anon_vma(page); | ||
707 | get_anon_vma(anon_vma); | ||
708 | } | 731 | } |
709 | } | 732 | } |
710 | 733 | ||
@@ -721,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
721 | * free the metadata, so the page can be freed. | 744 | * free the metadata, so the page can be freed. |
722 | */ | 745 | */ |
723 | if (!page->mapping) { | 746 | if (!page->mapping) { |
724 | if (!PageAnon(page) && page_has_private(page)) { | 747 | VM_BUG_ON(PageAnon(page)); |
725 | /* | 748 | if (page_has_private(page)) { |
726 | * Go direct to try_to_free_buffers() here because | ||
727 | * a) that's what try_to_release_page() would do anyway | ||
728 | * b) we may be under rcu_read_lock() here, so we can't | ||
729 | * use GFP_KERNEL which is what try_to_release_page() | ||
730 | * needs to be effective. | ||
731 | */ | ||
732 | try_to_free_buffers(page); | 749 | try_to_free_buffers(page); |
733 | goto rcu_unlock; | 750 | goto uncharge; |
734 | } | 751 | } |
735 | goto skip_unmap; | 752 | goto skip_unmap; |
736 | } | 753 | } |
@@ -744,20 +761,18 @@ skip_unmap: | |||
744 | 761 | ||
745 | if (rc && remap_swapcache) | 762 | if (rc && remap_swapcache) |
746 | remove_migration_ptes(page, page); | 763 | remove_migration_ptes(page, page); |
747 | rcu_unlock: | ||
748 | 764 | ||
749 | /* Drop an anon_vma reference if we took one */ | 765 | /* Drop an anon_vma reference if we took one */ |
750 | if (anon_vma) | 766 | if (anon_vma) |
751 | drop_anon_vma(anon_vma); | 767 | drop_anon_vma(anon_vma); |
752 | 768 | ||
753 | if (rcu_locked) | ||
754 | rcu_read_unlock(); | ||
755 | uncharge: | 769 | uncharge: |
756 | if (!charge) | 770 | if (!charge) |
757 | mem_cgroup_end_migration(mem, page, newpage); | 771 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
758 | unlock: | 772 | unlock: |
759 | unlock_page(page); | 773 | unlock_page(page); |
760 | 774 | ||
775 | move_newpage: | ||
761 | if (rc != -EAGAIN) { | 776 | if (rc != -EAGAIN) { |
762 | /* | 777 | /* |
763 | * A page that has been migrated has all references | 778 | * A page that has been migrated has all references |
@@ -771,8 +786,6 @@ unlock: | |||
771 | putback_lru_page(page); | 786 | putback_lru_page(page); |
772 | } | 787 | } |
773 | 788 | ||
774 | move_newpage: | ||
775 | |||
776 | /* | 789 | /* |
777 | * Move the new page to the LRU. If migration was not successful | 790 | * Move the new page to the LRU. If migration was not successful |
778 | * then this will free the page. | 791 | * then this will free the page. |
@@ -808,12 +821,11 @@ move_newpage: | |||
808 | */ | 821 | */ |
809 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 822 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
810 | unsigned long private, struct page *hpage, | 823 | unsigned long private, struct page *hpage, |
811 | int force, int offlining) | 824 | int force, bool offlining, bool sync) |
812 | { | 825 | { |
813 | int rc = 0; | 826 | int rc = 0; |
814 | int *result = NULL; | 827 | int *result = NULL; |
815 | struct page *new_hpage = get_new_page(hpage, private, &result); | 828 | struct page *new_hpage = get_new_page(hpage, private, &result); |
816 | int rcu_locked = 0; | ||
817 | struct anon_vma *anon_vma = NULL; | 829 | struct anon_vma *anon_vma = NULL; |
818 | 830 | ||
819 | if (!new_hpage) | 831 | if (!new_hpage) |
@@ -822,18 +834,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
822 | rc = -EAGAIN; | 834 | rc = -EAGAIN; |
823 | 835 | ||
824 | if (!trylock_page(hpage)) { | 836 | if (!trylock_page(hpage)) { |
825 | if (!force) | 837 | if (!force || !sync) |
826 | goto out; | 838 | goto out; |
827 | lock_page(hpage); | 839 | lock_page(hpage); |
828 | } | 840 | } |
829 | 841 | ||
830 | if (PageAnon(hpage)) { | 842 | if (PageAnon(hpage)) { |
831 | rcu_read_lock(); | 843 | anon_vma = page_lock_anon_vma(hpage); |
832 | rcu_locked = 1; | 844 | if (anon_vma) { |
833 | 845 | get_anon_vma(anon_vma); | |
834 | if (page_mapped(hpage)) { | 846 | page_unlock_anon_vma(anon_vma); |
835 | anon_vma = page_anon_vma(hpage); | ||
836 | atomic_inc(&anon_vma->external_refcount); | ||
837 | } | 847 | } |
838 | } | 848 | } |
839 | 849 | ||
@@ -845,16 +855,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
845 | if (rc) | 855 | if (rc) |
846 | remove_migration_ptes(hpage, hpage); | 856 | remove_migration_ptes(hpage, hpage); |
847 | 857 | ||
848 | if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, | 858 | if (anon_vma) |
849 | &anon_vma->lock)) { | 859 | drop_anon_vma(anon_vma); |
850 | int empty = list_empty(&anon_vma->head); | ||
851 | spin_unlock(&anon_vma->lock); | ||
852 | if (empty) | ||
853 | anon_vma_free(anon_vma); | ||
854 | } | ||
855 | |||
856 | if (rcu_locked) | ||
857 | rcu_read_unlock(); | ||
858 | out: | 860 | out: |
859 | unlock_page(hpage); | 861 | unlock_page(hpage); |
860 | 862 | ||
@@ -885,12 +887,13 @@ out: | |||
885 | * are movable anymore because to has become empty | 887 | * are movable anymore because to has become empty |
886 | * or no retryable pages exist anymore. | 888 | * or no retryable pages exist anymore. |
887 | * Caller should call putback_lru_pages to return pages to the LRU | 889 | * Caller should call putback_lru_pages to return pages to the LRU |
888 | * or free list. | 890 | * or free list only if ret != 0. |
889 | * | 891 | * |
890 | * Return: Number of pages not migrated or error code. | 892 | * Return: Number of pages not migrated or error code. |
891 | */ | 893 | */ |
892 | int migrate_pages(struct list_head *from, | 894 | int migrate_pages(struct list_head *from, |
893 | new_page_t get_new_page, unsigned long private, int offlining) | 895 | new_page_t get_new_page, unsigned long private, bool offlining, |
896 | bool sync) | ||
894 | { | 897 | { |
895 | int retry = 1; | 898 | int retry = 1; |
896 | int nr_failed = 0; | 899 | int nr_failed = 0; |
@@ -910,7 +913,8 @@ int migrate_pages(struct list_head *from, | |||
910 | cond_resched(); | 913 | cond_resched(); |
911 | 914 | ||
912 | rc = unmap_and_move(get_new_page, private, | 915 | rc = unmap_and_move(get_new_page, private, |
913 | page, pass > 2, offlining); | 916 | page, pass > 2, offlining, |
917 | sync); | ||
914 | 918 | ||
915 | switch(rc) { | 919 | switch(rc) { |
916 | case -ENOMEM: | 920 | case -ENOMEM: |
@@ -939,7 +943,8 @@ out: | |||
939 | } | 943 | } |
940 | 944 | ||
941 | int migrate_huge_pages(struct list_head *from, | 945 | int migrate_huge_pages(struct list_head *from, |
942 | new_page_t get_new_page, unsigned long private, int offlining) | 946 | new_page_t get_new_page, unsigned long private, bool offlining, |
947 | bool sync) | ||
943 | { | 948 | { |
944 | int retry = 1; | 949 | int retry = 1; |
945 | int nr_failed = 0; | 950 | int nr_failed = 0; |
@@ -955,7 +960,8 @@ int migrate_huge_pages(struct list_head *from, | |||
955 | cond_resched(); | 960 | cond_resched(); |
956 | 961 | ||
957 | rc = unmap_and_move_huge_page(get_new_page, | 962 | rc = unmap_and_move_huge_page(get_new_page, |
958 | private, page, pass > 2, offlining); | 963 | private, page, pass > 2, offlining, |
964 | sync); | ||
959 | 965 | ||
960 | switch(rc) { | 966 | switch(rc) { |
961 | case -ENOMEM: | 967 | case -ENOMEM: |
@@ -974,10 +980,6 @@ int migrate_huge_pages(struct list_head *from, | |||
974 | } | 980 | } |
975 | rc = 0; | 981 | rc = 0; |
976 | out: | 982 | out: |
977 | |||
978 | list_for_each_entry_safe(page, page2, from, lru) | ||
979 | put_page(page); | ||
980 | |||
981 | if (rc) | 983 | if (rc) |
982 | return rc; | 984 | return rc; |
983 | 985 | ||
@@ -1040,7 +1042,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1040 | if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) | 1042 | if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) |
1041 | goto set_status; | 1043 | goto set_status; |
1042 | 1044 | ||
1043 | page = follow_page(vma, pp->addr, FOLL_GET); | 1045 | page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); |
1044 | 1046 | ||
1045 | err = PTR_ERR(page); | 1047 | err = PTR_ERR(page); |
1046 | if (IS_ERR(page)) | 1048 | if (IS_ERR(page)) |
@@ -1088,7 +1090,7 @@ set_status: | |||
1088 | err = 0; | 1090 | err = 0; |
1089 | if (!list_empty(&pagelist)) { | 1091 | if (!list_empty(&pagelist)) { |
1090 | err = migrate_pages(&pagelist, new_page_node, | 1092 | err = migrate_pages(&pagelist, new_page_node, |
1091 | (unsigned long)pm, 0); | 1093 | (unsigned long)pm, 0, true); |
1092 | if (err) | 1094 | if (err) |
1093 | putback_lru_pages(&pagelist); | 1095 | putback_lru_pages(&pagelist); |
1094 | } | 1096 | } |
diff --git a/mm/mincore.c b/mm/mincore.c index 9ac42dc6d7b..a4e6b9d75c7 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
154 | pmd = pmd_offset(pud, addr); | 154 | pmd = pmd_offset(pud, addr); |
155 | do { | 155 | do { |
156 | next = pmd_addr_end(addr, end); | 156 | next = pmd_addr_end(addr, end); |
157 | if (pmd_trans_huge(*pmd)) { | ||
158 | if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { | ||
159 | vec += (next - addr) >> PAGE_SHIFT; | ||
160 | continue; | ||
161 | } | ||
162 | /* fall through */ | ||
163 | } | ||
157 | if (pmd_none_or_clear_bad(pmd)) | 164 | if (pmd_none_or_clear_bad(pmd)) |
158 | mincore_unmapped_range(vma, addr, next, vec); | 165 | mincore_unmapped_range(vma, addr, next, vec); |
159 | else | 166 | else |
diff --git a/mm/mlock.c b/mm/mlock.c index b70919ce4f7..c3924c7f00b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add | |||
155 | * vma->vm_mm->mmap_sem must be held for at least read. | 155 | * vma->vm_mm->mmap_sem must be held for at least read. |
156 | */ | 156 | */ |
157 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | 157 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, |
158 | unsigned long start, unsigned long end) | 158 | unsigned long start, unsigned long end, |
159 | int *nonblocking) | ||
159 | { | 160 | { |
160 | struct mm_struct *mm = vma->vm_mm; | 161 | struct mm_struct *mm = vma->vm_mm; |
161 | unsigned long addr = start; | 162 | unsigned long addr = start; |
162 | struct page *pages[16]; /* 16 gives a reasonable batch */ | ||
163 | int nr_pages = (end - start) / PAGE_SIZE; | 163 | int nr_pages = (end - start) / PAGE_SIZE; |
164 | int ret = 0; | ||
165 | int gup_flags; | 164 | int gup_flags; |
166 | 165 | ||
167 | VM_BUG_ON(start & ~PAGE_MASK); | 166 | VM_BUG_ON(start & ~PAGE_MASK); |
@@ -170,73 +169,33 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
170 | VM_BUG_ON(end > vma->vm_end); | 169 | VM_BUG_ON(end > vma->vm_end); |
171 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | 170 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); |
172 | 171 | ||
173 | gup_flags = FOLL_TOUCH | FOLL_GET; | 172 | gup_flags = FOLL_TOUCH; |
174 | if (vma->vm_flags & VM_WRITE) | 173 | /* |
174 | * We want to touch writable mappings with a write fault in order | ||
175 | * to break COW, except for shared mappings because these don't COW | ||
176 | * and we would not want to dirty them for nothing. | ||
177 | */ | ||
178 | if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) | ||
175 | gup_flags |= FOLL_WRITE; | 179 | gup_flags |= FOLL_WRITE; |
176 | 180 | ||
181 | /* | ||
182 | * We want mlock to succeed for regions that have any permissions | ||
183 | * other than PROT_NONE. | ||
184 | */ | ||
185 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) | ||
186 | gup_flags |= FOLL_FORCE; | ||
187 | |||
188 | if (vma->vm_flags & VM_LOCKED) | ||
189 | gup_flags |= FOLL_MLOCK; | ||
190 | |||
177 | /* We don't try to access the guard page of a stack vma */ | 191 | /* We don't try to access the guard page of a stack vma */ |
178 | if (stack_guard_page(vma, start)) { | 192 | if (stack_guard_page(vma, start)) { |
179 | addr += PAGE_SIZE; | 193 | addr += PAGE_SIZE; |
180 | nr_pages--; | 194 | nr_pages--; |
181 | } | 195 | } |
182 | 196 | ||
183 | while (nr_pages > 0) { | 197 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, |
184 | int i; | 198 | NULL, NULL, nonblocking); |
185 | |||
186 | cond_resched(); | ||
187 | |||
188 | /* | ||
189 | * get_user_pages makes pages present if we are | ||
190 | * setting mlock. and this extra reference count will | ||
191 | * disable migration of this page. However, page may | ||
192 | * still be truncated out from under us. | ||
193 | */ | ||
194 | ret = __get_user_pages(current, mm, addr, | ||
195 | min_t(int, nr_pages, ARRAY_SIZE(pages)), | ||
196 | gup_flags, pages, NULL); | ||
197 | /* | ||
198 | * This can happen for, e.g., VM_NONLINEAR regions before | ||
199 | * a page has been allocated and mapped at a given offset, | ||
200 | * or for addresses that map beyond end of a file. | ||
201 | * We'll mlock the pages if/when they get faulted in. | ||
202 | */ | ||
203 | if (ret < 0) | ||
204 | break; | ||
205 | |||
206 | lru_add_drain(); /* push cached pages to LRU */ | ||
207 | |||
208 | for (i = 0; i < ret; i++) { | ||
209 | struct page *page = pages[i]; | ||
210 | |||
211 | if (page->mapping) { | ||
212 | /* | ||
213 | * That preliminary check is mainly to avoid | ||
214 | * the pointless overhead of lock_page on the | ||
215 | * ZERO_PAGE: which might bounce very badly if | ||
216 | * there is contention. However, we're still | ||
217 | * dirtying its cacheline with get/put_page: | ||
218 | * we'll add another __get_user_pages flag to | ||
219 | * avoid it if that case turns out to matter. | ||
220 | */ | ||
221 | lock_page(page); | ||
222 | /* | ||
223 | * Because we lock page here and migration is | ||
224 | * blocked by the elevated reference, we need | ||
225 | * only check for file-cache page truncation. | ||
226 | */ | ||
227 | if (page->mapping) | ||
228 | mlock_vma_page(page); | ||
229 | unlock_page(page); | ||
230 | } | ||
231 | put_page(page); /* ref from get_user_pages() */ | ||
232 | } | ||
233 | |||
234 | addr += ret * PAGE_SIZE; | ||
235 | nr_pages -= ret; | ||
236 | ret = 0; | ||
237 | } | ||
238 | |||
239 | return ret; /* 0 or negative error code */ | ||
240 | } | 199 | } |
241 | 200 | ||
242 | /* | 201 | /* |
@@ -280,7 +239,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
280 | is_vm_hugetlb_page(vma) || | 239 | is_vm_hugetlb_page(vma) || |
281 | vma == get_gate_vma(current))) { | 240 | vma == get_gate_vma(current))) { |
282 | 241 | ||
283 | __mlock_vma_pages_range(vma, start, end); | 242 | __mlock_vma_pages_range(vma, start, end, NULL); |
284 | 243 | ||
285 | /* Hide errors from mmap() and other callers */ | 244 | /* Hide errors from mmap() and other callers */ |
286 | return 0; | 245 | return 0; |
@@ -372,18 +331,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
372 | int ret = 0; | 331 | int ret = 0; |
373 | int lock = newflags & VM_LOCKED; | 332 | int lock = newflags & VM_LOCKED; |
374 | 333 | ||
375 | if (newflags == vma->vm_flags || | 334 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || |
376 | (vma->vm_flags & (VM_IO | VM_PFNMAP))) | 335 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) |
377 | goto out; /* don't set VM_LOCKED, don't count */ | 336 | goto out; /* don't set VM_LOCKED, don't count */ |
378 | 337 | ||
379 | if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | ||
380 | is_vm_hugetlb_page(vma) || | ||
381 | vma == get_gate_vma(current)) { | ||
382 | if (lock) | ||
383 | make_pages_present(start, end); | ||
384 | goto out; /* don't set VM_LOCKED, don't count */ | ||
385 | } | ||
386 | |||
387 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 338 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
388 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, | 339 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, |
389 | vma->vm_file, pgoff, vma_policy(vma)); | 340 | vma->vm_file, pgoff, vma_policy(vma)); |
@@ -419,14 +370,10 @@ success: | |||
419 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. | 370 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. |
420 | */ | 371 | */ |
421 | 372 | ||
422 | if (lock) { | 373 | if (lock) |
423 | vma->vm_flags = newflags; | 374 | vma->vm_flags = newflags; |
424 | ret = __mlock_vma_pages_range(vma, start, end); | 375 | else |
425 | if (ret < 0) | ||
426 | ret = __mlock_posix_error_return(ret); | ||
427 | } else { | ||
428 | munlock_vma_pages_range(vma, start, end); | 376 | munlock_vma_pages_range(vma, start, end); |
429 | } | ||
430 | 377 | ||
431 | out: | 378 | out: |
432 | *prev = vma; | 379 | *prev = vma; |
@@ -439,7 +386,8 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
439 | struct vm_area_struct * vma, * prev; | 386 | struct vm_area_struct * vma, * prev; |
440 | int error; | 387 | int error; |
441 | 388 | ||
442 | len = PAGE_ALIGN(len); | 389 | VM_BUG_ON(start & ~PAGE_MASK); |
390 | VM_BUG_ON(len != PAGE_ALIGN(len)); | ||
443 | end = start + len; | 391 | end = start + len; |
444 | if (end < start) | 392 | if (end < start) |
445 | return -EINVAL; | 393 | return -EINVAL; |
@@ -482,6 +430,62 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
482 | return error; | 430 | return error; |
483 | } | 431 | } |
484 | 432 | ||
433 | static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | ||
434 | { | ||
435 | struct mm_struct *mm = current->mm; | ||
436 | unsigned long end, nstart, nend; | ||
437 | struct vm_area_struct *vma = NULL; | ||
438 | int locked = 0; | ||
439 | int ret = 0; | ||
440 | |||
441 | VM_BUG_ON(start & ~PAGE_MASK); | ||
442 | VM_BUG_ON(len != PAGE_ALIGN(len)); | ||
443 | end = start + len; | ||
444 | |||
445 | for (nstart = start; nstart < end; nstart = nend) { | ||
446 | /* | ||
447 | * We want to fault in pages for [nstart; end) address range. | ||
448 | * Find first corresponding VMA. | ||
449 | */ | ||
450 | if (!locked) { | ||
451 | locked = 1; | ||
452 | down_read(&mm->mmap_sem); | ||
453 | vma = find_vma(mm, nstart); | ||
454 | } else if (nstart >= vma->vm_end) | ||
455 | vma = vma->vm_next; | ||
456 | if (!vma || vma->vm_start >= end) | ||
457 | break; | ||
458 | /* | ||
459 | * Set [nstart; nend) to intersection of desired address | ||
460 | * range with the first VMA. Also, skip undesirable VMA types. | ||
461 | */ | ||
462 | nend = min(end, vma->vm_end); | ||
463 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
464 | continue; | ||
465 | if (nstart < vma->vm_start) | ||
466 | nstart = vma->vm_start; | ||
467 | /* | ||
468 | * Now fault in a range of pages. __mlock_vma_pages_range() | ||
469 | * double checks the vma flags, so that it won't mlock pages | ||
470 | * if the vma was already munlocked. | ||
471 | */ | ||
472 | ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); | ||
473 | if (ret < 0) { | ||
474 | if (ignore_errors) { | ||
475 | ret = 0; | ||
476 | continue; /* continue at next VMA */ | ||
477 | } | ||
478 | ret = __mlock_posix_error_return(ret); | ||
479 | break; | ||
480 | } | ||
481 | nend = nstart + ret * PAGE_SIZE; | ||
482 | ret = 0; | ||
483 | } | ||
484 | if (locked) | ||
485 | up_read(&mm->mmap_sem); | ||
486 | return ret; /* 0 or negative error code */ | ||
487 | } | ||
488 | |||
485 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | 489 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) |
486 | { | 490 | { |
487 | unsigned long locked; | 491 | unsigned long locked; |
@@ -507,6 +511,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
507 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) | 511 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
508 | error = do_mlock(start, len, 1); | 512 | error = do_mlock(start, len, 1); |
509 | up_write(¤t->mm->mmap_sem); | 513 | up_write(¤t->mm->mmap_sem); |
514 | if (!error) | ||
515 | error = do_mlock_pages(start, len, 0); | ||
510 | return error; | 516 | return error; |
511 | } | 517 | } |
512 | 518 | ||
@@ -571,6 +577,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
571 | capable(CAP_IPC_LOCK)) | 577 | capable(CAP_IPC_LOCK)) |
572 | ret = do_mlockall(flags); | 578 | ret = do_mlockall(flags); |
573 | up_write(¤t->mm->mmap_sem); | 579 | up_write(¤t->mm->mmap_sem); |
580 | if (!ret && (flags & MCL_CURRENT)) { | ||
581 | /* Ignore errors */ | ||
582 | do_mlock_pages(0, TASK_SIZE, 1); | ||
583 | } | ||
574 | out: | 584 | out: |
575 | return ret; | 585 | return ret; |
576 | } | 586 | } |
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/mmu_notifier.h> | 29 | #include <linux/mmu_notifier.h> |
30 | #include <linux/perf_event.h> | 30 | #include <linux/perf_event.h> |
31 | #include <linux/audit.h> | 31 | #include <linux/audit.h> |
32 | #include <linux/khugepaged.h> | ||
32 | 33 | ||
33 | #include <asm/uaccess.h> | 34 | #include <asm/uaccess.h> |
34 | #include <asm/cacheflush.h> | 35 | #include <asm/cacheflush.h> |
@@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
253 | down_write(&mm->mmap_sem); | 254 | down_write(&mm->mmap_sem); |
254 | 255 | ||
255 | #ifdef CONFIG_COMPAT_BRK | 256 | #ifdef CONFIG_COMPAT_BRK |
256 | min_brk = mm->end_code; | 257 | /* |
258 | * CONFIG_COMPAT_BRK can still be overridden by setting | ||
259 | * randomize_va_space to 2, which will still cause mm->start_brk | ||
260 | * to be arbitrarily shifted | ||
261 | */ | ||
262 | if (mm->start_brk > PAGE_ALIGN(mm->end_data)) | ||
263 | min_brk = mm->start_brk; | ||
264 | else | ||
265 | min_brk = mm->end_data; | ||
257 | #else | 266 | #else |
258 | min_brk = mm->start_brk; | 267 | min_brk = mm->start_brk; |
259 | #endif | 268 | #endif |
@@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
588 | } | 597 | } |
589 | } | 598 | } |
590 | 599 | ||
600 | vma_adjust_trans_huge(vma, start, end, adjust_next); | ||
601 | |||
591 | /* | 602 | /* |
592 | * When changing only vma->vm_end, we don't really need anon_vma | 603 | * When changing only vma->vm_end, we don't really need anon_vma |
593 | * lock. This is a fairly rare case by itself, but the anon_vma | 604 | * lock. This is a fairly rare case by itself, but the anon_vma |
@@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
815 | end, prev->vm_pgoff, NULL); | 826 | end, prev->vm_pgoff, NULL); |
816 | if (err) | 827 | if (err) |
817 | return NULL; | 828 | return NULL; |
829 | khugepaged_enter_vma_merge(prev); | ||
818 | return prev; | 830 | return prev; |
819 | } | 831 | } |
820 | 832 | ||
@@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
833 | next->vm_pgoff - pglen, NULL); | 845 | next->vm_pgoff - pglen, NULL); |
834 | if (err) | 846 | if (err) |
835 | return NULL; | 847 | return NULL; |
848 | khugepaged_enter_vma_merge(area); | ||
836 | return area; | 849 | return area; |
837 | } | 850 | } |
838 | 851 | ||
@@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1761 | } | 1774 | } |
1762 | } | 1775 | } |
1763 | vma_unlock_anon_vma(vma); | 1776 | vma_unlock_anon_vma(vma); |
1777 | khugepaged_enter_vma_merge(vma); | ||
1764 | return error; | 1778 | return error; |
1765 | } | 1779 | } |
1766 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | 1780 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ |
@@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma, | |||
1808 | } | 1822 | } |
1809 | } | 1823 | } |
1810 | vma_unlock_anon_vma(vma); | 1824 | vma_unlock_anon_vma(vma); |
1825 | khugepaged_enter_vma_merge(vma); | ||
1811 | return error; | 1826 | return error; |
1812 | } | 1827 | } |
1813 | 1828 | ||
@@ -2462,6 +2477,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
2462 | unsigned long addr, unsigned long len, | 2477 | unsigned long addr, unsigned long len, |
2463 | unsigned long vm_flags, struct page **pages) | 2478 | unsigned long vm_flags, struct page **pages) |
2464 | { | 2479 | { |
2480 | int ret; | ||
2465 | struct vm_area_struct *vma; | 2481 | struct vm_area_struct *vma; |
2466 | 2482 | ||
2467 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 2483 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
@@ -2479,16 +2495,23 @@ int install_special_mapping(struct mm_struct *mm, | |||
2479 | vma->vm_ops = &special_mapping_vmops; | 2495 | vma->vm_ops = &special_mapping_vmops; |
2480 | vma->vm_private_data = pages; | 2496 | vma->vm_private_data = pages; |
2481 | 2497 | ||
2482 | if (unlikely(insert_vm_struct(mm, vma))) { | 2498 | ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); |
2483 | kmem_cache_free(vm_area_cachep, vma); | 2499 | if (ret) |
2484 | return -ENOMEM; | 2500 | goto out; |
2485 | } | 2501 | |
2502 | ret = insert_vm_struct(mm, vma); | ||
2503 | if (ret) | ||
2504 | goto out; | ||
2486 | 2505 | ||
2487 | mm->total_vm += len >> PAGE_SHIFT; | 2506 | mm->total_vm += len >> PAGE_SHIFT; |
2488 | 2507 | ||
2489 | perf_event_mmap(vma); | 2508 | perf_event_mmap(vma); |
2490 | 2509 | ||
2491 | return 0; | 2510 | return 0; |
2511 | |||
2512 | out: | ||
2513 | kmem_cache_free(vm_area_cachep, vma); | ||
2514 | return ret; | ||
2492 | } | 2515 | } |
2493 | 2516 | ||
2494 | static DEFINE_MUTEX(mm_all_locks_mutex); | 2517 | static DEFINE_MUTEX(mm_all_locks_mutex); |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 438951d366f..8d032de4088 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
100 | return young; | 100 | return young; |
101 | } | 101 | } |
102 | 102 | ||
103 | int __mmu_notifier_test_young(struct mm_struct *mm, | ||
104 | unsigned long address) | ||
105 | { | ||
106 | struct mmu_notifier *mn; | ||
107 | struct hlist_node *n; | ||
108 | int young = 0; | ||
109 | |||
110 | rcu_read_lock(); | ||
111 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
112 | if (mn->ops->test_young) { | ||
113 | young = mn->ops->test_young(mn, mm, address); | ||
114 | if (young) | ||
115 | break; | ||
116 | } | ||
117 | } | ||
118 | rcu_read_unlock(); | ||
119 | |||
120 | return young; | ||
121 | } | ||
122 | |||
103 | void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | 123 | void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, |
104 | pte_t pte) | 124 | pte_t pte) |
105 | { | 125 | { |
diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb82c85..f5b7d176021 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, | |||
87 | return 1; | 87 | return 1; |
88 | } | 88 | } |
89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | 89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ |
90 | |||
91 | #ifdef CONFIG_SMP | ||
92 | /* Called when a more accurate view of NR_FREE_PAGES is needed */ | ||
93 | unsigned long zone_nr_free_pages(struct zone *zone) | ||
94 | { | ||
95 | unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); | ||
96 | |||
97 | /* | ||
98 | * While kswapd is awake, it is considered the zone is under some | ||
99 | * memory pressure. Under pressure, there is a risk that | ||
100 | * per-cpu-counter-drift will allow the min watermark to be breached | ||
101 | * potentially causing a live-lock. While kswapd is awake and | ||
102 | * free pages are low, get a better estimate for free pages | ||
103 | */ | ||
104 | if (nr_free_pages < zone->percpu_drift_mark && | ||
105 | !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) | ||
106 | return zone_page_state_snapshot(zone, NR_FREE_PAGES); | ||
107 | |||
108 | return nr_free_pages; | ||
109 | } | ||
110 | #endif /* CONFIG_SMP */ | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c513387309..5a688a2756b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
78 | pte_unmap_unlock(pte - 1, ptl); | 78 | pte_unmap_unlock(pte - 1, ptl); |
79 | } | 79 | } |
80 | 80 | ||
81 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | 81 | static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
82 | unsigned long addr, unsigned long end, pgprot_t newprot, | 82 | unsigned long addr, unsigned long end, pgprot_t newprot, |
83 | int dirty_accountable) | 83 | int dirty_accountable) |
84 | { | 84 | { |
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
88 | pmd = pmd_offset(pud, addr); | 88 | pmd = pmd_offset(pud, addr); |
89 | do { | 89 | do { |
90 | next = pmd_addr_end(addr, end); | 90 | next = pmd_addr_end(addr, end); |
91 | if (pmd_trans_huge(*pmd)) { | ||
92 | if (next - addr != HPAGE_PMD_SIZE) | ||
93 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) | ||
95 | continue; | ||
96 | /* fall through */ | ||
97 | } | ||
91 | if (pmd_none_or_clear_bad(pmd)) | 98 | if (pmd_none_or_clear_bad(pmd)) |
92 | continue; | 99 | continue; |
93 | change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); | 100 | change_pte_range(vma->vm_mm, pmd, addr, next, newprot, |
101 | dirty_accountable); | ||
94 | } while (pmd++, addr = next, addr != end); | 102 | } while (pmd++, addr = next, addr != end); |
95 | } | 103 | } |
96 | 104 | ||
97 | static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | 105 | static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
98 | unsigned long addr, unsigned long end, pgprot_t newprot, | 106 | unsigned long addr, unsigned long end, pgprot_t newprot, |
99 | int dirty_accountable) | 107 | int dirty_accountable) |
100 | { | 108 | { |
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
106 | next = pud_addr_end(addr, end); | 114 | next = pud_addr_end(addr, end); |
107 | if (pud_none_or_clear_bad(pud)) | 115 | if (pud_none_or_clear_bad(pud)) |
108 | continue; | 116 | continue; |
109 | change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); | 117 | change_pmd_range(vma, pud, addr, next, newprot, |
118 | dirty_accountable); | ||
110 | } while (pud++, addr = next, addr != end); | 119 | } while (pud++, addr = next, addr != end); |
111 | } | 120 | } |
112 | 121 | ||
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma, | |||
126 | next = pgd_addr_end(addr, end); | 135 | next = pgd_addr_end(addr, end); |
127 | if (pgd_none_or_clear_bad(pgd)) | 136 | if (pgd_none_or_clear_bad(pgd)) |
128 | continue; | 137 | continue; |
129 | change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); | 138 | change_pud_range(vma, pgd, addr, next, newprot, |
139 | dirty_accountable); | ||
130 | } while (pgd++, addr = next, addr != end); | 140 | } while (pgd++, addr = next, addr != end); |
131 | flush_tlb_range(vma, start, end); | 141 | flush_tlb_range(vma, start, end); |
132 | } | 142 | } |
diff --git a/mm/mremap.c b/mm/mremap.c index 563fbdd6293..9925b6391b8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) | |||
41 | return NULL; | 41 | return NULL; |
42 | 42 | ||
43 | pmd = pmd_offset(pud, addr); | 43 | pmd = pmd_offset(pud, addr); |
44 | split_huge_page_pmd(mm, pmd); | ||
44 | if (pmd_none_or_clear_bad(pmd)) | 45 | if (pmd_none_or_clear_bad(pmd)) |
45 | return NULL; | 46 | return NULL; |
46 | 47 | ||
47 | return pmd; | 48 | return pmd; |
48 | } | 49 | } |
49 | 50 | ||
50 | static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) | 51 | static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, |
52 | unsigned long addr) | ||
51 | { | 53 | { |
52 | pgd_t *pgd; | 54 | pgd_t *pgd; |
53 | pud_t *pud; | 55 | pud_t *pud; |
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) | |||
62 | if (!pmd) | 64 | if (!pmd) |
63 | return NULL; | 65 | return NULL; |
64 | 66 | ||
65 | if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) | 67 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
68 | if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) | ||
66 | return NULL; | 69 | return NULL; |
67 | 70 | ||
68 | return pmd; | 71 | return pmd; |
@@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
147 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); | 150 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); |
148 | if (!old_pmd) | 151 | if (!old_pmd) |
149 | continue; | 152 | continue; |
150 | new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); | 153 | new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); |
151 | if (!new_pmd) | 154 | if (!new_pmd) |
152 | break; | 155 | break; |
153 | next = (new_addr + PMD_SIZE) & PMD_MASK; | 156 | next = (new_addr + PMD_SIZE) & PMD_MASK; |
diff --git a/mm/nommu.c b/mm/nommu.c index 3613517c759..f59e1424d3d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -10,7 +10,7 @@ | |||
10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> | 10 | * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> |
11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> | 11 | * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> |
12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> | 12 | * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> |
13 | * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> | 13 | * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp) | |||
127 | 127 | ||
128 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 128 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
129 | unsigned long start, int nr_pages, unsigned int foll_flags, | 129 | unsigned long start, int nr_pages, unsigned int foll_flags, |
130 | struct page **pages, struct vm_area_struct **vmas) | 130 | struct page **pages, struct vm_area_struct **vmas, |
131 | int *retry) | ||
131 | { | 132 | { |
132 | struct vm_area_struct *vma; | 133 | struct vm_area_struct *vma; |
133 | unsigned long vm_flags; | 134 | unsigned long vm_flags; |
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
185 | if (force) | 186 | if (force) |
186 | flags |= FOLL_FORCE; | 187 | flags |= FOLL_FORCE; |
187 | 188 | ||
188 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 189 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, |
190 | NULL); | ||
189 | } | 191 | } |
190 | EXPORT_SYMBOL(get_user_pages); | 192 | EXPORT_SYMBOL(get_user_pages); |
191 | 193 | ||
@@ -328,6 +330,7 @@ void *vmalloc_node(unsigned long size, int node) | |||
328 | { | 330 | { |
329 | return vmalloc(size); | 331 | return vmalloc(size); |
330 | } | 332 | } |
333 | EXPORT_SYMBOL(vmalloc_node); | ||
331 | 334 | ||
332 | /** | 335 | /** |
333 | * vzalloc_node - allocate memory on a specific node with zero fill | 336 | * vzalloc_node - allocate memory on a specific node with zero fill |
@@ -440,6 +443,31 @@ void __attribute__((weak)) vmalloc_sync_all(void) | |||
440 | { | 443 | { |
441 | } | 444 | } |
442 | 445 | ||
446 | /** | ||
447 | * alloc_vm_area - allocate a range of kernel address space | ||
448 | * @size: size of the area | ||
449 | * | ||
450 | * Returns: NULL on failure, vm_struct on success | ||
451 | * | ||
452 | * This function reserves a range of kernel address space, and | ||
453 | * allocates pagetables to map that range. No actual mappings | ||
454 | * are created. If the kernel address space is not shared | ||
455 | * between processes, it syncs the pagetable across all | ||
456 | * processes. | ||
457 | */ | ||
458 | struct vm_struct *alloc_vm_area(size_t size) | ||
459 | { | ||
460 | BUG(); | ||
461 | return NULL; | ||
462 | } | ||
463 | EXPORT_SYMBOL_GPL(alloc_vm_area); | ||
464 | |||
465 | void free_vm_area(struct vm_struct *area) | ||
466 | { | ||
467 | BUG(); | ||
468 | } | ||
469 | EXPORT_SYMBOL_GPL(free_vm_area); | ||
470 | |||
443 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | 471 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, |
444 | struct page *page) | 472 | struct page *page) |
445 | { | 473 | { |
@@ -1717,6 +1745,7 @@ void exit_mmap(struct mm_struct *mm) | |||
1717 | mm->mmap = vma->vm_next; | 1745 | mm->mmap = vma->vm_next; |
1718 | delete_vma_from_mm(vma); | 1746 | delete_vma_from_mm(vma); |
1719 | delete_vma(mm, vma); | 1747 | delete_vma(mm, vma); |
1748 | cond_resched(); | ||
1720 | } | 1749 | } |
1721 | 1750 | ||
1722 | kleave(""); | 1751 | kleave(""); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b840afa8976..2cb01f6ec5d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -404,15 +404,18 @@ unsigned long determine_dirtyable_memory(void) | |||
404 | * - vm.dirty_background_ratio or vm.dirty_background_bytes | 404 | * - vm.dirty_background_ratio or vm.dirty_background_bytes |
405 | * - vm.dirty_ratio or vm.dirty_bytes | 405 | * - vm.dirty_ratio or vm.dirty_bytes |
406 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and | 406 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and |
407 | * runtime tasks. | 407 | * real-time tasks. |
408 | */ | 408 | */ |
409 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | 409 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) |
410 | { | 410 | { |
411 | unsigned long background; | 411 | unsigned long background; |
412 | unsigned long dirty; | 412 | unsigned long dirty; |
413 | unsigned long available_memory = determine_dirtyable_memory(); | 413 | unsigned long uninitialized_var(available_memory); |
414 | struct task_struct *tsk; | 414 | struct task_struct *tsk; |
415 | 415 | ||
416 | if (!vm_dirty_bytes || !dirty_background_bytes) | ||
417 | available_memory = determine_dirtyable_memory(); | ||
418 | |||
416 | if (vm_dirty_bytes) | 419 | if (vm_dirty_bytes) |
417 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); | 420 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); |
418 | else | 421 | else |
@@ -563,7 +566,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
563 | break; /* We've done our duty */ | 566 | break; /* We've done our duty */ |
564 | } | 567 | } |
565 | trace_wbc_balance_dirty_wait(&wbc, bdi); | 568 | trace_wbc_balance_dirty_wait(&wbc, bdi); |
566 | __set_current_state(TASK_INTERRUPTIBLE); | 569 | __set_current_state(TASK_UNINTERRUPTIBLE); |
567 | io_schedule_timeout(pause); | 570 | io_schedule_timeout(pause); |
568 | 571 | ||
569 | /* | 572 | /* |
@@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page); | |||
1103 | int __set_page_dirty_no_writeback(struct page *page) | 1106 | int __set_page_dirty_no_writeback(struct page *page) |
1104 | { | 1107 | { |
1105 | if (!PageDirty(page)) | 1108 | if (!PageDirty(page)) |
1106 | SetPageDirty(page); | 1109 | return !TestSetPageDirty(page); |
1107 | return 0; | 1110 | return 0; |
1108 | } | 1111 | } |
1109 | 1112 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 19413bfdef9..887ce3bd823 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | |||
104 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | 104 | * only be modified with pm_mutex held, unless the suspend/hibernate code is |
105 | * guaranteed not to run in parallel with that modification). | 105 | * guaranteed not to run in parallel with that modification). |
106 | */ | 106 | */ |
107 | void set_gfp_allowed_mask(gfp_t mask) | 107 | |
108 | static gfp_t saved_gfp_mask; | ||
109 | |||
110 | void pm_restore_gfp_mask(void) | ||
108 | { | 111 | { |
109 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 112 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
110 | gfp_allowed_mask = mask; | 113 | if (saved_gfp_mask) { |
114 | gfp_allowed_mask = saved_gfp_mask; | ||
115 | saved_gfp_mask = 0; | ||
116 | } | ||
111 | } | 117 | } |
112 | 118 | ||
113 | gfp_t clear_gfp_allowed_mask(gfp_t mask) | 119 | void pm_restrict_gfp_mask(void) |
114 | { | 120 | { |
115 | gfp_t ret = gfp_allowed_mask; | ||
116 | |||
117 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 121 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
118 | gfp_allowed_mask &= ~mask; | 122 | WARN_ON(saved_gfp_mask); |
119 | return ret; | 123 | saved_gfp_mask = gfp_allowed_mask; |
124 | gfp_allowed_mask &= ~GFP_IOFS; | ||
120 | } | 125 | } |
121 | #endif /* CONFIG_PM_SLEEP */ | 126 | #endif /* CONFIG_PM_SLEEP */ |
122 | 127 | ||
@@ -352,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
352 | } | 357 | } |
353 | } | 358 | } |
354 | 359 | ||
360 | /* update __split_huge_page_refcount if you change this function */ | ||
355 | static int destroy_compound_page(struct page *page, unsigned long order) | 361 | static int destroy_compound_page(struct page *page, unsigned long order) |
356 | { | 362 | { |
357 | int i; | 363 | int i; |
@@ -421,18 +427,10 @@ static inline void rmv_page_order(struct page *page) | |||
421 | * | 427 | * |
422 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER | 428 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER |
423 | */ | 429 | */ |
424 | static inline struct page * | ||
425 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) | ||
426 | { | ||
427 | unsigned long buddy_idx = page_idx ^ (1 << order); | ||
428 | |||
429 | return page + (buddy_idx - page_idx); | ||
430 | } | ||
431 | |||
432 | static inline unsigned long | 430 | static inline unsigned long |
433 | __find_combined_index(unsigned long page_idx, unsigned int order) | 431 | __find_buddy_index(unsigned long page_idx, unsigned int order) |
434 | { | 432 | { |
435 | return (page_idx & ~(1 << order)); | 433 | return page_idx ^ (1 << order); |
436 | } | 434 | } |
437 | 435 | ||
438 | /* | 436 | /* |
@@ -443,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
443 | * (c) a page and its buddy have the same order && | 441 | * (c) a page and its buddy have the same order && |
444 | * (d) a page and its buddy are in the same zone. | 442 | * (d) a page and its buddy are in the same zone. |
445 | * | 443 | * |
446 | * For recording whether a page is in the buddy system, we use PG_buddy. | 444 | * For recording whether a page is in the buddy system, we set ->_mapcount -2. |
447 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. | 445 | * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. |
448 | * | 446 | * |
449 | * For recording page's order, we use page_private(page). | 447 | * For recording page's order, we use page_private(page). |
450 | */ | 448 | */ |
@@ -477,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
477 | * as necessary, plus some accounting needed to play nicely with other | 475 | * as necessary, plus some accounting needed to play nicely with other |
478 | * parts of the VM system. | 476 | * parts of the VM system. |
479 | * At each level, we keep a list of pages, which are heads of continuous | 477 | * At each level, we keep a list of pages, which are heads of continuous |
480 | * free pages of length of (1 << order) and marked with PG_buddy. Page's | 478 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's |
481 | * order is recorded in page_private(page) field. | 479 | * order is recorded in page_private(page) field. |
482 | * So when we are allocating or freeing one, we can derive the state of the | 480 | * So when we are allocating or freeing one, we can derive the state of the |
483 | * other. That is, if we allocate a small block, and both were | 481 | * other. That is, if we allocate a small block, and both were |
@@ -494,6 +492,7 @@ static inline void __free_one_page(struct page *page, | |||
494 | { | 492 | { |
495 | unsigned long page_idx; | 493 | unsigned long page_idx; |
496 | unsigned long combined_idx; | 494 | unsigned long combined_idx; |
495 | unsigned long uninitialized_var(buddy_idx); | ||
497 | struct page *buddy; | 496 | struct page *buddy; |
498 | 497 | ||
499 | if (unlikely(PageCompound(page))) | 498 | if (unlikely(PageCompound(page))) |
@@ -508,7 +507,8 @@ static inline void __free_one_page(struct page *page, | |||
508 | VM_BUG_ON(bad_range(zone, page)); | 507 | VM_BUG_ON(bad_range(zone, page)); |
509 | 508 | ||
510 | while (order < MAX_ORDER-1) { | 509 | while (order < MAX_ORDER-1) { |
511 | buddy = __page_find_buddy(page, page_idx, order); | 510 | buddy_idx = __find_buddy_index(page_idx, order); |
511 | buddy = page + (buddy_idx - page_idx); | ||
512 | if (!page_is_buddy(page, buddy, order)) | 512 | if (!page_is_buddy(page, buddy, order)) |
513 | break; | 513 | break; |
514 | 514 | ||
@@ -516,7 +516,7 @@ static inline void __free_one_page(struct page *page, | |||
516 | list_del(&buddy->lru); | 516 | list_del(&buddy->lru); |
517 | zone->free_area[order].nr_free--; | 517 | zone->free_area[order].nr_free--; |
518 | rmv_page_order(buddy); | 518 | rmv_page_order(buddy); |
519 | combined_idx = __find_combined_index(page_idx, order); | 519 | combined_idx = buddy_idx & page_idx; |
520 | page = page + (combined_idx - page_idx); | 520 | page = page + (combined_idx - page_idx); |
521 | page_idx = combined_idx; | 521 | page_idx = combined_idx; |
522 | order++; | 522 | order++; |
@@ -533,9 +533,10 @@ static inline void __free_one_page(struct page *page, | |||
533 | */ | 533 | */ |
534 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { | 534 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { |
535 | struct page *higher_page, *higher_buddy; | 535 | struct page *higher_page, *higher_buddy; |
536 | combined_idx = __find_combined_index(page_idx, order); | 536 | combined_idx = buddy_idx & page_idx; |
537 | higher_page = page + combined_idx - page_idx; | 537 | higher_page = page + (combined_idx - page_idx); |
538 | higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); | 538 | buddy_idx = __find_buddy_index(combined_idx, order + 1); |
539 | higher_buddy = page + (buddy_idx - combined_idx); | ||
539 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { | 540 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { |
540 | list_add_tail(&page->lru, | 541 | list_add_tail(&page->lru, |
541 | &zone->free_area[order].free_list[migratetype]); | 542 | &zone->free_area[order].free_list[migratetype]); |
@@ -646,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
646 | trace_mm_page_free_direct(page, order); | 647 | trace_mm_page_free_direct(page, order); |
647 | kmemcheck_free_shadow(page, order); | 648 | kmemcheck_free_shadow(page, order); |
648 | 649 | ||
649 | for (i = 0; i < (1 << order); i++) { | 650 | if (PageAnon(page)) |
650 | struct page *pg = page + i; | 651 | page->mapping = NULL; |
651 | 652 | for (i = 0; i < (1 << order); i++) | |
652 | if (PageAnon(pg)) | 653 | bad += free_pages_check(page + i); |
653 | pg->mapping = NULL; | ||
654 | bad += free_pages_check(pg); | ||
655 | } | ||
656 | if (bad) | 654 | if (bad) |
657 | return false; | 655 | return false; |
658 | 656 | ||
@@ -1090,8 +1088,10 @@ static void drain_pages(unsigned int cpu) | |||
1090 | pset = per_cpu_ptr(zone->pageset, cpu); | 1088 | pset = per_cpu_ptr(zone->pageset, cpu); |
1091 | 1089 | ||
1092 | pcp = &pset->pcp; | 1090 | pcp = &pset->pcp; |
1093 | free_pcppages_bulk(zone, pcp->count, pcp); | 1091 | if (pcp->count) { |
1094 | pcp->count = 0; | 1092 | free_pcppages_bulk(zone, pcp->count, pcp); |
1093 | pcp->count = 0; | ||
1094 | } | ||
1095 | local_irq_restore(flags); | 1095 | local_irq_restore(flags); |
1096 | } | 1096 | } |
1097 | } | 1097 | } |
@@ -1455,24 +1455,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | |||
1455 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1455 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
1456 | 1456 | ||
1457 | /* | 1457 | /* |
1458 | * Return 1 if free pages are above 'mark'. This takes into account the order | 1458 | * Return true if free pages are above 'mark'. This takes into account the order |
1459 | * of the allocation. | 1459 | * of the allocation. |
1460 | */ | 1460 | */ |
1461 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1461 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1462 | int classzone_idx, int alloc_flags) | 1462 | int classzone_idx, int alloc_flags, long free_pages) |
1463 | { | 1463 | { |
1464 | /* free_pages my go negative - that's OK */ | 1464 | /* free_pages my go negative - that's OK */ |
1465 | long min = mark; | 1465 | long min = mark; |
1466 | long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; | ||
1467 | int o; | 1466 | int o; |
1468 | 1467 | ||
1468 | free_pages -= (1 << order) + 1; | ||
1469 | if (alloc_flags & ALLOC_HIGH) | 1469 | if (alloc_flags & ALLOC_HIGH) |
1470 | min -= min / 2; | 1470 | min -= min / 2; |
1471 | if (alloc_flags & ALLOC_HARDER) | 1471 | if (alloc_flags & ALLOC_HARDER) |
1472 | min -= min / 4; | 1472 | min -= min / 4; |
1473 | 1473 | ||
1474 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1474 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
1475 | return 0; | 1475 | return false; |
1476 | for (o = 0; o < order; o++) { | 1476 | for (o = 0; o < order; o++) { |
1477 | /* At the next order, this order's pages become unavailable */ | 1477 | /* At the next order, this order's pages become unavailable */ |
1478 | free_pages -= z->free_area[o].nr_free << o; | 1478 | free_pages -= z->free_area[o].nr_free << o; |
@@ -1481,9 +1481,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1481 | min >>= 1; | 1481 | min >>= 1; |
1482 | 1482 | ||
1483 | if (free_pages <= min) | 1483 | if (free_pages <= min) |
1484 | return 0; | 1484 | return false; |
1485 | } | 1485 | } |
1486 | return 1; | 1486 | return true; |
1487 | } | ||
1488 | |||
1489 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | ||
1490 | int classzone_idx, int alloc_flags) | ||
1491 | { | ||
1492 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
1493 | zone_page_state(z, NR_FREE_PAGES)); | ||
1494 | } | ||
1495 | |||
1496 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | ||
1497 | int classzone_idx, int alloc_flags) | ||
1498 | { | ||
1499 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | ||
1500 | |||
1501 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | ||
1502 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | ||
1503 | |||
1504 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
1505 | free_pages); | ||
1487 | } | 1506 | } |
1488 | 1507 | ||
1489 | #ifdef CONFIG_NUMA | 1508 | #ifdef CONFIG_NUMA |
@@ -1788,15 +1807,18 @@ static struct page * | |||
1788 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1807 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1789 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1808 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1790 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1809 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1791 | int migratetype, unsigned long *did_some_progress) | 1810 | int migratetype, unsigned long *did_some_progress, |
1811 | bool sync_migration) | ||
1792 | { | 1812 | { |
1793 | struct page *page; | 1813 | struct page *page; |
1794 | 1814 | ||
1795 | if (!order || compaction_deferred(preferred_zone)) | 1815 | if (!order || compaction_deferred(preferred_zone)) |
1796 | return NULL; | 1816 | return NULL; |
1797 | 1817 | ||
1818 | current->flags |= PF_MEMALLOC; | ||
1798 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 1819 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
1799 | nodemask); | 1820 | nodemask, sync_migration); |
1821 | current->flags &= ~PF_MEMALLOC; | ||
1800 | if (*did_some_progress != COMPACT_SKIPPED) { | 1822 | if (*did_some_progress != COMPACT_SKIPPED) { |
1801 | 1823 | ||
1802 | /* Page migration frees to the PCP lists but we want merging */ | 1824 | /* Page migration frees to the PCP lists but we want merging */ |
@@ -1832,7 +1854,8 @@ static inline struct page * | |||
1832 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1854 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1833 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1855 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1834 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1856 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1835 | int migratetype, unsigned long *did_some_progress) | 1857 | int migratetype, unsigned long *did_some_progress, |
1858 | bool sync_migration) | ||
1836 | { | 1859 | { |
1837 | return NULL; | 1860 | return NULL; |
1838 | } | 1861 | } |
@@ -1847,23 +1870,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
1847 | { | 1870 | { |
1848 | struct page *page = NULL; | 1871 | struct page *page = NULL; |
1849 | struct reclaim_state reclaim_state; | 1872 | struct reclaim_state reclaim_state; |
1850 | struct task_struct *p = current; | ||
1851 | bool drained = false; | 1873 | bool drained = false; |
1852 | 1874 | ||
1853 | cond_resched(); | 1875 | cond_resched(); |
1854 | 1876 | ||
1855 | /* We now go into synchronous reclaim */ | 1877 | /* We now go into synchronous reclaim */ |
1856 | cpuset_memory_pressure_bump(); | 1878 | cpuset_memory_pressure_bump(); |
1857 | p->flags |= PF_MEMALLOC; | 1879 | current->flags |= PF_MEMALLOC; |
1858 | lockdep_set_current_reclaim_state(gfp_mask); | 1880 | lockdep_set_current_reclaim_state(gfp_mask); |
1859 | reclaim_state.reclaimed_slab = 0; | 1881 | reclaim_state.reclaimed_slab = 0; |
1860 | p->reclaim_state = &reclaim_state; | 1882 | current->reclaim_state = &reclaim_state; |
1861 | 1883 | ||
1862 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 1884 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); |
1863 | 1885 | ||
1864 | p->reclaim_state = NULL; | 1886 | current->reclaim_state = NULL; |
1865 | lockdep_clear_current_reclaim_state(); | 1887 | lockdep_clear_current_reclaim_state(); |
1866 | p->flags &= ~PF_MEMALLOC; | 1888 | current->flags &= ~PF_MEMALLOC; |
1867 | 1889 | ||
1868 | cond_resched(); | 1890 | cond_resched(); |
1869 | 1891 | ||
@@ -1915,19 +1937,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
1915 | 1937 | ||
1916 | static inline | 1938 | static inline |
1917 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | 1939 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, |
1918 | enum zone_type high_zoneidx) | 1940 | enum zone_type high_zoneidx, |
1941 | enum zone_type classzone_idx) | ||
1919 | { | 1942 | { |
1920 | struct zoneref *z; | 1943 | struct zoneref *z; |
1921 | struct zone *zone; | 1944 | struct zone *zone; |
1922 | 1945 | ||
1923 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 1946 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
1924 | wakeup_kswapd(zone, order); | 1947 | wakeup_kswapd(zone, order, classzone_idx); |
1925 | } | 1948 | } |
1926 | 1949 | ||
1927 | static inline int | 1950 | static inline int |
1928 | gfp_to_alloc_flags(gfp_t gfp_mask) | 1951 | gfp_to_alloc_flags(gfp_t gfp_mask) |
1929 | { | 1952 | { |
1930 | struct task_struct *p = current; | ||
1931 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | 1953 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; |
1932 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1954 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
1933 | 1955 | ||
@@ -1943,18 +1965,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
1943 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); | 1965 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); |
1944 | 1966 | ||
1945 | if (!wait) { | 1967 | if (!wait) { |
1946 | alloc_flags |= ALLOC_HARDER; | 1968 | /* |
1969 | * Not worth trying to allocate harder for | ||
1970 | * __GFP_NOMEMALLOC even if it can't schedule. | ||
1971 | */ | ||
1972 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | ||
1973 | alloc_flags |= ALLOC_HARDER; | ||
1947 | /* | 1974 | /* |
1948 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 1975 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1949 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1976 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1950 | */ | 1977 | */ |
1951 | alloc_flags &= ~ALLOC_CPUSET; | 1978 | alloc_flags &= ~ALLOC_CPUSET; |
1952 | } else if (unlikely(rt_task(p)) && !in_interrupt()) | 1979 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
1953 | alloc_flags |= ALLOC_HARDER; | 1980 | alloc_flags |= ALLOC_HARDER; |
1954 | 1981 | ||
1955 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 1982 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
1956 | if (!in_interrupt() && | 1983 | if (!in_interrupt() && |
1957 | ((p->flags & PF_MEMALLOC) || | 1984 | ((current->flags & PF_MEMALLOC) || |
1958 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 1985 | unlikely(test_thread_flag(TIF_MEMDIE)))) |
1959 | alloc_flags |= ALLOC_NO_WATERMARKS; | 1986 | alloc_flags |= ALLOC_NO_WATERMARKS; |
1960 | } | 1987 | } |
@@ -1973,7 +2000,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
1973 | int alloc_flags; | 2000 | int alloc_flags; |
1974 | unsigned long pages_reclaimed = 0; | 2001 | unsigned long pages_reclaimed = 0; |
1975 | unsigned long did_some_progress; | 2002 | unsigned long did_some_progress; |
1976 | struct task_struct *p = current; | 2003 | bool sync_migration = false; |
1977 | 2004 | ||
1978 | /* | 2005 | /* |
1979 | * In the slowpath, we sanity check order to avoid ever trying to | 2006 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -1998,7 +2025,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
1998 | goto nopage; | 2025 | goto nopage; |
1999 | 2026 | ||
2000 | restart: | 2027 | restart: |
2001 | wake_all_kswapd(order, zonelist, high_zoneidx); | 2028 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2029 | wake_all_kswapd(order, zonelist, high_zoneidx, | ||
2030 | zone_idx(preferred_zone)); | ||
2002 | 2031 | ||
2003 | /* | 2032 | /* |
2004 | * OK, we're below the kswapd watermark and have kicked background | 2033 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2007,6 +2036,14 @@ restart: | |||
2007 | */ | 2036 | */ |
2008 | alloc_flags = gfp_to_alloc_flags(gfp_mask); | 2037 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
2009 | 2038 | ||
2039 | /* | ||
2040 | * Find the true preferred zone if the allocation is unconstrained by | ||
2041 | * cpusets. | ||
2042 | */ | ||
2043 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) | ||
2044 | first_zones_zonelist(zonelist, high_zoneidx, NULL, | ||
2045 | &preferred_zone); | ||
2046 | |||
2010 | /* This is the last chance, in general, before the goto nopage. */ | 2047 | /* This is the last chance, in general, before the goto nopage. */ |
2011 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2048 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
2012 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2049 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
@@ -2029,21 +2066,26 @@ rebalance: | |||
2029 | goto nopage; | 2066 | goto nopage; |
2030 | 2067 | ||
2031 | /* Avoid recursion of direct reclaim */ | 2068 | /* Avoid recursion of direct reclaim */ |
2032 | if (p->flags & PF_MEMALLOC) | 2069 | if (current->flags & PF_MEMALLOC) |
2033 | goto nopage; | 2070 | goto nopage; |
2034 | 2071 | ||
2035 | /* Avoid allocations with no watermarks from looping endlessly */ | 2072 | /* Avoid allocations with no watermarks from looping endlessly */ |
2036 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | 2073 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) |
2037 | goto nopage; | 2074 | goto nopage; |
2038 | 2075 | ||
2039 | /* Try direct compaction */ | 2076 | /* |
2077 | * Try direct compaction. The first pass is asynchronous. Subsequent | ||
2078 | * attempts after direct reclaim are synchronous | ||
2079 | */ | ||
2040 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2080 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2041 | zonelist, high_zoneidx, | 2081 | zonelist, high_zoneidx, |
2042 | nodemask, | 2082 | nodemask, |
2043 | alloc_flags, preferred_zone, | 2083 | alloc_flags, preferred_zone, |
2044 | migratetype, &did_some_progress); | 2084 | migratetype, &did_some_progress, |
2085 | sync_migration); | ||
2045 | if (page) | 2086 | if (page) |
2046 | goto got_pg; | 2087 | goto got_pg; |
2088 | sync_migration = true; | ||
2047 | 2089 | ||
2048 | /* Try direct reclaim and then allocating */ | 2090 | /* Try direct reclaim and then allocating */ |
2049 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2091 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
@@ -2097,13 +2139,27 @@ rebalance: | |||
2097 | /* Wait for some write requests to complete then retry */ | 2139 | /* Wait for some write requests to complete then retry */ |
2098 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2140 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
2099 | goto rebalance; | 2141 | goto rebalance; |
2142 | } else { | ||
2143 | /* | ||
2144 | * High-order allocations do not necessarily loop after | ||
2145 | * direct reclaim and reclaim/compaction depends on compaction | ||
2146 | * being called after reclaim so call directly if necessary | ||
2147 | */ | ||
2148 | page = __alloc_pages_direct_compact(gfp_mask, order, | ||
2149 | zonelist, high_zoneidx, | ||
2150 | nodemask, | ||
2151 | alloc_flags, preferred_zone, | ||
2152 | migratetype, &did_some_progress, | ||
2153 | sync_migration); | ||
2154 | if (page) | ||
2155 | goto got_pg; | ||
2100 | } | 2156 | } |
2101 | 2157 | ||
2102 | nopage: | 2158 | nopage: |
2103 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { | 2159 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { |
2104 | printk(KERN_WARNING "%s: page allocation failure." | 2160 | printk(KERN_WARNING "%s: page allocation failure." |
2105 | " order:%d, mode:0x%x\n", | 2161 | " order:%d, mode:0x%x\n", |
2106 | p->comm, order, gfp_mask); | 2162 | current->comm, order, gfp_mask); |
2107 | dump_stack(); | 2163 | dump_stack(); |
2108 | show_mem(); | 2164 | show_mem(); |
2109 | } | 2165 | } |
@@ -2146,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2146 | 2202 | ||
2147 | get_mems_allowed(); | 2203 | get_mems_allowed(); |
2148 | /* The preferred zone is used for statistics later */ | 2204 | /* The preferred zone is used for statistics later */ |
2149 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | 2205 | first_zones_zonelist(zonelist, high_zoneidx, |
2206 | nodemask ? : &cpuset_current_mems_allowed, | ||
2207 | &preferred_zone); | ||
2150 | if (!preferred_zone) { | 2208 | if (!preferred_zone) { |
2151 | put_mems_allowed(); | 2209 | put_mems_allowed(); |
2152 | return NULL; | 2210 | return NULL; |
@@ -2437,7 +2495,7 @@ void show_free_areas(void) | |||
2437 | " all_unreclaimable? %s" | 2495 | " all_unreclaimable? %s" |
2438 | "\n", | 2496 | "\n", |
2439 | zone->name, | 2497 | zone->name, |
2440 | K(zone_nr_free_pages(zone)), | 2498 | K(zone_page_state(zone, NR_FREE_PAGES)), |
2441 | K(min_wmark_pages(zone)), | 2499 | K(min_wmark_pages(zone)), |
2442 | K(low_wmark_pages(zone)), | 2500 | K(low_wmark_pages(zone)), |
2443 | K(high_wmark_pages(zone)), | 2501 | K(high_wmark_pages(zone)), |
@@ -2580,9 +2638,16 @@ static int __parse_numa_zonelist_order(char *s) | |||
2580 | 2638 | ||
2581 | static __init int setup_numa_zonelist_order(char *s) | 2639 | static __init int setup_numa_zonelist_order(char *s) |
2582 | { | 2640 | { |
2583 | if (s) | 2641 | int ret; |
2584 | return __parse_numa_zonelist_order(s); | 2642 | |
2585 | return 0; | 2643 | if (!s) |
2644 | return 0; | ||
2645 | |||
2646 | ret = __parse_numa_zonelist_order(s); | ||
2647 | if (ret == 0) | ||
2648 | strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); | ||
2649 | |||
2650 | return ret; | ||
2586 | } | 2651 | } |
2587 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | 2652 | early_param("numa_zonelist_order", setup_numa_zonelist_order); |
2588 | 2653 | ||
@@ -3008,14 +3073,6 @@ static __init_refok int __build_all_zonelists(void *data) | |||
3008 | build_zonelist_cache(pgdat); | 3073 | build_zonelist_cache(pgdat); |
3009 | } | 3074 | } |
3010 | 3075 | ||
3011 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
3012 | /* Setup real pagesets for the new zone */ | ||
3013 | if (data) { | ||
3014 | struct zone *zone = data; | ||
3015 | setup_zone_pageset(zone); | ||
3016 | } | ||
3017 | #endif | ||
3018 | |||
3019 | /* | 3076 | /* |
3020 | * Initialize the boot_pagesets that are going to be used | 3077 | * Initialize the boot_pagesets that are going to be used |
3021 | * for bootstrapping processors. The real pagesets for | 3078 | * for bootstrapping processors. The real pagesets for |
@@ -3064,7 +3121,11 @@ void build_all_zonelists(void *data) | |||
3064 | } else { | 3121 | } else { |
3065 | /* we have to stop all cpus to guarantee there is no user | 3122 | /* we have to stop all cpus to guarantee there is no user |
3066 | of zonelist */ | 3123 | of zonelist */ |
3067 | stop_machine(__build_all_zonelists, data, NULL); | 3124 | #ifdef CONFIG_MEMORY_HOTPLUG |
3125 | if (data) | ||
3126 | setup_zone_pageset((struct zone *)data); | ||
3127 | #endif | ||
3128 | stop_machine(__build_all_zonelists, NULL, NULL); | ||
3068 | /* cpuset refresh routine should be here */ | 3129 | /* cpuset refresh routine should be here */ |
3069 | } | 3130 | } |
3070 | vm_total_pages = nr_free_pagecache_pages(); | 3131 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -4045,7 +4106,7 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
4045 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | 4106 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); |
4046 | } | 4107 | } |
4047 | #else | 4108 | #else |
4048 | static void inline setup_usemap(struct pglist_data *pgdat, | 4109 | static inline void setup_usemap(struct pglist_data *pgdat, |
4049 | struct zone *zone, unsigned long zonesize) {} | 4110 | struct zone *zone, unsigned long zonesize) {} |
4050 | #endif /* CONFIG_SPARSEMEM */ | 4111 | #endif /* CONFIG_SPARSEMEM */ |
4051 | 4112 | ||
@@ -5548,7 +5609,6 @@ static struct trace_print_flags pageflag_names[] = { | |||
5548 | {1UL << PG_swapcache, "swapcache" }, | 5609 | {1UL << PG_swapcache, "swapcache" }, |
5549 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | 5610 | {1UL << PG_mappedtodisk, "mappedtodisk" }, |
5550 | {1UL << PG_reclaim, "reclaim" }, | 5611 | {1UL << PG_reclaim, "reclaim" }, |
5551 | {1UL << PG_buddy, "buddy" }, | ||
5552 | {1UL << PG_swapbacked, "swapbacked" }, | 5612 | {1UL << PG_swapbacked, "swapbacked" }, |
5553 | {1UL << PG_unevictable, "unevictable" }, | 5613 | {1UL << PG_unevictable, "unevictable" }, |
5554 | #ifdef CONFIG_MMU | 5614 | #ifdef CONFIG_MMU |
@@ -5596,7 +5656,7 @@ void dump_page(struct page *page) | |||
5596 | { | 5656 | { |
5597 | printk(KERN_ALERT | 5657 | printk(KERN_ALERT |
5598 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | 5658 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", |
5599 | page, page_count(page), page_mapcount(page), | 5659 | page, atomic_read(&page->_count), page_mapcount(page), |
5600 | page->mapping, page->index); | 5660 | page->mapping, page->index); |
5601 | dump_page_flags(page->flags); | 5661 | dump_page_flags(page->flags); |
5602 | } | 5662 | } |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8b1a2ce21ee..7cfa6ae0230 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, | |||
34 | pmd = pmd_offset(pud, addr); | 34 | pmd = pmd_offset(pud, addr); |
35 | do { | 35 | do { |
36 | next = pmd_addr_end(addr, end); | 36 | next = pmd_addr_end(addr, end); |
37 | split_huge_page_pmd(walk->mm, pmd); | ||
37 | if (pmd_none_or_clear_bad(pmd)) { | 38 | if (pmd_none_or_clear_bad(pmd)) { |
38 | if (walk->pte_hole) | 39 | if (walk->pte_hole) |
39 | err = walk->pte_hole(addr, next, walk); | 40 | err = walk->pte_hole(addr, next, walk); |
@@ -139,7 +140,6 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
139 | pgd_t *pgd; | 140 | pgd_t *pgd; |
140 | unsigned long next; | 141 | unsigned long next; |
141 | int err = 0; | 142 | int err = 0; |
142 | struct vm_area_struct *vma; | ||
143 | 143 | ||
144 | if (addr >= end) | 144 | if (addr >= end) |
145 | return err; | 145 | return err; |
@@ -149,15 +149,17 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
149 | 149 | ||
150 | pgd = pgd_offset(walk->mm, addr); | 150 | pgd = pgd_offset(walk->mm, addr); |
151 | do { | 151 | do { |
152 | struct vm_area_struct *uninitialized_var(vma); | ||
153 | |||
152 | next = pgd_addr_end(addr, end); | 154 | next = pgd_addr_end(addr, end); |
153 | 155 | ||
156 | #ifdef CONFIG_HUGETLB_PAGE | ||
154 | /* | 157 | /* |
155 | * handle hugetlb vma individually because pagetable walk for | 158 | * handle hugetlb vma individually because pagetable walk for |
156 | * the hugetlb page is dependent on the architecture and | 159 | * the hugetlb page is dependent on the architecture and |
157 | * we can't handled it in the same manner as non-huge pages. | 160 | * we can't handled it in the same manner as non-huge pages. |
158 | */ | 161 | */ |
159 | vma = find_vma(walk->mm, addr); | 162 | vma = find_vma(walk->mm, addr); |
160 | #ifdef CONFIG_HUGETLB_PAGE | ||
161 | if (vma && is_vm_hugetlb_page(vma)) { | 163 | if (vma && is_vm_hugetlb_page(vma)) { |
162 | if (vma->vm_end < next) | 164 | if (vma->vm_end < next) |
163 | next = vma->vm_end; | 165 | next = vma->vm_end; |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 7d9c1d0ebd3..ea534960a04 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) | |||
421 | return NULL; | 421 | return NULL; |
422 | 422 | ||
423 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, | 423 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, |
424 | pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); | 424 | pcpu_nr_groups, pcpu_atom_size); |
425 | if (!vms) { | 425 | if (!vms) { |
426 | pcpu_free_chunk(chunk); | 426 | pcpu_free_chunk(chunk); |
427 | return NULL; | 427 | return NULL; |
diff --git a/mm/percpu.c b/mm/percpu.c index efe816856a9..3f930018aa6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -258,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, | |||
258 | 258 | ||
259 | /* | 259 | /* |
260 | * (Un)populated page region iterators. Iterate over (un)populated | 260 | * (Un)populated page region iterators. Iterate over (un)populated |
261 | * page regions betwen @start and @end in @chunk. @rs and @re should | 261 | * page regions between @start and @end in @chunk. @rs and @re should |
262 | * be integer variables and will be set to start and end page index of | 262 | * be integer variables and will be set to start and end page index of |
263 | * the current region. | 263 | * the current region. |
264 | */ | 264 | */ |
@@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size) | |||
293 | 293 | ||
294 | if (size <= PAGE_SIZE) | 294 | if (size <= PAGE_SIZE) |
295 | return kzalloc(size, GFP_KERNEL); | 295 | return kzalloc(size, GFP_KERNEL); |
296 | else { | 296 | else |
297 | void *ptr = vmalloc(size); | 297 | return vzalloc(size); |
298 | if (ptr) | ||
299 | memset(ptr, 0, size); | ||
300 | return ptr; | ||
301 | } | ||
302 | } | 298 | } |
303 | 299 | ||
304 | /** | 300 | /** |
@@ -1268,7 +1264,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1268 | 1264 | ||
1269 | /* we're done parsing the input, undefine BUG macro and dump config */ | 1265 | /* we're done parsing the input, undefine BUG macro and dump config */ |
1270 | #undef PCPU_SETUP_BUG_ON | 1266 | #undef PCPU_SETUP_BUG_ON |
1271 | pcpu_dump_alloc_info(KERN_INFO, ai); | 1267 | pcpu_dump_alloc_info(KERN_DEBUG, ai); |
1272 | 1268 | ||
1273 | pcpu_nr_groups = ai->nr_groups; | 1269 | pcpu_nr_groups = ai->nr_groups; |
1274 | pcpu_group_offsets = group_offsets; | 1270 | pcpu_group_offsets = group_offsets; |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c new file mode 100644 index 00000000000..eb663fb533e --- /dev/null +++ b/mm/pgtable-generic.c | |||
@@ -0,0 +1,121 @@ | |||
1 | /* | ||
2 | * mm/pgtable-generic.c | ||
3 | * | ||
4 | * Generic pgtable methods declared in asm-generic/pgtable.h | ||
5 | * | ||
6 | * Copyright (C) 2010 Linus Torvalds | ||
7 | */ | ||
8 | |||
9 | #include <linux/pagemap.h> | ||
10 | #include <asm/tlb.h> | ||
11 | #include <asm-generic/pgtable.h> | ||
12 | |||
13 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | ||
14 | /* | ||
15 | * Only sets the access flags (dirty, accessed, and | ||
16 | * writable). Furthermore, we know it always gets set to a "more | ||
17 | * permissive" setting, which allows most architectures to optimize | ||
18 | * this. We return whether the PTE actually changed, which in turn | ||
19 | * instructs the caller to do things like update__mmu_cache. This | ||
20 | * used to be done in the caller, but sparc needs minor faults to | ||
21 | * force that call on sun4c so we changed this macro slightly | ||
22 | */ | ||
23 | int ptep_set_access_flags(struct vm_area_struct *vma, | ||
24 | unsigned long address, pte_t *ptep, | ||
25 | pte_t entry, int dirty) | ||
26 | { | ||
27 | int changed = !pte_same(*ptep, entry); | ||
28 | if (changed) { | ||
29 | set_pte_at(vma->vm_mm, address, ptep, entry); | ||
30 | flush_tlb_page(vma, address); | ||
31 | } | ||
32 | return changed; | ||
33 | } | ||
34 | #endif | ||
35 | |||
36 | #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS | ||
37 | int pmdp_set_access_flags(struct vm_area_struct *vma, | ||
38 | unsigned long address, pmd_t *pmdp, | ||
39 | pmd_t entry, int dirty) | ||
40 | { | ||
41 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
42 | int changed = !pmd_same(*pmdp, entry); | ||
43 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
44 | if (changed) { | ||
45 | set_pmd_at(vma->vm_mm, address, pmdp, entry); | ||
46 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
47 | } | ||
48 | return changed; | ||
49 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
50 | BUG(); | ||
51 | return 0; | ||
52 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
53 | } | ||
54 | #endif | ||
55 | |||
56 | #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH | ||
57 | int ptep_clear_flush_young(struct vm_area_struct *vma, | ||
58 | unsigned long address, pte_t *ptep) | ||
59 | { | ||
60 | int young; | ||
61 | young = ptep_test_and_clear_young(vma, address, ptep); | ||
62 | if (young) | ||
63 | flush_tlb_page(vma, address); | ||
64 | return young; | ||
65 | } | ||
66 | #endif | ||
67 | |||
68 | #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH | ||
69 | int pmdp_clear_flush_young(struct vm_area_struct *vma, | ||
70 | unsigned long address, pmd_t *pmdp) | ||
71 | { | ||
72 | int young; | ||
73 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | ||
74 | BUG(); | ||
75 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
76 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
77 | young = pmdp_test_and_clear_young(vma, address, pmdp); | ||
78 | if (young) | ||
79 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
80 | return young; | ||
81 | } | ||
82 | #endif | ||
83 | |||
84 | #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH | ||
85 | pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, | ||
86 | pte_t *ptep) | ||
87 | { | ||
88 | pte_t pte; | ||
89 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); | ||
90 | flush_tlb_page(vma, address); | ||
91 | return pte; | ||
92 | } | ||
93 | #endif | ||
94 | |||
95 | #ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH | ||
96 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
97 | pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | ||
98 | pmd_t *pmdp) | ||
99 | { | ||
100 | pmd_t pmd; | ||
101 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
102 | pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); | ||
103 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
104 | return pmd; | ||
105 | } | ||
106 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
107 | #endif | ||
108 | |||
109 | #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
110 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
111 | pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
112 | pmd_t *pmdp) | ||
113 | { | ||
114 | pmd_t pmd = pmd_mksplitting(*pmdp); | ||
115 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
116 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); | ||
117 | /* tlb flush only to serialize against gup-fast */ | ||
118 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
119 | } | ||
120 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
121 | #endif | ||
@@ -94,7 +94,7 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | |||
94 | * anonymous pages mapped into it with that anon_vma. | 94 | * anonymous pages mapped into it with that anon_vma. |
95 | * | 95 | * |
96 | * The common case will be that we already have one, but if | 96 | * The common case will be that we already have one, but if |
97 | * if not we either need to find an adjacent mapping that we | 97 | * not we either need to find an adjacent mapping that we |
98 | * can re-use the anon_vma from (very common when the only | 98 | * can re-use the anon_vma from (very common when the only |
99 | * reason for splitting a vma has been mprotect()), or we | 99 | * reason for splitting a vma has been mprotect()), or we |
100 | * allocate a new one. | 100 | * allocate a new one. |
@@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
177 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 177 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
178 | 178 | ||
179 | anon_vma_lock(anon_vma); | 179 | anon_vma_lock(anon_vma); |
180 | /* | ||
181 | * It's critical to add new vmas to the tail of the anon_vma, | ||
182 | * see comment in huge_memory.c:__split_huge_page(). | ||
183 | */ | ||
180 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | 184 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); |
181 | anon_vma_unlock(anon_vma); | 185 | anon_vma_unlock(anon_vma); |
182 | } | 186 | } |
@@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) | |||
360 | * Returns virtual address or -EFAULT if page's index/offset is not | 364 | * Returns virtual address or -EFAULT if page's index/offset is not |
361 | * within the range mapped the @vma. | 365 | * within the range mapped the @vma. |
362 | */ | 366 | */ |
363 | static inline unsigned long | 367 | inline unsigned long |
364 | vma_address(struct page *page, struct vm_area_struct *vma) | 368 | vma_address(struct page *page, struct vm_area_struct *vma) |
365 | { | 369 | { |
366 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 370 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | |||
435 | pmd = pmd_offset(pud, address); | 439 | pmd = pmd_offset(pud, address); |
436 | if (!pmd_present(*pmd)) | 440 | if (!pmd_present(*pmd)) |
437 | return NULL; | 441 | return NULL; |
442 | if (pmd_trans_huge(*pmd)) | ||
443 | return NULL; | ||
438 | 444 | ||
439 | pte = pte_offset_map(pmd, address); | 445 | pte = pte_offset_map(pmd, address); |
440 | /* Make a quick check before getting the lock */ | 446 | /* Make a quick check before getting the lock */ |
@@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
489 | unsigned long *vm_flags) | 495 | unsigned long *vm_flags) |
490 | { | 496 | { |
491 | struct mm_struct *mm = vma->vm_mm; | 497 | struct mm_struct *mm = vma->vm_mm; |
492 | pte_t *pte; | ||
493 | spinlock_t *ptl; | ||
494 | int referenced = 0; | 498 | int referenced = 0; |
495 | 499 | ||
496 | pte = page_check_address(page, mm, address, &ptl, 0); | ||
497 | if (!pte) | ||
498 | goto out; | ||
499 | |||
500 | /* | 500 | /* |
501 | * Don't want to elevate referenced for mlocked page that gets this far, | 501 | * Don't want to elevate referenced for mlocked page that gets this far, |
502 | * in order that it progresses to try_to_unmap and is moved to the | 502 | * in order that it progresses to try_to_unmap and is moved to the |
503 | * unevictable list. | 503 | * unevictable list. |
504 | */ | 504 | */ |
505 | if (vma->vm_flags & VM_LOCKED) { | 505 | if (vma->vm_flags & VM_LOCKED) { |
506 | *mapcount = 1; /* break early from loop */ | 506 | *mapcount = 0; /* break early from loop */ |
507 | *vm_flags |= VM_LOCKED; | 507 | *vm_flags |= VM_LOCKED; |
508 | goto out_unmap; | 508 | goto out; |
509 | } | ||
510 | |||
511 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | ||
512 | /* | ||
513 | * Don't treat a reference through a sequentially read | ||
514 | * mapping as such. If the page has been used in | ||
515 | * another mapping, we will catch it; if this other | ||
516 | * mapping is already gone, the unmap path will have | ||
517 | * set PG_referenced or activated the page. | ||
518 | */ | ||
519 | if (likely(!VM_SequentialReadHint(vma))) | ||
520 | referenced++; | ||
521 | } | 509 | } |
522 | 510 | ||
523 | /* Pretend the page is referenced if the task has the | 511 | /* Pretend the page is referenced if the task has the |
@@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
526 | rwsem_is_locked(&mm->mmap_sem)) | 514 | rwsem_is_locked(&mm->mmap_sem)) |
527 | referenced++; | 515 | referenced++; |
528 | 516 | ||
529 | out_unmap: | 517 | if (unlikely(PageTransHuge(page))) { |
518 | pmd_t *pmd; | ||
519 | |||
520 | spin_lock(&mm->page_table_lock); | ||
521 | pmd = page_check_address_pmd(page, mm, address, | ||
522 | PAGE_CHECK_ADDRESS_PMD_FLAG); | ||
523 | if (pmd && !pmd_trans_splitting(*pmd) && | ||
524 | pmdp_clear_flush_young_notify(vma, address, pmd)) | ||
525 | referenced++; | ||
526 | spin_unlock(&mm->page_table_lock); | ||
527 | } else { | ||
528 | pte_t *pte; | ||
529 | spinlock_t *ptl; | ||
530 | |||
531 | pte = page_check_address(page, mm, address, &ptl, 0); | ||
532 | if (!pte) | ||
533 | goto out; | ||
534 | |||
535 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | ||
536 | /* | ||
537 | * Don't treat a reference through a sequentially read | ||
538 | * mapping as such. If the page has been used in | ||
539 | * another mapping, we will catch it; if this other | ||
540 | * mapping is already gone, the unmap path will have | ||
541 | * set PG_referenced or activated the page. | ||
542 | */ | ||
543 | if (likely(!VM_SequentialReadHint(vma))) | ||
544 | referenced++; | ||
545 | } | ||
546 | pte_unmap_unlock(pte, ptl); | ||
547 | } | ||
548 | |||
530 | (*mapcount)--; | 549 | (*mapcount)--; |
531 | pte_unmap_unlock(pte, ptl); | ||
532 | 550 | ||
533 | if (referenced) | 551 | if (referenced) |
534 | *vm_flags |= vma->vm_flags; | 552 | *vm_flags |= vma->vm_flags; |
@@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page, | |||
864 | struct vm_area_struct *vma, unsigned long address, int exclusive) | 882 | struct vm_area_struct *vma, unsigned long address, int exclusive) |
865 | { | 883 | { |
866 | int first = atomic_inc_and_test(&page->_mapcount); | 884 | int first = atomic_inc_and_test(&page->_mapcount); |
867 | if (first) | 885 | if (first) { |
868 | __inc_zone_page_state(page, NR_ANON_PAGES); | 886 | if (!PageTransHuge(page)) |
887 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
888 | else | ||
889 | __inc_zone_page_state(page, | ||
890 | NR_ANON_TRANSPARENT_HUGEPAGES); | ||
891 | } | ||
869 | if (unlikely(PageKsm(page))) | 892 | if (unlikely(PageKsm(page))) |
870 | return; | 893 | return; |
871 | 894 | ||
@@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page, | |||
893 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 916 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
894 | SetPageSwapBacked(page); | 917 | SetPageSwapBacked(page); |
895 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 918 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
896 | __inc_zone_page_state(page, NR_ANON_PAGES); | 919 | if (!PageTransHuge(page)) |
920 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
921 | else | ||
922 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | ||
897 | __page_set_anon_rmap(page, vma, address, 1); | 923 | __page_set_anon_rmap(page, vma, address, 1); |
898 | if (page_evictable(page, vma)) | 924 | if (page_evictable(page, vma)) |
899 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 925 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
@@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page) | |||
911 | { | 937 | { |
912 | if (atomic_inc_and_test(&page->_mapcount)) { | 938 | if (atomic_inc_and_test(&page->_mapcount)) { |
913 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 939 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
914 | mem_cgroup_update_file_mapped(page, 1); | 940 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); |
915 | } | 941 | } |
916 | } | 942 | } |
917 | 943 | ||
@@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page) | |||
946 | return; | 972 | return; |
947 | if (PageAnon(page)) { | 973 | if (PageAnon(page)) { |
948 | mem_cgroup_uncharge_page(page); | 974 | mem_cgroup_uncharge_page(page); |
949 | __dec_zone_page_state(page, NR_ANON_PAGES); | 975 | if (!PageTransHuge(page)) |
976 | __dec_zone_page_state(page, NR_ANON_PAGES); | ||
977 | else | ||
978 | __dec_zone_page_state(page, | ||
979 | NR_ANON_TRANSPARENT_HUGEPAGES); | ||
950 | } else { | 980 | } else { |
951 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 981 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
952 | mem_cgroup_update_file_mapped(page, -1); | 982 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); |
953 | } | 983 | } |
954 | /* | 984 | /* |
955 | * It would be tidy to reset the PageAnon mapping here, | 985 | * It would be tidy to reset the PageAnon mapping here, |
@@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1202 | return ret; | 1232 | return ret; |
1203 | } | 1233 | } |
1204 | 1234 | ||
1205 | static bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1235 | bool is_vma_temporary_stack(struct vm_area_struct *vma) |
1206 | { | 1236 | { |
1207 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | 1237 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); |
1208 | 1238 | ||
@@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1400 | int ret; | 1430 | int ret; |
1401 | 1431 | ||
1402 | BUG_ON(!PageLocked(page)); | 1432 | BUG_ON(!PageLocked(page)); |
1433 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); | ||
1403 | 1434 | ||
1404 | if (unlikely(PageKsm(page))) | 1435 | if (unlikely(PageKsm(page))) |
1405 | ret = try_to_unmap_ksm(page, flags); | 1436 | ret = try_to_unmap_ksm(page, flags); |
diff --git a/mm/shmem.c b/mm/shmem.c index 47fdeeb9d63..5ee67c99060 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -2415,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) | |||
2415 | return &p->vfs_inode; | 2415 | return &p->vfs_inode; |
2416 | } | 2416 | } |
2417 | 2417 | ||
2418 | static void shmem_i_callback(struct rcu_head *head) | ||
2419 | { | ||
2420 | struct inode *inode = container_of(head, struct inode, i_rcu); | ||
2421 | INIT_LIST_HEAD(&inode->i_dentry); | ||
2422 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | ||
2423 | } | ||
2424 | |||
2418 | static void shmem_destroy_inode(struct inode *inode) | 2425 | static void shmem_destroy_inode(struct inode *inode) |
2419 | { | 2426 | { |
2420 | if ((inode->i_mode & S_IFMT) == S_IFREG) { | 2427 | if ((inode->i_mode & S_IFMT) == S_IFREG) { |
2421 | /* only struct inode is valid if it's an inline symlink */ | 2428 | /* only struct inode is valid if it's an inline symlink */ |
2422 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2429 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
2423 | } | 2430 | } |
2424 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2431 | call_rcu(&inode->i_rcu, shmem_i_callback); |
2425 | } | 2432 | } |
2426 | 2433 | ||
2427 | static void init_once(void *foo) | 2434 | static void init_once(void *foo) |
@@ -284,7 +284,7 @@ struct kmem_list3 { | |||
284 | * Need this for bootstrapping a per node allocator. | 284 | * Need this for bootstrapping a per node allocator. |
285 | */ | 285 | */ |
286 | #define NUM_INIT_LISTS (3 * MAX_NUMNODES) | 286 | #define NUM_INIT_LISTS (3 * MAX_NUMNODES) |
287 | struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; | 287 | static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; |
288 | #define CACHE_CACHE 0 | 288 | #define CACHE_CACHE 0 |
289 | #define SIZE_AC MAX_NUMNODES | 289 | #define SIZE_AC MAX_NUMNODES |
290 | #define SIZE_L3 (2 * MAX_NUMNODES) | 290 | #define SIZE_L3 (2 * MAX_NUMNODES) |
@@ -829,12 +829,12 @@ static void init_reap_node(int cpu) | |||
829 | 829 | ||
830 | static void next_reap_node(void) | 830 | static void next_reap_node(void) |
831 | { | 831 | { |
832 | int node = __get_cpu_var(slab_reap_node); | 832 | int node = __this_cpu_read(slab_reap_node); |
833 | 833 | ||
834 | node = next_node(node, node_online_map); | 834 | node = next_node(node, node_online_map); |
835 | if (unlikely(node >= MAX_NUMNODES)) | 835 | if (unlikely(node >= MAX_NUMNODES)) |
836 | node = first_node(node_online_map); | 836 | node = first_node(node_online_map); |
837 | __get_cpu_var(slab_reap_node) = node; | 837 | __this_cpu_write(slab_reap_node, node); |
838 | } | 838 | } |
839 | 839 | ||
840 | #else | 840 | #else |
@@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
1012 | */ | 1012 | */ |
1013 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | 1013 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) |
1014 | { | 1014 | { |
1015 | int node = __get_cpu_var(slab_reap_node); | 1015 | int node = __this_cpu_read(slab_reap_node); |
1016 | 1016 | ||
1017 | if (l3->alien) { | 1017 | if (l3->alien) { |
1018 | struct array_cache *ac = l3->alien[node]; | 1018 | struct array_cache *ac = l3->alien[node]; |
@@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1293 | * anything expensive but will only modify reap_work | 1293 | * anything expensive but will only modify reap_work |
1294 | * and reschedule the timer. | 1294 | * and reschedule the timer. |
1295 | */ | 1295 | */ |
1296 | cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); | 1296 | cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); |
1297 | /* Now the cache_reaper is guaranteed to be not running. */ | 1297 | /* Now the cache_reaper is guaranteed to be not running. */ |
1298 | per_cpu(slab_reap_work, cpu).work.func = NULL; | 1298 | per_cpu(slab_reap_work, cpu).work.func = NULL; |
1299 | break; | 1299 | break; |
@@ -2781,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, | |||
2781 | /* | 2781 | /* |
2782 | * Map pages beginning at addr to the given cache and slab. This is required | 2782 | * Map pages beginning at addr to the given cache and slab. This is required |
2783 | * for the slab allocator to be able to lookup the cache and slab of a | 2783 | * for the slab allocator to be able to lookup the cache and slab of a |
2784 | * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. | 2784 | * virtual address for kfree, ksize, and slab debugging. |
2785 | */ | 2785 | */ |
2786 | static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, | 2786 | static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, |
2787 | void *addr) | 2787 | void *addr) |
@@ -3653,42 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3653 | EXPORT_SYMBOL(kmem_cache_alloc); | 3653 | EXPORT_SYMBOL(kmem_cache_alloc); |
3654 | 3654 | ||
3655 | #ifdef CONFIG_TRACING | 3655 | #ifdef CONFIG_TRACING |
3656 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) | 3656 | void * |
3657 | kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) | ||
3657 | { | 3658 | { |
3658 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3659 | void *ret; |
3659 | } | ||
3660 | EXPORT_SYMBOL(kmem_cache_alloc_notrace); | ||
3661 | #endif | ||
3662 | 3660 | ||
3663 | /** | 3661 | ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); |
3664 | * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. | ||
3665 | * @cachep: the cache we're checking against | ||
3666 | * @ptr: pointer to validate | ||
3667 | * | ||
3668 | * This verifies that the untrusted pointer looks sane; | ||
3669 | * it is _not_ a guarantee that the pointer is actually | ||
3670 | * part of the slab cache in question, but it at least | ||
3671 | * validates that the pointer can be dereferenced and | ||
3672 | * looks half-way sane. | ||
3673 | * | ||
3674 | * Currently only used for dentry validation. | ||
3675 | */ | ||
3676 | int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) | ||
3677 | { | ||
3678 | unsigned long size = cachep->buffer_size; | ||
3679 | struct page *page; | ||
3680 | 3662 | ||
3681 | if (unlikely(!kern_ptr_validate(ptr, size))) | 3663 | trace_kmalloc(_RET_IP_, ret, |
3682 | goto out; | 3664 | size, slab_buffer_size(cachep), flags); |
3683 | page = virt_to_page(ptr); | 3665 | return ret; |
3684 | if (unlikely(!PageSlab(page))) | ||
3685 | goto out; | ||
3686 | if (unlikely(page_get_cache(page) != cachep)) | ||
3687 | goto out; | ||
3688 | return 1; | ||
3689 | out: | ||
3690 | return 0; | ||
3691 | } | 3666 | } |
3667 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | ||
3668 | #endif | ||
3692 | 3669 | ||
3693 | #ifdef CONFIG_NUMA | 3670 | #ifdef CONFIG_NUMA |
3694 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3671 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) |
@@ -3705,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3705 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3682 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3706 | 3683 | ||
3707 | #ifdef CONFIG_TRACING | 3684 | #ifdef CONFIG_TRACING |
3708 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, | 3685 | void *kmem_cache_alloc_node_trace(size_t size, |
3709 | gfp_t flags, | 3686 | struct kmem_cache *cachep, |
3710 | int nodeid) | 3687 | gfp_t flags, |
3688 | int nodeid) | ||
3711 | { | 3689 | { |
3712 | return __cache_alloc_node(cachep, flags, nodeid, | 3690 | void *ret; |
3691 | |||
3692 | ret = __cache_alloc_node(cachep, flags, nodeid, | ||
3713 | __builtin_return_address(0)); | 3693 | __builtin_return_address(0)); |
3694 | trace_kmalloc_node(_RET_IP_, ret, | ||
3695 | size, slab_buffer_size(cachep), | ||
3696 | flags, nodeid); | ||
3697 | return ret; | ||
3714 | } | 3698 | } |
3715 | EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | 3699 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
3716 | #endif | 3700 | #endif |
3717 | 3701 | ||
3718 | static __always_inline void * | 3702 | static __always_inline void * |
3719 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | 3703 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) |
3720 | { | 3704 | { |
3721 | struct kmem_cache *cachep; | 3705 | struct kmem_cache *cachep; |
3722 | void *ret; | ||
3723 | 3706 | ||
3724 | cachep = kmem_find_general_cachep(size, flags); | 3707 | cachep = kmem_find_general_cachep(size, flags); |
3725 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3708 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
3726 | return cachep; | 3709 | return cachep; |
3727 | ret = kmem_cache_alloc_node_notrace(cachep, flags, node); | 3710 | return kmem_cache_alloc_node_trace(size, cachep, flags, node); |
3728 | |||
3729 | trace_kmalloc_node((unsigned long) caller, ret, | ||
3730 | size, cachep->buffer_size, flags, node); | ||
3731 | |||
3732 | return ret; | ||
3733 | } | 3711 | } |
3734 | 3712 | ||
3735 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | 3713 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
@@ -4075,7 +4053,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | |||
4075 | * necessary. Note that the l3 listlock also protects the array_cache | 4053 | * necessary. Note that the l3 listlock also protects the array_cache |
4076 | * if drain_array() is used on the shared array. | 4054 | * if drain_array() is used on the shared array. |
4077 | */ | 4055 | */ |
4078 | void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, | 4056 | static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, |
4079 | struct array_cache *ac, int force, int node) | 4057 | struct array_cache *ac, int force, int node) |
4080 | { | 4058 | { |
4081 | int tofree; | 4059 | int tofree; |
@@ -4339,7 +4317,7 @@ static const struct seq_operations slabinfo_op = { | |||
4339 | * @count: data length | 4317 | * @count: data length |
4340 | * @ppos: unused | 4318 | * @ppos: unused |
4341 | */ | 4319 | */ |
4342 | ssize_t slabinfo_write(struct file *file, const char __user * buffer, | 4320 | static ssize_t slabinfo_write(struct file *file, const char __user *buffer, |
4343 | size_t count, loff_t *ppos) | 4321 | size_t count, loff_t *ppos) |
4344 | { | 4322 | { |
4345 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; | 4323 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; |
@@ -678,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d) | |||
678 | } | 678 | } |
679 | EXPORT_SYMBOL(kmem_cache_shrink); | 679 | EXPORT_SYMBOL(kmem_cache_shrink); |
680 | 680 | ||
681 | int kmem_ptr_validate(struct kmem_cache *a, const void *b) | ||
682 | { | ||
683 | return 0; | ||
684 | } | ||
685 | |||
686 | static unsigned int slob_ready __read_mostly; | 681 | static unsigned int slob_ready __read_mostly; |
687 | 682 | ||
688 | int slab_is_available(void) | 683 | int slab_is_available(void) |
@@ -28,6 +28,8 @@ | |||
28 | #include <linux/math64.h> | 28 | #include <linux/math64.h> |
29 | #include <linux/fault-inject.h> | 29 | #include <linux/fault-inject.h> |
30 | 30 | ||
31 | #include <trace/events/kmem.h> | ||
32 | |||
31 | /* | 33 | /* |
32 | * Lock order: | 34 | * Lock order: |
33 | * 1. slab_lock(page) | 35 | * 1. slab_lock(page) |
@@ -1774,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | |||
1774 | EXPORT_SYMBOL(kmem_cache_alloc); | 1776 | EXPORT_SYMBOL(kmem_cache_alloc); |
1775 | 1777 | ||
1776 | #ifdef CONFIG_TRACING | 1778 | #ifdef CONFIG_TRACING |
1777 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) | 1779 | void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) |
1780 | { | ||
1781 | void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); | ||
1782 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); | ||
1783 | return ret; | ||
1784 | } | ||
1785 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | ||
1786 | |||
1787 | void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) | ||
1778 | { | 1788 | { |
1779 | return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); | 1789 | void *ret = kmalloc_order(size, flags, order); |
1790 | trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); | ||
1791 | return ret; | ||
1780 | } | 1792 | } |
1781 | EXPORT_SYMBOL(kmem_cache_alloc_notrace); | 1793 | EXPORT_SYMBOL(kmalloc_order_trace); |
1782 | #endif | 1794 | #endif |
1783 | 1795 | ||
1784 | #ifdef CONFIG_NUMA | 1796 | #ifdef CONFIG_NUMA |
@@ -1794,13 +1806,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | |||
1794 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 1806 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
1795 | 1807 | ||
1796 | #ifdef CONFIG_TRACING | 1808 | #ifdef CONFIG_TRACING |
1797 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, | 1809 | void *kmem_cache_alloc_node_trace(struct kmem_cache *s, |
1798 | gfp_t gfpflags, | 1810 | gfp_t gfpflags, |
1799 | int node) | 1811 | int node, size_t size) |
1800 | { | 1812 | { |
1801 | return slab_alloc(s, gfpflags, node, _RET_IP_); | 1813 | void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); |
1814 | |||
1815 | trace_kmalloc_node(_RET_IP_, ret, | ||
1816 | size, s->size, gfpflags, node); | ||
1817 | return ret; | ||
1802 | } | 1818 | } |
1803 | EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | 1819 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
1804 | #endif | 1820 | #endif |
1805 | #endif | 1821 | #endif |
1806 | 1822 | ||
@@ -1917,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x) | |||
1917 | } | 1933 | } |
1918 | EXPORT_SYMBOL(kmem_cache_free); | 1934 | EXPORT_SYMBOL(kmem_cache_free); |
1919 | 1935 | ||
1920 | /* Figure out on which slab page the object resides */ | ||
1921 | static struct page *get_object_page(const void *x) | ||
1922 | { | ||
1923 | struct page *page = virt_to_head_page(x); | ||
1924 | |||
1925 | if (!PageSlab(page)) | ||
1926 | return NULL; | ||
1927 | |||
1928 | return page; | ||
1929 | } | ||
1930 | |||
1931 | /* | 1936 | /* |
1932 | * Object placement in a slab is made very easy because we always start at | 1937 | * Object placement in a slab is made very easy because we always start at |
1933 | * offset 0. If we tune the size of the object to the alignment then we can | 1938 | * offset 0. If we tune the size of the object to the alignment then we can |
@@ -2386,35 +2391,6 @@ error: | |||
2386 | } | 2391 | } |
2387 | 2392 | ||
2388 | /* | 2393 | /* |
2389 | * Check if a given pointer is valid | ||
2390 | */ | ||
2391 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) | ||
2392 | { | ||
2393 | struct page *page; | ||
2394 | |||
2395 | if (!kern_ptr_validate(object, s->size)) | ||
2396 | return 0; | ||
2397 | |||
2398 | page = get_object_page(object); | ||
2399 | |||
2400 | if (!page || s != page->slab) | ||
2401 | /* No slab or wrong slab */ | ||
2402 | return 0; | ||
2403 | |||
2404 | if (!check_valid_pointer(s, page, object)) | ||
2405 | return 0; | ||
2406 | |||
2407 | /* | ||
2408 | * We could also check if the object is on the slabs freelist. | ||
2409 | * But this would be too expensive and it seems that the main | ||
2410 | * purpose of kmem_ptr_valid() is to check if the object belongs | ||
2411 | * to a certain slab. | ||
2412 | */ | ||
2413 | return 1; | ||
2414 | } | ||
2415 | EXPORT_SYMBOL(kmem_ptr_validate); | ||
2416 | |||
2417 | /* | ||
2418 | * Determine the size of a slab object | 2394 | * Determine the size of a slab object |
2419 | */ | 2395 | */ |
2420 | unsigned int kmem_cache_size(struct kmem_cache *s) | 2396 | unsigned int kmem_cache_size(struct kmem_cache *s) |
@@ -3401,13 +3377,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page, | |||
3401 | 3377 | ||
3402 | for_each_free_object(p, s, page->freelist) { | 3378 | for_each_free_object(p, s, page->freelist) { |
3403 | set_bit(slab_index(p, s, addr), map); | 3379 | set_bit(slab_index(p, s, addr), map); |
3404 | if (!check_object(s, page, p, 0)) | 3380 | if (!check_object(s, page, p, SLUB_RED_INACTIVE)) |
3405 | return 0; | 3381 | return 0; |
3406 | } | 3382 | } |
3407 | 3383 | ||
3408 | for_each_object(p, s, addr, page->objects) | 3384 | for_each_object(p, s, addr, page->objects) |
3409 | if (!test_bit(slab_index(p, s, addr), map)) | 3385 | if (!test_bit(slab_index(p, s, addr), map)) |
3410 | if (!check_object(s, page, p, 1)) | 3386 | if (!check_object(s, page, p, SLUB_RED_ACTIVE)) |
3411 | return 0; | 3387 | return 0; |
3412 | return 1; | 3388 | return 1; |
3413 | } | 3389 | } |
@@ -3660,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
3660 | len += sprintf(buf + len, "%7ld ", l->count); | 3636 | len += sprintf(buf + len, "%7ld ", l->count); |
3661 | 3637 | ||
3662 | if (l->addr) | 3638 | if (l->addr) |
3663 | len += sprint_symbol(buf + len, (unsigned long)l->addr); | 3639 | len += sprintf(buf + len, "%pS", (void *)l->addr); |
3664 | else | 3640 | else |
3665 | len += sprintf(buf + len, "<not-available>"); | 3641 | len += sprintf(buf + len, "<not-available>"); |
3666 | 3642 | ||
@@ -3821,7 +3797,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
3821 | } | 3797 | } |
3822 | } | 3798 | } |
3823 | 3799 | ||
3824 | down_read(&slub_lock); | 3800 | lock_memory_hotplug(); |
3825 | #ifdef CONFIG_SLUB_DEBUG | 3801 | #ifdef CONFIG_SLUB_DEBUG |
3826 | if (flags & SO_ALL) { | 3802 | if (flags & SO_ALL) { |
3827 | for_each_node_state(node, N_NORMAL_MEMORY) { | 3803 | for_each_node_state(node, N_NORMAL_MEMORY) { |
@@ -3862,7 +3838,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
3862 | x += sprintf(buf + x, " N%d=%lu", | 3838 | x += sprintf(buf + x, " N%d=%lu", |
3863 | node, nodes[node]); | 3839 | node, nodes[node]); |
3864 | #endif | 3840 | #endif |
3865 | up_read(&slub_lock); | 3841 | unlock_memory_hotplug(); |
3866 | kfree(nodes); | 3842 | kfree(nodes); |
3867 | return x + sprintf(buf + x, "\n"); | 3843 | return x + sprintf(buf + x, "\n"); |
3868 | } | 3844 | } |
@@ -3970,12 +3946,9 @@ SLAB_ATTR(min_partial); | |||
3970 | 3946 | ||
3971 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) | 3947 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) |
3972 | { | 3948 | { |
3973 | if (s->ctor) { | 3949 | if (!s->ctor) |
3974 | int n = sprint_symbol(buf, (unsigned long)s->ctor); | 3950 | return 0; |
3975 | 3951 | return sprintf(buf, "%pS\n", s->ctor); | |
3976 | return n + sprintf(buf + n, "\n"); | ||
3977 | } | ||
3978 | return 0; | ||
3979 | } | 3952 | } |
3980 | SLAB_ATTR_RO(ctor); | 3953 | SLAB_ATTR_RO(ctor); |
3981 | 3954 | ||
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 29d6cbffb28..64b984091ed 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * | 9 | * |
10 | * However, virtual mappings need a page table and TLBs. Many Linux | 10 | * However, virtual mappings need a page table and TLBs. Many Linux |
11 | * architectures already map their physical space using 1-1 mappings | 11 | * architectures already map their physical space using 1-1 mappings |
12 | * via TLBs. For those arches the virtual memmory map is essentially | 12 | * via TLBs. For those arches the virtual memory map is essentially |
13 | * for free if we use the same page size as the 1-1 mappings. In that | 13 | * for free if we use the same page size as the 1-1 mappings. In that |
14 | * case the overhead consists of a few additional pages that are | 14 | * case the overhead consists of a few additional pages that are |
15 | * allocated to create a view of memory for vmemmap. | 15 | * allocated to create a view of memory for vmemmap. |
diff --git a/mm/sparse.c b/mm/sparse.c index 95ac219af37..93250207c5c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
671 | static void free_map_bootmem(struct page *page, unsigned long nr_pages) | 671 | static void free_map_bootmem(struct page *page, unsigned long nr_pages) |
672 | { | 672 | { |
673 | unsigned long maps_section_nr, removing_section_nr, i; | 673 | unsigned long maps_section_nr, removing_section_nr, i; |
674 | int magic; | 674 | unsigned long magic; |
675 | 675 | ||
676 | for (i = 0; i < nr_pages; i++, page++) { | 676 | for (i = 0; i < nr_pages; i++, page++) { |
677 | magic = atomic_read(&page->_mapcount); | 677 | magic = (unsigned long) page->lru.next; |
678 | 678 | ||
679 | BUG_ON(magic == NODE_INFO); | 679 | BUG_ON(magic == NODE_INFO); |
680 | 680 | ||
@@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page) | |||
56 | del_page_from_lru(zone, page); | 56 | del_page_from_lru(zone, page); |
57 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 57 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
58 | } | 58 | } |
59 | } | ||
60 | |||
61 | static void __put_single_page(struct page *page) | ||
62 | { | ||
63 | __page_cache_release(page); | ||
59 | free_hot_cold_page(page, 0); | 64 | free_hot_cold_page(page, 0); |
60 | } | 65 | } |
61 | 66 | ||
62 | static void put_compound_page(struct page *page) | 67 | static void __put_compound_page(struct page *page) |
63 | { | 68 | { |
64 | page = compound_head(page); | 69 | compound_page_dtor *dtor; |
65 | if (put_page_testzero(page)) { | ||
66 | compound_page_dtor *dtor; | ||
67 | 70 | ||
68 | dtor = get_compound_page_dtor(page); | 71 | __page_cache_release(page); |
69 | (*dtor)(page); | 72 | dtor = get_compound_page_dtor(page); |
73 | (*dtor)(page); | ||
74 | } | ||
75 | |||
76 | static void put_compound_page(struct page *page) | ||
77 | { | ||
78 | if (unlikely(PageTail(page))) { | ||
79 | /* __split_huge_page_refcount can run under us */ | ||
80 | struct page *page_head = page->first_page; | ||
81 | smp_rmb(); | ||
82 | /* | ||
83 | * If PageTail is still set after smp_rmb() we can be sure | ||
84 | * that the page->first_page we read wasn't a dangling pointer. | ||
85 | * See __split_huge_page_refcount() smp_wmb(). | ||
86 | */ | ||
87 | if (likely(PageTail(page) && get_page_unless_zero(page_head))) { | ||
88 | unsigned long flags; | ||
89 | /* | ||
90 | * Verify that our page_head wasn't converted | ||
91 | * to a a regular page before we got a | ||
92 | * reference on it. | ||
93 | */ | ||
94 | if (unlikely(!PageHead(page_head))) { | ||
95 | /* PageHead is cleared after PageTail */ | ||
96 | smp_rmb(); | ||
97 | VM_BUG_ON(PageTail(page)); | ||
98 | goto out_put_head; | ||
99 | } | ||
100 | /* | ||
101 | * Only run compound_lock on a valid PageHead, | ||
102 | * after having it pinned with | ||
103 | * get_page_unless_zero() above. | ||
104 | */ | ||
105 | smp_mb(); | ||
106 | /* page_head wasn't a dangling pointer */ | ||
107 | flags = compound_lock_irqsave(page_head); | ||
108 | if (unlikely(!PageTail(page))) { | ||
109 | /* __split_huge_page_refcount run before us */ | ||
110 | compound_unlock_irqrestore(page_head, flags); | ||
111 | VM_BUG_ON(PageHead(page_head)); | ||
112 | out_put_head: | ||
113 | if (put_page_testzero(page_head)) | ||
114 | __put_single_page(page_head); | ||
115 | out_put_single: | ||
116 | if (put_page_testzero(page)) | ||
117 | __put_single_page(page); | ||
118 | return; | ||
119 | } | ||
120 | VM_BUG_ON(page_head != page->first_page); | ||
121 | /* | ||
122 | * We can release the refcount taken by | ||
123 | * get_page_unless_zero now that | ||
124 | * split_huge_page_refcount is blocked on the | ||
125 | * compound_lock. | ||
126 | */ | ||
127 | if (put_page_testzero(page_head)) | ||
128 | VM_BUG_ON(1); | ||
129 | /* __split_huge_page_refcount will wait now */ | ||
130 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | ||
131 | atomic_dec(&page->_count); | ||
132 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | ||
133 | compound_unlock_irqrestore(page_head, flags); | ||
134 | if (put_page_testzero(page_head)) { | ||
135 | if (PageHead(page_head)) | ||
136 | __put_compound_page(page_head); | ||
137 | else | ||
138 | __put_single_page(page_head); | ||
139 | } | ||
140 | } else { | ||
141 | /* page_head is a dangling pointer */ | ||
142 | VM_BUG_ON(PageTail(page)); | ||
143 | goto out_put_single; | ||
144 | } | ||
145 | } else if (put_page_testzero(page)) { | ||
146 | if (PageHead(page)) | ||
147 | __put_compound_page(page); | ||
148 | else | ||
149 | __put_single_page(page); | ||
70 | } | 150 | } |
71 | } | 151 | } |
72 | 152 | ||
@@ -75,7 +155,7 @@ void put_page(struct page *page) | |||
75 | if (unlikely(PageCompound(page))) | 155 | if (unlikely(PageCompound(page))) |
76 | put_compound_page(page); | 156 | put_compound_page(page); |
77 | else if (put_page_testzero(page)) | 157 | else if (put_page_testzero(page)) |
78 | __page_cache_release(page); | 158 | __put_single_page(page); |
79 | } | 159 | } |
80 | EXPORT_SYMBOL(put_page); | 160 | EXPORT_SYMBOL(put_page); |
81 | 161 | ||
@@ -399,6 +479,43 @@ void __pagevec_release(struct pagevec *pvec) | |||
399 | 479 | ||
400 | EXPORT_SYMBOL(__pagevec_release); | 480 | EXPORT_SYMBOL(__pagevec_release); |
401 | 481 | ||
482 | /* used by __split_huge_page_refcount() */ | ||
483 | void lru_add_page_tail(struct zone* zone, | ||
484 | struct page *page, struct page *page_tail) | ||
485 | { | ||
486 | int active; | ||
487 | enum lru_list lru; | ||
488 | const int file = 0; | ||
489 | struct list_head *head; | ||
490 | |||
491 | VM_BUG_ON(!PageHead(page)); | ||
492 | VM_BUG_ON(PageCompound(page_tail)); | ||
493 | VM_BUG_ON(PageLRU(page_tail)); | ||
494 | VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); | ||
495 | |||
496 | SetPageLRU(page_tail); | ||
497 | |||
498 | if (page_evictable(page_tail, NULL)) { | ||
499 | if (PageActive(page)) { | ||
500 | SetPageActive(page_tail); | ||
501 | active = 1; | ||
502 | lru = LRU_ACTIVE_ANON; | ||
503 | } else { | ||
504 | active = 0; | ||
505 | lru = LRU_INACTIVE_ANON; | ||
506 | } | ||
507 | update_page_reclaim_stat(zone, page_tail, file, active); | ||
508 | if (likely(PageLRU(page))) | ||
509 | head = page->lru.prev; | ||
510 | else | ||
511 | head = &zone->lru[lru].list; | ||
512 | __add_page_to_lru_list(zone, page_tail, lru, head); | ||
513 | } else { | ||
514 | SetPageUnevictable(page_tail); | ||
515 | add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); | ||
516 | } | ||
517 | } | ||
518 | |||
402 | /* | 519 | /* |
403 | * Add the passed pages to the LRU, then drop the caller's refcount | 520 | * Add the passed pages to the LRU, then drop the caller's refcount |
404 | * on them. Reinitialises the caller's pagevec. | 521 | * on them. Reinitialises the caller's pagevec. |
diff --git a/mm/swap_state.c b/mm/swap_state.c index e10f5833167..5c8cfabbc9b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -157,6 +157,12 @@ int add_to_swap(struct page *page) | |||
157 | if (!entry.val) | 157 | if (!entry.val) |
158 | return 0; | 158 | return 0; |
159 | 159 | ||
160 | if (unlikely(PageTransHuge(page))) | ||
161 | if (unlikely(split_huge_page(page))) { | ||
162 | swapcache_free(entry, NULL); | ||
163 | return 0; | ||
164 | } | ||
165 | |||
160 | /* | 166 | /* |
161 | * Radix-tree node allocations from PF_MEMALLOC contexts could | 167 | * Radix-tree node allocations from PF_MEMALLOC contexts could |
162 | * completely exhaust the page allocator. __GFP_NOMEMALLOC | 168 | * completely exhaust the page allocator. __GFP_NOMEMALLOC |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 67ddaaf98c7..07a458d72fa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
964 | pmd = pmd_offset(pud, addr); | 964 | pmd = pmd_offset(pud, addr); |
965 | do { | 965 | do { |
966 | next = pmd_addr_end(addr, end); | 966 | next = pmd_addr_end(addr, end); |
967 | if (unlikely(pmd_trans_huge(*pmd))) | ||
968 | continue; | ||
967 | if (pmd_none_or_clear_bad(pmd)) | 969 | if (pmd_none_or_clear_bad(pmd)) |
968 | continue; | 970 | continue; |
969 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | 971 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
@@ -1677,7 +1679,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1677 | if (S_ISBLK(inode->i_mode)) { | 1679 | if (S_ISBLK(inode->i_mode)) { |
1678 | struct block_device *bdev = I_BDEV(inode); | 1680 | struct block_device *bdev = I_BDEV(inode); |
1679 | set_blocksize(bdev, p->old_block_size); | 1681 | set_blocksize(bdev, p->old_block_size); |
1680 | bd_release(bdev); | 1682 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
1681 | } else { | 1683 | } else { |
1682 | mutex_lock(&inode->i_mutex); | 1684 | mutex_lock(&inode->i_mutex); |
1683 | inode->i_flags &= ~S_SWAPFILE; | 1685 | inode->i_flags &= ~S_SWAPFILE; |
@@ -1939,7 +1941,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1939 | error = -EINVAL; | 1941 | error = -EINVAL; |
1940 | if (S_ISBLK(inode->i_mode)) { | 1942 | if (S_ISBLK(inode->i_mode)) { |
1941 | bdev = I_BDEV(inode); | 1943 | bdev = I_BDEV(inode); |
1942 | error = bd_claim(bdev, sys_swapon); | 1944 | error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, |
1945 | sys_swapon); | ||
1943 | if (error < 0) { | 1946 | if (error < 0) { |
1944 | bdev = NULL; | 1947 | bdev = NULL; |
1945 | error = -EINVAL; | 1948 | error = -EINVAL; |
@@ -2136,7 +2139,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2136 | bad_swap: | 2139 | bad_swap: |
2137 | if (bdev) { | 2140 | if (bdev) { |
2138 | set_blocksize(bdev, p->old_block_size); | 2141 | set_blocksize(bdev, p->old_block_size); |
2139 | bd_release(bdev); | 2142 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
2140 | } | 2143 | } |
2141 | destroy_swap_extents(p); | 2144 | destroy_swap_extents(p); |
2142 | swap_cgroup_swapoff(type); | 2145 | swap_cgroup_swapoff(type); |
diff --git a/mm/truncate.c b/mm/truncate.c index ba887bff48c..49feb46e77b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
390 | __remove_from_page_cache(page); | 390 | __remove_from_page_cache(page); |
391 | spin_unlock_irq(&mapping->tree_lock); | 391 | spin_unlock_irq(&mapping->tree_lock); |
392 | mem_cgroup_uncharge_cache_page(page); | 392 | mem_cgroup_uncharge_cache_page(page); |
393 | |||
394 | if (mapping->a_ops->freepage) | ||
395 | mapping->a_ops->freepage(page); | ||
396 | |||
393 | page_cache_release(page); /* pagecache ref */ | 397 | page_cache_release(page); /* pagecache ref */ |
394 | return 1; | 398 | return 1; |
395 | failed: | 399 | failed: |
@@ -545,13 +549,12 @@ EXPORT_SYMBOL(truncate_pagecache); | |||
545 | * @inode: inode | 549 | * @inode: inode |
546 | * @newsize: new file size | 550 | * @newsize: new file size |
547 | * | 551 | * |
548 | * truncate_setsize updastes i_size update and performs pagecache | 552 | * truncate_setsize updates i_size and performs pagecache truncation (if |
549 | * truncation (if necessary) for a file size updates. It will be | 553 | * necessary) to @newsize. It will be typically be called from the filesystem's |
550 | * typically be called from the filesystem's setattr function when | 554 | * setattr function when ATTR_SIZE is passed in. |
551 | * ATTR_SIZE is passed in. | ||
552 | * | 555 | * |
553 | * Must be called with inode_mutex held and after all filesystem | 556 | * Must be called with inode_mutex held and before all filesystem specific |
554 | * specific block truncation has been performed. | 557 | * block truncation has been performed. |
555 | */ | 558 | */ |
556 | void truncate_setsize(struct inode *inode, loff_t newsize) | 559 | void truncate_setsize(struct inode *inode, loff_t newsize) |
557 | { | 560 | { |
@@ -186,27 +186,6 @@ void kzfree(const void *p) | |||
186 | } | 186 | } |
187 | EXPORT_SYMBOL(kzfree); | 187 | EXPORT_SYMBOL(kzfree); |
188 | 188 | ||
189 | int kern_ptr_validate(const void *ptr, unsigned long size) | ||
190 | { | ||
191 | unsigned long addr = (unsigned long)ptr; | ||
192 | unsigned long min_addr = PAGE_OFFSET; | ||
193 | unsigned long align_mask = sizeof(void *) - 1; | ||
194 | |||
195 | if (unlikely(addr < min_addr)) | ||
196 | goto out; | ||
197 | if (unlikely(addr > (unsigned long)high_memory - size)) | ||
198 | goto out; | ||
199 | if (unlikely(addr & align_mask)) | ||
200 | goto out; | ||
201 | if (unlikely(!kern_addr_valid(addr))) | ||
202 | goto out; | ||
203 | if (unlikely(!kern_addr_valid(addr + size - 1))) | ||
204 | goto out; | ||
205 | return 1; | ||
206 | out: | ||
207 | return 0; | ||
208 | } | ||
209 | |||
210 | /* | 189 | /* |
211 | * strndup_user - duplicate an existing string from user space | 190 | * strndup_user - duplicate an existing string from user space |
212 | * @s: The string to duplicate | 191 | * @s: The string to duplicate |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3d66b3dc5c..f9b166732e7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -31,8 +31,6 @@ | |||
31 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
32 | #include <asm/shmparam.h> | 32 | #include <asm/shmparam.h> |
33 | 33 | ||
34 | bool vmap_lazy_unmap __read_mostly = true; | ||
35 | |||
36 | /*** Page table manipulation functions ***/ | 34 | /*** Page table manipulation functions ***/ |
37 | 35 | ||
38 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 36 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) |
@@ -503,9 +501,6 @@ static unsigned long lazy_max_pages(void) | |||
503 | { | 501 | { |
504 | unsigned int log; | 502 | unsigned int log; |
505 | 503 | ||
506 | if (!vmap_lazy_unmap) | ||
507 | return 0; | ||
508 | |||
509 | log = fls(num_online_cpus()); | 504 | log = fls(num_online_cpus()); |
510 | 505 | ||
511 | return log * (32UL * 1024 * 1024 / PAGE_SIZE); | 506 | return log * (32UL * 1024 * 1024 / PAGE_SIZE); |
@@ -566,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
566 | if (va->va_end > *end) | 561 | if (va->va_end > *end) |
567 | *end = va->va_end; | 562 | *end = va->va_end; |
568 | nr += (va->va_end - va->va_start) >> PAGE_SHIFT; | 563 | nr += (va->va_end - va->va_start) >> PAGE_SHIFT; |
569 | unmap_vmap_area(va); | ||
570 | list_add_tail(&va->purge_list, &valist); | 564 | list_add_tail(&va->purge_list, &valist); |
571 | va->flags |= VM_LAZY_FREEING; | 565 | va->flags |= VM_LAZY_FREEING; |
572 | va->flags &= ~VM_LAZY_FREE; | 566 | va->flags &= ~VM_LAZY_FREE; |
@@ -611,10 +605,11 @@ static void purge_vmap_area_lazy(void) | |||
611 | } | 605 | } |
612 | 606 | ||
613 | /* | 607 | /* |
614 | * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been | 608 | * Free a vmap area, caller ensuring that the area has been unmapped |
615 | * called for the correct range previously. | 609 | * and flush_cache_vunmap had been called for the correct range |
610 | * previously. | ||
616 | */ | 611 | */ |
617 | static void free_unmap_vmap_area_noflush(struct vmap_area *va) | 612 | static void free_vmap_area_noflush(struct vmap_area *va) |
618 | { | 613 | { |
619 | va->flags |= VM_LAZY_FREE; | 614 | va->flags |= VM_LAZY_FREE; |
620 | atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); | 615 | atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); |
@@ -623,6 +618,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va) | |||
623 | } | 618 | } |
624 | 619 | ||
625 | /* | 620 | /* |
621 | * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been | ||
622 | * called for the correct range previously. | ||
623 | */ | ||
624 | static void free_unmap_vmap_area_noflush(struct vmap_area *va) | ||
625 | { | ||
626 | unmap_vmap_area(va); | ||
627 | free_vmap_area_noflush(va); | ||
628 | } | ||
629 | |||
630 | /* | ||
626 | * Free and unmap a vmap area | 631 | * Free and unmap a vmap area |
627 | */ | 632 | */ |
628 | static void free_unmap_vmap_area(struct vmap_area *va) | 633 | static void free_unmap_vmap_area(struct vmap_area *va) |
@@ -743,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
743 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, | 748 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, |
744 | VMALLOC_START, VMALLOC_END, | 749 | VMALLOC_START, VMALLOC_END, |
745 | node, gfp_mask); | 750 | node, gfp_mask); |
746 | if (unlikely(IS_ERR(va))) { | 751 | if (IS_ERR(va)) { |
747 | kfree(vb); | 752 | kfree(vb); |
748 | return ERR_CAST(va); | 753 | return ERR_CAST(va); |
749 | } | 754 | } |
@@ -798,7 +803,7 @@ static void free_vmap_block(struct vmap_block *vb) | |||
798 | spin_unlock(&vmap_block_tree_lock); | 803 | spin_unlock(&vmap_block_tree_lock); |
799 | BUG_ON(tmp != vb); | 804 | BUG_ON(tmp != vb); |
800 | 805 | ||
801 | free_unmap_vmap_area_noflush(vb->va); | 806 | free_vmap_area_noflush(vb->va); |
802 | call_rcu(&vb->rcu_head, rcu_free_vb); | 807 | call_rcu(&vb->rcu_head, rcu_free_vb); |
803 | } | 808 | } |
804 | 809 | ||
@@ -936,6 +941,8 @@ static void vb_free(const void *addr, unsigned long size) | |||
936 | rcu_read_unlock(); | 941 | rcu_read_unlock(); |
937 | BUG_ON(!vb); | 942 | BUG_ON(!vb); |
938 | 943 | ||
944 | vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); | ||
945 | |||
939 | spin_lock(&vb->lock); | 946 | spin_lock(&vb->lock); |
940 | BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); | 947 | BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); |
941 | 948 | ||
@@ -988,7 +995,6 @@ void vm_unmap_aliases(void) | |||
988 | 995 | ||
989 | s = vb->va->va_start + (i << PAGE_SHIFT); | 996 | s = vb->va->va_start + (i << PAGE_SHIFT); |
990 | e = vb->va->va_start + (j << PAGE_SHIFT); | 997 | e = vb->va->va_start + (j << PAGE_SHIFT); |
991 | vunmap_page_range(s, e); | ||
992 | flush = 1; | 998 | flush = 1; |
993 | 999 | ||
994 | if (s < start) | 1000 | if (s < start) |
@@ -1169,6 +1175,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) | |||
1169 | { | 1175 | { |
1170 | vunmap_page_range(addr, addr + size); | 1176 | vunmap_page_range(addr, addr + size); |
1171 | } | 1177 | } |
1178 | EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); | ||
1172 | 1179 | ||
1173 | /** | 1180 | /** |
1174 | * unmap_kernel_range - unmap kernel VM area and flush cache and TLB | 1181 | * unmap_kernel_range - unmap kernel VM area and flush cache and TLB |
@@ -1309,13 +1316,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | |||
1309 | -1, GFP_KERNEL, caller); | 1316 | -1, GFP_KERNEL, caller); |
1310 | } | 1317 | } |
1311 | 1318 | ||
1312 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | ||
1313 | int node, gfp_t gfp_mask) | ||
1314 | { | ||
1315 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | ||
1316 | node, gfp_mask, __builtin_return_address(0)); | ||
1317 | } | ||
1318 | |||
1319 | static struct vm_struct *find_vm_area(const void *addr) | 1319 | static struct vm_struct *find_vm_area(const void *addr) |
1320 | { | 1320 | { |
1321 | struct vmap_area *va; | 1321 | struct vmap_area *va; |
@@ -1531,25 +1531,12 @@ fail: | |||
1531 | return NULL; | 1531 | return NULL; |
1532 | } | 1532 | } |
1533 | 1533 | ||
1534 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | ||
1535 | { | ||
1536 | void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, | ||
1537 | __builtin_return_address(0)); | ||
1538 | |||
1539 | /* | ||
1540 | * A ref_count = 3 is needed because the vm_struct and vmap_area | ||
1541 | * structures allocated in the __get_vm_area_node() function contain | ||
1542 | * references to the virtual address of the vmalloc'ed block. | ||
1543 | */ | ||
1544 | kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); | ||
1545 | |||
1546 | return addr; | ||
1547 | } | ||
1548 | |||
1549 | /** | 1534 | /** |
1550 | * __vmalloc_node - allocate virtually contiguous memory | 1535 | * __vmalloc_node_range - allocate virtually contiguous memory |
1551 | * @size: allocation size | 1536 | * @size: allocation size |
1552 | * @align: desired alignment | 1537 | * @align: desired alignment |
1538 | * @start: vm area range start | ||
1539 | * @end: vm area range end | ||
1553 | * @gfp_mask: flags for the page level allocator | 1540 | * @gfp_mask: flags for the page level allocator |
1554 | * @prot: protection mask for the allocated pages | 1541 | * @prot: protection mask for the allocated pages |
1555 | * @node: node to use for allocation or -1 | 1542 | * @node: node to use for allocation or -1 |
@@ -1559,9 +1546,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
1559 | * allocator with @gfp_mask flags. Map them into contiguous | 1546 | * allocator with @gfp_mask flags. Map them into contiguous |
1560 | * kernel virtual space, using a pagetable protection of @prot. | 1547 | * kernel virtual space, using a pagetable protection of @prot. |
1561 | */ | 1548 | */ |
1562 | static void *__vmalloc_node(unsigned long size, unsigned long align, | 1549 | void *__vmalloc_node_range(unsigned long size, unsigned long align, |
1563 | gfp_t gfp_mask, pgprot_t prot, | 1550 | unsigned long start, unsigned long end, gfp_t gfp_mask, |
1564 | int node, void *caller) | 1551 | pgprot_t prot, int node, void *caller) |
1565 | { | 1552 | { |
1566 | struct vm_struct *area; | 1553 | struct vm_struct *area; |
1567 | void *addr; | 1554 | void *addr; |
@@ -1571,8 +1558,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1571 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1558 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
1572 | return NULL; | 1559 | return NULL; |
1573 | 1560 | ||
1574 | area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, | 1561 | area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, |
1575 | VMALLOC_END, node, gfp_mask, caller); | 1562 | gfp_mask, caller); |
1576 | 1563 | ||
1577 | if (!area) | 1564 | if (!area) |
1578 | return NULL; | 1565 | return NULL; |
@@ -1589,6 +1576,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1589 | return addr; | 1576 | return addr; |
1590 | } | 1577 | } |
1591 | 1578 | ||
1579 | /** | ||
1580 | * __vmalloc_node - allocate virtually contiguous memory | ||
1581 | * @size: allocation size | ||
1582 | * @align: desired alignment | ||
1583 | * @gfp_mask: flags for the page level allocator | ||
1584 | * @prot: protection mask for the allocated pages | ||
1585 | * @node: node to use for allocation or -1 | ||
1586 | * @caller: caller's return address | ||
1587 | * | ||
1588 | * Allocate enough pages to cover @size from the page level | ||
1589 | * allocator with @gfp_mask flags. Map them into contiguous | ||
1590 | * kernel virtual space, using a pagetable protection of @prot. | ||
1591 | */ | ||
1592 | static void *__vmalloc_node(unsigned long size, unsigned long align, | ||
1593 | gfp_t gfp_mask, pgprot_t prot, | ||
1594 | int node, void *caller) | ||
1595 | { | ||
1596 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, | ||
1597 | gfp_mask, prot, node, caller); | ||
1598 | } | ||
1599 | |||
1592 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1600 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
1593 | { | 1601 | { |
1594 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, | 1602 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, |
@@ -2197,17 +2205,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, | |||
2197 | * @sizes: array containing size of each area | 2205 | * @sizes: array containing size of each area |
2198 | * @nr_vms: the number of areas to allocate | 2206 | * @nr_vms: the number of areas to allocate |
2199 | * @align: alignment, all entries in @offsets and @sizes must be aligned to this | 2207 | * @align: alignment, all entries in @offsets and @sizes must be aligned to this |
2200 | * @gfp_mask: allocation mask | ||
2201 | * | 2208 | * |
2202 | * Returns: kmalloc'd vm_struct pointer array pointing to allocated | 2209 | * Returns: kmalloc'd vm_struct pointer array pointing to allocated |
2203 | * vm_structs on success, %NULL on failure | 2210 | * vm_structs on success, %NULL on failure |
2204 | * | 2211 | * |
2205 | * Percpu allocator wants to use congruent vm areas so that it can | 2212 | * Percpu allocator wants to use congruent vm areas so that it can |
2206 | * maintain the offsets among percpu areas. This function allocates | 2213 | * maintain the offsets among percpu areas. This function allocates |
2207 | * congruent vmalloc areas for it. These areas tend to be scattered | 2214 | * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to |
2208 | * pretty far, distance between two areas easily going up to | 2215 | * be scattered pretty far, distance between two areas easily going up |
2209 | * gigabytes. To avoid interacting with regular vmallocs, these areas | 2216 | * to gigabytes. To avoid interacting with regular vmallocs, these |
2210 | * are allocated from top. | 2217 | * areas are allocated from top. |
2211 | * | 2218 | * |
2212 | * Despite its complicated look, this allocator is rather simple. It | 2219 | * Despite its complicated look, this allocator is rather simple. It |
2213 | * does everything top-down and scans areas from the end looking for | 2220 | * does everything top-down and scans areas from the end looking for |
@@ -2218,7 +2225,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, | |||
2218 | */ | 2225 | */ |
2219 | struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | 2226 | struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, |
2220 | const size_t *sizes, int nr_vms, | 2227 | const size_t *sizes, int nr_vms, |
2221 | size_t align, gfp_t gfp_mask) | 2228 | size_t align) |
2222 | { | 2229 | { |
2223 | const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); | 2230 | const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); |
2224 | const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); | 2231 | const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); |
@@ -2228,8 +2235,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
2228 | unsigned long base, start, end, last_end; | 2235 | unsigned long base, start, end, last_end; |
2229 | bool purged = false; | 2236 | bool purged = false; |
2230 | 2237 | ||
2231 | gfp_mask &= GFP_RECLAIM_MASK; | ||
2232 | |||
2233 | /* verify parameters and allocate data structures */ | 2238 | /* verify parameters and allocate data structures */ |
2234 | BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); | 2239 | BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); |
2235 | for (last_area = 0, area = 0; area < nr_vms; area++) { | 2240 | for (last_area = 0, area = 0; area < nr_vms; area++) { |
@@ -2262,14 +2267,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
2262 | return NULL; | 2267 | return NULL; |
2263 | } | 2268 | } |
2264 | 2269 | ||
2265 | vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); | 2270 | vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); |
2266 | vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); | 2271 | vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); |
2267 | if (!vas || !vms) | 2272 | if (!vas || !vms) |
2268 | goto err_free; | 2273 | goto err_free; |
2269 | 2274 | ||
2270 | for (area = 0; area < nr_vms; area++) { | 2275 | for (area = 0; area < nr_vms; area++) { |
2271 | vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); | 2276 | vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); |
2272 | vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); | 2277 | vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); |
2273 | if (!vas[area] || !vms[area]) | 2278 | if (!vas[area] || !vms[area]) |
2274 | goto err_free; | 2279 | goto err_free; |
2275 | } | 2280 | } |
@@ -2450,13 +2455,8 @@ static int s_show(struct seq_file *m, void *p) | |||
2450 | seq_printf(m, "0x%p-0x%p %7ld", | 2455 | seq_printf(m, "0x%p-0x%p %7ld", |
2451 | v->addr, v->addr + v->size, v->size); | 2456 | v->addr, v->addr + v->size, v->size); |
2452 | 2457 | ||
2453 | if (v->caller) { | 2458 | if (v->caller) |
2454 | char buff[KSYM_SYMBOL_LEN]; | 2459 | seq_printf(m, " %pS", v->caller); |
2455 | |||
2456 | seq_putc(m, ' '); | ||
2457 | sprint_symbol(buff, (unsigned long)v->caller); | ||
2458 | seq_puts(m, buff); | ||
2459 | } | ||
2460 | 2460 | ||
2461 | if (v->nr_pages) | 2461 | if (v->nr_pages) |
2462 | seq_printf(m, " pages=%d", v->nr_pages); | 2462 | seq_printf(m, " pages=%d", v->nr_pages); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index d31d7ce52c0..17497d0cd8b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/topology.h> | 32 | #include <linux/topology.h> |
33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
35 | #include <linux/compaction.h> | ||
35 | #include <linux/notifier.h> | 36 | #include <linux/notifier.h> |
36 | #include <linux/rwsem.h> | 37 | #include <linux/rwsem.h> |
37 | #include <linux/delay.h> | 38 | #include <linux/delay.h> |
@@ -51,11 +52,23 @@ | |||
51 | #define CREATE_TRACE_POINTS | 52 | #define CREATE_TRACE_POINTS |
52 | #include <trace/events/vmscan.h> | 53 | #include <trace/events/vmscan.h> |
53 | 54 | ||
54 | enum lumpy_mode { | 55 | /* |
55 | LUMPY_MODE_NONE, | 56 | * reclaim_mode determines how the inactive list is shrunk |
56 | LUMPY_MODE_ASYNC, | 57 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages |
57 | LUMPY_MODE_SYNC, | 58 | * RECLAIM_MODE_ASYNC: Do not block |
58 | }; | 59 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback |
60 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference | ||
61 | * page from the LRU and reclaim all pages within a | ||
62 | * naturally aligned range | ||
63 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of | ||
64 | * order-0 pages and then compact the zone | ||
65 | */ | ||
66 | typedef unsigned __bitwise__ reclaim_mode_t; | ||
67 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) | ||
68 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) | ||
69 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) | ||
70 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) | ||
71 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) | ||
59 | 72 | ||
60 | struct scan_control { | 73 | struct scan_control { |
61 | /* Incremented by the number of inactive pages that were scanned */ | 74 | /* Incremented by the number of inactive pages that were scanned */ |
@@ -88,7 +101,7 @@ struct scan_control { | |||
88 | * Intend to reclaim enough continuous memory rather than reclaim | 101 | * Intend to reclaim enough continuous memory rather than reclaim |
89 | * enough amount of memory. i.e, mode for high order allocation. | 102 | * enough amount of memory. i.e, mode for high order allocation. |
90 | */ | 103 | */ |
91 | enum lumpy_mode lumpy_reclaim_mode; | 104 | reclaim_mode_t reclaim_mode; |
92 | 105 | ||
93 | /* Which cgroup do we reclaim from */ | 106 | /* Which cgroup do we reclaim from */ |
94 | struct mem_cgroup *mem_cgroup; | 107 | struct mem_cgroup *mem_cgroup; |
@@ -271,34 +284,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
271 | return ret; | 284 | return ret; |
272 | } | 285 | } |
273 | 286 | ||
274 | static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, | 287 | static void set_reclaim_mode(int priority, struct scan_control *sc, |
275 | bool sync) | 288 | bool sync) |
276 | { | 289 | { |
277 | enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; | 290 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; |
278 | 291 | ||
279 | /* | 292 | /* |
280 | * Some reclaim have alredy been failed. No worth to try synchronous | 293 | * Initially assume we are entering either lumpy reclaim or |
281 | * lumpy reclaim. | 294 | * reclaim/compaction.Depending on the order, we will either set the |
295 | * sync mode or just reclaim order-0 pages later. | ||
282 | */ | 296 | */ |
283 | if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) | 297 | if (COMPACTION_BUILD) |
284 | return; | 298 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; |
299 | else | ||
300 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; | ||
285 | 301 | ||
286 | /* | 302 | /* |
287 | * If we need a large contiguous chunk of memory, or have | 303 | * Avoid using lumpy reclaim or reclaim/compaction if possible by |
288 | * trouble getting a small set of contiguous pages, we | 304 | * restricting when its set to either costly allocations or when |
289 | * will reclaim both active and inactive pages. | 305 | * under memory pressure |
290 | */ | 306 | */ |
291 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | 307 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) |
292 | sc->lumpy_reclaim_mode = mode; | 308 | sc->reclaim_mode |= syncmode; |
293 | else if (sc->order && priority < DEF_PRIORITY - 2) | 309 | else if (sc->order && priority < DEF_PRIORITY - 2) |
294 | sc->lumpy_reclaim_mode = mode; | 310 | sc->reclaim_mode |= syncmode; |
295 | else | 311 | else |
296 | sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; | 312 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
297 | } | 313 | } |
298 | 314 | ||
299 | static void disable_lumpy_reclaim_mode(struct scan_control *sc) | 315 | static void reset_reclaim_mode(struct scan_control *sc) |
300 | { | 316 | { |
301 | sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; | 317 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
302 | } | 318 | } |
303 | 319 | ||
304 | static inline int is_page_cache_freeable(struct page *page) | 320 | static inline int is_page_cache_freeable(struct page *page) |
@@ -429,7 +445,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
429 | * first attempt to free a range of pages fails. | 445 | * first attempt to free a range of pages fails. |
430 | */ | 446 | */ |
431 | if (PageWriteback(page) && | 447 | if (PageWriteback(page) && |
432 | sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) | 448 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) |
433 | wait_on_page_writeback(page); | 449 | wait_on_page_writeback(page); |
434 | 450 | ||
435 | if (!PageWriteback(page)) { | 451 | if (!PageWriteback(page)) { |
@@ -437,7 +453,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
437 | ClearPageReclaim(page); | 453 | ClearPageReclaim(page); |
438 | } | 454 | } |
439 | trace_mm_vmscan_writepage(page, | 455 | trace_mm_vmscan_writepage(page, |
440 | trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); | 456 | trace_reclaim_flags(page, sc->reclaim_mode)); |
441 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 457 | inc_zone_page_state(page, NR_VMSCAN_WRITE); |
442 | return PAGE_SUCCESS; | 458 | return PAGE_SUCCESS; |
443 | } | 459 | } |
@@ -494,9 +510,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) | |||
494 | spin_unlock_irq(&mapping->tree_lock); | 510 | spin_unlock_irq(&mapping->tree_lock); |
495 | swapcache_free(swap, page); | 511 | swapcache_free(swap, page); |
496 | } else { | 512 | } else { |
513 | void (*freepage)(struct page *); | ||
514 | |||
515 | freepage = mapping->a_ops->freepage; | ||
516 | |||
497 | __remove_from_page_cache(page); | 517 | __remove_from_page_cache(page); |
498 | spin_unlock_irq(&mapping->tree_lock); | 518 | spin_unlock_irq(&mapping->tree_lock); |
499 | mem_cgroup_uncharge_cache_page(page); | 519 | mem_cgroup_uncharge_cache_page(page); |
520 | |||
521 | if (freepage != NULL) | ||
522 | freepage(page); | ||
500 | } | 523 | } |
501 | 524 | ||
502 | return 1; | 525 | return 1; |
@@ -615,7 +638,7 @@ static enum page_references page_check_references(struct page *page, | |||
615 | referenced_page = TestClearPageReferenced(page); | 638 | referenced_page = TestClearPageReferenced(page); |
616 | 639 | ||
617 | /* Lumpy reclaim - ignore references */ | 640 | /* Lumpy reclaim - ignore references */ |
618 | if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) | 641 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) |
619 | return PAGEREF_RECLAIM; | 642 | return PAGEREF_RECLAIM; |
620 | 643 | ||
621 | /* | 644 | /* |
@@ -732,7 +755,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
732 | * for any page for which writeback has already | 755 | * for any page for which writeback has already |
733 | * started. | 756 | * started. |
734 | */ | 757 | */ |
735 | if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && | 758 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
736 | may_enter_fs) | 759 | may_enter_fs) |
737 | wait_on_page_writeback(page); | 760 | wait_on_page_writeback(page); |
738 | else { | 761 | else { |
@@ -888,7 +911,7 @@ cull_mlocked: | |||
888 | try_to_free_swap(page); | 911 | try_to_free_swap(page); |
889 | unlock_page(page); | 912 | unlock_page(page); |
890 | putback_lru_page(page); | 913 | putback_lru_page(page); |
891 | disable_lumpy_reclaim_mode(sc); | 914 | reset_reclaim_mode(sc); |
892 | continue; | 915 | continue; |
893 | 916 | ||
894 | activate_locked: | 917 | activate_locked: |
@@ -901,7 +924,7 @@ activate_locked: | |||
901 | keep_locked: | 924 | keep_locked: |
902 | unlock_page(page); | 925 | unlock_page(page); |
903 | keep: | 926 | keep: |
904 | disable_lumpy_reclaim_mode(sc); | 927 | reset_reclaim_mode(sc); |
905 | keep_lumpy: | 928 | keep_lumpy: |
906 | list_add(&page->lru, &ret_pages); | 929 | list_add(&page->lru, &ret_pages); |
907 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 930 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
@@ -1021,7 +1044,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1021 | case 0: | 1044 | case 0: |
1022 | list_move(&page->lru, dst); | 1045 | list_move(&page->lru, dst); |
1023 | mem_cgroup_del_lru(page); | 1046 | mem_cgroup_del_lru(page); |
1024 | nr_taken++; | 1047 | nr_taken += hpage_nr_pages(page); |
1025 | break; | 1048 | break; |
1026 | 1049 | ||
1027 | case -EBUSY: | 1050 | case -EBUSY: |
@@ -1079,7 +1102,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1079 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | 1102 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
1080 | list_move(&cursor_page->lru, dst); | 1103 | list_move(&cursor_page->lru, dst); |
1081 | mem_cgroup_del_lru(cursor_page); | 1104 | mem_cgroup_del_lru(cursor_page); |
1082 | nr_taken++; | 1105 | nr_taken += hpage_nr_pages(page); |
1083 | nr_lumpy_taken++; | 1106 | nr_lumpy_taken++; |
1084 | if (PageDirty(cursor_page)) | 1107 | if (PageDirty(cursor_page)) |
1085 | nr_lumpy_dirty++; | 1108 | nr_lumpy_dirty++; |
@@ -1134,14 +1157,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, | |||
1134 | struct page *page; | 1157 | struct page *page; |
1135 | 1158 | ||
1136 | list_for_each_entry(page, page_list, lru) { | 1159 | list_for_each_entry(page, page_list, lru) { |
1160 | int numpages = hpage_nr_pages(page); | ||
1137 | lru = page_lru_base_type(page); | 1161 | lru = page_lru_base_type(page); |
1138 | if (PageActive(page)) { | 1162 | if (PageActive(page)) { |
1139 | lru += LRU_ACTIVE; | 1163 | lru += LRU_ACTIVE; |
1140 | ClearPageActive(page); | 1164 | ClearPageActive(page); |
1141 | nr_active++; | 1165 | nr_active += numpages; |
1142 | } | 1166 | } |
1143 | if (count) | 1167 | if (count) |
1144 | count[lru]++; | 1168 | count[lru] += numpages; |
1145 | } | 1169 | } |
1146 | 1170 | ||
1147 | return nr_active; | 1171 | return nr_active; |
@@ -1251,7 +1275,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
1251 | add_page_to_lru_list(zone, page, lru); | 1275 | add_page_to_lru_list(zone, page, lru); |
1252 | if (is_active_lru(lru)) { | 1276 | if (is_active_lru(lru)) { |
1253 | int file = is_file_lru(lru); | 1277 | int file = is_file_lru(lru); |
1254 | reclaim_stat->recent_rotated[file]++; | 1278 | int numpages = hpage_nr_pages(page); |
1279 | reclaim_stat->recent_rotated[file] += numpages; | ||
1255 | } | 1280 | } |
1256 | if (!pagevec_add(&pvec, page)) { | 1281 | if (!pagevec_add(&pvec, page)) { |
1257 | spin_unlock_irq(&zone->lru_lock); | 1282 | spin_unlock_irq(&zone->lru_lock); |
@@ -1317,7 +1342,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, | |||
1317 | return false; | 1342 | return false; |
1318 | 1343 | ||
1319 | /* Only stall on lumpy reclaim */ | 1344 | /* Only stall on lumpy reclaim */ |
1320 | if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) | 1345 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) |
1321 | return false; | 1346 | return false; |
1322 | 1347 | ||
1323 | /* If we have relaimed everything on the isolated list, no stall */ | 1348 | /* If we have relaimed everything on the isolated list, no stall */ |
@@ -1361,15 +1386,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1361 | return SWAP_CLUSTER_MAX; | 1386 | return SWAP_CLUSTER_MAX; |
1362 | } | 1387 | } |
1363 | 1388 | ||
1364 | set_lumpy_reclaim_mode(priority, sc, false); | 1389 | set_reclaim_mode(priority, sc, false); |
1365 | lru_add_drain(); | 1390 | lru_add_drain(); |
1366 | spin_lock_irq(&zone->lru_lock); | 1391 | spin_lock_irq(&zone->lru_lock); |
1367 | 1392 | ||
1368 | if (scanning_global_lru(sc)) { | 1393 | if (scanning_global_lru(sc)) { |
1369 | nr_taken = isolate_pages_global(nr_to_scan, | 1394 | nr_taken = isolate_pages_global(nr_to_scan, |
1370 | &page_list, &nr_scanned, sc->order, | 1395 | &page_list, &nr_scanned, sc->order, |
1371 | sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? | 1396 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
1372 | ISOLATE_INACTIVE : ISOLATE_BOTH, | 1397 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
1373 | zone, 0, file); | 1398 | zone, 0, file); |
1374 | zone->pages_scanned += nr_scanned; | 1399 | zone->pages_scanned += nr_scanned; |
1375 | if (current_is_kswapd()) | 1400 | if (current_is_kswapd()) |
@@ -1381,8 +1406,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1381 | } else { | 1406 | } else { |
1382 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1407 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, |
1383 | &page_list, &nr_scanned, sc->order, | 1408 | &page_list, &nr_scanned, sc->order, |
1384 | sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? | 1409 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
1385 | ISOLATE_INACTIVE : ISOLATE_BOTH, | 1410 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
1386 | zone, sc->mem_cgroup, | 1411 | zone, sc->mem_cgroup, |
1387 | 0, file); | 1412 | 0, file); |
1388 | /* | 1413 | /* |
@@ -1404,7 +1429,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1404 | 1429 | ||
1405 | /* Check if we should syncronously wait for writeback */ | 1430 | /* Check if we should syncronously wait for writeback */ |
1406 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1431 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1407 | set_lumpy_reclaim_mode(priority, sc, true); | 1432 | set_reclaim_mode(priority, sc, true); |
1408 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1433 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); |
1409 | } | 1434 | } |
1410 | 1435 | ||
@@ -1419,7 +1444,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1419 | zone_idx(zone), | 1444 | zone_idx(zone), |
1420 | nr_scanned, nr_reclaimed, | 1445 | nr_scanned, nr_reclaimed, |
1421 | priority, | 1446 | priority, |
1422 | trace_shrink_flags(file, sc->lumpy_reclaim_mode)); | 1447 | trace_shrink_flags(file, sc->reclaim_mode)); |
1423 | return nr_reclaimed; | 1448 | return nr_reclaimed; |
1424 | } | 1449 | } |
1425 | 1450 | ||
@@ -1459,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1459 | 1484 | ||
1460 | list_move(&page->lru, &zone->lru[lru].list); | 1485 | list_move(&page->lru, &zone->lru[lru].list); |
1461 | mem_cgroup_add_lru_list(page, lru); | 1486 | mem_cgroup_add_lru_list(page, lru); |
1462 | pgmoved++; | 1487 | pgmoved += hpage_nr_pages(page); |
1463 | 1488 | ||
1464 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | 1489 | if (!pagevec_add(&pvec, page) || list_empty(list)) { |
1465 | spin_unlock_irq(&zone->lru_lock); | 1490 | spin_unlock_irq(&zone->lru_lock); |
@@ -1527,7 +1552,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1527 | } | 1552 | } |
1528 | 1553 | ||
1529 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | 1554 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
1530 | nr_rotated++; | 1555 | nr_rotated += hpage_nr_pages(page); |
1531 | /* | 1556 | /* |
1532 | * Identify referenced, file-backed active pages and | 1557 | * Identify referenced, file-backed active pages and |
1533 | * give them one more trip around the active list. So | 1558 | * give them one more trip around the active list. So |
@@ -1798,6 +1823,57 @@ out: | |||
1798 | } | 1823 | } |
1799 | 1824 | ||
1800 | /* | 1825 | /* |
1826 | * Reclaim/compaction depends on a number of pages being freed. To avoid | ||
1827 | * disruption to the system, a small number of order-0 pages continue to be | ||
1828 | * rotated and reclaimed in the normal fashion. However, by the time we get | ||
1829 | * back to the allocator and call try_to_compact_zone(), we ensure that | ||
1830 | * there are enough free pages for it to be likely successful | ||
1831 | */ | ||
1832 | static inline bool should_continue_reclaim(struct zone *zone, | ||
1833 | unsigned long nr_reclaimed, | ||
1834 | unsigned long nr_scanned, | ||
1835 | struct scan_control *sc) | ||
1836 | { | ||
1837 | unsigned long pages_for_compaction; | ||
1838 | unsigned long inactive_lru_pages; | ||
1839 | |||
1840 | /* If not in reclaim/compaction mode, stop */ | ||
1841 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | ||
1842 | return false; | ||
1843 | |||
1844 | /* | ||
1845 | * If we failed to reclaim and have scanned the full list, stop. | ||
1846 | * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far | ||
1847 | * faster but obviously would be less likely to succeed | ||
1848 | * allocation. If this is desirable, use GFP_REPEAT to decide | ||
1849 | * if both reclaimed and scanned should be checked or just | ||
1850 | * reclaimed | ||
1851 | */ | ||
1852 | if (!nr_reclaimed && !nr_scanned) | ||
1853 | return false; | ||
1854 | |||
1855 | /* | ||
1856 | * If we have not reclaimed enough pages for compaction and the | ||
1857 | * inactive lists are large enough, continue reclaiming | ||
1858 | */ | ||
1859 | pages_for_compaction = (2UL << sc->order); | ||
1860 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + | ||
1861 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1862 | if (sc->nr_reclaimed < pages_for_compaction && | ||
1863 | inactive_lru_pages > pages_for_compaction) | ||
1864 | return true; | ||
1865 | |||
1866 | /* If compaction would go ahead or the allocation would succeed, stop */ | ||
1867 | switch (compaction_suitable(zone, sc->order)) { | ||
1868 | case COMPACT_PARTIAL: | ||
1869 | case COMPACT_CONTINUE: | ||
1870 | return false; | ||
1871 | default: | ||
1872 | return true; | ||
1873 | } | ||
1874 | } | ||
1875 | |||
1876 | /* | ||
1801 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1877 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
1802 | */ | 1878 | */ |
1803 | static void shrink_zone(int priority, struct zone *zone, | 1879 | static void shrink_zone(int priority, struct zone *zone, |
@@ -1806,9 +1882,12 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1806 | unsigned long nr[NR_LRU_LISTS]; | 1882 | unsigned long nr[NR_LRU_LISTS]; |
1807 | unsigned long nr_to_scan; | 1883 | unsigned long nr_to_scan; |
1808 | enum lru_list l; | 1884 | enum lru_list l; |
1809 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1885 | unsigned long nr_reclaimed, nr_scanned; |
1810 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1886 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1811 | 1887 | ||
1888 | restart: | ||
1889 | nr_reclaimed = 0; | ||
1890 | nr_scanned = sc->nr_scanned; | ||
1812 | get_scan_count(zone, sc, nr, priority); | 1891 | get_scan_count(zone, sc, nr, priority); |
1813 | 1892 | ||
1814 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1893 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
@@ -1834,8 +1913,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1834 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 1913 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
1835 | break; | 1914 | break; |
1836 | } | 1915 | } |
1837 | 1916 | sc->nr_reclaimed += nr_reclaimed; | |
1838 | sc->nr_reclaimed = nr_reclaimed; | ||
1839 | 1917 | ||
1840 | /* | 1918 | /* |
1841 | * Even if we did not try to evict anon pages at all, we want to | 1919 | * Even if we did not try to evict anon pages at all, we want to |
@@ -1844,6 +1922,11 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1844 | if (inactive_anon_is_low(zone, sc)) | 1922 | if (inactive_anon_is_low(zone, sc)) |
1845 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1923 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1846 | 1924 | ||
1925 | /* reclaim/compaction might need reclaim to continue */ | ||
1926 | if (should_continue_reclaim(zone, nr_reclaimed, | ||
1927 | sc->nr_scanned - nr_scanned, sc)) | ||
1928 | goto restart; | ||
1929 | |||
1847 | throttle_vm_writeout(sc->gfp_mask); | 1930 | throttle_vm_writeout(sc->gfp_mask); |
1848 | } | 1931 | } |
1849 | 1932 | ||
@@ -2000,7 +2083,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2000 | struct zone *preferred_zone; | 2083 | struct zone *preferred_zone; |
2001 | 2084 | ||
2002 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | 2085 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), |
2003 | NULL, &preferred_zone); | 2086 | &cpuset_current_mems_allowed, |
2087 | &preferred_zone); | ||
2004 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | 2088 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); |
2005 | } | 2089 | } |
2006 | } | 2090 | } |
@@ -2117,38 +2201,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2117 | } | 2201 | } |
2118 | #endif | 2202 | #endif |
2119 | 2203 | ||
2204 | /* | ||
2205 | * pgdat_balanced is used when checking if a node is balanced for high-order | ||
2206 | * allocations. Only zones that meet watermarks and are in a zone allowed | ||
2207 | * by the callers classzone_idx are added to balanced_pages. The total of | ||
2208 | * balanced pages must be at least 25% of the zones allowed by classzone_idx | ||
2209 | * for the node to be considered balanced. Forcing all zones to be balanced | ||
2210 | * for high orders can cause excessive reclaim when there are imbalanced zones. | ||
2211 | * The choice of 25% is due to | ||
2212 | * o a 16M DMA zone that is balanced will not balance a zone on any | ||
2213 | * reasonable sized machine | ||
2214 | * o On all other machines, the top zone must be at least a reasonable | ||
2215 | * precentage of the middle zones. For example, on 32-bit x86, highmem | ||
2216 | * would need to be at least 256M for it to be balance a whole node. | ||
2217 | * Similarly, on x86-64 the Normal zone would need to be at least 1G | ||
2218 | * to balance a node on its own. These seemed like reasonable ratios. | ||
2219 | */ | ||
2220 | static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | ||
2221 | int classzone_idx) | ||
2222 | { | ||
2223 | unsigned long present_pages = 0; | ||
2224 | int i; | ||
2225 | |||
2226 | for (i = 0; i <= classzone_idx; i++) | ||
2227 | present_pages += pgdat->node_zones[i].present_pages; | ||
2228 | |||
2229 | return balanced_pages > (present_pages >> 2); | ||
2230 | } | ||
2231 | |||
2120 | /* is kswapd sleeping prematurely? */ | 2232 | /* is kswapd sleeping prematurely? */ |
2121 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | 2233 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, |
2234 | int classzone_idx) | ||
2122 | { | 2235 | { |
2123 | int i; | 2236 | int i; |
2237 | unsigned long balanced = 0; | ||
2238 | bool all_zones_ok = true; | ||
2124 | 2239 | ||
2125 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2240 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2126 | if (remaining) | 2241 | if (remaining) |
2127 | return 1; | 2242 | return true; |
2128 | 2243 | ||
2129 | /* If after HZ/10, a zone is below the high mark, it's premature */ | 2244 | /* Check the watermark levels */ |
2130 | for (i = 0; i < pgdat->nr_zones; i++) { | 2245 | for (i = 0; i < pgdat->nr_zones; i++) { |
2131 | struct zone *zone = pgdat->node_zones + i; | 2246 | struct zone *zone = pgdat->node_zones + i; |
2132 | 2247 | ||
2133 | if (!populated_zone(zone)) | 2248 | if (!populated_zone(zone)) |
2134 | continue; | 2249 | continue; |
2135 | 2250 | ||
2136 | if (zone->all_unreclaimable) | 2251 | /* |
2252 | * balance_pgdat() skips over all_unreclaimable after | ||
2253 | * DEF_PRIORITY. Effectively, it considers them balanced so | ||
2254 | * they must be considered balanced here as well if kswapd | ||
2255 | * is to sleep | ||
2256 | */ | ||
2257 | if (zone->all_unreclaimable) { | ||
2258 | balanced += zone->present_pages; | ||
2137 | continue; | 2259 | continue; |
2260 | } | ||
2138 | 2261 | ||
2139 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 2262 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2140 | 0, 0)) | 2263 | classzone_idx, 0)) |
2141 | return 1; | 2264 | all_zones_ok = false; |
2265 | else | ||
2266 | balanced += zone->present_pages; | ||
2142 | } | 2267 | } |
2143 | 2268 | ||
2144 | return 0; | 2269 | /* |
2270 | * For high-order requests, the balanced zones must contain at least | ||
2271 | * 25% of the nodes pages for kswapd to sleep. For order-0, all zones | ||
2272 | * must be balanced | ||
2273 | */ | ||
2274 | if (order) | ||
2275 | return pgdat_balanced(pgdat, balanced, classzone_idx); | ||
2276 | else | ||
2277 | return !all_zones_ok; | ||
2145 | } | 2278 | } |
2146 | 2279 | ||
2147 | /* | 2280 | /* |
2148 | * For kswapd, balance_pgdat() will work across all this node's zones until | 2281 | * For kswapd, balance_pgdat() will work across all this node's zones until |
2149 | * they are all at high_wmark_pages(zone). | 2282 | * they are all at high_wmark_pages(zone). |
2150 | * | 2283 | * |
2151 | * Returns the number of pages which were actually freed. | 2284 | * Returns the final order kswapd was reclaiming at |
2152 | * | 2285 | * |
2153 | * There is special handling here for zones which are full of pinned pages. | 2286 | * There is special handling here for zones which are full of pinned pages. |
2154 | * This can happen if the pages are all mlocked, or if they are all used by | 2287 | * This can happen if the pages are all mlocked, or if they are all used by |
@@ -2165,11 +2298,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
2165 | * interoperates with the page allocator fallback scheme to ensure that aging | 2298 | * interoperates with the page allocator fallback scheme to ensure that aging |
2166 | * of pages is balanced across the zones. | 2299 | * of pages is balanced across the zones. |
2167 | */ | 2300 | */ |
2168 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 2301 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2302 | int *classzone_idx) | ||
2169 | { | 2303 | { |
2170 | int all_zones_ok; | 2304 | int all_zones_ok; |
2305 | unsigned long balanced; | ||
2171 | int priority; | 2306 | int priority; |
2172 | int i; | 2307 | int i; |
2308 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
2173 | unsigned long total_scanned; | 2309 | unsigned long total_scanned; |
2174 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2310 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2175 | struct scan_control sc = { | 2311 | struct scan_control sc = { |
@@ -2192,7 +2328,6 @@ loop_again: | |||
2192 | count_vm_event(PAGEOUTRUN); | 2328 | count_vm_event(PAGEOUTRUN); |
2193 | 2329 | ||
2194 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2330 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2195 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
2196 | unsigned long lru_pages = 0; | 2331 | unsigned long lru_pages = 0; |
2197 | int has_under_min_watermark_zone = 0; | 2332 | int has_under_min_watermark_zone = 0; |
2198 | 2333 | ||
@@ -2201,6 +2336,7 @@ loop_again: | |||
2201 | disable_swap_token(); | 2336 | disable_swap_token(); |
2202 | 2337 | ||
2203 | all_zones_ok = 1; | 2338 | all_zones_ok = 1; |
2339 | balanced = 0; | ||
2204 | 2340 | ||
2205 | /* | 2341 | /* |
2206 | * Scan in the highmem->dma direction for the highest | 2342 | * Scan in the highmem->dma direction for the highest |
@@ -2223,9 +2359,10 @@ loop_again: | |||
2223 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 2359 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
2224 | &sc, priority, 0); | 2360 | &sc, priority, 0); |
2225 | 2361 | ||
2226 | if (!zone_watermark_ok(zone, order, | 2362 | if (!zone_watermark_ok_safe(zone, order, |
2227 | high_wmark_pages(zone), 0, 0)) { | 2363 | high_wmark_pages(zone), 0, 0)) { |
2228 | end_zone = i; | 2364 | end_zone = i; |
2365 | *classzone_idx = i; | ||
2229 | break; | 2366 | break; |
2230 | } | 2367 | } |
2231 | } | 2368 | } |
@@ -2248,6 +2385,7 @@ loop_again: | |||
2248 | * cause too much scanning of the lower zones. | 2385 | * cause too much scanning of the lower zones. |
2249 | */ | 2386 | */ |
2250 | for (i = 0; i <= end_zone; i++) { | 2387 | for (i = 0; i <= end_zone; i++) { |
2388 | int compaction; | ||
2251 | struct zone *zone = pgdat->node_zones + i; | 2389 | struct zone *zone = pgdat->node_zones + i; |
2252 | int nr_slab; | 2390 | int nr_slab; |
2253 | 2391 | ||
@@ -2269,7 +2407,7 @@ loop_again: | |||
2269 | * We put equal pressure on every zone, unless one | 2407 | * We put equal pressure on every zone, unless one |
2270 | * zone has way too many pages free already. | 2408 | * zone has way too many pages free already. |
2271 | */ | 2409 | */ |
2272 | if (!zone_watermark_ok(zone, order, | 2410 | if (!zone_watermark_ok_safe(zone, order, |
2273 | 8*high_wmark_pages(zone), end_zone, 0)) | 2411 | 8*high_wmark_pages(zone), end_zone, 0)) |
2274 | shrink_zone(priority, zone, &sc); | 2412 | shrink_zone(priority, zone, &sc); |
2275 | reclaim_state->reclaimed_slab = 0; | 2413 | reclaim_state->reclaimed_slab = 0; |
@@ -2277,9 +2415,26 @@ loop_again: | |||
2277 | lru_pages); | 2415 | lru_pages); |
2278 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2416 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2279 | total_scanned += sc.nr_scanned; | 2417 | total_scanned += sc.nr_scanned; |
2418 | |||
2419 | compaction = 0; | ||
2420 | if (order && | ||
2421 | zone_watermark_ok(zone, 0, | ||
2422 | high_wmark_pages(zone), | ||
2423 | end_zone, 0) && | ||
2424 | !zone_watermark_ok(zone, order, | ||
2425 | high_wmark_pages(zone), | ||
2426 | end_zone, 0)) { | ||
2427 | compact_zone_order(zone, | ||
2428 | order, | ||
2429 | sc.gfp_mask, false, | ||
2430 | COMPACT_MODE_KSWAPD); | ||
2431 | compaction = 1; | ||
2432 | } | ||
2433 | |||
2280 | if (zone->all_unreclaimable) | 2434 | if (zone->all_unreclaimable) |
2281 | continue; | 2435 | continue; |
2282 | if (nr_slab == 0 && !zone_reclaimable(zone)) | 2436 | if (!compaction && nr_slab == 0 && |
2437 | !zone_reclaimable(zone)) | ||
2283 | zone->all_unreclaimable = 1; | 2438 | zone->all_unreclaimable = 1; |
2284 | /* | 2439 | /* |
2285 | * If we've done a decent amount of scanning and | 2440 | * If we've done a decent amount of scanning and |
@@ -2290,7 +2445,7 @@ loop_again: | |||
2290 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2445 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2291 | sc.may_writepage = 1; | 2446 | sc.may_writepage = 1; |
2292 | 2447 | ||
2293 | if (!zone_watermark_ok(zone, order, | 2448 | if (!zone_watermark_ok_safe(zone, order, |
2294 | high_wmark_pages(zone), end_zone, 0)) { | 2449 | high_wmark_pages(zone), end_zone, 0)) { |
2295 | all_zones_ok = 0; | 2450 | all_zones_ok = 0; |
2296 | /* | 2451 | /* |
@@ -2298,7 +2453,7 @@ loop_again: | |||
2298 | * means that we have a GFP_ATOMIC allocation | 2453 | * means that we have a GFP_ATOMIC allocation |
2299 | * failure risk. Hurry up! | 2454 | * failure risk. Hurry up! |
2300 | */ | 2455 | */ |
2301 | if (!zone_watermark_ok(zone, order, | 2456 | if (!zone_watermark_ok_safe(zone, order, |
2302 | min_wmark_pages(zone), end_zone, 0)) | 2457 | min_wmark_pages(zone), end_zone, 0)) |
2303 | has_under_min_watermark_zone = 1; | 2458 | has_under_min_watermark_zone = 1; |
2304 | } else { | 2459 | } else { |
@@ -2310,10 +2465,12 @@ loop_again: | |||
2310 | * spectulatively avoid congestion waits | 2465 | * spectulatively avoid congestion waits |
2311 | */ | 2466 | */ |
2312 | zone_clear_flag(zone, ZONE_CONGESTED); | 2467 | zone_clear_flag(zone, ZONE_CONGESTED); |
2468 | if (i <= *classzone_idx) | ||
2469 | balanced += zone->present_pages; | ||
2313 | } | 2470 | } |
2314 | 2471 | ||
2315 | } | 2472 | } |
2316 | if (all_zones_ok) | 2473 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2317 | break; /* kswapd: all done */ | 2474 | break; /* kswapd: all done */ |
2318 | /* | 2475 | /* |
2319 | * OK, kswapd is getting into trouble. Take a nap, then take | 2476 | * OK, kswapd is getting into trouble. Take a nap, then take |
@@ -2336,7 +2493,13 @@ loop_again: | |||
2336 | break; | 2493 | break; |
2337 | } | 2494 | } |
2338 | out: | 2495 | out: |
2339 | if (!all_zones_ok) { | 2496 | |
2497 | /* | ||
2498 | * order-0: All zones must meet high watermark for a balanced node | ||
2499 | * high-order: Balanced zones must make up at least 25% of the node | ||
2500 | * for the node to be balanced | ||
2501 | */ | ||
2502 | if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { | ||
2340 | cond_resched(); | 2503 | cond_resched(); |
2341 | 2504 | ||
2342 | try_to_freeze(); | 2505 | try_to_freeze(); |
@@ -2361,7 +2524,88 @@ out: | |||
2361 | goto loop_again; | 2524 | goto loop_again; |
2362 | } | 2525 | } |
2363 | 2526 | ||
2364 | return sc.nr_reclaimed; | 2527 | /* |
2528 | * If kswapd was reclaiming at a higher order, it has the option of | ||
2529 | * sleeping without all zones being balanced. Before it does, it must | ||
2530 | * ensure that the watermarks for order-0 on *all* zones are met and | ||
2531 | * that the congestion flags are cleared. The congestion flag must | ||
2532 | * be cleared as kswapd is the only mechanism that clears the flag | ||
2533 | * and it is potentially going to sleep here. | ||
2534 | */ | ||
2535 | if (order) { | ||
2536 | for (i = 0; i <= end_zone; i++) { | ||
2537 | struct zone *zone = pgdat->node_zones + i; | ||
2538 | |||
2539 | if (!populated_zone(zone)) | ||
2540 | continue; | ||
2541 | |||
2542 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | ||
2543 | continue; | ||
2544 | |||
2545 | /* Confirm the zone is balanced for order-0 */ | ||
2546 | if (!zone_watermark_ok(zone, 0, | ||
2547 | high_wmark_pages(zone), 0, 0)) { | ||
2548 | order = sc.order = 0; | ||
2549 | goto loop_again; | ||
2550 | } | ||
2551 | |||
2552 | /* If balanced, clear the congested flag */ | ||
2553 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2554 | } | ||
2555 | } | ||
2556 | |||
2557 | /* | ||
2558 | * Return the order we were reclaiming at so sleeping_prematurely() | ||
2559 | * makes a decision on the order we were last reclaiming at. However, | ||
2560 | * if another caller entered the allocator slow path while kswapd | ||
2561 | * was awake, order will remain at the higher level | ||
2562 | */ | ||
2563 | *classzone_idx = end_zone; | ||
2564 | return order; | ||
2565 | } | ||
2566 | |||
2567 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | ||
2568 | { | ||
2569 | long remaining = 0; | ||
2570 | DEFINE_WAIT(wait); | ||
2571 | |||
2572 | if (freezing(current) || kthread_should_stop()) | ||
2573 | return; | ||
2574 | |||
2575 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2576 | |||
2577 | /* Try to sleep for a short interval */ | ||
2578 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
2579 | remaining = schedule_timeout(HZ/10); | ||
2580 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2581 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2582 | } | ||
2583 | |||
2584 | /* | ||
2585 | * After a short sleep, check if it was a premature sleep. If not, then | ||
2586 | * go fully to sleep until explicitly woken up. | ||
2587 | */ | ||
2588 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
2589 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
2590 | |||
2591 | /* | ||
2592 | * vmstat counters are not perfectly accurate and the estimated | ||
2593 | * value for counters such as NR_FREE_PAGES can deviate from the | ||
2594 | * true value by nr_online_cpus * threshold. To avoid the zone | ||
2595 | * watermarks being breached while under pressure, we reduce the | ||
2596 | * per-cpu vmstat threshold while kswapd is awake and restore | ||
2597 | * them before going back to sleep. | ||
2598 | */ | ||
2599 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | ||
2600 | schedule(); | ||
2601 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | ||
2602 | } else { | ||
2603 | if (remaining) | ||
2604 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2605 | else | ||
2606 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2607 | } | ||
2608 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2365 | } | 2609 | } |
2366 | 2610 | ||
2367 | /* | 2611 | /* |
@@ -2380,9 +2624,10 @@ out: | |||
2380 | static int kswapd(void *p) | 2624 | static int kswapd(void *p) |
2381 | { | 2625 | { |
2382 | unsigned long order; | 2626 | unsigned long order; |
2627 | int classzone_idx; | ||
2383 | pg_data_t *pgdat = (pg_data_t*)p; | 2628 | pg_data_t *pgdat = (pg_data_t*)p; |
2384 | struct task_struct *tsk = current; | 2629 | struct task_struct *tsk = current; |
2385 | DEFINE_WAIT(wait); | 2630 | |
2386 | struct reclaim_state reclaim_state = { | 2631 | struct reclaim_state reclaim_state = { |
2387 | .reclaimed_slab = 0, | 2632 | .reclaimed_slab = 0, |
2388 | }; | 2633 | }; |
@@ -2410,49 +2655,30 @@ static int kswapd(void *p) | |||
2410 | set_freezable(); | 2655 | set_freezable(); |
2411 | 2656 | ||
2412 | order = 0; | 2657 | order = 0; |
2658 | classzone_idx = MAX_NR_ZONES - 1; | ||
2413 | for ( ; ; ) { | 2659 | for ( ; ; ) { |
2414 | unsigned long new_order; | 2660 | unsigned long new_order; |
2661 | int new_classzone_idx; | ||
2415 | int ret; | 2662 | int ret; |
2416 | 2663 | ||
2417 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2418 | new_order = pgdat->kswapd_max_order; | 2664 | new_order = pgdat->kswapd_max_order; |
2665 | new_classzone_idx = pgdat->classzone_idx; | ||
2419 | pgdat->kswapd_max_order = 0; | 2666 | pgdat->kswapd_max_order = 0; |
2420 | if (order < new_order) { | 2667 | pgdat->classzone_idx = MAX_NR_ZONES - 1; |
2668 | if (order < new_order || classzone_idx > new_classzone_idx) { | ||
2421 | /* | 2669 | /* |
2422 | * Don't sleep if someone wants a larger 'order' | 2670 | * Don't sleep if someone wants a larger 'order' |
2423 | * allocation | 2671 | * allocation or has tigher zone constraints |
2424 | */ | 2672 | */ |
2425 | order = new_order; | 2673 | order = new_order; |
2674 | classzone_idx = new_classzone_idx; | ||
2426 | } else { | 2675 | } else { |
2427 | if (!freezing(current) && !kthread_should_stop()) { | 2676 | kswapd_try_to_sleep(pgdat, order, classzone_idx); |
2428 | long remaining = 0; | ||
2429 | |||
2430 | /* Try to sleep for a short interval */ | ||
2431 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2432 | remaining = schedule_timeout(HZ/10); | ||
2433 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2434 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2435 | } | ||
2436 | |||
2437 | /* | ||
2438 | * After a short sleep, check if it was a | ||
2439 | * premature sleep. If not, then go fully | ||
2440 | * to sleep until explicitly woken up | ||
2441 | */ | ||
2442 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2443 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
2444 | schedule(); | ||
2445 | } else { | ||
2446 | if (remaining) | ||
2447 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2448 | else | ||
2449 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2450 | } | ||
2451 | } | ||
2452 | |||
2453 | order = pgdat->kswapd_max_order; | 2677 | order = pgdat->kswapd_max_order; |
2678 | classzone_idx = pgdat->classzone_idx; | ||
2679 | pgdat->kswapd_max_order = 0; | ||
2680 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | ||
2454 | } | 2681 | } |
2455 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2456 | 2682 | ||
2457 | ret = try_to_freeze(); | 2683 | ret = try_to_freeze(); |
2458 | if (kthread_should_stop()) | 2684 | if (kthread_should_stop()) |
@@ -2464,7 +2690,7 @@ static int kswapd(void *p) | |||
2464 | */ | 2690 | */ |
2465 | if (!ret) { | 2691 | if (!ret) { |
2466 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2692 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
2467 | balance_pgdat(pgdat, order); | 2693 | order = balance_pgdat(pgdat, order, &classzone_idx); |
2468 | } | 2694 | } |
2469 | } | 2695 | } |
2470 | return 0; | 2696 | return 0; |
@@ -2473,23 +2699,26 @@ static int kswapd(void *p) | |||
2473 | /* | 2699 | /* |
2474 | * A zone is low on free memory, so wake its kswapd task to service it. | 2700 | * A zone is low on free memory, so wake its kswapd task to service it. |
2475 | */ | 2701 | */ |
2476 | void wakeup_kswapd(struct zone *zone, int order) | 2702 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) |
2477 | { | 2703 | { |
2478 | pg_data_t *pgdat; | 2704 | pg_data_t *pgdat; |
2479 | 2705 | ||
2480 | if (!populated_zone(zone)) | 2706 | if (!populated_zone(zone)) |
2481 | return; | 2707 | return; |
2482 | 2708 | ||
2483 | pgdat = zone->zone_pgdat; | ||
2484 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2485 | return; | ||
2486 | if (pgdat->kswapd_max_order < order) | ||
2487 | pgdat->kswapd_max_order = order; | ||
2488 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2489 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2709 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2490 | return; | 2710 | return; |
2711 | pgdat = zone->zone_pgdat; | ||
2712 | if (pgdat->kswapd_max_order < order) { | ||
2713 | pgdat->kswapd_max_order = order; | ||
2714 | pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); | ||
2715 | } | ||
2491 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 2716 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
2492 | return; | 2717 | return; |
2718 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2719 | return; | ||
2720 | |||
2721 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2493 | wake_up_interruptible(&pgdat->kswapd_wait); | 2722 | wake_up_interruptible(&pgdat->kswapd_wait); |
2494 | } | 2723 | } |
2495 | 2724 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 42eac4d3321..0c3b5048773 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat); | |||
83 | 83 | ||
84 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |
85 | 85 | ||
86 | static int calculate_threshold(struct zone *zone) | 86 | int calculate_pressure_threshold(struct zone *zone) |
87 | { | ||
88 | int threshold; | ||
89 | int watermark_distance; | ||
90 | |||
91 | /* | ||
92 | * As vmstats are not up to date, there is drift between the estimated | ||
93 | * and real values. For high thresholds and a high number of CPUs, it | ||
94 | * is possible for the min watermark to be breached while the estimated | ||
95 | * value looks fine. The pressure threshold is a reduced value such | ||
96 | * that even the maximum amount of drift will not accidentally breach | ||
97 | * the min watermark | ||
98 | */ | ||
99 | watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); | ||
100 | threshold = max(1, (int)(watermark_distance / num_online_cpus())); | ||
101 | |||
102 | /* | ||
103 | * Maximum threshold is 125 | ||
104 | */ | ||
105 | threshold = min(125, threshold); | ||
106 | |||
107 | return threshold; | ||
108 | } | ||
109 | |||
110 | int calculate_normal_threshold(struct zone *zone) | ||
87 | { | 111 | { |
88 | int threshold; | 112 | int threshold; |
89 | int mem; /* memory in 128 MB units */ | 113 | int mem; /* memory in 128 MB units */ |
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) | |||
142 | for_each_populated_zone(zone) { | 166 | for_each_populated_zone(zone) { |
143 | unsigned long max_drift, tolerate_drift; | 167 | unsigned long max_drift, tolerate_drift; |
144 | 168 | ||
145 | threshold = calculate_threshold(zone); | 169 | threshold = calculate_normal_threshold(zone); |
146 | 170 | ||
147 | for_each_online_cpu(cpu) | 171 | for_each_online_cpu(cpu) |
148 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | 172 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
@@ -161,42 +185,50 @@ static void refresh_zone_stat_thresholds(void) | |||
161 | } | 185 | } |
162 | } | 186 | } |
163 | 187 | ||
188 | void set_pgdat_percpu_threshold(pg_data_t *pgdat, | ||
189 | int (*calculate_pressure)(struct zone *)) | ||
190 | { | ||
191 | struct zone *zone; | ||
192 | int cpu; | ||
193 | int threshold; | ||
194 | int i; | ||
195 | |||
196 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
197 | zone = &pgdat->node_zones[i]; | ||
198 | if (!zone->percpu_drift_mark) | ||
199 | continue; | ||
200 | |||
201 | threshold = (*calculate_pressure)(zone); | ||
202 | for_each_possible_cpu(cpu) | ||
203 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | ||
204 | = threshold; | ||
205 | } | ||
206 | } | ||
207 | |||
164 | /* | 208 | /* |
165 | * For use when we know that interrupts are disabled. | 209 | * For use when we know that interrupts are disabled. |
166 | */ | 210 | */ |
167 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 211 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
168 | int delta) | 212 | int delta) |
169 | { | 213 | { |
170 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 214 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
171 | 215 | s8 __percpu *p = pcp->vm_stat_diff + item; | |
172 | s8 *p = pcp->vm_stat_diff + item; | ||
173 | long x; | 216 | long x; |
217 | long t; | ||
174 | 218 | ||
175 | x = delta + *p; | 219 | x = delta + __this_cpu_read(*p); |
176 | 220 | ||
177 | if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { | 221 | t = __this_cpu_read(pcp->stat_threshold); |
222 | |||
223 | if (unlikely(x > t || x < -t)) { | ||
178 | zone_page_state_add(x, zone, item); | 224 | zone_page_state_add(x, zone, item); |
179 | x = 0; | 225 | x = 0; |
180 | } | 226 | } |
181 | *p = x; | 227 | __this_cpu_write(*p, x); |
182 | } | 228 | } |
183 | EXPORT_SYMBOL(__mod_zone_page_state); | 229 | EXPORT_SYMBOL(__mod_zone_page_state); |
184 | 230 | ||
185 | /* | 231 | /* |
186 | * For an unknown interrupt state | ||
187 | */ | ||
188 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
189 | int delta) | ||
190 | { | ||
191 | unsigned long flags; | ||
192 | |||
193 | local_irq_save(flags); | ||
194 | __mod_zone_page_state(zone, item, delta); | ||
195 | local_irq_restore(flags); | ||
196 | } | ||
197 | EXPORT_SYMBOL(mod_zone_page_state); | ||
198 | |||
199 | /* | ||
200 | * Optimized increment and decrement functions. | 232 | * Optimized increment and decrement functions. |
201 | * | 233 | * |
202 | * These are only for a single page and therefore can take a struct page * | 234 | * These are only for a single page and therefore can take a struct page * |
@@ -221,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state); | |||
221 | */ | 253 | */ |
222 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | 254 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
223 | { | 255 | { |
224 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 256 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
225 | s8 *p = pcp->vm_stat_diff + item; | 257 | s8 __percpu *p = pcp->vm_stat_diff + item; |
226 | 258 | s8 v, t; | |
227 | (*p)++; | ||
228 | 259 | ||
229 | if (unlikely(*p > pcp->stat_threshold)) { | 260 | v = __this_cpu_inc_return(*p); |
230 | int overstep = pcp->stat_threshold / 2; | 261 | t = __this_cpu_read(pcp->stat_threshold); |
262 | if (unlikely(v > t)) { | ||
263 | s8 overstep = t >> 1; | ||
231 | 264 | ||
232 | zone_page_state_add(*p + overstep, zone, item); | 265 | zone_page_state_add(v + overstep, zone, item); |
233 | *p = -overstep; | 266 | __this_cpu_write(*p, -overstep); |
234 | } | 267 | } |
235 | } | 268 | } |
236 | 269 | ||
@@ -242,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state); | |||
242 | 275 | ||
243 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | 276 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
244 | { | 277 | { |
245 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 278 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
246 | s8 *p = pcp->vm_stat_diff + item; | 279 | s8 __percpu *p = pcp->vm_stat_diff + item; |
280 | s8 v, t; | ||
247 | 281 | ||
248 | (*p)--; | 282 | v = __this_cpu_dec_return(*p); |
283 | t = __this_cpu_read(pcp->stat_threshold); | ||
284 | if (unlikely(v < - t)) { | ||
285 | s8 overstep = t >> 1; | ||
249 | 286 | ||
250 | if (unlikely(*p < - pcp->stat_threshold)) { | 287 | zone_page_state_add(v - overstep, zone, item); |
251 | int overstep = pcp->stat_threshold / 2; | 288 | __this_cpu_write(*p, overstep); |
252 | |||
253 | zone_page_state_add(*p - overstep, zone, item); | ||
254 | *p = overstep; | ||
255 | } | 289 | } |
256 | } | 290 | } |
257 | 291 | ||
@@ -261,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
261 | } | 295 | } |
262 | EXPORT_SYMBOL(__dec_zone_page_state); | 296 | EXPORT_SYMBOL(__dec_zone_page_state); |
263 | 297 | ||
298 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
299 | /* | ||
300 | * If we have cmpxchg_local support then we do not need to incur the overhead | ||
301 | * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. | ||
302 | * | ||
303 | * mod_state() modifies the zone counter state through atomic per cpu | ||
304 | * operations. | ||
305 | * | ||
306 | * Overstep mode specifies how overstep should handled: | ||
307 | * 0 No overstepping | ||
308 | * 1 Overstepping half of threshold | ||
309 | * -1 Overstepping minus half of threshold | ||
310 | */ | ||
311 | static inline void mod_state(struct zone *zone, | ||
312 | enum zone_stat_item item, int delta, int overstep_mode) | ||
313 | { | ||
314 | struct per_cpu_pageset __percpu *pcp = zone->pageset; | ||
315 | s8 __percpu *p = pcp->vm_stat_diff + item; | ||
316 | long o, n, t, z; | ||
317 | |||
318 | do { | ||
319 | z = 0; /* overflow to zone counters */ | ||
320 | |||
321 | /* | ||
322 | * The fetching of the stat_threshold is racy. We may apply | ||
323 | * a counter threshold to the wrong the cpu if we get | ||
324 | * rescheduled while executing here. However, the following | ||
325 | * will apply the threshold again and therefore bring the | ||
326 | * counter under the threshold. | ||
327 | */ | ||
328 | t = this_cpu_read(pcp->stat_threshold); | ||
329 | |||
330 | o = this_cpu_read(*p); | ||
331 | n = delta + o; | ||
332 | |||
333 | if (n > t || n < -t) { | ||
334 | int os = overstep_mode * (t >> 1) ; | ||
335 | |||
336 | /* Overflow must be added to zone counters */ | ||
337 | z = n + os; | ||
338 | n = -os; | ||
339 | } | ||
340 | } while (this_cpu_cmpxchg(*p, o, n) != o); | ||
341 | |||
342 | if (z) | ||
343 | zone_page_state_add(z, zone, item); | ||
344 | } | ||
345 | |||
346 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
347 | int delta) | ||
348 | { | ||
349 | mod_state(zone, item, delta, 0); | ||
350 | } | ||
351 | EXPORT_SYMBOL(mod_zone_page_state); | ||
352 | |||
353 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | ||
354 | { | ||
355 | mod_state(zone, item, 1, 1); | ||
356 | } | ||
357 | |||
358 | void inc_zone_page_state(struct page *page, enum zone_stat_item item) | ||
359 | { | ||
360 | mod_state(page_zone(page), item, 1, 1); | ||
361 | } | ||
362 | EXPORT_SYMBOL(inc_zone_page_state); | ||
363 | |||
364 | void dec_zone_page_state(struct page *page, enum zone_stat_item item) | ||
365 | { | ||
366 | mod_state(page_zone(page), item, -1, -1); | ||
367 | } | ||
368 | EXPORT_SYMBOL(dec_zone_page_state); | ||
369 | #else | ||
370 | /* | ||
371 | * Use interrupt disable to serialize counter updates | ||
372 | */ | ||
373 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
374 | int delta) | ||
375 | { | ||
376 | unsigned long flags; | ||
377 | |||
378 | local_irq_save(flags); | ||
379 | __mod_zone_page_state(zone, item, delta); | ||
380 | local_irq_restore(flags); | ||
381 | } | ||
382 | EXPORT_SYMBOL(mod_zone_page_state); | ||
383 | |||
264 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | 384 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) |
265 | { | 385 | { |
266 | unsigned long flags; | 386 | unsigned long flags; |
@@ -291,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
291 | local_irq_restore(flags); | 411 | local_irq_restore(flags); |
292 | } | 412 | } |
293 | EXPORT_SYMBOL(dec_zone_page_state); | 413 | EXPORT_SYMBOL(dec_zone_page_state); |
414 | #endif | ||
294 | 415 | ||
295 | /* | 416 | /* |
296 | * Update the zone counters for one cpu. | 417 | * Update the zone counters for one cpu. |
@@ -750,8 +871,6 @@ static const char * const vmstat_text[] = { | |||
750 | "nr_shmem", | 871 | "nr_shmem", |
751 | "nr_dirtied", | 872 | "nr_dirtied", |
752 | "nr_written", | 873 | "nr_written", |
753 | "nr_dirty_threshold", | ||
754 | "nr_dirty_background_threshold", | ||
755 | 874 | ||
756 | #ifdef CONFIG_NUMA | 875 | #ifdef CONFIG_NUMA |
757 | "numa_hit", | 876 | "numa_hit", |
@@ -761,6 +880,9 @@ static const char * const vmstat_text[] = { | |||
761 | "numa_local", | 880 | "numa_local", |
762 | "numa_other", | 881 | "numa_other", |
763 | #endif | 882 | #endif |
883 | "nr_anon_transparent_hugepages", | ||
884 | "nr_dirty_threshold", | ||
885 | "nr_dirty_background_threshold", | ||
764 | 886 | ||
765 | #ifdef CONFIG_VM_EVENT_COUNTERS | 887 | #ifdef CONFIG_VM_EVENT_COUNTERS |
766 | "pgpgin", | 888 | "pgpgin", |
@@ -834,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
834 | "\n scanned %lu" | 956 | "\n scanned %lu" |
835 | "\n spanned %lu" | 957 | "\n spanned %lu" |
836 | "\n present %lu", | 958 | "\n present %lu", |
837 | zone_nr_free_pages(zone), | 959 | zone_page_state(zone, NR_FREE_PAGES), |
838 | min_wmark_pages(zone), | 960 | min_wmark_pages(zone), |
839 | low_wmark_pages(zone), | 961 | low_wmark_pages(zone), |
840 | high_wmark_pages(zone), | 962 | high_wmark_pages(zone), |
@@ -1033,7 +1155,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
1033 | break; | 1155 | break; |
1034 | case CPU_DOWN_PREPARE: | 1156 | case CPU_DOWN_PREPARE: |
1035 | case CPU_DOWN_PREPARE_FROZEN: | 1157 | case CPU_DOWN_PREPARE_FROZEN: |
1036 | cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); | 1158 | cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); |
1037 | per_cpu(vmstat_work, cpu).work.func = NULL; | 1159 | per_cpu(vmstat_work, cpu).work.func = NULL; |
1038 | break; | 1160 | break; |
1039 | case CPU_DOWN_FAILED: | 1161 | case CPU_DOWN_FAILED: |