diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 38 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/compaction.c | 174 | ||||
-rw-r--r-- | mm/dmapool.c | 16 | ||||
-rw-r--r-- | mm/filemap.c | 20 | ||||
-rw-r--r-- | mm/huge_memory.c | 2346 | ||||
-rw-r--r-- | mm/hugetlb.c | 111 | ||||
-rw-r--r-- | mm/internal.h | 16 | ||||
-rw-r--r-- | mm/ksm.c | 81 | ||||
-rw-r--r-- | mm/madvise.c | 10 | ||||
-rw-r--r-- | mm/memcontrol.c | 258 | ||||
-rw-r--r-- | mm/memory-failure.c | 22 | ||||
-rw-r--r-- | mm/memory.c | 336 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 17 | ||||
-rw-r--r-- | mm/mempolicy.c | 23 | ||||
-rw-r--r-- | mm/migrate.c | 123 | ||||
-rw-r--r-- | mm/mincore.c | 7 | ||||
-rw-r--r-- | mm/mlock.c | 163 | ||||
-rw-r--r-- | mm/mmap.c | 17 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 20 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/mprotect.c | 20 | ||||
-rw-r--r-- | mm/mremap.c | 9 | ||||
-rw-r--r-- | mm/nommu.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 9 | ||||
-rw-r--r-- | mm/page_alloc.c | 165 | ||||
-rw-r--r-- | mm/pagewalk.c | 1 | ||||
-rw-r--r-- | mm/percpu-vm.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 10 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 123 | ||||
-rw-r--r-- | mm/rmap.c | 93 | ||||
-rw-r--r-- | mm/shmem.c | 9 | ||||
-rw-r--r-- | mm/slab.c | 76 | ||||
-rw-r--r-- | mm/slob.c | 5 | ||||
-rw-r--r-- | mm/slub.c | 81 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 2 | ||||
-rw-r--r-- | mm/sparse.c | 4 | ||||
-rw-r--r-- | mm/swap.c | 322 | ||||
-rw-r--r-- | mm/swap_state.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 9 | ||||
-rw-r--r-- | mm/util.c | 21 | ||||
-rw-r--r-- | mm/vmalloc.c | 89 | ||||
-rw-r--r-- | mm/vmscan.c | 432 | ||||
-rw-r--r-- | mm/vmstat.c | 202 |
44 files changed, 4508 insertions, 1010 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c2c8a4a11898..3ad483bdf505 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS | |||
302 | 302 | ||
303 | See Documentation/nommu-mmap.txt for more information. | 303 | See Documentation/nommu-mmap.txt for more information. |
304 | 304 | ||
305 | config TRANSPARENT_HUGEPAGE | ||
306 | bool "Transparent Hugepage Support" | ||
307 | depends on X86 && MMU | ||
308 | select COMPACTION | ||
309 | help | ||
310 | Transparent Hugepages allows the kernel to use huge pages and | ||
311 | huge tlb transparently to the applications whenever possible. | ||
312 | This feature can improve computing performance to certain | ||
313 | applications by speeding up page faults during memory | ||
314 | allocation, by reducing the number of tlb misses and by speeding | ||
315 | up the pagetable walking. | ||
316 | |||
317 | If memory constrained on embedded, you may want to say N. | ||
318 | |||
319 | choice | ||
320 | prompt "Transparent Hugepage Support sysfs defaults" | ||
321 | depends on TRANSPARENT_HUGEPAGE | ||
322 | default TRANSPARENT_HUGEPAGE_ALWAYS | ||
323 | help | ||
324 | Selects the sysfs defaults for Transparent Hugepage Support. | ||
325 | |||
326 | config TRANSPARENT_HUGEPAGE_ALWAYS | ||
327 | bool "always" | ||
328 | help | ||
329 | Enabling Transparent Hugepage always, can increase the | ||
330 | memory footprint of applications without a guaranteed | ||
331 | benefit but it will work automatically for all applications. | ||
332 | |||
333 | config TRANSPARENT_HUGEPAGE_MADVISE | ||
334 | bool "madvise" | ||
335 | help | ||
336 | Enabling Transparent Hugepage madvise, will only provide a | ||
337 | performance improvement benefit to the applications using | ||
338 | madvise(MADV_HUGEPAGE) but it won't risk to increase the | ||
339 | memory footprint of applications without a guaranteed | ||
340 | benefit. | ||
341 | endchoice | ||
342 | |||
305 | # | 343 | # |
306 | # UP and nommu archs use km based percpu allocator | 344 | # UP and nommu archs use km based percpu allocator |
307 | # | 345 | # |
diff --git a/mm/Makefile b/mm/Makefile index f73f75a29f82..2b1b575ae712 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -5,7 +5,7 @@ | |||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o | 8 | vmalloc.o pagewalk.o pgtable-generic.o |
9 | 9 | ||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | maccess.o page_alloc.o page-writeback.o \ | 11 | maccess.o page_alloc.o page-writeback.o \ |
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | |||
37 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 37 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
38 | obj-$(CONFIG_MIGRATION) += migrate.o | 38 | obj-$(CONFIG_MIGRATION) += migrate.o |
39 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 39 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
40 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | ||
40 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 41 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
41 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 42 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
42 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 43 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
diff --git a/mm/compaction.c b/mm/compaction.c index 1a8894eadf72..6d592a021072 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -16,6 +16,9 @@ | |||
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include "internal.h" | 17 | #include "internal.h" |
18 | 18 | ||
19 | #define CREATE_TRACE_POINTS | ||
20 | #include <trace/events/compaction.h> | ||
21 | |||
19 | /* | 22 | /* |
20 | * compact_control is used to track pages being migrated and the free pages | 23 | * compact_control is used to track pages being migrated and the free pages |
21 | * they are being migrated to during memory compaction. The free_pfn starts | 24 | * they are being migrated to during memory compaction. The free_pfn starts |
@@ -30,6 +33,7 @@ struct compact_control { | |||
30 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 33 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
31 | unsigned long free_pfn; /* isolate_freepages search base */ | 34 | unsigned long free_pfn; /* isolate_freepages search base */ |
32 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
36 | bool sync; /* Synchronous migration */ | ||
33 | 37 | ||
34 | /* Account for isolated anon and file pages */ | 38 | /* Account for isolated anon and file pages */ |
35 | unsigned long nr_anon; | 39 | unsigned long nr_anon; |
@@ -38,6 +42,8 @@ struct compact_control { | |||
38 | unsigned int order; /* order a direct compactor needs */ | 42 | unsigned int order; /* order a direct compactor needs */ |
39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 43 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
40 | struct zone *zone; | 44 | struct zone *zone; |
45 | |||
46 | int compact_mode; | ||
41 | }; | 47 | }; |
42 | 48 | ||
43 | static unsigned long release_freepages(struct list_head *freelist) | 49 | static unsigned long release_freepages(struct list_head *freelist) |
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
60 | struct list_head *freelist) | 66 | struct list_head *freelist) |
61 | { | 67 | { |
62 | unsigned long zone_end_pfn, end_pfn; | 68 | unsigned long zone_end_pfn, end_pfn; |
63 | int total_isolated = 0; | 69 | int nr_scanned = 0, total_isolated = 0; |
64 | struct page *cursor; | 70 | struct page *cursor; |
65 | 71 | ||
66 | /* Get the last PFN we should scan for free pages at */ | 72 | /* Get the last PFN we should scan for free pages at */ |
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
81 | 87 | ||
82 | if (!pfn_valid_within(blockpfn)) | 88 | if (!pfn_valid_within(blockpfn)) |
83 | continue; | 89 | continue; |
90 | nr_scanned++; | ||
84 | 91 | ||
85 | if (!PageBuddy(page)) | 92 | if (!PageBuddy(page)) |
86 | continue; | 93 | continue; |
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, | |||
100 | } | 107 | } |
101 | } | 108 | } |
102 | 109 | ||
110 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | ||
103 | return total_isolated; | 111 | return total_isolated; |
104 | } | 112 | } |
105 | 113 | ||
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
234 | struct compact_control *cc) | 242 | struct compact_control *cc) |
235 | { | 243 | { |
236 | unsigned long low_pfn, end_pfn; | 244 | unsigned long low_pfn, end_pfn; |
245 | unsigned long last_pageblock_nr = 0, pageblock_nr; | ||
246 | unsigned long nr_scanned = 0, nr_isolated = 0; | ||
237 | struct list_head *migratelist = &cc->migratepages; | 247 | struct list_head *migratelist = &cc->migratepages; |
238 | 248 | ||
239 | /* Do not scan outside zone boundaries */ | 249 | /* Do not scan outside zone boundaries */ |
@@ -266,20 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
266 | struct page *page; | 276 | struct page *page; |
267 | if (!pfn_valid_within(low_pfn)) | 277 | if (!pfn_valid_within(low_pfn)) |
268 | continue; | 278 | continue; |
279 | nr_scanned++; | ||
269 | 280 | ||
270 | /* Get the page and skip if free */ | 281 | /* Get the page and skip if free */ |
271 | page = pfn_to_page(low_pfn); | 282 | page = pfn_to_page(low_pfn); |
272 | if (PageBuddy(page)) | 283 | if (PageBuddy(page)) |
273 | continue; | 284 | continue; |
274 | 285 | ||
286 | /* | ||
287 | * For async migration, also only scan in MOVABLE blocks. Async | ||
288 | * migration is optimistic to see if the minimum amount of work | ||
289 | * satisfies the allocation | ||
290 | */ | ||
291 | pageblock_nr = low_pfn >> pageblock_order; | ||
292 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | ||
293 | get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { | ||
294 | low_pfn += pageblock_nr_pages; | ||
295 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | ||
296 | last_pageblock_nr = pageblock_nr; | ||
297 | continue; | ||
298 | } | ||
299 | |||
300 | if (!PageLRU(page)) | ||
301 | continue; | ||
302 | |||
303 | /* | ||
304 | * PageLRU is set, and lru_lock excludes isolation, | ||
305 | * splitting and collapsing (collapsing has already | ||
306 | * happened if PageLRU is set). | ||
307 | */ | ||
308 | if (PageTransHuge(page)) { | ||
309 | low_pfn += (1 << compound_order(page)) - 1; | ||
310 | continue; | ||
311 | } | ||
312 | |||
275 | /* Try isolate the page */ | 313 | /* Try isolate the page */ |
276 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) | 314 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) |
277 | continue; | 315 | continue; |
278 | 316 | ||
317 | VM_BUG_ON(PageTransCompound(page)); | ||
318 | |||
279 | /* Successfully isolated */ | 319 | /* Successfully isolated */ |
280 | del_page_from_lru_list(zone, page, page_lru(page)); | 320 | del_page_from_lru_list(zone, page, page_lru(page)); |
281 | list_add(&page->lru, migratelist); | 321 | list_add(&page->lru, migratelist); |
282 | cc->nr_migratepages++; | 322 | cc->nr_migratepages++; |
323 | nr_isolated++; | ||
283 | 324 | ||
284 | /* Avoid isolating too much */ | 325 | /* Avoid isolating too much */ |
285 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) | 326 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) |
@@ -291,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
291 | spin_unlock_irq(&zone->lru_lock); | 332 | spin_unlock_irq(&zone->lru_lock); |
292 | cc->migrate_pfn = low_pfn; | 333 | cc->migrate_pfn = low_pfn; |
293 | 334 | ||
335 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | ||
336 | |||
294 | return cc->nr_migratepages; | 337 | return cc->nr_migratepages; |
295 | } | 338 | } |
296 | 339 | ||
@@ -341,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc) | |||
341 | } | 384 | } |
342 | 385 | ||
343 | static int compact_finished(struct zone *zone, | 386 | static int compact_finished(struct zone *zone, |
344 | struct compact_control *cc) | 387 | struct compact_control *cc) |
345 | { | 388 | { |
346 | unsigned int order; | 389 | unsigned int order; |
347 | unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); | 390 | unsigned long watermark; |
348 | 391 | ||
349 | if (fatal_signal_pending(current)) | 392 | if (fatal_signal_pending(current)) |
350 | return COMPACT_PARTIAL; | 393 | return COMPACT_PARTIAL; |
@@ -354,12 +397,27 @@ static int compact_finished(struct zone *zone, | |||
354 | return COMPACT_COMPLETE; | 397 | return COMPACT_COMPLETE; |
355 | 398 | ||
356 | /* Compaction run is not finished if the watermark is not met */ | 399 | /* Compaction run is not finished if the watermark is not met */ |
400 | if (cc->compact_mode != COMPACT_MODE_KSWAPD) | ||
401 | watermark = low_wmark_pages(zone); | ||
402 | else | ||
403 | watermark = high_wmark_pages(zone); | ||
404 | watermark += (1 << cc->order); | ||
405 | |||
357 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | 406 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) |
358 | return COMPACT_CONTINUE; | 407 | return COMPACT_CONTINUE; |
359 | 408 | ||
360 | if (cc->order == -1) | 409 | if (cc->order == -1) |
361 | return COMPACT_CONTINUE; | 410 | return COMPACT_CONTINUE; |
362 | 411 | ||
412 | /* | ||
413 | * Generating only one page of the right order is not enough | ||
414 | * for kswapd, we must continue until we're above the high | ||
415 | * watermark as a pool for high order GFP_ATOMIC allocations | ||
416 | * too. | ||
417 | */ | ||
418 | if (cc->compact_mode == COMPACT_MODE_KSWAPD) | ||
419 | return COMPACT_CONTINUE; | ||
420 | |||
363 | /* Direct compactor: Is a suitable page free? */ | 421 | /* Direct compactor: Is a suitable page free? */ |
364 | for (order = cc->order; order < MAX_ORDER; order++) { | 422 | for (order = cc->order; order < MAX_ORDER; order++) { |
365 | /* Job done if page is free of the right migratetype */ | 423 | /* Job done if page is free of the right migratetype */ |
@@ -374,10 +432,62 @@ static int compact_finished(struct zone *zone, | |||
374 | return COMPACT_CONTINUE; | 432 | return COMPACT_CONTINUE; |
375 | } | 433 | } |
376 | 434 | ||
435 | /* | ||
436 | * compaction_suitable: Is this suitable to run compaction on this zone now? | ||
437 | * Returns | ||
438 | * COMPACT_SKIPPED - If there are too few free pages for compaction | ||
439 | * COMPACT_PARTIAL - If the allocation would succeed without compaction | ||
440 | * COMPACT_CONTINUE - If compaction should run now | ||
441 | */ | ||
442 | unsigned long compaction_suitable(struct zone *zone, int order) | ||
443 | { | ||
444 | int fragindex; | ||
445 | unsigned long watermark; | ||
446 | |||
447 | /* | ||
448 | * Watermarks for order-0 must be met for compaction. Note the 2UL. | ||
449 | * This is because during migration, copies of pages need to be | ||
450 | * allocated and for a short time, the footprint is higher | ||
451 | */ | ||
452 | watermark = low_wmark_pages(zone) + (2UL << order); | ||
453 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | ||
454 | return COMPACT_SKIPPED; | ||
455 | |||
456 | /* | ||
457 | * fragmentation index determines if allocation failures are due to | ||
458 | * low memory or external fragmentation | ||
459 | * | ||
460 | * index of -1 implies allocations might succeed dependingon watermarks | ||
461 | * index towards 0 implies failure is due to lack of memory | ||
462 | * index towards 1000 implies failure is due to fragmentation | ||
463 | * | ||
464 | * Only compact if a failure would be due to fragmentation. | ||
465 | */ | ||
466 | fragindex = fragmentation_index(zone, order); | ||
467 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | ||
468 | return COMPACT_SKIPPED; | ||
469 | |||
470 | if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) | ||
471 | return COMPACT_PARTIAL; | ||
472 | |||
473 | return COMPACT_CONTINUE; | ||
474 | } | ||
475 | |||
377 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 476 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
378 | { | 477 | { |
379 | int ret; | 478 | int ret; |
380 | 479 | ||
480 | ret = compaction_suitable(zone, cc->order); | ||
481 | switch (ret) { | ||
482 | case COMPACT_PARTIAL: | ||
483 | case COMPACT_SKIPPED: | ||
484 | /* Compaction is likely to fail */ | ||
485 | return ret; | ||
486 | case COMPACT_CONTINUE: | ||
487 | /* Fall through to compaction */ | ||
488 | ; | ||
489 | } | ||
490 | |||
381 | /* Setup to move all movable pages to the end of the zone */ | 491 | /* Setup to move all movable pages to the end of the zone */ |
382 | cc->migrate_pfn = zone->zone_start_pfn; | 492 | cc->migrate_pfn = zone->zone_start_pfn; |
383 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; | 493 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; |
@@ -393,7 +503,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
393 | 503 | ||
394 | nr_migrate = cc->nr_migratepages; | 504 | nr_migrate = cc->nr_migratepages; |
395 | migrate_pages(&cc->migratepages, compaction_alloc, | 505 | migrate_pages(&cc->migratepages, compaction_alloc, |
396 | (unsigned long)cc, 0); | 506 | (unsigned long)cc, false, |
507 | cc->sync); | ||
397 | update_nr_listpages(cc); | 508 | update_nr_listpages(cc); |
398 | nr_remaining = cc->nr_migratepages; | 509 | nr_remaining = cc->nr_migratepages; |
399 | 510 | ||
@@ -401,6 +512,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
401 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); | 512 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); |
402 | if (nr_remaining) | 513 | if (nr_remaining) |
403 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); | 514 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); |
515 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | ||
516 | nr_remaining); | ||
404 | 517 | ||
405 | /* Release LRU pages not migrated */ | 518 | /* Release LRU pages not migrated */ |
406 | if (!list_empty(&cc->migratepages)) { | 519 | if (!list_empty(&cc->migratepages)) { |
@@ -417,8 +530,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
417 | return ret; | 530 | return ret; |
418 | } | 531 | } |
419 | 532 | ||
420 | static unsigned long compact_zone_order(struct zone *zone, | 533 | unsigned long compact_zone_order(struct zone *zone, |
421 | int order, gfp_t gfp_mask) | 534 | int order, gfp_t gfp_mask, |
535 | bool sync, | ||
536 | int compact_mode) | ||
422 | { | 537 | { |
423 | struct compact_control cc = { | 538 | struct compact_control cc = { |
424 | .nr_freepages = 0, | 539 | .nr_freepages = 0, |
@@ -426,6 +541,8 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
426 | .order = order, | 541 | .order = order, |
427 | .migratetype = allocflags_to_migratetype(gfp_mask), | 542 | .migratetype = allocflags_to_migratetype(gfp_mask), |
428 | .zone = zone, | 543 | .zone = zone, |
544 | .sync = sync, | ||
545 | .compact_mode = compact_mode, | ||
429 | }; | 546 | }; |
430 | INIT_LIST_HEAD(&cc.freepages); | 547 | INIT_LIST_HEAD(&cc.freepages); |
431 | INIT_LIST_HEAD(&cc.migratepages); | 548 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -441,16 +558,17 @@ int sysctl_extfrag_threshold = 500; | |||
441 | * @order: The order of the current allocation | 558 | * @order: The order of the current allocation |
442 | * @gfp_mask: The GFP mask of the current allocation | 559 | * @gfp_mask: The GFP mask of the current allocation |
443 | * @nodemask: The allowed nodes to allocate from | 560 | * @nodemask: The allowed nodes to allocate from |
561 | * @sync: Whether migration is synchronous or not | ||
444 | * | 562 | * |
445 | * This is the main entry point for direct page compaction. | 563 | * This is the main entry point for direct page compaction. |
446 | */ | 564 | */ |
447 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 565 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
448 | int order, gfp_t gfp_mask, nodemask_t *nodemask) | 566 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
567 | bool sync) | ||
449 | { | 568 | { |
450 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 569 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
451 | int may_enter_fs = gfp_mask & __GFP_FS; | 570 | int may_enter_fs = gfp_mask & __GFP_FS; |
452 | int may_perform_io = gfp_mask & __GFP_IO; | 571 | int may_perform_io = gfp_mask & __GFP_IO; |
453 | unsigned long watermark; | ||
454 | struct zoneref *z; | 572 | struct zoneref *z; |
455 | struct zone *zone; | 573 | struct zone *zone; |
456 | int rc = COMPACT_SKIPPED; | 574 | int rc = COMPACT_SKIPPED; |
@@ -460,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
460 | * made because an assumption is made that the page allocator can satisfy | 578 | * made because an assumption is made that the page allocator can satisfy |
461 | * the "cheaper" orders without taking special steps | 579 | * the "cheaper" orders without taking special steps |
462 | */ | 580 | */ |
463 | if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) | 581 | if (!order || !may_enter_fs || !may_perform_io) |
464 | return rc; | 582 | return rc; |
465 | 583 | ||
466 | count_vm_event(COMPACTSTALL); | 584 | count_vm_event(COMPACTSTALL); |
@@ -468,43 +586,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
468 | /* Compact each zone in the list */ | 586 | /* Compact each zone in the list */ |
469 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 587 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
470 | nodemask) { | 588 | nodemask) { |
471 | int fragindex; | ||
472 | int status; | 589 | int status; |
473 | 590 | ||
474 | /* | 591 | status = compact_zone_order(zone, order, gfp_mask, sync, |
475 | * Watermarks for order-0 must be met for compaction. Note | 592 | COMPACT_MODE_DIRECT_RECLAIM); |
476 | * the 2UL. This is because during migration, copies of | ||
477 | * pages need to be allocated and for a short time, the | ||
478 | * footprint is higher | ||
479 | */ | ||
480 | watermark = low_wmark_pages(zone) + (2UL << order); | ||
481 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | ||
482 | continue; | ||
483 | |||
484 | /* | ||
485 | * fragmentation index determines if allocation failures are | ||
486 | * due to low memory or external fragmentation | ||
487 | * | ||
488 | * index of -1 implies allocations might succeed depending | ||
489 | * on watermarks | ||
490 | * index towards 0 implies failure is due to lack of memory | ||
491 | * index towards 1000 implies failure is due to fragmentation | ||
492 | * | ||
493 | * Only compact if a failure would be due to fragmentation. | ||
494 | */ | ||
495 | fragindex = fragmentation_index(zone, order); | ||
496 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | ||
497 | continue; | ||
498 | |||
499 | if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { | ||
500 | rc = COMPACT_PARTIAL; | ||
501 | break; | ||
502 | } | ||
503 | |||
504 | status = compact_zone_order(zone, order, gfp_mask); | ||
505 | rc = max(status, rc); | 593 | rc = max(status, rc); |
506 | 594 | ||
507 | if (zone_watermark_ok(zone, order, watermark, 0, 0)) | 595 | /* If a normal allocation would succeed, stop compacting */ |
596 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
508 | break; | 597 | break; |
509 | } | 598 | } |
510 | 599 | ||
@@ -531,6 +620,7 @@ static int compact_node(int nid) | |||
531 | .nr_freepages = 0, | 620 | .nr_freepages = 0, |
532 | .nr_migratepages = 0, | 621 | .nr_migratepages = 0, |
533 | .order = -1, | 622 | .order = -1, |
623 | .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, | ||
534 | }; | 624 | }; |
535 | 625 | ||
536 | zone = &pgdat->node_zones[zoneid]; | 626 | zone = &pgdat->node_zones[zoneid]; |
diff --git a/mm/dmapool.c b/mm/dmapool.c index 4df2de77e069..03bf3bb4519a 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
324 | if (mem_flags & __GFP_WAIT) { | 324 | if (mem_flags & __GFP_WAIT) { |
325 | DECLARE_WAITQUEUE(wait, current); | 325 | DECLARE_WAITQUEUE(wait, current); |
326 | 326 | ||
327 | __set_current_state(TASK_INTERRUPTIBLE); | 327 | __set_current_state(TASK_UNINTERRUPTIBLE); |
328 | __add_wait_queue(&pool->waitq, &wait); | 328 | __add_wait_queue(&pool->waitq, &wait); |
329 | spin_unlock_irqrestore(&pool->lock, flags); | 329 | spin_unlock_irqrestore(&pool->lock, flags); |
330 | 330 | ||
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc); | |||
355 | 355 | ||
356 | static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) | 356 | static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) |
357 | { | 357 | { |
358 | unsigned long flags; | ||
359 | struct dma_page *page; | 358 | struct dma_page *page; |
360 | 359 | ||
361 | spin_lock_irqsave(&pool->lock, flags); | ||
362 | list_for_each_entry(page, &pool->page_list, page_list) { | 360 | list_for_each_entry(page, &pool->page_list, page_list) { |
363 | if (dma < page->dma) | 361 | if (dma < page->dma) |
364 | continue; | 362 | continue; |
365 | if (dma < (page->dma + pool->allocation)) | 363 | if (dma < (page->dma + pool->allocation)) |
366 | goto done; | 364 | return page; |
367 | } | 365 | } |
368 | page = NULL; | 366 | return NULL; |
369 | done: | ||
370 | spin_unlock_irqrestore(&pool->lock, flags); | ||
371 | return page; | ||
372 | } | 367 | } |
373 | 368 | ||
374 | /** | 369 | /** |
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
386 | unsigned long flags; | 381 | unsigned long flags; |
387 | unsigned int offset; | 382 | unsigned int offset; |
388 | 383 | ||
384 | spin_lock_irqsave(&pool->lock, flags); | ||
389 | page = pool_find_page(pool, dma); | 385 | page = pool_find_page(pool, dma); |
390 | if (!page) { | 386 | if (!page) { |
387 | spin_unlock_irqrestore(&pool->lock, flags); | ||
391 | if (pool->dev) | 388 | if (pool->dev) |
392 | dev_err(pool->dev, | 389 | dev_err(pool->dev, |
393 | "dma_pool_free %s, %p/%lx (bad dma)\n", | 390 | "dma_pool_free %s, %p/%lx (bad dma)\n", |
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
401 | offset = vaddr - page->vaddr; | 398 | offset = vaddr - page->vaddr; |
402 | #ifdef DMAPOOL_DEBUG | 399 | #ifdef DMAPOOL_DEBUG |
403 | if ((dma - page->dma) != offset) { | 400 | if ((dma - page->dma) != offset) { |
401 | spin_unlock_irqrestore(&pool->lock, flags); | ||
404 | if (pool->dev) | 402 | if (pool->dev) |
405 | dev_err(pool->dev, | 403 | dev_err(pool->dev, |
406 | "dma_pool_free %s, %p (bad vaddr)/%Lx\n", | 404 | "dma_pool_free %s, %p (bad vaddr)/%Lx\n", |
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
418 | chain = *(int *)(page->vaddr + chain); | 416 | chain = *(int *)(page->vaddr + chain); |
419 | continue; | 417 | continue; |
420 | } | 418 | } |
419 | spin_unlock_irqrestore(&pool->lock, flags); | ||
421 | if (pool->dev) | 420 | if (pool->dev) |
422 | dev_err(pool->dev, "dma_pool_free %s, dma %Lx " | 421 | dev_err(pool->dev, "dma_pool_free %s, dma %Lx " |
423 | "already free\n", pool->name, | 422 | "already free\n", pool->name, |
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
432 | memset(vaddr, POOL_POISON_FREED, pool->size); | 431 | memset(vaddr, POOL_POISON_FREED, pool->size); |
433 | #endif | 432 | #endif |
434 | 433 | ||
435 | spin_lock_irqsave(&pool->lock, flags); | ||
436 | page->in_use--; | 434 | page->in_use--; |
437 | *(int *)vaddr = page->offset; | 435 | *(int *)vaddr = page->offset; |
438 | page->offset = offset; | 436 | page->offset = offset; |
diff --git a/mm/filemap.c b/mm/filemap.c index 6b9aee20f242..83a45d35468b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -102,9 +102,6 @@ | |||
102 | * ->inode_lock (zap_pte_range->set_page_dirty) | 102 | * ->inode_lock (zap_pte_range->set_page_dirty) |
103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | * | 104 | * |
105 | * ->task->proc_lock | ||
106 | * ->dcache_lock (proc_pid_lookup) | ||
107 | * | ||
108 | * (code doesn't rely on that order, so you could switch it around) | 105 | * (code doesn't rely on that order, so you could switch it around) |
109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 106 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
110 | * ->i_mmap_lock | 107 | * ->i_mmap_lock |
@@ -301,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, | |||
301 | continue; | 298 | continue; |
302 | 299 | ||
303 | wait_on_page_writeback(page); | 300 | wait_on_page_writeback(page); |
304 | if (PageError(page)) | 301 | if (TestClearPageError(page)) |
305 | ret = -EIO; | 302 | ret = -EIO; |
306 | } | 303 | } |
307 | pagevec_release(&pvec); | 304 | pagevec_release(&pvec); |
@@ -840,9 +837,6 @@ repeat: | |||
840 | if (radix_tree_deref_retry(page)) | 837 | if (radix_tree_deref_retry(page)) |
841 | goto restart; | 838 | goto restart; |
842 | 839 | ||
843 | if (page->mapping == NULL || page->index != index) | ||
844 | break; | ||
845 | |||
846 | if (!page_cache_get_speculative(page)) | 840 | if (!page_cache_get_speculative(page)) |
847 | goto repeat; | 841 | goto repeat; |
848 | 842 | ||
@@ -852,6 +846,16 @@ repeat: | |||
852 | goto repeat; | 846 | goto repeat; |
853 | } | 847 | } |
854 | 848 | ||
849 | /* | ||
850 | * must check mapping and index after taking the ref. | ||
851 | * otherwise we can get both false positives and false | ||
852 | * negatives, which is just confusing to the caller. | ||
853 | */ | ||
854 | if (page->mapping == NULL || page->index != index) { | ||
855 | page_cache_release(page); | ||
856 | break; | ||
857 | } | ||
858 | |||
855 | pages[ret] = page; | 859 | pages[ret] = page; |
856 | ret++; | 860 | ret++; |
857 | index++; | 861 | index++; |
@@ -2223,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, | |||
2223 | gfp_notmask = __GFP_FS; | 2227 | gfp_notmask = __GFP_FS; |
2224 | repeat: | 2228 | repeat: |
2225 | page = find_lock_page(mapping, index); | 2229 | page = find_lock_page(mapping, index); |
2226 | if (likely(page)) | 2230 | if (page) |
2227 | return page; | 2231 | return page; |
2228 | 2232 | ||
2229 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); | 2233 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 000000000000..004c9c2aac78 --- /dev/null +++ b/mm/huge_memory.c | |||
@@ -0,0 +1,2346 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2009 Red Hat, Inc. | ||
3 | * | ||
4 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
5 | * the COPYING file in the top-level directory. | ||
6 | */ | ||
7 | |||
8 | #include <linux/mm.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/highmem.h> | ||
11 | #include <linux/hugetlb.h> | ||
12 | #include <linux/mmu_notifier.h> | ||
13 | #include <linux/rmap.h> | ||
14 | #include <linux/swap.h> | ||
15 | #include <linux/mm_inline.h> | ||
16 | #include <linux/kthread.h> | ||
17 | #include <linux/khugepaged.h> | ||
18 | #include <linux/freezer.h> | ||
19 | #include <linux/mman.h> | ||
20 | #include <asm/tlb.h> | ||
21 | #include <asm/pgalloc.h> | ||
22 | #include "internal.h" | ||
23 | |||
24 | /* | ||
25 | * By default transparent hugepage support is enabled for all mappings | ||
26 | * and khugepaged scans all mappings. Defrag is only invoked by | ||
27 | * khugepaged hugepage allocations and by page faults inside | ||
28 | * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived | ||
29 | * allocations. | ||
30 | */ | ||
31 | unsigned long transparent_hugepage_flags __read_mostly = | ||
32 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS | ||
33 | (1<<TRANSPARENT_HUGEPAGE_FLAG)| | ||
34 | #endif | ||
35 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE | ||
36 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| | ||
37 | #endif | ||
38 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| | ||
39 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
40 | |||
41 | /* default scan 8*512 pte (or vmas) every 30 second */ | ||
42 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | ||
43 | static unsigned int khugepaged_pages_collapsed; | ||
44 | static unsigned int khugepaged_full_scans; | ||
45 | static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; | ||
46 | /* during fragmentation poll the hugepage allocator once every minute */ | ||
47 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; | ||
48 | static struct task_struct *khugepaged_thread __read_mostly; | ||
49 | static DEFINE_MUTEX(khugepaged_mutex); | ||
50 | static DEFINE_SPINLOCK(khugepaged_mm_lock); | ||
51 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | ||
52 | /* | ||
53 | * default collapse hugepages if there is at least one pte mapped like | ||
54 | * it would have happened if the vma was large enough during page | ||
55 | * fault. | ||
56 | */ | ||
57 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | ||
58 | |||
59 | static int khugepaged(void *none); | ||
60 | static int mm_slots_hash_init(void); | ||
61 | static int khugepaged_slab_init(void); | ||
62 | static void khugepaged_slab_free(void); | ||
63 | |||
64 | #define MM_SLOTS_HASH_HEADS 1024 | ||
65 | static struct hlist_head *mm_slots_hash __read_mostly; | ||
66 | static struct kmem_cache *mm_slot_cache __read_mostly; | ||
67 | |||
68 | /** | ||
69 | * struct mm_slot - hash lookup from mm to mm_slot | ||
70 | * @hash: hash collision list | ||
71 | * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head | ||
72 | * @mm: the mm that this information is valid for | ||
73 | */ | ||
74 | struct mm_slot { | ||
75 | struct hlist_node hash; | ||
76 | struct list_head mm_node; | ||
77 | struct mm_struct *mm; | ||
78 | }; | ||
79 | |||
80 | /** | ||
81 | * struct khugepaged_scan - cursor for scanning | ||
82 | * @mm_head: the head of the mm list to scan | ||
83 | * @mm_slot: the current mm_slot we are scanning | ||
84 | * @address: the next address inside that to be scanned | ||
85 | * | ||
86 | * There is only the one khugepaged_scan instance of this cursor structure. | ||
87 | */ | ||
88 | struct khugepaged_scan { | ||
89 | struct list_head mm_head; | ||
90 | struct mm_slot *mm_slot; | ||
91 | unsigned long address; | ||
92 | } khugepaged_scan = { | ||
93 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | ||
94 | }; | ||
95 | |||
96 | |||
97 | static int set_recommended_min_free_kbytes(void) | ||
98 | { | ||
99 | struct zone *zone; | ||
100 | int nr_zones = 0; | ||
101 | unsigned long recommended_min; | ||
102 | extern int min_free_kbytes; | ||
103 | |||
104 | if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
105 | &transparent_hugepage_flags) && | ||
106 | !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
107 | &transparent_hugepage_flags)) | ||
108 | return 0; | ||
109 | |||
110 | for_each_populated_zone(zone) | ||
111 | nr_zones++; | ||
112 | |||
113 | /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ | ||
114 | recommended_min = pageblock_nr_pages * nr_zones * 2; | ||
115 | |||
116 | /* | ||
117 | * Make sure that on average at least two pageblocks are almost free | ||
118 | * of another type, one for a migratetype to fall back to and a | ||
119 | * second to avoid subsequent fallbacks of other types There are 3 | ||
120 | * MIGRATE_TYPES we care about. | ||
121 | */ | ||
122 | recommended_min += pageblock_nr_pages * nr_zones * | ||
123 | MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; | ||
124 | |||
125 | /* don't ever allow to reserve more than 5% of the lowmem */ | ||
126 | recommended_min = min(recommended_min, | ||
127 | (unsigned long) nr_free_buffer_pages() / 20); | ||
128 | recommended_min <<= (PAGE_SHIFT-10); | ||
129 | |||
130 | if (recommended_min > min_free_kbytes) | ||
131 | min_free_kbytes = recommended_min; | ||
132 | setup_per_zone_wmarks(); | ||
133 | return 0; | ||
134 | } | ||
135 | late_initcall(set_recommended_min_free_kbytes); | ||
136 | |||
137 | static int start_khugepaged(void) | ||
138 | { | ||
139 | int err = 0; | ||
140 | if (khugepaged_enabled()) { | ||
141 | int wakeup; | ||
142 | if (unlikely(!mm_slot_cache || !mm_slots_hash)) { | ||
143 | err = -ENOMEM; | ||
144 | goto out; | ||
145 | } | ||
146 | mutex_lock(&khugepaged_mutex); | ||
147 | if (!khugepaged_thread) | ||
148 | khugepaged_thread = kthread_run(khugepaged, NULL, | ||
149 | "khugepaged"); | ||
150 | if (unlikely(IS_ERR(khugepaged_thread))) { | ||
151 | printk(KERN_ERR | ||
152 | "khugepaged: kthread_run(khugepaged) failed\n"); | ||
153 | err = PTR_ERR(khugepaged_thread); | ||
154 | khugepaged_thread = NULL; | ||
155 | } | ||
156 | wakeup = !list_empty(&khugepaged_scan.mm_head); | ||
157 | mutex_unlock(&khugepaged_mutex); | ||
158 | if (wakeup) | ||
159 | wake_up_interruptible(&khugepaged_wait); | ||
160 | |||
161 | set_recommended_min_free_kbytes(); | ||
162 | } else | ||
163 | /* wakeup to exit */ | ||
164 | wake_up_interruptible(&khugepaged_wait); | ||
165 | out: | ||
166 | return err; | ||
167 | } | ||
168 | |||
169 | #ifdef CONFIG_SYSFS | ||
170 | |||
171 | static ssize_t double_flag_show(struct kobject *kobj, | ||
172 | struct kobj_attribute *attr, char *buf, | ||
173 | enum transparent_hugepage_flag enabled, | ||
174 | enum transparent_hugepage_flag req_madv) | ||
175 | { | ||
176 | if (test_bit(enabled, &transparent_hugepage_flags)) { | ||
177 | VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); | ||
178 | return sprintf(buf, "[always] madvise never\n"); | ||
179 | } else if (test_bit(req_madv, &transparent_hugepage_flags)) | ||
180 | return sprintf(buf, "always [madvise] never\n"); | ||
181 | else | ||
182 | return sprintf(buf, "always madvise [never]\n"); | ||
183 | } | ||
184 | static ssize_t double_flag_store(struct kobject *kobj, | ||
185 | struct kobj_attribute *attr, | ||
186 | const char *buf, size_t count, | ||
187 | enum transparent_hugepage_flag enabled, | ||
188 | enum transparent_hugepage_flag req_madv) | ||
189 | { | ||
190 | if (!memcmp("always", buf, | ||
191 | min(sizeof("always")-1, count))) { | ||
192 | set_bit(enabled, &transparent_hugepage_flags); | ||
193 | clear_bit(req_madv, &transparent_hugepage_flags); | ||
194 | } else if (!memcmp("madvise", buf, | ||
195 | min(sizeof("madvise")-1, count))) { | ||
196 | clear_bit(enabled, &transparent_hugepage_flags); | ||
197 | set_bit(req_madv, &transparent_hugepage_flags); | ||
198 | } else if (!memcmp("never", buf, | ||
199 | min(sizeof("never")-1, count))) { | ||
200 | clear_bit(enabled, &transparent_hugepage_flags); | ||
201 | clear_bit(req_madv, &transparent_hugepage_flags); | ||
202 | } else | ||
203 | return -EINVAL; | ||
204 | |||
205 | return count; | ||
206 | } | ||
207 | |||
208 | static ssize_t enabled_show(struct kobject *kobj, | ||
209 | struct kobj_attribute *attr, char *buf) | ||
210 | { | ||
211 | return double_flag_show(kobj, attr, buf, | ||
212 | TRANSPARENT_HUGEPAGE_FLAG, | ||
213 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | ||
214 | } | ||
215 | static ssize_t enabled_store(struct kobject *kobj, | ||
216 | struct kobj_attribute *attr, | ||
217 | const char *buf, size_t count) | ||
218 | { | ||
219 | ssize_t ret; | ||
220 | |||
221 | ret = double_flag_store(kobj, attr, buf, count, | ||
222 | TRANSPARENT_HUGEPAGE_FLAG, | ||
223 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | ||
224 | |||
225 | if (ret > 0) { | ||
226 | int err = start_khugepaged(); | ||
227 | if (err) | ||
228 | ret = err; | ||
229 | } | ||
230 | |||
231 | if (ret > 0 && | ||
232 | (test_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
233 | &transparent_hugepage_flags) || | ||
234 | test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
235 | &transparent_hugepage_flags))) | ||
236 | set_recommended_min_free_kbytes(); | ||
237 | |||
238 | return ret; | ||
239 | } | ||
240 | static struct kobj_attribute enabled_attr = | ||
241 | __ATTR(enabled, 0644, enabled_show, enabled_store); | ||
242 | |||
243 | static ssize_t single_flag_show(struct kobject *kobj, | ||
244 | struct kobj_attribute *attr, char *buf, | ||
245 | enum transparent_hugepage_flag flag) | ||
246 | { | ||
247 | if (test_bit(flag, &transparent_hugepage_flags)) | ||
248 | return sprintf(buf, "[yes] no\n"); | ||
249 | else | ||
250 | return sprintf(buf, "yes [no]\n"); | ||
251 | } | ||
252 | static ssize_t single_flag_store(struct kobject *kobj, | ||
253 | struct kobj_attribute *attr, | ||
254 | const char *buf, size_t count, | ||
255 | enum transparent_hugepage_flag flag) | ||
256 | { | ||
257 | if (!memcmp("yes", buf, | ||
258 | min(sizeof("yes")-1, count))) { | ||
259 | set_bit(flag, &transparent_hugepage_flags); | ||
260 | } else if (!memcmp("no", buf, | ||
261 | min(sizeof("no")-1, count))) { | ||
262 | clear_bit(flag, &transparent_hugepage_flags); | ||
263 | } else | ||
264 | return -EINVAL; | ||
265 | |||
266 | return count; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Currently defrag only disables __GFP_NOWAIT for allocation. A blind | ||
271 | * __GFP_REPEAT is too aggressive, it's never worth swapping tons of | ||
272 | * memory just to allocate one more hugepage. | ||
273 | */ | ||
274 | static ssize_t defrag_show(struct kobject *kobj, | ||
275 | struct kobj_attribute *attr, char *buf) | ||
276 | { | ||
277 | return double_flag_show(kobj, attr, buf, | ||
278 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | ||
279 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); | ||
280 | } | ||
281 | static ssize_t defrag_store(struct kobject *kobj, | ||
282 | struct kobj_attribute *attr, | ||
283 | const char *buf, size_t count) | ||
284 | { | ||
285 | return double_flag_store(kobj, attr, buf, count, | ||
286 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | ||
287 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); | ||
288 | } | ||
289 | static struct kobj_attribute defrag_attr = | ||
290 | __ATTR(defrag, 0644, defrag_show, defrag_store); | ||
291 | |||
292 | #ifdef CONFIG_DEBUG_VM | ||
293 | static ssize_t debug_cow_show(struct kobject *kobj, | ||
294 | struct kobj_attribute *attr, char *buf) | ||
295 | { | ||
296 | return single_flag_show(kobj, attr, buf, | ||
297 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | ||
298 | } | ||
299 | static ssize_t debug_cow_store(struct kobject *kobj, | ||
300 | struct kobj_attribute *attr, | ||
301 | const char *buf, size_t count) | ||
302 | { | ||
303 | return single_flag_store(kobj, attr, buf, count, | ||
304 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | ||
305 | } | ||
306 | static struct kobj_attribute debug_cow_attr = | ||
307 | __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); | ||
308 | #endif /* CONFIG_DEBUG_VM */ | ||
309 | |||
310 | static struct attribute *hugepage_attr[] = { | ||
311 | &enabled_attr.attr, | ||
312 | &defrag_attr.attr, | ||
313 | #ifdef CONFIG_DEBUG_VM | ||
314 | &debug_cow_attr.attr, | ||
315 | #endif | ||
316 | NULL, | ||
317 | }; | ||
318 | |||
319 | static struct attribute_group hugepage_attr_group = { | ||
320 | .attrs = hugepage_attr, | ||
321 | }; | ||
322 | |||
323 | static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, | ||
324 | struct kobj_attribute *attr, | ||
325 | char *buf) | ||
326 | { | ||
327 | return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); | ||
328 | } | ||
329 | |||
330 | static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, | ||
331 | struct kobj_attribute *attr, | ||
332 | const char *buf, size_t count) | ||
333 | { | ||
334 | unsigned long msecs; | ||
335 | int err; | ||
336 | |||
337 | err = strict_strtoul(buf, 10, &msecs); | ||
338 | if (err || msecs > UINT_MAX) | ||
339 | return -EINVAL; | ||
340 | |||
341 | khugepaged_scan_sleep_millisecs = msecs; | ||
342 | wake_up_interruptible(&khugepaged_wait); | ||
343 | |||
344 | return count; | ||
345 | } | ||
346 | static struct kobj_attribute scan_sleep_millisecs_attr = | ||
347 | __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, | ||
348 | scan_sleep_millisecs_store); | ||
349 | |||
350 | static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, | ||
351 | struct kobj_attribute *attr, | ||
352 | char *buf) | ||
353 | { | ||
354 | return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); | ||
355 | } | ||
356 | |||
357 | static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, | ||
358 | struct kobj_attribute *attr, | ||
359 | const char *buf, size_t count) | ||
360 | { | ||
361 | unsigned long msecs; | ||
362 | int err; | ||
363 | |||
364 | err = strict_strtoul(buf, 10, &msecs); | ||
365 | if (err || msecs > UINT_MAX) | ||
366 | return -EINVAL; | ||
367 | |||
368 | khugepaged_alloc_sleep_millisecs = msecs; | ||
369 | wake_up_interruptible(&khugepaged_wait); | ||
370 | |||
371 | return count; | ||
372 | } | ||
373 | static struct kobj_attribute alloc_sleep_millisecs_attr = | ||
374 | __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, | ||
375 | alloc_sleep_millisecs_store); | ||
376 | |||
377 | static ssize_t pages_to_scan_show(struct kobject *kobj, | ||
378 | struct kobj_attribute *attr, | ||
379 | char *buf) | ||
380 | { | ||
381 | return sprintf(buf, "%u\n", khugepaged_pages_to_scan); | ||
382 | } | ||
383 | static ssize_t pages_to_scan_store(struct kobject *kobj, | ||
384 | struct kobj_attribute *attr, | ||
385 | const char *buf, size_t count) | ||
386 | { | ||
387 | int err; | ||
388 | unsigned long pages; | ||
389 | |||
390 | err = strict_strtoul(buf, 10, &pages); | ||
391 | if (err || !pages || pages > UINT_MAX) | ||
392 | return -EINVAL; | ||
393 | |||
394 | khugepaged_pages_to_scan = pages; | ||
395 | |||
396 | return count; | ||
397 | } | ||
398 | static struct kobj_attribute pages_to_scan_attr = | ||
399 | __ATTR(pages_to_scan, 0644, pages_to_scan_show, | ||
400 | pages_to_scan_store); | ||
401 | |||
402 | static ssize_t pages_collapsed_show(struct kobject *kobj, | ||
403 | struct kobj_attribute *attr, | ||
404 | char *buf) | ||
405 | { | ||
406 | return sprintf(buf, "%u\n", khugepaged_pages_collapsed); | ||
407 | } | ||
408 | static struct kobj_attribute pages_collapsed_attr = | ||
409 | __ATTR_RO(pages_collapsed); | ||
410 | |||
411 | static ssize_t full_scans_show(struct kobject *kobj, | ||
412 | struct kobj_attribute *attr, | ||
413 | char *buf) | ||
414 | { | ||
415 | return sprintf(buf, "%u\n", khugepaged_full_scans); | ||
416 | } | ||
417 | static struct kobj_attribute full_scans_attr = | ||
418 | __ATTR_RO(full_scans); | ||
419 | |||
420 | static ssize_t khugepaged_defrag_show(struct kobject *kobj, | ||
421 | struct kobj_attribute *attr, char *buf) | ||
422 | { | ||
423 | return single_flag_show(kobj, attr, buf, | ||
424 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
425 | } | ||
426 | static ssize_t khugepaged_defrag_store(struct kobject *kobj, | ||
427 | struct kobj_attribute *attr, | ||
428 | const char *buf, size_t count) | ||
429 | { | ||
430 | return single_flag_store(kobj, attr, buf, count, | ||
431 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
432 | } | ||
433 | static struct kobj_attribute khugepaged_defrag_attr = | ||
434 | __ATTR(defrag, 0644, khugepaged_defrag_show, | ||
435 | khugepaged_defrag_store); | ||
436 | |||
437 | /* | ||
438 | * max_ptes_none controls if khugepaged should collapse hugepages over | ||
439 | * any unmapped ptes in turn potentially increasing the memory | ||
440 | * footprint of the vmas. When max_ptes_none is 0 khugepaged will not | ||
441 | * reduce the available free memory in the system as it | ||
442 | * runs. Increasing max_ptes_none will instead potentially reduce the | ||
443 | * free memory in the system during the khugepaged scan. | ||
444 | */ | ||
445 | static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, | ||
446 | struct kobj_attribute *attr, | ||
447 | char *buf) | ||
448 | { | ||
449 | return sprintf(buf, "%u\n", khugepaged_max_ptes_none); | ||
450 | } | ||
451 | static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, | ||
452 | struct kobj_attribute *attr, | ||
453 | const char *buf, size_t count) | ||
454 | { | ||
455 | int err; | ||
456 | unsigned long max_ptes_none; | ||
457 | |||
458 | err = strict_strtoul(buf, 10, &max_ptes_none); | ||
459 | if (err || max_ptes_none > HPAGE_PMD_NR-1) | ||
460 | return -EINVAL; | ||
461 | |||
462 | khugepaged_max_ptes_none = max_ptes_none; | ||
463 | |||
464 | return count; | ||
465 | } | ||
466 | static struct kobj_attribute khugepaged_max_ptes_none_attr = | ||
467 | __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, | ||
468 | khugepaged_max_ptes_none_store); | ||
469 | |||
470 | static struct attribute *khugepaged_attr[] = { | ||
471 | &khugepaged_defrag_attr.attr, | ||
472 | &khugepaged_max_ptes_none_attr.attr, | ||
473 | &pages_to_scan_attr.attr, | ||
474 | &pages_collapsed_attr.attr, | ||
475 | &full_scans_attr.attr, | ||
476 | &scan_sleep_millisecs_attr.attr, | ||
477 | &alloc_sleep_millisecs_attr.attr, | ||
478 | NULL, | ||
479 | }; | ||
480 | |||
481 | static struct attribute_group khugepaged_attr_group = { | ||
482 | .attrs = khugepaged_attr, | ||
483 | .name = "khugepaged", | ||
484 | }; | ||
485 | #endif /* CONFIG_SYSFS */ | ||
486 | |||
487 | static int __init hugepage_init(void) | ||
488 | { | ||
489 | int err; | ||
490 | #ifdef CONFIG_SYSFS | ||
491 | static struct kobject *hugepage_kobj; | ||
492 | #endif | ||
493 | |||
494 | err = -EINVAL; | ||
495 | if (!has_transparent_hugepage()) { | ||
496 | transparent_hugepage_flags = 0; | ||
497 | goto out; | ||
498 | } | ||
499 | |||
500 | #ifdef CONFIG_SYSFS | ||
501 | err = -ENOMEM; | ||
502 | hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); | ||
503 | if (unlikely(!hugepage_kobj)) { | ||
504 | printk(KERN_ERR "hugepage: failed kobject create\n"); | ||
505 | goto out; | ||
506 | } | ||
507 | |||
508 | err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); | ||
509 | if (err) { | ||
510 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | ||
511 | goto out; | ||
512 | } | ||
513 | |||
514 | err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); | ||
515 | if (err) { | ||
516 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | ||
517 | goto out; | ||
518 | } | ||
519 | #endif | ||
520 | |||
521 | err = khugepaged_slab_init(); | ||
522 | if (err) | ||
523 | goto out; | ||
524 | |||
525 | err = mm_slots_hash_init(); | ||
526 | if (err) { | ||
527 | khugepaged_slab_free(); | ||
528 | goto out; | ||
529 | } | ||
530 | |||
531 | /* | ||
532 | * By default disable transparent hugepages on smaller systems, | ||
533 | * where the extra memory used could hurt more than TLB overhead | ||
534 | * is likely to save. The admin can still enable it through /sys. | ||
535 | */ | ||
536 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) | ||
537 | transparent_hugepage_flags = 0; | ||
538 | |||
539 | start_khugepaged(); | ||
540 | |||
541 | set_recommended_min_free_kbytes(); | ||
542 | |||
543 | out: | ||
544 | return err; | ||
545 | } | ||
546 | module_init(hugepage_init) | ||
547 | |||
548 | static int __init setup_transparent_hugepage(char *str) | ||
549 | { | ||
550 | int ret = 0; | ||
551 | if (!str) | ||
552 | goto out; | ||
553 | if (!strcmp(str, "always")) { | ||
554 | set_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
555 | &transparent_hugepage_flags); | ||
556 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
557 | &transparent_hugepage_flags); | ||
558 | ret = 1; | ||
559 | } else if (!strcmp(str, "madvise")) { | ||
560 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
561 | &transparent_hugepage_flags); | ||
562 | set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
563 | &transparent_hugepage_flags); | ||
564 | ret = 1; | ||
565 | } else if (!strcmp(str, "never")) { | ||
566 | clear_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
567 | &transparent_hugepage_flags); | ||
568 | clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
569 | &transparent_hugepage_flags); | ||
570 | ret = 1; | ||
571 | } | ||
572 | out: | ||
573 | if (!ret) | ||
574 | printk(KERN_WARNING | ||
575 | "transparent_hugepage= cannot parse, ignored\n"); | ||
576 | return ret; | ||
577 | } | ||
578 | __setup("transparent_hugepage=", setup_transparent_hugepage); | ||
579 | |||
580 | static void prepare_pmd_huge_pte(pgtable_t pgtable, | ||
581 | struct mm_struct *mm) | ||
582 | { | ||
583 | assert_spin_locked(&mm->page_table_lock); | ||
584 | |||
585 | /* FIFO */ | ||
586 | if (!mm->pmd_huge_pte) | ||
587 | INIT_LIST_HEAD(&pgtable->lru); | ||
588 | else | ||
589 | list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); | ||
590 | mm->pmd_huge_pte = pgtable; | ||
591 | } | ||
592 | |||
593 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | ||
594 | { | ||
595 | if (likely(vma->vm_flags & VM_WRITE)) | ||
596 | pmd = pmd_mkwrite(pmd); | ||
597 | return pmd; | ||
598 | } | ||
599 | |||
600 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | ||
601 | struct vm_area_struct *vma, | ||
602 | unsigned long haddr, pmd_t *pmd, | ||
603 | struct page *page) | ||
604 | { | ||
605 | int ret = 0; | ||
606 | pgtable_t pgtable; | ||
607 | |||
608 | VM_BUG_ON(!PageCompound(page)); | ||
609 | pgtable = pte_alloc_one(mm, haddr); | ||
610 | if (unlikely(!pgtable)) { | ||
611 | mem_cgroup_uncharge_page(page); | ||
612 | put_page(page); | ||
613 | return VM_FAULT_OOM; | ||
614 | } | ||
615 | |||
616 | clear_huge_page(page, haddr, HPAGE_PMD_NR); | ||
617 | __SetPageUptodate(page); | ||
618 | |||
619 | spin_lock(&mm->page_table_lock); | ||
620 | if (unlikely(!pmd_none(*pmd))) { | ||
621 | spin_unlock(&mm->page_table_lock); | ||
622 | mem_cgroup_uncharge_page(page); | ||
623 | put_page(page); | ||
624 | pte_free(mm, pgtable); | ||
625 | } else { | ||
626 | pmd_t entry; | ||
627 | entry = mk_pmd(page, vma->vm_page_prot); | ||
628 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
629 | entry = pmd_mkhuge(entry); | ||
630 | /* | ||
631 | * The spinlocking to take the lru_lock inside | ||
632 | * page_add_new_anon_rmap() acts as a full memory | ||
633 | * barrier to be sure clear_huge_page writes become | ||
634 | * visible after the set_pmd_at() write. | ||
635 | */ | ||
636 | page_add_new_anon_rmap(page, vma, haddr); | ||
637 | set_pmd_at(mm, haddr, pmd, entry); | ||
638 | prepare_pmd_huge_pte(pgtable, mm); | ||
639 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | ||
640 | spin_unlock(&mm->page_table_lock); | ||
641 | } | ||
642 | |||
643 | return ret; | ||
644 | } | ||
645 | |||
646 | static inline gfp_t alloc_hugepage_gfpmask(int defrag) | ||
647 | { | ||
648 | return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); | ||
649 | } | ||
650 | |||
651 | static inline struct page *alloc_hugepage_vma(int defrag, | ||
652 | struct vm_area_struct *vma, | ||
653 | unsigned long haddr) | ||
654 | { | ||
655 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), | ||
656 | HPAGE_PMD_ORDER, vma, haddr); | ||
657 | } | ||
658 | |||
659 | #ifndef CONFIG_NUMA | ||
660 | static inline struct page *alloc_hugepage(int defrag) | ||
661 | { | ||
662 | return alloc_pages(alloc_hugepage_gfpmask(defrag), | ||
663 | HPAGE_PMD_ORDER); | ||
664 | } | ||
665 | #endif | ||
666 | |||
667 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
668 | unsigned long address, pmd_t *pmd, | ||
669 | unsigned int flags) | ||
670 | { | ||
671 | struct page *page; | ||
672 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
673 | pte_t *pte; | ||
674 | |||
675 | if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { | ||
676 | if (unlikely(anon_vma_prepare(vma))) | ||
677 | return VM_FAULT_OOM; | ||
678 | if (unlikely(khugepaged_enter(vma))) | ||
679 | return VM_FAULT_OOM; | ||
680 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | ||
681 | vma, haddr); | ||
682 | if (unlikely(!page)) | ||
683 | goto out; | ||
684 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | ||
685 | put_page(page); | ||
686 | goto out; | ||
687 | } | ||
688 | |||
689 | return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); | ||
690 | } | ||
691 | out: | ||
692 | /* | ||
693 | * Use __pte_alloc instead of pte_alloc_map, because we can't | ||
694 | * run pte_offset_map on the pmd, if an huge pmd could | ||
695 | * materialize from under us from a different thread. | ||
696 | */ | ||
697 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
698 | return VM_FAULT_OOM; | ||
699 | /* if an huge pmd materialized from under us just retry later */ | ||
700 | if (unlikely(pmd_trans_huge(*pmd))) | ||
701 | return 0; | ||
702 | /* | ||
703 | * A regular pmd is established and it can't morph into a huge pmd | ||
704 | * from under us anymore at this point because we hold the mmap_sem | ||
705 | * read mode and khugepaged takes it in write mode. So now it's | ||
706 | * safe to run pte_offset_map(). | ||
707 | */ | ||
708 | pte = pte_offset_map(pmd, address); | ||
709 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | ||
710 | } | ||
711 | |||
712 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||
713 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | ||
714 | struct vm_area_struct *vma) | ||
715 | { | ||
716 | struct page *src_page; | ||
717 | pmd_t pmd; | ||
718 | pgtable_t pgtable; | ||
719 | int ret; | ||
720 | |||
721 | ret = -ENOMEM; | ||
722 | pgtable = pte_alloc_one(dst_mm, addr); | ||
723 | if (unlikely(!pgtable)) | ||
724 | goto out; | ||
725 | |||
726 | spin_lock(&dst_mm->page_table_lock); | ||
727 | spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); | ||
728 | |||
729 | ret = -EAGAIN; | ||
730 | pmd = *src_pmd; | ||
731 | if (unlikely(!pmd_trans_huge(pmd))) { | ||
732 | pte_free(dst_mm, pgtable); | ||
733 | goto out_unlock; | ||
734 | } | ||
735 | if (unlikely(pmd_trans_splitting(pmd))) { | ||
736 | /* split huge page running from under us */ | ||
737 | spin_unlock(&src_mm->page_table_lock); | ||
738 | spin_unlock(&dst_mm->page_table_lock); | ||
739 | pte_free(dst_mm, pgtable); | ||
740 | |||
741 | wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ | ||
742 | goto out; | ||
743 | } | ||
744 | src_page = pmd_page(pmd); | ||
745 | VM_BUG_ON(!PageHead(src_page)); | ||
746 | get_page(src_page); | ||
747 | page_dup_rmap(src_page); | ||
748 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | ||
749 | |||
750 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | ||
751 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | ||
752 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | ||
753 | prepare_pmd_huge_pte(pgtable, dst_mm); | ||
754 | |||
755 | ret = 0; | ||
756 | out_unlock: | ||
757 | spin_unlock(&src_mm->page_table_lock); | ||
758 | spin_unlock(&dst_mm->page_table_lock); | ||
759 | out: | ||
760 | return ret; | ||
761 | } | ||
762 | |||
763 | /* no "address" argument so destroys page coloring of some arch */ | ||
764 | pgtable_t get_pmd_huge_pte(struct mm_struct *mm) | ||
765 | { | ||
766 | pgtable_t pgtable; | ||
767 | |||
768 | assert_spin_locked(&mm->page_table_lock); | ||
769 | |||
770 | /* FIFO */ | ||
771 | pgtable = mm->pmd_huge_pte; | ||
772 | if (list_empty(&pgtable->lru)) | ||
773 | mm->pmd_huge_pte = NULL; | ||
774 | else { | ||
775 | mm->pmd_huge_pte = list_entry(pgtable->lru.next, | ||
776 | struct page, lru); | ||
777 | list_del(&pgtable->lru); | ||
778 | } | ||
779 | return pgtable; | ||
780 | } | ||
781 | |||
782 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | ||
783 | struct vm_area_struct *vma, | ||
784 | unsigned long address, | ||
785 | pmd_t *pmd, pmd_t orig_pmd, | ||
786 | struct page *page, | ||
787 | unsigned long haddr) | ||
788 | { | ||
789 | pgtable_t pgtable; | ||
790 | pmd_t _pmd; | ||
791 | int ret = 0, i; | ||
792 | struct page **pages; | ||
793 | |||
794 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, | ||
795 | GFP_KERNEL); | ||
796 | if (unlikely(!pages)) { | ||
797 | ret |= VM_FAULT_OOM; | ||
798 | goto out; | ||
799 | } | ||
800 | |||
801 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
802 | pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | ||
803 | vma, address); | ||
804 | if (unlikely(!pages[i] || | ||
805 | mem_cgroup_newpage_charge(pages[i], mm, | ||
806 | GFP_KERNEL))) { | ||
807 | if (pages[i]) | ||
808 | put_page(pages[i]); | ||
809 | mem_cgroup_uncharge_start(); | ||
810 | while (--i >= 0) { | ||
811 | mem_cgroup_uncharge_page(pages[i]); | ||
812 | put_page(pages[i]); | ||
813 | } | ||
814 | mem_cgroup_uncharge_end(); | ||
815 | kfree(pages); | ||
816 | ret |= VM_FAULT_OOM; | ||
817 | goto out; | ||
818 | } | ||
819 | } | ||
820 | |||
821 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
822 | copy_user_highpage(pages[i], page + i, | ||
823 | haddr + PAGE_SHIFT*i, vma); | ||
824 | __SetPageUptodate(pages[i]); | ||
825 | cond_resched(); | ||
826 | } | ||
827 | |||
828 | spin_lock(&mm->page_table_lock); | ||
829 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
830 | goto out_free_pages; | ||
831 | VM_BUG_ON(!PageHead(page)); | ||
832 | |||
833 | pmdp_clear_flush_notify(vma, haddr, pmd); | ||
834 | /* leave pmd empty until pte is filled */ | ||
835 | |||
836 | pgtable = get_pmd_huge_pte(mm); | ||
837 | pmd_populate(mm, &_pmd, pgtable); | ||
838 | |||
839 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
840 | pte_t *pte, entry; | ||
841 | entry = mk_pte(pages[i], vma->vm_page_prot); | ||
842 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
843 | page_add_new_anon_rmap(pages[i], vma, haddr); | ||
844 | pte = pte_offset_map(&_pmd, haddr); | ||
845 | VM_BUG_ON(!pte_none(*pte)); | ||
846 | set_pte_at(mm, haddr, pte, entry); | ||
847 | pte_unmap(pte); | ||
848 | } | ||
849 | kfree(pages); | ||
850 | |||
851 | mm->nr_ptes++; | ||
852 | smp_wmb(); /* make pte visible before pmd */ | ||
853 | pmd_populate(mm, pmd, pgtable); | ||
854 | page_remove_rmap(page); | ||
855 | spin_unlock(&mm->page_table_lock); | ||
856 | |||
857 | ret |= VM_FAULT_WRITE; | ||
858 | put_page(page); | ||
859 | |||
860 | out: | ||
861 | return ret; | ||
862 | |||
863 | out_free_pages: | ||
864 | spin_unlock(&mm->page_table_lock); | ||
865 | mem_cgroup_uncharge_start(); | ||
866 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
867 | mem_cgroup_uncharge_page(pages[i]); | ||
868 | put_page(pages[i]); | ||
869 | } | ||
870 | mem_cgroup_uncharge_end(); | ||
871 | kfree(pages); | ||
872 | goto out; | ||
873 | } | ||
874 | |||
875 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
876 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | ||
877 | { | ||
878 | int ret = 0; | ||
879 | struct page *page, *new_page; | ||
880 | unsigned long haddr; | ||
881 | |||
882 | VM_BUG_ON(!vma->anon_vma); | ||
883 | spin_lock(&mm->page_table_lock); | ||
884 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
885 | goto out_unlock; | ||
886 | |||
887 | page = pmd_page(orig_pmd); | ||
888 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | ||
889 | haddr = address & HPAGE_PMD_MASK; | ||
890 | if (page_mapcount(page) == 1) { | ||
891 | pmd_t entry; | ||
892 | entry = pmd_mkyoung(orig_pmd); | ||
893 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
894 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) | ||
895 | update_mmu_cache(vma, address, entry); | ||
896 | ret |= VM_FAULT_WRITE; | ||
897 | goto out_unlock; | ||
898 | } | ||
899 | get_page(page); | ||
900 | spin_unlock(&mm->page_table_lock); | ||
901 | |||
902 | if (transparent_hugepage_enabled(vma) && | ||
903 | !transparent_hugepage_debug_cow()) | ||
904 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | ||
905 | vma, haddr); | ||
906 | else | ||
907 | new_page = NULL; | ||
908 | |||
909 | if (unlikely(!new_page)) { | ||
910 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | ||
911 | pmd, orig_pmd, page, haddr); | ||
912 | put_page(page); | ||
913 | goto out; | ||
914 | } | ||
915 | |||
916 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | ||
917 | put_page(new_page); | ||
918 | put_page(page); | ||
919 | ret |= VM_FAULT_OOM; | ||
920 | goto out; | ||
921 | } | ||
922 | |||
923 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | ||
924 | __SetPageUptodate(new_page); | ||
925 | |||
926 | spin_lock(&mm->page_table_lock); | ||
927 | put_page(page); | ||
928 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | ||
929 | mem_cgroup_uncharge_page(new_page); | ||
930 | put_page(new_page); | ||
931 | } else { | ||
932 | pmd_t entry; | ||
933 | VM_BUG_ON(!PageHead(page)); | ||
934 | entry = mk_pmd(new_page, vma->vm_page_prot); | ||
935 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
936 | entry = pmd_mkhuge(entry); | ||
937 | pmdp_clear_flush_notify(vma, haddr, pmd); | ||
938 | page_add_new_anon_rmap(new_page, vma, haddr); | ||
939 | set_pmd_at(mm, haddr, pmd, entry); | ||
940 | update_mmu_cache(vma, address, entry); | ||
941 | page_remove_rmap(page); | ||
942 | put_page(page); | ||
943 | ret |= VM_FAULT_WRITE; | ||
944 | } | ||
945 | out_unlock: | ||
946 | spin_unlock(&mm->page_table_lock); | ||
947 | out: | ||
948 | return ret; | ||
949 | } | ||
950 | |||
951 | struct page *follow_trans_huge_pmd(struct mm_struct *mm, | ||
952 | unsigned long addr, | ||
953 | pmd_t *pmd, | ||
954 | unsigned int flags) | ||
955 | { | ||
956 | struct page *page = NULL; | ||
957 | |||
958 | assert_spin_locked(&mm->page_table_lock); | ||
959 | |||
960 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) | ||
961 | goto out; | ||
962 | |||
963 | page = pmd_page(*pmd); | ||
964 | VM_BUG_ON(!PageHead(page)); | ||
965 | if (flags & FOLL_TOUCH) { | ||
966 | pmd_t _pmd; | ||
967 | /* | ||
968 | * We should set the dirty bit only for FOLL_WRITE but | ||
969 | * for now the dirty bit in the pmd is meaningless. | ||
970 | * And if the dirty bit will become meaningful and | ||
971 | * we'll only set it with FOLL_WRITE, an atomic | ||
972 | * set_bit will be required on the pmd to set the | ||
973 | * young bit, instead of the current set_pmd_at. | ||
974 | */ | ||
975 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | ||
976 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); | ||
977 | } | ||
978 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | ||
979 | VM_BUG_ON(!PageCompound(page)); | ||
980 | if (flags & FOLL_GET) | ||
981 | get_page(page); | ||
982 | |||
983 | out: | ||
984 | return page; | ||
985 | } | ||
986 | |||
987 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | ||
988 | pmd_t *pmd) | ||
989 | { | ||
990 | int ret = 0; | ||
991 | |||
992 | spin_lock(&tlb->mm->page_table_lock); | ||
993 | if (likely(pmd_trans_huge(*pmd))) { | ||
994 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
995 | spin_unlock(&tlb->mm->page_table_lock); | ||
996 | wait_split_huge_page(vma->anon_vma, | ||
997 | pmd); | ||
998 | } else { | ||
999 | struct page *page; | ||
1000 | pgtable_t pgtable; | ||
1001 | pgtable = get_pmd_huge_pte(tlb->mm); | ||
1002 | page = pmd_page(*pmd); | ||
1003 | pmd_clear(pmd); | ||
1004 | page_remove_rmap(page); | ||
1005 | VM_BUG_ON(page_mapcount(page) < 0); | ||
1006 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1007 | VM_BUG_ON(!PageHead(page)); | ||
1008 | spin_unlock(&tlb->mm->page_table_lock); | ||
1009 | tlb_remove_page(tlb, page); | ||
1010 | pte_free(tlb->mm, pgtable); | ||
1011 | ret = 1; | ||
1012 | } | ||
1013 | } else | ||
1014 | spin_unlock(&tlb->mm->page_table_lock); | ||
1015 | |||
1016 | return ret; | ||
1017 | } | ||
1018 | |||
1019 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | ||
1020 | unsigned long addr, unsigned long end, | ||
1021 | unsigned char *vec) | ||
1022 | { | ||
1023 | int ret = 0; | ||
1024 | |||
1025 | spin_lock(&vma->vm_mm->page_table_lock); | ||
1026 | if (likely(pmd_trans_huge(*pmd))) { | ||
1027 | ret = !pmd_trans_splitting(*pmd); | ||
1028 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1029 | if (unlikely(!ret)) | ||
1030 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1031 | else { | ||
1032 | /* | ||
1033 | * All logical pages in the range are present | ||
1034 | * if backed by a huge page. | ||
1035 | */ | ||
1036 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1037 | } | ||
1038 | } else | ||
1039 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1040 | |||
1041 | return ret; | ||
1042 | } | ||
1043 | |||
1044 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | ||
1045 | unsigned long addr, pgprot_t newprot) | ||
1046 | { | ||
1047 | struct mm_struct *mm = vma->vm_mm; | ||
1048 | int ret = 0; | ||
1049 | |||
1050 | spin_lock(&mm->page_table_lock); | ||
1051 | if (likely(pmd_trans_huge(*pmd))) { | ||
1052 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
1053 | spin_unlock(&mm->page_table_lock); | ||
1054 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1055 | } else { | ||
1056 | pmd_t entry; | ||
1057 | |||
1058 | entry = pmdp_get_and_clear(mm, addr, pmd); | ||
1059 | entry = pmd_modify(entry, newprot); | ||
1060 | set_pmd_at(mm, addr, pmd, entry); | ||
1061 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1062 | flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); | ||
1063 | ret = 1; | ||
1064 | } | ||
1065 | } else | ||
1066 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1067 | |||
1068 | return ret; | ||
1069 | } | ||
1070 | |||
1071 | pmd_t *page_check_address_pmd(struct page *page, | ||
1072 | struct mm_struct *mm, | ||
1073 | unsigned long address, | ||
1074 | enum page_check_address_pmd_flag flag) | ||
1075 | { | ||
1076 | pgd_t *pgd; | ||
1077 | pud_t *pud; | ||
1078 | pmd_t *pmd, *ret = NULL; | ||
1079 | |||
1080 | if (address & ~HPAGE_PMD_MASK) | ||
1081 | goto out; | ||
1082 | |||
1083 | pgd = pgd_offset(mm, address); | ||
1084 | if (!pgd_present(*pgd)) | ||
1085 | goto out; | ||
1086 | |||
1087 | pud = pud_offset(pgd, address); | ||
1088 | if (!pud_present(*pud)) | ||
1089 | goto out; | ||
1090 | |||
1091 | pmd = pmd_offset(pud, address); | ||
1092 | if (pmd_none(*pmd)) | ||
1093 | goto out; | ||
1094 | if (pmd_page(*pmd) != page) | ||
1095 | goto out; | ||
1096 | /* | ||
1097 | * split_vma() may create temporary aliased mappings. There is | ||
1098 | * no risk as long as all huge pmd are found and have their | ||
1099 | * splitting bit set before __split_huge_page_refcount | ||
1100 | * runs. Finding the same huge pmd more than once during the | ||
1101 | * same rmap walk is not a problem. | ||
1102 | */ | ||
1103 | if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && | ||
1104 | pmd_trans_splitting(*pmd)) | ||
1105 | goto out; | ||
1106 | if (pmd_trans_huge(*pmd)) { | ||
1107 | VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && | ||
1108 | !pmd_trans_splitting(*pmd)); | ||
1109 | ret = pmd; | ||
1110 | } | ||
1111 | out: | ||
1112 | return ret; | ||
1113 | } | ||
1114 | |||
1115 | static int __split_huge_page_splitting(struct page *page, | ||
1116 | struct vm_area_struct *vma, | ||
1117 | unsigned long address) | ||
1118 | { | ||
1119 | struct mm_struct *mm = vma->vm_mm; | ||
1120 | pmd_t *pmd; | ||
1121 | int ret = 0; | ||
1122 | |||
1123 | spin_lock(&mm->page_table_lock); | ||
1124 | pmd = page_check_address_pmd(page, mm, address, | ||
1125 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); | ||
1126 | if (pmd) { | ||
1127 | /* | ||
1128 | * We can't temporarily set the pmd to null in order | ||
1129 | * to split it, the pmd must remain marked huge at all | ||
1130 | * times or the VM won't take the pmd_trans_huge paths | ||
1131 | * and it won't wait on the anon_vma->root->lock to | ||
1132 | * serialize against split_huge_page*. | ||
1133 | */ | ||
1134 | pmdp_splitting_flush_notify(vma, address, pmd); | ||
1135 | ret = 1; | ||
1136 | } | ||
1137 | spin_unlock(&mm->page_table_lock); | ||
1138 | |||
1139 | return ret; | ||
1140 | } | ||
1141 | |||
1142 | static void __split_huge_page_refcount(struct page *page) | ||
1143 | { | ||
1144 | int i; | ||
1145 | unsigned long head_index = page->index; | ||
1146 | struct zone *zone = page_zone(page); | ||
1147 | int zonestat; | ||
1148 | |||
1149 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | ||
1150 | spin_lock_irq(&zone->lru_lock); | ||
1151 | compound_lock(page); | ||
1152 | |||
1153 | for (i = 1; i < HPAGE_PMD_NR; i++) { | ||
1154 | struct page *page_tail = page + i; | ||
1155 | |||
1156 | /* tail_page->_count cannot change */ | ||
1157 | atomic_sub(atomic_read(&page_tail->_count), &page->_count); | ||
1158 | BUG_ON(page_count(page) <= 0); | ||
1159 | atomic_add(page_mapcount(page) + 1, &page_tail->_count); | ||
1160 | BUG_ON(atomic_read(&page_tail->_count) <= 0); | ||
1161 | |||
1162 | /* after clearing PageTail the gup refcount can be released */ | ||
1163 | smp_mb(); | ||
1164 | |||
1165 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | ||
1166 | page_tail->flags |= (page->flags & | ||
1167 | ((1L << PG_referenced) | | ||
1168 | (1L << PG_swapbacked) | | ||
1169 | (1L << PG_mlocked) | | ||
1170 | (1L << PG_uptodate))); | ||
1171 | page_tail->flags |= (1L << PG_dirty); | ||
1172 | |||
1173 | /* | ||
1174 | * 1) clear PageTail before overwriting first_page | ||
1175 | * 2) clear PageTail before clearing PageHead for VM_BUG_ON | ||
1176 | */ | ||
1177 | smp_wmb(); | ||
1178 | |||
1179 | /* | ||
1180 | * __split_huge_page_splitting() already set the | ||
1181 | * splitting bit in all pmd that could map this | ||
1182 | * hugepage, that will ensure no CPU can alter the | ||
1183 | * mapcount on the head page. The mapcount is only | ||
1184 | * accounted in the head page and it has to be | ||
1185 | * transferred to all tail pages in the below code. So | ||
1186 | * for this code to be safe, the split the mapcount | ||
1187 | * can't change. But that doesn't mean userland can't | ||
1188 | * keep changing and reading the page contents while | ||
1189 | * we transfer the mapcount, so the pmd splitting | ||
1190 | * status is achieved setting a reserved bit in the | ||
1191 | * pmd, not by clearing the present bit. | ||
1192 | */ | ||
1193 | BUG_ON(page_mapcount(page_tail)); | ||
1194 | page_tail->_mapcount = page->_mapcount; | ||
1195 | |||
1196 | BUG_ON(page_tail->mapping); | ||
1197 | page_tail->mapping = page->mapping; | ||
1198 | |||
1199 | page_tail->index = ++head_index; | ||
1200 | |||
1201 | BUG_ON(!PageAnon(page_tail)); | ||
1202 | BUG_ON(!PageUptodate(page_tail)); | ||
1203 | BUG_ON(!PageDirty(page_tail)); | ||
1204 | BUG_ON(!PageSwapBacked(page_tail)); | ||
1205 | |||
1206 | lru_add_page_tail(zone, page, page_tail); | ||
1207 | } | ||
1208 | |||
1209 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | ||
1210 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | ||
1211 | |||
1212 | /* | ||
1213 | * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, | ||
1214 | * so adjust those appropriately if this page is on the LRU. | ||
1215 | */ | ||
1216 | if (PageLRU(page)) { | ||
1217 | zonestat = NR_LRU_BASE + page_lru(page); | ||
1218 | __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); | ||
1219 | } | ||
1220 | |||
1221 | ClearPageCompound(page); | ||
1222 | compound_unlock(page); | ||
1223 | spin_unlock_irq(&zone->lru_lock); | ||
1224 | |||
1225 | for (i = 1; i < HPAGE_PMD_NR; i++) { | ||
1226 | struct page *page_tail = page + i; | ||
1227 | BUG_ON(page_count(page_tail) <= 0); | ||
1228 | /* | ||
1229 | * Tail pages may be freed if there wasn't any mapping | ||
1230 | * like if add_to_swap() is running on a lru page that | ||
1231 | * had its mapping zapped. And freeing these pages | ||
1232 | * requires taking the lru_lock so we do the put_page | ||
1233 | * of the tail pages after the split is complete. | ||
1234 | */ | ||
1235 | put_page(page_tail); | ||
1236 | } | ||
1237 | |||
1238 | /* | ||
1239 | * Only the head page (now become a regular page) is required | ||
1240 | * to be pinned by the caller. | ||
1241 | */ | ||
1242 | BUG_ON(page_count(page) <= 0); | ||
1243 | } | ||
1244 | |||
1245 | static int __split_huge_page_map(struct page *page, | ||
1246 | struct vm_area_struct *vma, | ||
1247 | unsigned long address) | ||
1248 | { | ||
1249 | struct mm_struct *mm = vma->vm_mm; | ||
1250 | pmd_t *pmd, _pmd; | ||
1251 | int ret = 0, i; | ||
1252 | pgtable_t pgtable; | ||
1253 | unsigned long haddr; | ||
1254 | |||
1255 | spin_lock(&mm->page_table_lock); | ||
1256 | pmd = page_check_address_pmd(page, mm, address, | ||
1257 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); | ||
1258 | if (pmd) { | ||
1259 | pgtable = get_pmd_huge_pte(mm); | ||
1260 | pmd_populate(mm, &_pmd, pgtable); | ||
1261 | |||
1262 | for (i = 0, haddr = address; i < HPAGE_PMD_NR; | ||
1263 | i++, haddr += PAGE_SIZE) { | ||
1264 | pte_t *pte, entry; | ||
1265 | BUG_ON(PageCompound(page+i)); | ||
1266 | entry = mk_pte(page + i, vma->vm_page_prot); | ||
1267 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1268 | if (!pmd_write(*pmd)) | ||
1269 | entry = pte_wrprotect(entry); | ||
1270 | else | ||
1271 | BUG_ON(page_mapcount(page) != 1); | ||
1272 | if (!pmd_young(*pmd)) | ||
1273 | entry = pte_mkold(entry); | ||
1274 | pte = pte_offset_map(&_pmd, haddr); | ||
1275 | BUG_ON(!pte_none(*pte)); | ||
1276 | set_pte_at(mm, haddr, pte, entry); | ||
1277 | pte_unmap(pte); | ||
1278 | } | ||
1279 | |||
1280 | mm->nr_ptes++; | ||
1281 | smp_wmb(); /* make pte visible before pmd */ | ||
1282 | /* | ||
1283 | * Up to this point the pmd is present and huge and | ||
1284 | * userland has the whole access to the hugepage | ||
1285 | * during the split (which happens in place). If we | ||
1286 | * overwrite the pmd with the not-huge version | ||
1287 | * pointing to the pte here (which of course we could | ||
1288 | * if all CPUs were bug free), userland could trigger | ||
1289 | * a small page size TLB miss on the small sized TLB | ||
1290 | * while the hugepage TLB entry is still established | ||
1291 | * in the huge TLB. Some CPU doesn't like that. See | ||
1292 | * http://support.amd.com/us/Processor_TechDocs/41322.pdf, | ||
1293 | * Erratum 383 on page 93. Intel should be safe but is | ||
1294 | * also warns that it's only safe if the permission | ||
1295 | * and cache attributes of the two entries loaded in | ||
1296 | * the two TLB is identical (which should be the case | ||
1297 | * here). But it is generally safer to never allow | ||
1298 | * small and huge TLB entries for the same virtual | ||
1299 | * address to be loaded simultaneously. So instead of | ||
1300 | * doing "pmd_populate(); flush_tlb_range();" we first | ||
1301 | * mark the current pmd notpresent (atomically because | ||
1302 | * here the pmd_trans_huge and pmd_trans_splitting | ||
1303 | * must remain set at all times on the pmd until the | ||
1304 | * split is complete for this pmd), then we flush the | ||
1305 | * SMP TLB and finally we write the non-huge version | ||
1306 | * of the pmd entry with pmd_populate. | ||
1307 | */ | ||
1308 | set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); | ||
1309 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
1310 | pmd_populate(mm, pmd, pgtable); | ||
1311 | ret = 1; | ||
1312 | } | ||
1313 | spin_unlock(&mm->page_table_lock); | ||
1314 | |||
1315 | return ret; | ||
1316 | } | ||
1317 | |||
1318 | /* must be called with anon_vma->root->lock hold */ | ||
1319 | static void __split_huge_page(struct page *page, | ||
1320 | struct anon_vma *anon_vma) | ||
1321 | { | ||
1322 | int mapcount, mapcount2; | ||
1323 | struct anon_vma_chain *avc; | ||
1324 | |||
1325 | BUG_ON(!PageHead(page)); | ||
1326 | BUG_ON(PageTail(page)); | ||
1327 | |||
1328 | mapcount = 0; | ||
1329 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | ||
1330 | struct vm_area_struct *vma = avc->vma; | ||
1331 | unsigned long addr = vma_address(page, vma); | ||
1332 | BUG_ON(is_vma_temporary_stack(vma)); | ||
1333 | if (addr == -EFAULT) | ||
1334 | continue; | ||
1335 | mapcount += __split_huge_page_splitting(page, vma, addr); | ||
1336 | } | ||
1337 | /* | ||
1338 | * It is critical that new vmas are added to the tail of the | ||
1339 | * anon_vma list. This guarantes that if copy_huge_pmd() runs | ||
1340 | * and establishes a child pmd before | ||
1341 | * __split_huge_page_splitting() freezes the parent pmd (so if | ||
1342 | * we fail to prevent copy_huge_pmd() from running until the | ||
1343 | * whole __split_huge_page() is complete), we will still see | ||
1344 | * the newly established pmd of the child later during the | ||
1345 | * walk, to be able to set it as pmd_trans_splitting too. | ||
1346 | */ | ||
1347 | if (mapcount != page_mapcount(page)) | ||
1348 | printk(KERN_ERR "mapcount %d page_mapcount %d\n", | ||
1349 | mapcount, page_mapcount(page)); | ||
1350 | BUG_ON(mapcount != page_mapcount(page)); | ||
1351 | |||
1352 | __split_huge_page_refcount(page); | ||
1353 | |||
1354 | mapcount2 = 0; | ||
1355 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | ||
1356 | struct vm_area_struct *vma = avc->vma; | ||
1357 | unsigned long addr = vma_address(page, vma); | ||
1358 | BUG_ON(is_vma_temporary_stack(vma)); | ||
1359 | if (addr == -EFAULT) | ||
1360 | continue; | ||
1361 | mapcount2 += __split_huge_page_map(page, vma, addr); | ||
1362 | } | ||
1363 | if (mapcount != mapcount2) | ||
1364 | printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", | ||
1365 | mapcount, mapcount2, page_mapcount(page)); | ||
1366 | BUG_ON(mapcount != mapcount2); | ||
1367 | } | ||
1368 | |||
1369 | int split_huge_page(struct page *page) | ||
1370 | { | ||
1371 | struct anon_vma *anon_vma; | ||
1372 | int ret = 1; | ||
1373 | |||
1374 | BUG_ON(!PageAnon(page)); | ||
1375 | anon_vma = page_lock_anon_vma(page); | ||
1376 | if (!anon_vma) | ||
1377 | goto out; | ||
1378 | ret = 0; | ||
1379 | if (!PageCompound(page)) | ||
1380 | goto out_unlock; | ||
1381 | |||
1382 | BUG_ON(!PageSwapBacked(page)); | ||
1383 | __split_huge_page(page, anon_vma); | ||
1384 | |||
1385 | BUG_ON(PageCompound(page)); | ||
1386 | out_unlock: | ||
1387 | page_unlock_anon_vma(anon_vma); | ||
1388 | out: | ||
1389 | return ret; | ||
1390 | } | ||
1391 | |||
1392 | int hugepage_madvise(struct vm_area_struct *vma, | ||
1393 | unsigned long *vm_flags, int advice) | ||
1394 | { | ||
1395 | switch (advice) { | ||
1396 | case MADV_HUGEPAGE: | ||
1397 | /* | ||
1398 | * Be somewhat over-protective like KSM for now! | ||
1399 | */ | ||
1400 | if (*vm_flags & (VM_HUGEPAGE | | ||
1401 | VM_SHARED | VM_MAYSHARE | | ||
1402 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | ||
1403 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | ||
1404 | VM_MIXEDMAP | VM_SAO)) | ||
1405 | return -EINVAL; | ||
1406 | *vm_flags &= ~VM_NOHUGEPAGE; | ||
1407 | *vm_flags |= VM_HUGEPAGE; | ||
1408 | /* | ||
1409 | * If the vma become good for khugepaged to scan, | ||
1410 | * register it here without waiting a page fault that | ||
1411 | * may not happen any time soon. | ||
1412 | */ | ||
1413 | if (unlikely(khugepaged_enter_vma_merge(vma))) | ||
1414 | return -ENOMEM; | ||
1415 | break; | ||
1416 | case MADV_NOHUGEPAGE: | ||
1417 | /* | ||
1418 | * Be somewhat over-protective like KSM for now! | ||
1419 | */ | ||
1420 | if (*vm_flags & (VM_NOHUGEPAGE | | ||
1421 | VM_SHARED | VM_MAYSHARE | | ||
1422 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | ||
1423 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | ||
1424 | VM_MIXEDMAP | VM_SAO)) | ||
1425 | return -EINVAL; | ||
1426 | *vm_flags &= ~VM_HUGEPAGE; | ||
1427 | *vm_flags |= VM_NOHUGEPAGE; | ||
1428 | /* | ||
1429 | * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning | ||
1430 | * this vma even if we leave the mm registered in khugepaged if | ||
1431 | * it got registered before VM_NOHUGEPAGE was set. | ||
1432 | */ | ||
1433 | break; | ||
1434 | } | ||
1435 | |||
1436 | return 0; | ||
1437 | } | ||
1438 | |||
1439 | static int __init khugepaged_slab_init(void) | ||
1440 | { | ||
1441 | mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", | ||
1442 | sizeof(struct mm_slot), | ||
1443 | __alignof__(struct mm_slot), 0, NULL); | ||
1444 | if (!mm_slot_cache) | ||
1445 | return -ENOMEM; | ||
1446 | |||
1447 | return 0; | ||
1448 | } | ||
1449 | |||
1450 | static void __init khugepaged_slab_free(void) | ||
1451 | { | ||
1452 | kmem_cache_destroy(mm_slot_cache); | ||
1453 | mm_slot_cache = NULL; | ||
1454 | } | ||
1455 | |||
1456 | static inline struct mm_slot *alloc_mm_slot(void) | ||
1457 | { | ||
1458 | if (!mm_slot_cache) /* initialization failed */ | ||
1459 | return NULL; | ||
1460 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); | ||
1461 | } | ||
1462 | |||
1463 | static inline void free_mm_slot(struct mm_slot *mm_slot) | ||
1464 | { | ||
1465 | kmem_cache_free(mm_slot_cache, mm_slot); | ||
1466 | } | ||
1467 | |||
1468 | static int __init mm_slots_hash_init(void) | ||
1469 | { | ||
1470 | mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), | ||
1471 | GFP_KERNEL); | ||
1472 | if (!mm_slots_hash) | ||
1473 | return -ENOMEM; | ||
1474 | return 0; | ||
1475 | } | ||
1476 | |||
1477 | #if 0 | ||
1478 | static void __init mm_slots_hash_free(void) | ||
1479 | { | ||
1480 | kfree(mm_slots_hash); | ||
1481 | mm_slots_hash = NULL; | ||
1482 | } | ||
1483 | #endif | ||
1484 | |||
1485 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | ||
1486 | { | ||
1487 | struct mm_slot *mm_slot; | ||
1488 | struct hlist_head *bucket; | ||
1489 | struct hlist_node *node; | ||
1490 | |||
1491 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
1492 | % MM_SLOTS_HASH_HEADS]; | ||
1493 | hlist_for_each_entry(mm_slot, node, bucket, hash) { | ||
1494 | if (mm == mm_slot->mm) | ||
1495 | return mm_slot; | ||
1496 | } | ||
1497 | return NULL; | ||
1498 | } | ||
1499 | |||
1500 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | ||
1501 | struct mm_slot *mm_slot) | ||
1502 | { | ||
1503 | struct hlist_head *bucket; | ||
1504 | |||
1505 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
1506 | % MM_SLOTS_HASH_HEADS]; | ||
1507 | mm_slot->mm = mm; | ||
1508 | hlist_add_head(&mm_slot->hash, bucket); | ||
1509 | } | ||
1510 | |||
1511 | static inline int khugepaged_test_exit(struct mm_struct *mm) | ||
1512 | { | ||
1513 | return atomic_read(&mm->mm_users) == 0; | ||
1514 | } | ||
1515 | |||
1516 | int __khugepaged_enter(struct mm_struct *mm) | ||
1517 | { | ||
1518 | struct mm_slot *mm_slot; | ||
1519 | int wakeup; | ||
1520 | |||
1521 | mm_slot = alloc_mm_slot(); | ||
1522 | if (!mm_slot) | ||
1523 | return -ENOMEM; | ||
1524 | |||
1525 | /* __khugepaged_exit() must not run from under us */ | ||
1526 | VM_BUG_ON(khugepaged_test_exit(mm)); | ||
1527 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { | ||
1528 | free_mm_slot(mm_slot); | ||
1529 | return 0; | ||
1530 | } | ||
1531 | |||
1532 | spin_lock(&khugepaged_mm_lock); | ||
1533 | insert_to_mm_slots_hash(mm, mm_slot); | ||
1534 | /* | ||
1535 | * Insert just behind the scanning cursor, to let the area settle | ||
1536 | * down a little. | ||
1537 | */ | ||
1538 | wakeup = list_empty(&khugepaged_scan.mm_head); | ||
1539 | list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); | ||
1540 | spin_unlock(&khugepaged_mm_lock); | ||
1541 | |||
1542 | atomic_inc(&mm->mm_count); | ||
1543 | if (wakeup) | ||
1544 | wake_up_interruptible(&khugepaged_wait); | ||
1545 | |||
1546 | return 0; | ||
1547 | } | ||
1548 | |||
1549 | int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | ||
1550 | { | ||
1551 | unsigned long hstart, hend; | ||
1552 | if (!vma->anon_vma) | ||
1553 | /* | ||
1554 | * Not yet faulted in so we will register later in the | ||
1555 | * page fault if needed. | ||
1556 | */ | ||
1557 | return 0; | ||
1558 | if (vma->vm_file || vma->vm_ops) | ||
1559 | /* khugepaged not yet working on file or special mappings */ | ||
1560 | return 0; | ||
1561 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
1562 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
1563 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
1564 | if (hstart < hend) | ||
1565 | return khugepaged_enter(vma); | ||
1566 | return 0; | ||
1567 | } | ||
1568 | |||
1569 | void __khugepaged_exit(struct mm_struct *mm) | ||
1570 | { | ||
1571 | struct mm_slot *mm_slot; | ||
1572 | int free = 0; | ||
1573 | |||
1574 | spin_lock(&khugepaged_mm_lock); | ||
1575 | mm_slot = get_mm_slot(mm); | ||
1576 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | ||
1577 | hlist_del(&mm_slot->hash); | ||
1578 | list_del(&mm_slot->mm_node); | ||
1579 | free = 1; | ||
1580 | } | ||
1581 | |||
1582 | if (free) { | ||
1583 | spin_unlock(&khugepaged_mm_lock); | ||
1584 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
1585 | free_mm_slot(mm_slot); | ||
1586 | mmdrop(mm); | ||
1587 | } else if (mm_slot) { | ||
1588 | spin_unlock(&khugepaged_mm_lock); | ||
1589 | /* | ||
1590 | * This is required to serialize against | ||
1591 | * khugepaged_test_exit() (which is guaranteed to run | ||
1592 | * under mmap sem read mode). Stop here (after we | ||
1593 | * return all pagetables will be destroyed) until | ||
1594 | * khugepaged has finished working on the pagetables | ||
1595 | * under the mmap_sem. | ||
1596 | */ | ||
1597 | down_write(&mm->mmap_sem); | ||
1598 | up_write(&mm->mmap_sem); | ||
1599 | } else | ||
1600 | spin_unlock(&khugepaged_mm_lock); | ||
1601 | } | ||
1602 | |||
1603 | static void release_pte_page(struct page *page) | ||
1604 | { | ||
1605 | /* 0 stands for page_is_file_cache(page) == false */ | ||
1606 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
1607 | unlock_page(page); | ||
1608 | putback_lru_page(page); | ||
1609 | } | ||
1610 | |||
1611 | static void release_pte_pages(pte_t *pte, pte_t *_pte) | ||
1612 | { | ||
1613 | while (--_pte >= pte) { | ||
1614 | pte_t pteval = *_pte; | ||
1615 | if (!pte_none(pteval)) | ||
1616 | release_pte_page(pte_page(pteval)); | ||
1617 | } | ||
1618 | } | ||
1619 | |||
1620 | static void release_all_pte_pages(pte_t *pte) | ||
1621 | { | ||
1622 | release_pte_pages(pte, pte + HPAGE_PMD_NR); | ||
1623 | } | ||
1624 | |||
1625 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | ||
1626 | unsigned long address, | ||
1627 | pte_t *pte) | ||
1628 | { | ||
1629 | struct page *page; | ||
1630 | pte_t *_pte; | ||
1631 | int referenced = 0, isolated = 0, none = 0; | ||
1632 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
1633 | _pte++, address += PAGE_SIZE) { | ||
1634 | pte_t pteval = *_pte; | ||
1635 | if (pte_none(pteval)) { | ||
1636 | if (++none <= khugepaged_max_ptes_none) | ||
1637 | continue; | ||
1638 | else { | ||
1639 | release_pte_pages(pte, _pte); | ||
1640 | goto out; | ||
1641 | } | ||
1642 | } | ||
1643 | if (!pte_present(pteval) || !pte_write(pteval)) { | ||
1644 | release_pte_pages(pte, _pte); | ||
1645 | goto out; | ||
1646 | } | ||
1647 | page = vm_normal_page(vma, address, pteval); | ||
1648 | if (unlikely(!page)) { | ||
1649 | release_pte_pages(pte, _pte); | ||
1650 | goto out; | ||
1651 | } | ||
1652 | VM_BUG_ON(PageCompound(page)); | ||
1653 | BUG_ON(!PageAnon(page)); | ||
1654 | VM_BUG_ON(!PageSwapBacked(page)); | ||
1655 | |||
1656 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
1657 | if (page_count(page) != 1) { | ||
1658 | release_pte_pages(pte, _pte); | ||
1659 | goto out; | ||
1660 | } | ||
1661 | /* | ||
1662 | * We can do it before isolate_lru_page because the | ||
1663 | * page can't be freed from under us. NOTE: PG_lock | ||
1664 | * is needed to serialize against split_huge_page | ||
1665 | * when invoked from the VM. | ||
1666 | */ | ||
1667 | if (!trylock_page(page)) { | ||
1668 | release_pte_pages(pte, _pte); | ||
1669 | goto out; | ||
1670 | } | ||
1671 | /* | ||
1672 | * Isolate the page to avoid collapsing an hugepage | ||
1673 | * currently in use by the VM. | ||
1674 | */ | ||
1675 | if (isolate_lru_page(page)) { | ||
1676 | unlock_page(page); | ||
1677 | release_pte_pages(pte, _pte); | ||
1678 | goto out; | ||
1679 | } | ||
1680 | /* 0 stands for page_is_file_cache(page) == false */ | ||
1681 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
1682 | VM_BUG_ON(!PageLocked(page)); | ||
1683 | VM_BUG_ON(PageLRU(page)); | ||
1684 | |||
1685 | /* If there is no mapped pte young don't collapse the page */ | ||
1686 | if (pte_young(pteval) || PageReferenced(page) || | ||
1687 | mmu_notifier_test_young(vma->vm_mm, address)) | ||
1688 | referenced = 1; | ||
1689 | } | ||
1690 | if (unlikely(!referenced)) | ||
1691 | release_all_pte_pages(pte); | ||
1692 | else | ||
1693 | isolated = 1; | ||
1694 | out: | ||
1695 | return isolated; | ||
1696 | } | ||
1697 | |||
1698 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | ||
1699 | struct vm_area_struct *vma, | ||
1700 | unsigned long address, | ||
1701 | spinlock_t *ptl) | ||
1702 | { | ||
1703 | pte_t *_pte; | ||
1704 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { | ||
1705 | pte_t pteval = *_pte; | ||
1706 | struct page *src_page; | ||
1707 | |||
1708 | if (pte_none(pteval)) { | ||
1709 | clear_user_highpage(page, address); | ||
1710 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); | ||
1711 | } else { | ||
1712 | src_page = pte_page(pteval); | ||
1713 | copy_user_highpage(page, src_page, address, vma); | ||
1714 | VM_BUG_ON(page_mapcount(src_page) != 1); | ||
1715 | VM_BUG_ON(page_count(src_page) != 2); | ||
1716 | release_pte_page(src_page); | ||
1717 | /* | ||
1718 | * ptl mostly unnecessary, but preempt has to | ||
1719 | * be disabled to update the per-cpu stats | ||
1720 | * inside page_remove_rmap(). | ||
1721 | */ | ||
1722 | spin_lock(ptl); | ||
1723 | /* | ||
1724 | * paravirt calls inside pte_clear here are | ||
1725 | * superfluous. | ||
1726 | */ | ||
1727 | pte_clear(vma->vm_mm, address, _pte); | ||
1728 | page_remove_rmap(src_page); | ||
1729 | spin_unlock(ptl); | ||
1730 | free_page_and_swap_cache(src_page); | ||
1731 | } | ||
1732 | |||
1733 | address += PAGE_SIZE; | ||
1734 | page++; | ||
1735 | } | ||
1736 | } | ||
1737 | |||
1738 | static void collapse_huge_page(struct mm_struct *mm, | ||
1739 | unsigned long address, | ||
1740 | struct page **hpage, | ||
1741 | struct vm_area_struct *vma) | ||
1742 | { | ||
1743 | pgd_t *pgd; | ||
1744 | pud_t *pud; | ||
1745 | pmd_t *pmd, _pmd; | ||
1746 | pte_t *pte; | ||
1747 | pgtable_t pgtable; | ||
1748 | struct page *new_page; | ||
1749 | spinlock_t *ptl; | ||
1750 | int isolated; | ||
1751 | unsigned long hstart, hend; | ||
1752 | |||
1753 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1754 | #ifndef CONFIG_NUMA | ||
1755 | VM_BUG_ON(!*hpage); | ||
1756 | new_page = *hpage; | ||
1757 | #else | ||
1758 | VM_BUG_ON(*hpage); | ||
1759 | /* | ||
1760 | * Allocate the page while the vma is still valid and under | ||
1761 | * the mmap_sem read mode so there is no memory allocation | ||
1762 | * later when we take the mmap_sem in write mode. This is more | ||
1763 | * friendly behavior (OTOH it may actually hide bugs) to | ||
1764 | * filesystems in userland with daemons allocating memory in | ||
1765 | * the userland I/O paths. Allocating memory with the | ||
1766 | * mmap_sem in read mode is good idea also to allow greater | ||
1767 | * scalability. | ||
1768 | */ | ||
1769 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); | ||
1770 | if (unlikely(!new_page)) { | ||
1771 | up_read(&mm->mmap_sem); | ||
1772 | *hpage = ERR_PTR(-ENOMEM); | ||
1773 | return; | ||
1774 | } | ||
1775 | #endif | ||
1776 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | ||
1777 | up_read(&mm->mmap_sem); | ||
1778 | put_page(new_page); | ||
1779 | return; | ||
1780 | } | ||
1781 | |||
1782 | /* after allocating the hugepage upgrade to mmap_sem write mode */ | ||
1783 | up_read(&mm->mmap_sem); | ||
1784 | |||
1785 | /* | ||
1786 | * Prevent all access to pagetables with the exception of | ||
1787 | * gup_fast later hanlded by the ptep_clear_flush and the VM | ||
1788 | * handled by the anon_vma lock + PG_lock. | ||
1789 | */ | ||
1790 | down_write(&mm->mmap_sem); | ||
1791 | if (unlikely(khugepaged_test_exit(mm))) | ||
1792 | goto out; | ||
1793 | |||
1794 | vma = find_vma(mm, address); | ||
1795 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
1796 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
1797 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | ||
1798 | goto out; | ||
1799 | |||
1800 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
1801 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
1802 | goto out; | ||
1803 | |||
1804 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | ||
1805 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) | ||
1806 | goto out; | ||
1807 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
1808 | |||
1809 | pgd = pgd_offset(mm, address); | ||
1810 | if (!pgd_present(*pgd)) | ||
1811 | goto out; | ||
1812 | |||
1813 | pud = pud_offset(pgd, address); | ||
1814 | if (!pud_present(*pud)) | ||
1815 | goto out; | ||
1816 | |||
1817 | pmd = pmd_offset(pud, address); | ||
1818 | /* pmd can't go away or become huge under us */ | ||
1819 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
1820 | goto out; | ||
1821 | |||
1822 | anon_vma_lock(vma->anon_vma); | ||
1823 | |||
1824 | pte = pte_offset_map(pmd, address); | ||
1825 | ptl = pte_lockptr(mm, pmd); | ||
1826 | |||
1827 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | ||
1828 | /* | ||
1829 | * After this gup_fast can't run anymore. This also removes | ||
1830 | * any huge TLB entry from the CPU so we won't allow | ||
1831 | * huge and small TLB entries for the same virtual address | ||
1832 | * to avoid the risk of CPU bugs in that area. | ||
1833 | */ | ||
1834 | _pmd = pmdp_clear_flush_notify(vma, address, pmd); | ||
1835 | spin_unlock(&mm->page_table_lock); | ||
1836 | |||
1837 | spin_lock(ptl); | ||
1838 | isolated = __collapse_huge_page_isolate(vma, address, pte); | ||
1839 | spin_unlock(ptl); | ||
1840 | pte_unmap(pte); | ||
1841 | |||
1842 | if (unlikely(!isolated)) { | ||
1843 | spin_lock(&mm->page_table_lock); | ||
1844 | BUG_ON(!pmd_none(*pmd)); | ||
1845 | set_pmd_at(mm, address, pmd, _pmd); | ||
1846 | spin_unlock(&mm->page_table_lock); | ||
1847 | anon_vma_unlock(vma->anon_vma); | ||
1848 | mem_cgroup_uncharge_page(new_page); | ||
1849 | goto out; | ||
1850 | } | ||
1851 | |||
1852 | /* | ||
1853 | * All pages are isolated and locked so anon_vma rmap | ||
1854 | * can't run anymore. | ||
1855 | */ | ||
1856 | anon_vma_unlock(vma->anon_vma); | ||
1857 | |||
1858 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); | ||
1859 | __SetPageUptodate(new_page); | ||
1860 | pgtable = pmd_pgtable(_pmd); | ||
1861 | VM_BUG_ON(page_count(pgtable) != 1); | ||
1862 | VM_BUG_ON(page_mapcount(pgtable) != 0); | ||
1863 | |||
1864 | _pmd = mk_pmd(new_page, vma->vm_page_prot); | ||
1865 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
1866 | _pmd = pmd_mkhuge(_pmd); | ||
1867 | |||
1868 | /* | ||
1869 | * spin_lock() below is not the equivalent of smp_wmb(), so | ||
1870 | * this is needed to avoid the copy_huge_page writes to become | ||
1871 | * visible after the set_pmd_at() write. | ||
1872 | */ | ||
1873 | smp_wmb(); | ||
1874 | |||
1875 | spin_lock(&mm->page_table_lock); | ||
1876 | BUG_ON(!pmd_none(*pmd)); | ||
1877 | page_add_new_anon_rmap(new_page, vma, address); | ||
1878 | set_pmd_at(mm, address, pmd, _pmd); | ||
1879 | update_mmu_cache(vma, address, entry); | ||
1880 | prepare_pmd_huge_pte(pgtable, mm); | ||
1881 | mm->nr_ptes--; | ||
1882 | spin_unlock(&mm->page_table_lock); | ||
1883 | |||
1884 | #ifndef CONFIG_NUMA | ||
1885 | *hpage = NULL; | ||
1886 | #endif | ||
1887 | khugepaged_pages_collapsed++; | ||
1888 | out_up_write: | ||
1889 | up_write(&mm->mmap_sem); | ||
1890 | return; | ||
1891 | |||
1892 | out: | ||
1893 | #ifdef CONFIG_NUMA | ||
1894 | put_page(new_page); | ||
1895 | #endif | ||
1896 | goto out_up_write; | ||
1897 | } | ||
1898 | |||
1899 | static int khugepaged_scan_pmd(struct mm_struct *mm, | ||
1900 | struct vm_area_struct *vma, | ||
1901 | unsigned long address, | ||
1902 | struct page **hpage) | ||
1903 | { | ||
1904 | pgd_t *pgd; | ||
1905 | pud_t *pud; | ||
1906 | pmd_t *pmd; | ||
1907 | pte_t *pte, *_pte; | ||
1908 | int ret = 0, referenced = 0, none = 0; | ||
1909 | struct page *page; | ||
1910 | unsigned long _address; | ||
1911 | spinlock_t *ptl; | ||
1912 | |||
1913 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1914 | |||
1915 | pgd = pgd_offset(mm, address); | ||
1916 | if (!pgd_present(*pgd)) | ||
1917 | goto out; | ||
1918 | |||
1919 | pud = pud_offset(pgd, address); | ||
1920 | if (!pud_present(*pud)) | ||
1921 | goto out; | ||
1922 | |||
1923 | pmd = pmd_offset(pud, address); | ||
1924 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
1925 | goto out; | ||
1926 | |||
1927 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
1928 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
1929 | _pte++, _address += PAGE_SIZE) { | ||
1930 | pte_t pteval = *_pte; | ||
1931 | if (pte_none(pteval)) { | ||
1932 | if (++none <= khugepaged_max_ptes_none) | ||
1933 | continue; | ||
1934 | else | ||
1935 | goto out_unmap; | ||
1936 | } | ||
1937 | if (!pte_present(pteval) || !pte_write(pteval)) | ||
1938 | goto out_unmap; | ||
1939 | page = vm_normal_page(vma, _address, pteval); | ||
1940 | if (unlikely(!page)) | ||
1941 | goto out_unmap; | ||
1942 | VM_BUG_ON(PageCompound(page)); | ||
1943 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | ||
1944 | goto out_unmap; | ||
1945 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
1946 | if (page_count(page) != 1) | ||
1947 | goto out_unmap; | ||
1948 | if (pte_young(pteval) || PageReferenced(page) || | ||
1949 | mmu_notifier_test_young(vma->vm_mm, address)) | ||
1950 | referenced = 1; | ||
1951 | } | ||
1952 | if (referenced) | ||
1953 | ret = 1; | ||
1954 | out_unmap: | ||
1955 | pte_unmap_unlock(pte, ptl); | ||
1956 | if (ret) | ||
1957 | /* collapse_huge_page will return with the mmap_sem released */ | ||
1958 | collapse_huge_page(mm, address, hpage, vma); | ||
1959 | out: | ||
1960 | return ret; | ||
1961 | } | ||
1962 | |||
1963 | static void collect_mm_slot(struct mm_slot *mm_slot) | ||
1964 | { | ||
1965 | struct mm_struct *mm = mm_slot->mm; | ||
1966 | |||
1967 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | ||
1968 | |||
1969 | if (khugepaged_test_exit(mm)) { | ||
1970 | /* free mm_slot */ | ||
1971 | hlist_del(&mm_slot->hash); | ||
1972 | list_del(&mm_slot->mm_node); | ||
1973 | |||
1974 | /* | ||
1975 | * Not strictly needed because the mm exited already. | ||
1976 | * | ||
1977 | * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
1978 | */ | ||
1979 | |||
1980 | /* khugepaged_mm_lock actually not necessary for the below */ | ||
1981 | free_mm_slot(mm_slot); | ||
1982 | mmdrop(mm); | ||
1983 | } | ||
1984 | } | ||
1985 | |||
1986 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | ||
1987 | struct page **hpage) | ||
1988 | { | ||
1989 | struct mm_slot *mm_slot; | ||
1990 | struct mm_struct *mm; | ||
1991 | struct vm_area_struct *vma; | ||
1992 | int progress = 0; | ||
1993 | |||
1994 | VM_BUG_ON(!pages); | ||
1995 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | ||
1996 | |||
1997 | if (khugepaged_scan.mm_slot) | ||
1998 | mm_slot = khugepaged_scan.mm_slot; | ||
1999 | else { | ||
2000 | mm_slot = list_entry(khugepaged_scan.mm_head.next, | ||
2001 | struct mm_slot, mm_node); | ||
2002 | khugepaged_scan.address = 0; | ||
2003 | khugepaged_scan.mm_slot = mm_slot; | ||
2004 | } | ||
2005 | spin_unlock(&khugepaged_mm_lock); | ||
2006 | |||
2007 | mm = mm_slot->mm; | ||
2008 | down_read(&mm->mmap_sem); | ||
2009 | if (unlikely(khugepaged_test_exit(mm))) | ||
2010 | vma = NULL; | ||
2011 | else | ||
2012 | vma = find_vma(mm, khugepaged_scan.address); | ||
2013 | |||
2014 | progress++; | ||
2015 | for (; vma; vma = vma->vm_next) { | ||
2016 | unsigned long hstart, hend; | ||
2017 | |||
2018 | cond_resched(); | ||
2019 | if (unlikely(khugepaged_test_exit(mm))) { | ||
2020 | progress++; | ||
2021 | break; | ||
2022 | } | ||
2023 | |||
2024 | if ((!(vma->vm_flags & VM_HUGEPAGE) && | ||
2025 | !khugepaged_always()) || | ||
2026 | (vma->vm_flags & VM_NOHUGEPAGE)) { | ||
2027 | progress++; | ||
2028 | continue; | ||
2029 | } | ||
2030 | |||
2031 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | ||
2032 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { | ||
2033 | khugepaged_scan.address = vma->vm_end; | ||
2034 | progress++; | ||
2035 | continue; | ||
2036 | } | ||
2037 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
2038 | |||
2039 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
2040 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
2041 | if (hstart >= hend) { | ||
2042 | progress++; | ||
2043 | continue; | ||
2044 | } | ||
2045 | if (khugepaged_scan.address < hstart) | ||
2046 | khugepaged_scan.address = hstart; | ||
2047 | if (khugepaged_scan.address > hend) { | ||
2048 | khugepaged_scan.address = hend + HPAGE_PMD_SIZE; | ||
2049 | progress++; | ||
2050 | continue; | ||
2051 | } | ||
2052 | BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); | ||
2053 | |||
2054 | while (khugepaged_scan.address < hend) { | ||
2055 | int ret; | ||
2056 | cond_resched(); | ||
2057 | if (unlikely(khugepaged_test_exit(mm))) | ||
2058 | goto breakouterloop; | ||
2059 | |||
2060 | VM_BUG_ON(khugepaged_scan.address < hstart || | ||
2061 | khugepaged_scan.address + HPAGE_PMD_SIZE > | ||
2062 | hend); | ||
2063 | ret = khugepaged_scan_pmd(mm, vma, | ||
2064 | khugepaged_scan.address, | ||
2065 | hpage); | ||
2066 | /* move to next address */ | ||
2067 | khugepaged_scan.address += HPAGE_PMD_SIZE; | ||
2068 | progress += HPAGE_PMD_NR; | ||
2069 | if (ret) | ||
2070 | /* we released mmap_sem so break loop */ | ||
2071 | goto breakouterloop_mmap_sem; | ||
2072 | if (progress >= pages) | ||
2073 | goto breakouterloop; | ||
2074 | } | ||
2075 | } | ||
2076 | breakouterloop: | ||
2077 | up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ | ||
2078 | breakouterloop_mmap_sem: | ||
2079 | |||
2080 | spin_lock(&khugepaged_mm_lock); | ||
2081 | BUG_ON(khugepaged_scan.mm_slot != mm_slot); | ||
2082 | /* | ||
2083 | * Release the current mm_slot if this mm is about to die, or | ||
2084 | * if we scanned all vmas of this mm. | ||
2085 | */ | ||
2086 | if (khugepaged_test_exit(mm) || !vma) { | ||
2087 | /* | ||
2088 | * Make sure that if mm_users is reaching zero while | ||
2089 | * khugepaged runs here, khugepaged_exit will find | ||
2090 | * mm_slot not pointing to the exiting mm. | ||
2091 | */ | ||
2092 | if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { | ||
2093 | khugepaged_scan.mm_slot = list_entry( | ||
2094 | mm_slot->mm_node.next, | ||
2095 | struct mm_slot, mm_node); | ||
2096 | khugepaged_scan.address = 0; | ||
2097 | } else { | ||
2098 | khugepaged_scan.mm_slot = NULL; | ||
2099 | khugepaged_full_scans++; | ||
2100 | } | ||
2101 | |||
2102 | collect_mm_slot(mm_slot); | ||
2103 | } | ||
2104 | |||
2105 | return progress; | ||
2106 | } | ||
2107 | |||
2108 | static int khugepaged_has_work(void) | ||
2109 | { | ||
2110 | return !list_empty(&khugepaged_scan.mm_head) && | ||
2111 | khugepaged_enabled(); | ||
2112 | } | ||
2113 | |||
2114 | static int khugepaged_wait_event(void) | ||
2115 | { | ||
2116 | return !list_empty(&khugepaged_scan.mm_head) || | ||
2117 | !khugepaged_enabled(); | ||
2118 | } | ||
2119 | |||
2120 | static void khugepaged_do_scan(struct page **hpage) | ||
2121 | { | ||
2122 | unsigned int progress = 0, pass_through_head = 0; | ||
2123 | unsigned int pages = khugepaged_pages_to_scan; | ||
2124 | |||
2125 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | ||
2126 | |||
2127 | while (progress < pages) { | ||
2128 | cond_resched(); | ||
2129 | |||
2130 | #ifndef CONFIG_NUMA | ||
2131 | if (!*hpage) { | ||
2132 | *hpage = alloc_hugepage(khugepaged_defrag()); | ||
2133 | if (unlikely(!*hpage)) | ||
2134 | break; | ||
2135 | } | ||
2136 | #else | ||
2137 | if (IS_ERR(*hpage)) | ||
2138 | break; | ||
2139 | #endif | ||
2140 | |||
2141 | if (unlikely(kthread_should_stop() || freezing(current))) | ||
2142 | break; | ||
2143 | |||
2144 | spin_lock(&khugepaged_mm_lock); | ||
2145 | if (!khugepaged_scan.mm_slot) | ||
2146 | pass_through_head++; | ||
2147 | if (khugepaged_has_work() && | ||
2148 | pass_through_head < 2) | ||
2149 | progress += khugepaged_scan_mm_slot(pages - progress, | ||
2150 | hpage); | ||
2151 | else | ||
2152 | progress = pages; | ||
2153 | spin_unlock(&khugepaged_mm_lock); | ||
2154 | } | ||
2155 | } | ||
2156 | |||
2157 | static void khugepaged_alloc_sleep(void) | ||
2158 | { | ||
2159 | DEFINE_WAIT(wait); | ||
2160 | add_wait_queue(&khugepaged_wait, &wait); | ||
2161 | schedule_timeout_interruptible( | ||
2162 | msecs_to_jiffies( | ||
2163 | khugepaged_alloc_sleep_millisecs)); | ||
2164 | remove_wait_queue(&khugepaged_wait, &wait); | ||
2165 | } | ||
2166 | |||
2167 | #ifndef CONFIG_NUMA | ||
2168 | static struct page *khugepaged_alloc_hugepage(void) | ||
2169 | { | ||
2170 | struct page *hpage; | ||
2171 | |||
2172 | do { | ||
2173 | hpage = alloc_hugepage(khugepaged_defrag()); | ||
2174 | if (!hpage) | ||
2175 | khugepaged_alloc_sleep(); | ||
2176 | } while (unlikely(!hpage) && | ||
2177 | likely(khugepaged_enabled())); | ||
2178 | return hpage; | ||
2179 | } | ||
2180 | #endif | ||
2181 | |||
2182 | static void khugepaged_loop(void) | ||
2183 | { | ||
2184 | struct page *hpage; | ||
2185 | |||
2186 | #ifdef CONFIG_NUMA | ||
2187 | hpage = NULL; | ||
2188 | #endif | ||
2189 | while (likely(khugepaged_enabled())) { | ||
2190 | #ifndef CONFIG_NUMA | ||
2191 | hpage = khugepaged_alloc_hugepage(); | ||
2192 | if (unlikely(!hpage)) | ||
2193 | break; | ||
2194 | #else | ||
2195 | if (IS_ERR(hpage)) { | ||
2196 | khugepaged_alloc_sleep(); | ||
2197 | hpage = NULL; | ||
2198 | } | ||
2199 | #endif | ||
2200 | |||
2201 | khugepaged_do_scan(&hpage); | ||
2202 | #ifndef CONFIG_NUMA | ||
2203 | if (hpage) | ||
2204 | put_page(hpage); | ||
2205 | #endif | ||
2206 | try_to_freeze(); | ||
2207 | if (unlikely(kthread_should_stop())) | ||
2208 | break; | ||
2209 | if (khugepaged_has_work()) { | ||
2210 | DEFINE_WAIT(wait); | ||
2211 | if (!khugepaged_scan_sleep_millisecs) | ||
2212 | continue; | ||
2213 | add_wait_queue(&khugepaged_wait, &wait); | ||
2214 | schedule_timeout_interruptible( | ||
2215 | msecs_to_jiffies( | ||
2216 | khugepaged_scan_sleep_millisecs)); | ||
2217 | remove_wait_queue(&khugepaged_wait, &wait); | ||
2218 | } else if (khugepaged_enabled()) | ||
2219 | wait_event_freezable(khugepaged_wait, | ||
2220 | khugepaged_wait_event()); | ||
2221 | } | ||
2222 | } | ||
2223 | |||
2224 | static int khugepaged(void *none) | ||
2225 | { | ||
2226 | struct mm_slot *mm_slot; | ||
2227 | |||
2228 | set_freezable(); | ||
2229 | set_user_nice(current, 19); | ||
2230 | |||
2231 | /* serialize with start_khugepaged() */ | ||
2232 | mutex_lock(&khugepaged_mutex); | ||
2233 | |||
2234 | for (;;) { | ||
2235 | mutex_unlock(&khugepaged_mutex); | ||
2236 | BUG_ON(khugepaged_thread != current); | ||
2237 | khugepaged_loop(); | ||
2238 | BUG_ON(khugepaged_thread != current); | ||
2239 | |||
2240 | mutex_lock(&khugepaged_mutex); | ||
2241 | if (!khugepaged_enabled()) | ||
2242 | break; | ||
2243 | if (unlikely(kthread_should_stop())) | ||
2244 | break; | ||
2245 | } | ||
2246 | |||
2247 | spin_lock(&khugepaged_mm_lock); | ||
2248 | mm_slot = khugepaged_scan.mm_slot; | ||
2249 | khugepaged_scan.mm_slot = NULL; | ||
2250 | if (mm_slot) | ||
2251 | collect_mm_slot(mm_slot); | ||
2252 | spin_unlock(&khugepaged_mm_lock); | ||
2253 | |||
2254 | khugepaged_thread = NULL; | ||
2255 | mutex_unlock(&khugepaged_mutex); | ||
2256 | |||
2257 | return 0; | ||
2258 | } | ||
2259 | |||
2260 | void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | ||
2261 | { | ||
2262 | struct page *page; | ||
2263 | |||
2264 | spin_lock(&mm->page_table_lock); | ||
2265 | if (unlikely(!pmd_trans_huge(*pmd))) { | ||
2266 | spin_unlock(&mm->page_table_lock); | ||
2267 | return; | ||
2268 | } | ||
2269 | page = pmd_page(*pmd); | ||
2270 | VM_BUG_ON(!page_count(page)); | ||
2271 | get_page(page); | ||
2272 | spin_unlock(&mm->page_table_lock); | ||
2273 | |||
2274 | split_huge_page(page); | ||
2275 | |||
2276 | put_page(page); | ||
2277 | BUG_ON(pmd_trans_huge(*pmd)); | ||
2278 | } | ||
2279 | |||
2280 | static void split_huge_page_address(struct mm_struct *mm, | ||
2281 | unsigned long address) | ||
2282 | { | ||
2283 | pgd_t *pgd; | ||
2284 | pud_t *pud; | ||
2285 | pmd_t *pmd; | ||
2286 | |||
2287 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); | ||
2288 | |||
2289 | pgd = pgd_offset(mm, address); | ||
2290 | if (!pgd_present(*pgd)) | ||
2291 | return; | ||
2292 | |||
2293 | pud = pud_offset(pgd, address); | ||
2294 | if (!pud_present(*pud)) | ||
2295 | return; | ||
2296 | |||
2297 | pmd = pmd_offset(pud, address); | ||
2298 | if (!pmd_present(*pmd)) | ||
2299 | return; | ||
2300 | /* | ||
2301 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | ||
2302 | * materialize from under us. | ||
2303 | */ | ||
2304 | split_huge_page_pmd(mm, pmd); | ||
2305 | } | ||
2306 | |||
2307 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | ||
2308 | unsigned long start, | ||
2309 | unsigned long end, | ||
2310 | long adjust_next) | ||
2311 | { | ||
2312 | /* | ||
2313 | * If the new start address isn't hpage aligned and it could | ||
2314 | * previously contain an hugepage: check if we need to split | ||
2315 | * an huge pmd. | ||
2316 | */ | ||
2317 | if (start & ~HPAGE_PMD_MASK && | ||
2318 | (start & HPAGE_PMD_MASK) >= vma->vm_start && | ||
2319 | (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | ||
2320 | split_huge_page_address(vma->vm_mm, start); | ||
2321 | |||
2322 | /* | ||
2323 | * If the new end address isn't hpage aligned and it could | ||
2324 | * previously contain an hugepage: check if we need to split | ||
2325 | * an huge pmd. | ||
2326 | */ | ||
2327 | if (end & ~HPAGE_PMD_MASK && | ||
2328 | (end & HPAGE_PMD_MASK) >= vma->vm_start && | ||
2329 | (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | ||
2330 | split_huge_page_address(vma->vm_mm, end); | ||
2331 | |||
2332 | /* | ||
2333 | * If we're also updating the vma->vm_next->vm_start, if the new | ||
2334 | * vm_next->vm_start isn't page aligned and it could previously | ||
2335 | * contain an hugepage: check if we need to split an huge pmd. | ||
2336 | */ | ||
2337 | if (adjust_next > 0) { | ||
2338 | struct vm_area_struct *next = vma->vm_next; | ||
2339 | unsigned long nstart = next->vm_start; | ||
2340 | nstart += adjust_next << PAGE_SHIFT; | ||
2341 | if (nstart & ~HPAGE_PMD_MASK && | ||
2342 | (nstart & HPAGE_PMD_MASK) >= next->vm_start && | ||
2343 | (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) | ||
2344 | split_huge_page_address(next->vm_mm, nstart); | ||
2345 | } | ||
2346 | } | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85855240933d..bb0b7c128015 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma) | |||
394 | return 0; | 394 | return 0; |
395 | } | 395 | } |
396 | 396 | ||
397 | static void clear_gigantic_page(struct page *page, | ||
398 | unsigned long addr, unsigned long sz) | ||
399 | { | ||
400 | int i; | ||
401 | struct page *p = page; | ||
402 | |||
403 | might_sleep(); | ||
404 | for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { | ||
405 | cond_resched(); | ||
406 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
407 | } | ||
408 | } | ||
409 | static void clear_huge_page(struct page *page, | ||
410 | unsigned long addr, unsigned long sz) | ||
411 | { | ||
412 | int i; | ||
413 | |||
414 | if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { | ||
415 | clear_gigantic_page(page, addr, sz); | ||
416 | return; | ||
417 | } | ||
418 | |||
419 | might_sleep(); | ||
420 | for (i = 0; i < sz/PAGE_SIZE; i++) { | ||
421 | cond_resched(); | ||
422 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
423 | } | ||
424 | } | ||
425 | |||
426 | static void copy_user_gigantic_page(struct page *dst, struct page *src, | ||
427 | unsigned long addr, struct vm_area_struct *vma) | ||
428 | { | ||
429 | int i; | ||
430 | struct hstate *h = hstate_vma(vma); | ||
431 | struct page *dst_base = dst; | ||
432 | struct page *src_base = src; | ||
433 | |||
434 | for (i = 0; i < pages_per_huge_page(h); ) { | ||
435 | cond_resched(); | ||
436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | ||
437 | |||
438 | i++; | ||
439 | dst = mem_map_next(dst, dst_base, i); | ||
440 | src = mem_map_next(src, src_base, i); | ||
441 | } | ||
442 | } | ||
443 | |||
444 | static void copy_user_huge_page(struct page *dst, struct page *src, | ||
445 | unsigned long addr, struct vm_area_struct *vma) | ||
446 | { | ||
447 | int i; | ||
448 | struct hstate *h = hstate_vma(vma); | ||
449 | |||
450 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | ||
451 | copy_user_gigantic_page(dst, src, addr, vma); | ||
452 | return; | ||
453 | } | ||
454 | |||
455 | might_sleep(); | ||
456 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
457 | cond_resched(); | ||
458 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | ||
459 | } | ||
460 | } | ||
461 | |||
462 | static void copy_gigantic_page(struct page *dst, struct page *src) | 397 | static void copy_gigantic_page(struct page *dst, struct page *src) |
463 | { | 398 | { |
464 | int i; | 399 | int i; |
@@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, | |||
1428 | 1363 | ||
1429 | return sprintf(buf, "%lu\n", nr_huge_pages); | 1364 | return sprintf(buf, "%lu\n", nr_huge_pages); |
1430 | } | 1365 | } |
1366 | |||
1431 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | 1367 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
1432 | struct kobject *kobj, struct kobj_attribute *attr, | 1368 | struct kobject *kobj, struct kobj_attribute *attr, |
1433 | const char *buf, size_t len) | 1369 | const char *buf, size_t len) |
@@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1440 | 1376 | ||
1441 | err = strict_strtoul(buf, 10, &count); | 1377 | err = strict_strtoul(buf, 10, &count); |
1442 | if (err) | 1378 | if (err) |
1443 | return 0; | 1379 | goto out; |
1444 | 1380 | ||
1445 | h = kobj_to_hstate(kobj, &nid); | 1381 | h = kobj_to_hstate(kobj, &nid); |
1382 | if (h->order >= MAX_ORDER) { | ||
1383 | err = -EINVAL; | ||
1384 | goto out; | ||
1385 | } | ||
1386 | |||
1446 | if (nid == NUMA_NO_NODE) { | 1387 | if (nid == NUMA_NO_NODE) { |
1447 | /* | 1388 | /* |
1448 | * global hstate attribute | 1389 | * global hstate attribute |
@@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1468 | NODEMASK_FREE(nodes_allowed); | 1409 | NODEMASK_FREE(nodes_allowed); |
1469 | 1410 | ||
1470 | return len; | 1411 | return len; |
1412 | out: | ||
1413 | NODEMASK_FREE(nodes_allowed); | ||
1414 | return err; | ||
1471 | } | 1415 | } |
1472 | 1416 | ||
1473 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1417 | static ssize_t nr_hugepages_show(struct kobject *kobj, |
@@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | |||
1510 | struct hstate *h = kobj_to_hstate(kobj, NULL); | 1454 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1511 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1455 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
1512 | } | 1456 | } |
1457 | |||
1513 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1458 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
1514 | struct kobj_attribute *attr, const char *buf, size_t count) | 1459 | struct kobj_attribute *attr, const char *buf, size_t count) |
1515 | { | 1460 | { |
@@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1517 | unsigned long input; | 1462 | unsigned long input; |
1518 | struct hstate *h = kobj_to_hstate(kobj, NULL); | 1463 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1519 | 1464 | ||
1465 | if (h->order >= MAX_ORDER) | ||
1466 | return -EINVAL; | ||
1467 | |||
1520 | err = strict_strtoul(buf, 10, &input); | 1468 | err = strict_strtoul(buf, 10, &input); |
1521 | if (err) | 1469 | if (err) |
1522 | return 0; | 1470 | return err; |
1523 | 1471 | ||
1524 | spin_lock(&hugetlb_lock); | 1472 | spin_lock(&hugetlb_lock); |
1525 | h->nr_overcommit_huge_pages = input; | 1473 | h->nr_overcommit_huge_pages = input; |
@@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
1922 | { | 1870 | { |
1923 | struct hstate *h = &default_hstate; | 1871 | struct hstate *h = &default_hstate; |
1924 | unsigned long tmp; | 1872 | unsigned long tmp; |
1873 | int ret; | ||
1925 | 1874 | ||
1926 | if (!write) | 1875 | if (!write) |
1927 | tmp = h->max_huge_pages; | 1876 | tmp = h->max_huge_pages; |
1928 | 1877 | ||
1878 | if (write && h->order >= MAX_ORDER) | ||
1879 | return -EINVAL; | ||
1880 | |||
1929 | table->data = &tmp; | 1881 | table->data = &tmp; |
1930 | table->maxlen = sizeof(unsigned long); | 1882 | table->maxlen = sizeof(unsigned long); |
1931 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1883 | ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1884 | if (ret) | ||
1885 | goto out; | ||
1932 | 1886 | ||
1933 | if (write) { | 1887 | if (write) { |
1934 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, | 1888 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
@@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
1943 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 1897 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) |
1944 | NODEMASK_FREE(nodes_allowed); | 1898 | NODEMASK_FREE(nodes_allowed); |
1945 | } | 1899 | } |
1946 | 1900 | out: | |
1947 | return 0; | 1901 | return ret; |
1948 | } | 1902 | } |
1949 | 1903 | ||
1950 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1904 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
@@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
1982 | { | 1936 | { |
1983 | struct hstate *h = &default_hstate; | 1937 | struct hstate *h = &default_hstate; |
1984 | unsigned long tmp; | 1938 | unsigned long tmp; |
1939 | int ret; | ||
1985 | 1940 | ||
1986 | if (!write) | 1941 | if (!write) |
1987 | tmp = h->nr_overcommit_huge_pages; | 1942 | tmp = h->nr_overcommit_huge_pages; |
1988 | 1943 | ||
1944 | if (write && h->order >= MAX_ORDER) | ||
1945 | return -EINVAL; | ||
1946 | |||
1989 | table->data = &tmp; | 1947 | table->data = &tmp; |
1990 | table->maxlen = sizeof(unsigned long); | 1948 | table->maxlen = sizeof(unsigned long); |
1991 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1949 | ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1950 | if (ret) | ||
1951 | goto out; | ||
1992 | 1952 | ||
1993 | if (write) { | 1953 | if (write) { |
1994 | spin_lock(&hugetlb_lock); | 1954 | spin_lock(&hugetlb_lock); |
1995 | h->nr_overcommit_huge_pages = tmp; | 1955 | h->nr_overcommit_huge_pages = tmp; |
1996 | spin_unlock(&hugetlb_lock); | 1956 | spin_unlock(&hugetlb_lock); |
1997 | } | 1957 | } |
1998 | 1958 | out: | |
1999 | return 0; | 1959 | return ret; |
2000 | } | 1960 | } |
2001 | 1961 | ||
2002 | #endif /* CONFIG_SYSCTL */ | 1962 | #endif /* CONFIG_SYSCTL */ |
@@ -2454,7 +2414,8 @@ retry_avoidcopy: | |||
2454 | return VM_FAULT_OOM; | 2414 | return VM_FAULT_OOM; |
2455 | } | 2415 | } |
2456 | 2416 | ||
2457 | copy_user_huge_page(new_page, old_page, address, vma); | 2417 | copy_user_huge_page(new_page, old_page, address, vma, |
2418 | pages_per_huge_page(h)); | ||
2458 | __SetPageUptodate(new_page); | 2419 | __SetPageUptodate(new_page); |
2459 | 2420 | ||
2460 | /* | 2421 | /* |
@@ -2558,7 +2519,7 @@ retry: | |||
2558 | ret = -PTR_ERR(page); | 2519 | ret = -PTR_ERR(page); |
2559 | goto out; | 2520 | goto out; |
2560 | } | 2521 | } |
2561 | clear_huge_page(page, address, huge_page_size(h)); | 2522 | clear_huge_page(page, address, pages_per_huge_page(h)); |
2562 | __SetPageUptodate(page); | 2523 | __SetPageUptodate(page); |
2563 | 2524 | ||
2564 | if (vma->vm_flags & VM_MAYSHARE) { | 2525 | if (vma->vm_flags & VM_MAYSHARE) { |
diff --git a/mm/internal.h b/mm/internal.h index dedb0aff673f..4c98630f0f77 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page) | |||
39 | 39 | ||
40 | extern unsigned long highest_memmap_pfn; | 40 | extern unsigned long highest_memmap_pfn; |
41 | 41 | ||
42 | #ifdef CONFIG_SMP | ||
43 | extern int putback_active_lru_page(struct zone *zone, struct page *page); | ||
44 | #else | ||
45 | static inline int putback_active_lru_page(struct zone *zone, struct page *page) | ||
46 | { | ||
47 | return 0; | ||
48 | } | ||
49 | #endif | ||
50 | |||
42 | /* | 51 | /* |
43 | * in mm/vmscan.c: | 52 | * in mm/vmscan.c: |
44 | */ | 53 | */ |
@@ -134,6 +143,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
134 | } | 143 | } |
135 | } | 144 | } |
136 | 145 | ||
146 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
147 | extern unsigned long vma_address(struct page *page, | ||
148 | struct vm_area_struct *vma); | ||
149 | #endif | ||
137 | #else /* !CONFIG_MMU */ | 150 | #else /* !CONFIG_MMU */ |
138 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 151 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
139 | { | 152 | { |
@@ -243,7 +256,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
243 | 256 | ||
244 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 257 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
245 | unsigned long start, int len, unsigned int foll_flags, | 258 | unsigned long start, int len, unsigned int foll_flags, |
246 | struct page **pages, struct vm_area_struct **vmas); | 259 | struct page **pages, struct vm_area_struct **vmas, |
260 | int *nonblocking); | ||
247 | 261 | ||
248 | #define ZONE_RECLAIM_NOSCAN -2 | 262 | #define ZONE_RECLAIM_NOSCAN -2 |
249 | #define ZONE_RECLAIM_FULL -1 | 263 | #define ZONE_RECLAIM_FULL -1 |
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
36 | #include <linux/hash.h> | 36 | #include <linux/hash.h> |
37 | #include <linux/freezer.h> | ||
37 | 38 | ||
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
39 | #include "internal.h" | 40 | #include "internal.h" |
@@ -411,6 +412,20 @@ out: | |||
411 | up_read(&mm->mmap_sem); | 412 | up_read(&mm->mmap_sem); |
412 | } | 413 | } |
413 | 414 | ||
415 | static struct page *page_trans_compound_anon(struct page *page) | ||
416 | { | ||
417 | if (PageTransCompound(page)) { | ||
418 | struct page *head = compound_trans_head(page); | ||
419 | /* | ||
420 | * head may actually be splitted and freed from under | ||
421 | * us but it's ok here. | ||
422 | */ | ||
423 | if (PageAnon(head)) | ||
424 | return head; | ||
425 | } | ||
426 | return NULL; | ||
427 | } | ||
428 | |||
414 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) | 429 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) |
415 | { | 430 | { |
416 | struct mm_struct *mm = rmap_item->mm; | 431 | struct mm_struct *mm = rmap_item->mm; |
@@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
430 | page = follow_page(vma, addr, FOLL_GET); | 445 | page = follow_page(vma, addr, FOLL_GET); |
431 | if (IS_ERR_OR_NULL(page)) | 446 | if (IS_ERR_OR_NULL(page)) |
432 | goto out; | 447 | goto out; |
433 | if (PageAnon(page)) { | 448 | if (PageAnon(page) || page_trans_compound_anon(page)) { |
434 | flush_anon_page(vma, page, addr); | 449 | flush_anon_page(vma, page, addr); |
435 | flush_dcache_page(page); | 450 | flush_dcache_page(page); |
436 | } else { | 451 | } else { |
@@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
708 | if (addr == -EFAULT) | 723 | if (addr == -EFAULT) |
709 | goto out; | 724 | goto out; |
710 | 725 | ||
726 | BUG_ON(PageTransCompound(page)); | ||
711 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 727 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
712 | if (!ptep) | 728 | if (!ptep) |
713 | goto out; | 729 | goto out; |
@@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
783 | goto out; | 799 | goto out; |
784 | 800 | ||
785 | pmd = pmd_offset(pud, addr); | 801 | pmd = pmd_offset(pud, addr); |
802 | BUG_ON(pmd_trans_huge(*pmd)); | ||
786 | if (!pmd_present(*pmd)) | 803 | if (!pmd_present(*pmd)) |
787 | goto out; | 804 | goto out; |
788 | 805 | ||
@@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
800 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | 817 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
801 | 818 | ||
802 | page_remove_rmap(page); | 819 | page_remove_rmap(page); |
820 | if (!page_mapped(page)) | ||
821 | try_to_free_swap(page); | ||
803 | put_page(page); | 822 | put_page(page); |
804 | 823 | ||
805 | pte_unmap_unlock(ptep, ptl); | 824 | pte_unmap_unlock(ptep, ptl); |
@@ -808,6 +827,33 @@ out: | |||
808 | return err; | 827 | return err; |
809 | } | 828 | } |
810 | 829 | ||
830 | static int page_trans_compound_anon_split(struct page *page) | ||
831 | { | ||
832 | int ret = 0; | ||
833 | struct page *transhuge_head = page_trans_compound_anon(page); | ||
834 | if (transhuge_head) { | ||
835 | /* Get the reference on the head to split it. */ | ||
836 | if (get_page_unless_zero(transhuge_head)) { | ||
837 | /* | ||
838 | * Recheck we got the reference while the head | ||
839 | * was still anonymous. | ||
840 | */ | ||
841 | if (PageAnon(transhuge_head)) | ||
842 | ret = split_huge_page(transhuge_head); | ||
843 | else | ||
844 | /* | ||
845 | * Retry later if split_huge_page run | ||
846 | * from under us. | ||
847 | */ | ||
848 | ret = 1; | ||
849 | put_page(transhuge_head); | ||
850 | } else | ||
851 | /* Retry later if split_huge_page run from under us. */ | ||
852 | ret = 1; | ||
853 | } | ||
854 | return ret; | ||
855 | } | ||
856 | |||
811 | /* | 857 | /* |
812 | * try_to_merge_one_page - take two pages and merge them into one | 858 | * try_to_merge_one_page - take two pages and merge them into one |
813 | * @vma: the vma that holds the pte pointing to page | 859 | * @vma: the vma that holds the pte pointing to page |
@@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
828 | 874 | ||
829 | if (!(vma->vm_flags & VM_MERGEABLE)) | 875 | if (!(vma->vm_flags & VM_MERGEABLE)) |
830 | goto out; | 876 | goto out; |
877 | if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | ||
878 | goto out; | ||
879 | BUG_ON(PageTransCompound(page)); | ||
831 | if (!PageAnon(page)) | 880 | if (!PageAnon(page)) |
832 | goto out; | 881 | goto out; |
833 | 882 | ||
@@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1247 | 1296 | ||
1248 | slot = ksm_scan.mm_slot; | 1297 | slot = ksm_scan.mm_slot; |
1249 | if (slot == &ksm_mm_head) { | 1298 | if (slot == &ksm_mm_head) { |
1299 | /* | ||
1300 | * A number of pages can hang around indefinitely on per-cpu | ||
1301 | * pagevecs, raised page count preventing write_protect_page | ||
1302 | * from merging them. Though it doesn't really matter much, | ||
1303 | * it is puzzling to see some stuck in pages_volatile until | ||
1304 | * other activity jostles them out, and they also prevented | ||
1305 | * LTP's KSM test from succeeding deterministically; so drain | ||
1306 | * them here (here rather than on entry to ksm_do_scan(), | ||
1307 | * so we don't IPI too often when pages_to_scan is set low). | ||
1308 | */ | ||
1309 | lru_add_drain_all(); | ||
1310 | |||
1250 | root_unstable_tree = RB_ROOT; | 1311 | root_unstable_tree = RB_ROOT; |
1251 | 1312 | ||
1252 | spin_lock(&ksm_mmlist_lock); | 1313 | spin_lock(&ksm_mmlist_lock); |
@@ -1277,7 +1338,13 @@ next_mm: | |||
1277 | if (ksm_test_exit(mm)) | 1338 | if (ksm_test_exit(mm)) |
1278 | break; | 1339 | break; |
1279 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); | 1340 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); |
1280 | if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { | 1341 | if (IS_ERR_OR_NULL(*page)) { |
1342 | ksm_scan.address += PAGE_SIZE; | ||
1343 | cond_resched(); | ||
1344 | continue; | ||
1345 | } | ||
1346 | if (PageAnon(*page) || | ||
1347 | page_trans_compound_anon(*page)) { | ||
1281 | flush_anon_page(vma, *page, ksm_scan.address); | 1348 | flush_anon_page(vma, *page, ksm_scan.address); |
1282 | flush_dcache_page(*page); | 1349 | flush_dcache_page(*page); |
1283 | rmap_item = get_next_rmap_item(slot, | 1350 | rmap_item = get_next_rmap_item(slot, |
@@ -1291,8 +1358,7 @@ next_mm: | |||
1291 | up_read(&mm->mmap_sem); | 1358 | up_read(&mm->mmap_sem); |
1292 | return rmap_item; | 1359 | return rmap_item; |
1293 | } | 1360 | } |
1294 | if (!IS_ERR_OR_NULL(*page)) | 1361 | put_page(*page); |
1295 | put_page(*page); | ||
1296 | ksm_scan.address += PAGE_SIZE; | 1362 | ksm_scan.address += PAGE_SIZE; |
1297 | cond_resched(); | 1363 | cond_resched(); |
1298 | } | 1364 | } |
@@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1352 | struct rmap_item *rmap_item; | 1418 | struct rmap_item *rmap_item; |
1353 | struct page *uninitialized_var(page); | 1419 | struct page *uninitialized_var(page); |
1354 | 1420 | ||
1355 | while (scan_npages--) { | 1421 | while (scan_npages-- && likely(!freezing(current))) { |
1356 | cond_resched(); | 1422 | cond_resched(); |
1357 | rmap_item = scan_get_next_rmap_item(&page); | 1423 | rmap_item = scan_get_next_rmap_item(&page); |
1358 | if (!rmap_item) | 1424 | if (!rmap_item) |
@@ -1370,6 +1436,7 @@ static int ksmd_should_run(void) | |||
1370 | 1436 | ||
1371 | static int ksm_scan_thread(void *nothing) | 1437 | static int ksm_scan_thread(void *nothing) |
1372 | { | 1438 | { |
1439 | set_freezable(); | ||
1373 | set_user_nice(current, 5); | 1440 | set_user_nice(current, 5); |
1374 | 1441 | ||
1375 | while (!kthread_should_stop()) { | 1442 | while (!kthread_should_stop()) { |
@@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing) | |||
1378 | ksm_do_scan(ksm_thread_pages_to_scan); | 1445 | ksm_do_scan(ksm_thread_pages_to_scan); |
1379 | mutex_unlock(&ksm_thread_mutex); | 1446 | mutex_unlock(&ksm_thread_mutex); |
1380 | 1447 | ||
1448 | try_to_freeze(); | ||
1449 | |||
1381 | if (ksmd_should_run()) { | 1450 | if (ksmd_should_run()) { |
1382 | schedule_timeout_interruptible( | 1451 | schedule_timeout_interruptible( |
1383 | msecs_to_jiffies(ksm_thread_sleep_millisecs)); | 1452 | msecs_to_jiffies(ksm_thread_sleep_millisecs)); |
1384 | } else { | 1453 | } else { |
1385 | wait_event_interruptible(ksm_thread_wait, | 1454 | wait_event_freezable(ksm_thread_wait, |
1386 | ksmd_should_run() || kthread_should_stop()); | 1455 | ksmd_should_run() || kthread_should_stop()); |
1387 | } | 1456 | } |
1388 | } | 1457 | } |
diff --git a/mm/madvise.c b/mm/madvise.c index 319528b8db74..2221491ed503 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma, | |||
71 | if (error) | 71 | if (error) |
72 | goto out; | 72 | goto out; |
73 | break; | 73 | break; |
74 | case MADV_HUGEPAGE: | ||
75 | case MADV_NOHUGEPAGE: | ||
76 | error = hugepage_madvise(vma, &new_flags, behavior); | ||
77 | if (error) | ||
78 | goto out; | ||
79 | break; | ||
74 | } | 80 | } |
75 | 81 | ||
76 | if (new_flags == vma->vm_flags) { | 82 | if (new_flags == vma->vm_flags) { |
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior) | |||
283 | case MADV_MERGEABLE: | 289 | case MADV_MERGEABLE: |
284 | case MADV_UNMERGEABLE: | 290 | case MADV_UNMERGEABLE: |
285 | #endif | 291 | #endif |
292 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
293 | case MADV_HUGEPAGE: | ||
294 | case MADV_NOHUGEPAGE: | ||
295 | #endif | ||
286 | return 1; | 296 | return 1; |
287 | 297 | ||
288 | default: | 298 | default: |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00bb8a64d028..8ab841031436 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -292,7 +292,6 @@ static struct move_charge_struct { | |||
292 | unsigned long moved_charge; | 292 | unsigned long moved_charge; |
293 | unsigned long moved_swap; | 293 | unsigned long moved_swap; |
294 | struct task_struct *moving_task; /* a task moving charges */ | 294 | struct task_struct *moving_task; /* a task moving charges */ |
295 | struct mm_struct *mm; | ||
296 | wait_queue_head_t waitq; /* a waitq for other context */ | 295 | wait_queue_head_t waitq; /* a waitq for other context */ |
297 | } mc = { | 296 | } mc = { |
298 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | 297 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), |
@@ -821,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | |||
821 | return; | 820 | return; |
822 | VM_BUG_ON(list_empty(&pc->lru)); | 821 | VM_BUG_ON(list_empty(&pc->lru)); |
823 | list_del_init(&pc->lru); | 822 | list_del_init(&pc->lru); |
824 | return; | ||
825 | } | 823 | } |
826 | 824 | ||
827 | void mem_cgroup_del_lru(struct page *page) | 825 | void mem_cgroup_del_lru(struct page *page) |
@@ -1087,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1087 | case 0: | 1085 | case 0: |
1088 | list_move(&page->lru, dst); | 1086 | list_move(&page->lru, dst); |
1089 | mem_cgroup_del_lru(page); | 1087 | mem_cgroup_del_lru(page); |
1090 | nr_taken++; | 1088 | nr_taken += hpage_nr_pages(page); |
1091 | break; | 1089 | break; |
1092 | case -EBUSY: | 1090 | case -EBUSY: |
1093 | /* we don't affect global LRU but rotate in our LRU */ | 1091 | /* we don't affect global LRU but rotate in our LRU */ |
@@ -1312,8 +1310,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1312 | u64 limit; | 1310 | u64 limit; |
1313 | u64 memsw; | 1311 | u64 memsw; |
1314 | 1312 | ||
1315 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + | 1313 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
1316 | total_swap_pages; | 1314 | limit += total_swap_pages << PAGE_SHIFT; |
1315 | |||
1317 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 1316 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
1318 | /* | 1317 | /* |
1319 | * If memsw is finite and limits the amount of swap space available | 1318 | * If memsw is finite and limits the amount of swap space available |
@@ -1600,11 +1599,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1600 | * possibility of race condition. If there is, we take a lock. | 1599 | * possibility of race condition. If there is, we take a lock. |
1601 | */ | 1600 | */ |
1602 | 1601 | ||
1603 | static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) | 1602 | void mem_cgroup_update_page_stat(struct page *page, |
1603 | enum mem_cgroup_page_stat_item idx, int val) | ||
1604 | { | 1604 | { |
1605 | struct mem_cgroup *mem; | 1605 | struct mem_cgroup *mem; |
1606 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1606 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1607 | bool need_unlock = false; | 1607 | bool need_unlock = false; |
1608 | unsigned long uninitialized_var(flags); | ||
1608 | 1609 | ||
1609 | if (unlikely(!pc)) | 1610 | if (unlikely(!pc)) |
1610 | return; | 1611 | return; |
@@ -1616,37 +1617,34 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) | |||
1616 | /* pc->mem_cgroup is unstable ? */ | 1617 | /* pc->mem_cgroup is unstable ? */ |
1617 | if (unlikely(mem_cgroup_stealed(mem))) { | 1618 | if (unlikely(mem_cgroup_stealed(mem))) { |
1618 | /* take a lock against to access pc->mem_cgroup */ | 1619 | /* take a lock against to access pc->mem_cgroup */ |
1619 | lock_page_cgroup(pc); | 1620 | move_lock_page_cgroup(pc, &flags); |
1620 | need_unlock = true; | 1621 | need_unlock = true; |
1621 | mem = pc->mem_cgroup; | 1622 | mem = pc->mem_cgroup; |
1622 | if (!mem || !PageCgroupUsed(pc)) | 1623 | if (!mem || !PageCgroupUsed(pc)) |
1623 | goto out; | 1624 | goto out; |
1624 | } | 1625 | } |
1625 | 1626 | ||
1626 | this_cpu_add(mem->stat->count[idx], val); | ||
1627 | |||
1628 | switch (idx) { | 1627 | switch (idx) { |
1629 | case MEM_CGROUP_STAT_FILE_MAPPED: | 1628 | case MEMCG_NR_FILE_MAPPED: |
1630 | if (val > 0) | 1629 | if (val > 0) |
1631 | SetPageCgroupFileMapped(pc); | 1630 | SetPageCgroupFileMapped(pc); |
1632 | else if (!page_mapped(page)) | 1631 | else if (!page_mapped(page)) |
1633 | ClearPageCgroupFileMapped(pc); | 1632 | ClearPageCgroupFileMapped(pc); |
1633 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | ||
1634 | break; | 1634 | break; |
1635 | default: | 1635 | default: |
1636 | BUG(); | 1636 | BUG(); |
1637 | } | 1637 | } |
1638 | 1638 | ||
1639 | this_cpu_add(mem->stat->count[idx], val); | ||
1640 | |||
1639 | out: | 1641 | out: |
1640 | if (unlikely(need_unlock)) | 1642 | if (unlikely(need_unlock)) |
1641 | unlock_page_cgroup(pc); | 1643 | move_unlock_page_cgroup(pc, &flags); |
1642 | rcu_read_unlock(); | 1644 | rcu_read_unlock(); |
1643 | return; | 1645 | return; |
1644 | } | 1646 | } |
1645 | 1647 | EXPORT_SYMBOL(mem_cgroup_update_page_stat); | |
1646 | void mem_cgroup_update_file_mapped(struct page *page, int val) | ||
1647 | { | ||
1648 | mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); | ||
1649 | } | ||
1650 | 1648 | ||
1651 | /* | 1649 | /* |
1652 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1650 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
@@ -1887,12 +1885,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1887 | * oom-killer can be invoked. | 1885 | * oom-killer can be invoked. |
1888 | */ | 1886 | */ |
1889 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1887 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1890 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | 1888 | gfp_t gfp_mask, |
1889 | struct mem_cgroup **memcg, bool oom, | ||
1890 | int page_size) | ||
1891 | { | 1891 | { |
1892 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1892 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1893 | struct mem_cgroup *mem = NULL; | 1893 | struct mem_cgroup *mem = NULL; |
1894 | int ret; | 1894 | int ret; |
1895 | int csize = CHARGE_SIZE; | 1895 | int csize = max(CHARGE_SIZE, (unsigned long) page_size); |
1896 | 1896 | ||
1897 | /* | 1897 | /* |
1898 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage | 1898 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
@@ -1917,7 +1917,7 @@ again: | |||
1917 | VM_BUG_ON(css_is_removed(&mem->css)); | 1917 | VM_BUG_ON(css_is_removed(&mem->css)); |
1918 | if (mem_cgroup_is_root(mem)) | 1918 | if (mem_cgroup_is_root(mem)) |
1919 | goto done; | 1919 | goto done; |
1920 | if (consume_stock(mem)) | 1920 | if (page_size == PAGE_SIZE && consume_stock(mem)) |
1921 | goto done; | 1921 | goto done; |
1922 | css_get(&mem->css); | 1922 | css_get(&mem->css); |
1923 | } else { | 1923 | } else { |
@@ -1940,7 +1940,7 @@ again: | |||
1940 | rcu_read_unlock(); | 1940 | rcu_read_unlock(); |
1941 | goto done; | 1941 | goto done; |
1942 | } | 1942 | } |
1943 | if (consume_stock(mem)) { | 1943 | if (page_size == PAGE_SIZE && consume_stock(mem)) { |
1944 | /* | 1944 | /* |
1945 | * It seems dagerous to access memcg without css_get(). | 1945 | * It seems dagerous to access memcg without css_get(). |
1946 | * But considering how consume_stok works, it's not | 1946 | * But considering how consume_stok works, it's not |
@@ -1981,7 +1981,7 @@ again: | |||
1981 | case CHARGE_OK: | 1981 | case CHARGE_OK: |
1982 | break; | 1982 | break; |
1983 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 1983 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
1984 | csize = PAGE_SIZE; | 1984 | csize = page_size; |
1985 | css_put(&mem->css); | 1985 | css_put(&mem->css); |
1986 | mem = NULL; | 1986 | mem = NULL; |
1987 | goto again; | 1987 | goto again; |
@@ -2002,8 +2002,8 @@ again: | |||
2002 | } | 2002 | } |
2003 | } while (ret != CHARGE_OK); | 2003 | } while (ret != CHARGE_OK); |
2004 | 2004 | ||
2005 | if (csize > PAGE_SIZE) | 2005 | if (csize > page_size) |
2006 | refill_stock(mem, csize - PAGE_SIZE); | 2006 | refill_stock(mem, csize - page_size); |
2007 | css_put(&mem->css); | 2007 | css_put(&mem->css); |
2008 | done: | 2008 | done: |
2009 | *memcg = mem; | 2009 | *memcg = mem; |
@@ -2031,9 +2031,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | |||
2031 | } | 2031 | } |
2032 | } | 2032 | } |
2033 | 2033 | ||
2034 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | 2034 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
2035 | int page_size) | ||
2035 | { | 2036 | { |
2036 | __mem_cgroup_cancel_charge(mem, 1); | 2037 | __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); |
2037 | } | 2038 | } |
2038 | 2039 | ||
2039 | /* | 2040 | /* |
@@ -2087,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2087 | * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be | 2088 | * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be |
2088 | * USED state. If already USED, uncharge and return. | 2089 | * USED state. If already USED, uncharge and return. |
2089 | */ | 2090 | */ |
2090 | 2091 | static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, | |
2091 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 2092 | struct page_cgroup *pc, |
2092 | struct page_cgroup *pc, | 2093 | enum charge_type ctype) |
2093 | enum charge_type ctype) | ||
2094 | { | 2094 | { |
2095 | /* try_charge() can return NULL to *memcg, taking care of it. */ | ||
2096 | if (!mem) | ||
2097 | return; | ||
2098 | |||
2099 | lock_page_cgroup(pc); | ||
2100 | if (unlikely(PageCgroupUsed(pc))) { | ||
2101 | unlock_page_cgroup(pc); | ||
2102 | mem_cgroup_cancel_charge(mem); | ||
2103 | return; | ||
2104 | } | ||
2105 | |||
2106 | pc->mem_cgroup = mem; | 2095 | pc->mem_cgroup = mem; |
2107 | /* | 2096 | /* |
2108 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | 2097 | * We access a page_cgroup asynchronously without lock_page_cgroup(). |
@@ -2127,6 +2116,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2127 | } | 2116 | } |
2128 | 2117 | ||
2129 | mem_cgroup_charge_statistics(mem, pc, true); | 2118 | mem_cgroup_charge_statistics(mem, pc, true); |
2119 | } | ||
2120 | |||
2121 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | ||
2122 | struct page_cgroup *pc, | ||
2123 | enum charge_type ctype, | ||
2124 | int page_size) | ||
2125 | { | ||
2126 | int i; | ||
2127 | int count = page_size >> PAGE_SHIFT; | ||
2128 | |||
2129 | /* try_charge() can return NULL to *memcg, taking care of it. */ | ||
2130 | if (!mem) | ||
2131 | return; | ||
2132 | |||
2133 | lock_page_cgroup(pc); | ||
2134 | if (unlikely(PageCgroupUsed(pc))) { | ||
2135 | unlock_page_cgroup(pc); | ||
2136 | mem_cgroup_cancel_charge(mem, page_size); | ||
2137 | return; | ||
2138 | } | ||
2139 | |||
2140 | /* | ||
2141 | * we don't need page_cgroup_lock about tail pages, becase they are not | ||
2142 | * accessed by any other context at this point. | ||
2143 | */ | ||
2144 | for (i = 0; i < count; i++) | ||
2145 | ____mem_cgroup_commit_charge(mem, pc + i, ctype); | ||
2130 | 2146 | ||
2131 | unlock_page_cgroup(pc); | 2147 | unlock_page_cgroup(pc); |
2132 | /* | 2148 | /* |
@@ -2173,7 +2189,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2173 | mem_cgroup_charge_statistics(from, pc, false); | 2189 | mem_cgroup_charge_statistics(from, pc, false); |
2174 | if (uncharge) | 2190 | if (uncharge) |
2175 | /* This is not "cancel", but cancel_charge does all we need. */ | 2191 | /* This is not "cancel", but cancel_charge does all we need. */ |
2176 | mem_cgroup_cancel_charge(from); | 2192 | mem_cgroup_cancel_charge(from, PAGE_SIZE); |
2177 | 2193 | ||
2178 | /* caller should have done css_get */ | 2194 | /* caller should have done css_get */ |
2179 | pc->mem_cgroup = to; | 2195 | pc->mem_cgroup = to; |
@@ -2195,9 +2211,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
2195 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) | 2211 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
2196 | { | 2212 | { |
2197 | int ret = -EINVAL; | 2213 | int ret = -EINVAL; |
2214 | unsigned long flags; | ||
2215 | |||
2198 | lock_page_cgroup(pc); | 2216 | lock_page_cgroup(pc); |
2199 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | 2217 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { |
2218 | move_lock_page_cgroup(pc, &flags); | ||
2200 | __mem_cgroup_move_account(pc, from, to, uncharge); | 2219 | __mem_cgroup_move_account(pc, from, to, uncharge); |
2220 | move_unlock_page_cgroup(pc, &flags); | ||
2201 | ret = 0; | 2221 | ret = 0; |
2202 | } | 2222 | } |
2203 | unlock_page_cgroup(pc); | 2223 | unlock_page_cgroup(pc); |
@@ -2234,13 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
2234 | goto put; | 2254 | goto put; |
2235 | 2255 | ||
2236 | parent = mem_cgroup_from_cont(pcg); | 2256 | parent = mem_cgroup_from_cont(pcg); |
2237 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 2257 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, |
2258 | PAGE_SIZE); | ||
2238 | if (ret || !parent) | 2259 | if (ret || !parent) |
2239 | goto put_back; | 2260 | goto put_back; |
2240 | 2261 | ||
2241 | ret = mem_cgroup_move_account(pc, child, parent, true); | 2262 | ret = mem_cgroup_move_account(pc, child, parent, true); |
2242 | if (ret) | 2263 | if (ret) |
2243 | mem_cgroup_cancel_charge(parent); | 2264 | mem_cgroup_cancel_charge(parent, PAGE_SIZE); |
2244 | put_back: | 2265 | put_back: |
2245 | putback_lru_page(page); | 2266 | putback_lru_page(page); |
2246 | put: | 2267 | put: |
@@ -2261,6 +2282,12 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2261 | struct mem_cgroup *mem = NULL; | 2282 | struct mem_cgroup *mem = NULL; |
2262 | struct page_cgroup *pc; | 2283 | struct page_cgroup *pc; |
2263 | int ret; | 2284 | int ret; |
2285 | int page_size = PAGE_SIZE; | ||
2286 | |||
2287 | if (PageTransHuge(page)) { | ||
2288 | page_size <<= compound_order(page); | ||
2289 | VM_BUG_ON(!PageTransHuge(page)); | ||
2290 | } | ||
2264 | 2291 | ||
2265 | pc = lookup_page_cgroup(page); | 2292 | pc = lookup_page_cgroup(page); |
2266 | /* can happen at boot */ | 2293 | /* can happen at boot */ |
@@ -2268,11 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2268 | return 0; | 2295 | return 0; |
2269 | prefetchw(pc); | 2296 | prefetchw(pc); |
2270 | 2297 | ||
2271 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 2298 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); |
2272 | if (ret || !mem) | 2299 | if (ret || !mem) |
2273 | return ret; | 2300 | return ret; |
2274 | 2301 | ||
2275 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2302 | __mem_cgroup_commit_charge(mem, pc, ctype, page_size); |
2276 | return 0; | 2303 | return 0; |
2277 | } | 2304 | } |
2278 | 2305 | ||
@@ -2281,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
2281 | { | 2308 | { |
2282 | if (mem_cgroup_disabled()) | 2309 | if (mem_cgroup_disabled()) |
2283 | return 0; | 2310 | return 0; |
2284 | if (PageCompound(page)) | ||
2285 | return 0; | ||
2286 | /* | 2311 | /* |
2287 | * If already mapped, we don't have to account. | 2312 | * If already mapped, we don't have to account. |
2288 | * If page cache, page->mapping has address_space. | 2313 | * If page cache, page->mapping has address_space. |
@@ -2388,13 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2388 | if (!mem) | 2413 | if (!mem) |
2389 | goto charge_cur_mm; | 2414 | goto charge_cur_mm; |
2390 | *ptr = mem; | 2415 | *ptr = mem; |
2391 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 2416 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); |
2392 | css_put(&mem->css); | 2417 | css_put(&mem->css); |
2393 | return ret; | 2418 | return ret; |
2394 | charge_cur_mm: | 2419 | charge_cur_mm: |
2395 | if (unlikely(!mm)) | 2420 | if (unlikely(!mm)) |
2396 | mm = &init_mm; | 2421 | mm = &init_mm; |
2397 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 2422 | return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); |
2398 | } | 2423 | } |
2399 | 2424 | ||
2400 | static void | 2425 | static void |
@@ -2410,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
2410 | cgroup_exclude_rmdir(&ptr->css); | 2435 | cgroup_exclude_rmdir(&ptr->css); |
2411 | pc = lookup_page_cgroup(page); | 2436 | pc = lookup_page_cgroup(page); |
2412 | mem_cgroup_lru_del_before_commit_swapcache(page); | 2437 | mem_cgroup_lru_del_before_commit_swapcache(page); |
2413 | __mem_cgroup_commit_charge(ptr, pc, ctype); | 2438 | __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); |
2414 | mem_cgroup_lru_add_after_commit_swapcache(page); | 2439 | mem_cgroup_lru_add_after_commit_swapcache(page); |
2415 | /* | 2440 | /* |
2416 | * Now swap is on-memory. This means this page may be | 2441 | * Now swap is on-memory. This means this page may be |
@@ -2459,11 +2484,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
2459 | return; | 2484 | return; |
2460 | if (!mem) | 2485 | if (!mem) |
2461 | return; | 2486 | return; |
2462 | mem_cgroup_cancel_charge(mem); | 2487 | mem_cgroup_cancel_charge(mem, PAGE_SIZE); |
2463 | } | 2488 | } |
2464 | 2489 | ||
2465 | static void | 2490 | static void |
2466 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | 2491 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, |
2492 | int page_size) | ||
2467 | { | 2493 | { |
2468 | struct memcg_batch_info *batch = NULL; | 2494 | struct memcg_batch_info *batch = NULL; |
2469 | bool uncharge_memsw = true; | 2495 | bool uncharge_memsw = true; |
@@ -2490,6 +2516,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2490 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | 2516 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) |
2491 | goto direct_uncharge; | 2517 | goto direct_uncharge; |
2492 | 2518 | ||
2519 | if (page_size != PAGE_SIZE) | ||
2520 | goto direct_uncharge; | ||
2521 | |||
2493 | /* | 2522 | /* |
2494 | * In typical case, batch->memcg == mem. This means we can | 2523 | * In typical case, batch->memcg == mem. This means we can |
2495 | * merge a series of uncharges to an uncharge of res_counter. | 2524 | * merge a series of uncharges to an uncharge of res_counter. |
@@ -2503,9 +2532,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2503 | batch->memsw_bytes += PAGE_SIZE; | 2532 | batch->memsw_bytes += PAGE_SIZE; |
2504 | return; | 2533 | return; |
2505 | direct_uncharge: | 2534 | direct_uncharge: |
2506 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2535 | res_counter_uncharge(&mem->res, page_size); |
2507 | if (uncharge_memsw) | 2536 | if (uncharge_memsw) |
2508 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 2537 | res_counter_uncharge(&mem->memsw, page_size); |
2509 | if (unlikely(batch->memcg != mem)) | 2538 | if (unlikely(batch->memcg != mem)) |
2510 | memcg_oom_recover(mem); | 2539 | memcg_oom_recover(mem); |
2511 | return; | 2540 | return; |
@@ -2517,8 +2546,11 @@ direct_uncharge: | |||
2517 | static struct mem_cgroup * | 2546 | static struct mem_cgroup * |
2518 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2547 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
2519 | { | 2548 | { |
2549 | int i; | ||
2550 | int count; | ||
2520 | struct page_cgroup *pc; | 2551 | struct page_cgroup *pc; |
2521 | struct mem_cgroup *mem = NULL; | 2552 | struct mem_cgroup *mem = NULL; |
2553 | int page_size = PAGE_SIZE; | ||
2522 | 2554 | ||
2523 | if (mem_cgroup_disabled()) | 2555 | if (mem_cgroup_disabled()) |
2524 | return NULL; | 2556 | return NULL; |
@@ -2526,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2526 | if (PageSwapCache(page)) | 2558 | if (PageSwapCache(page)) |
2527 | return NULL; | 2559 | return NULL; |
2528 | 2560 | ||
2561 | if (PageTransHuge(page)) { | ||
2562 | page_size <<= compound_order(page); | ||
2563 | VM_BUG_ON(!PageTransHuge(page)); | ||
2564 | } | ||
2565 | |||
2566 | count = page_size >> PAGE_SHIFT; | ||
2529 | /* | 2567 | /* |
2530 | * Check if our page_cgroup is valid | 2568 | * Check if our page_cgroup is valid |
2531 | */ | 2569 | */ |
@@ -2558,7 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2558 | break; | 2596 | break; |
2559 | } | 2597 | } |
2560 | 2598 | ||
2561 | mem_cgroup_charge_statistics(mem, pc, false); | 2599 | for (i = 0; i < count; i++) |
2600 | mem_cgroup_charge_statistics(mem, pc + i, false); | ||
2562 | 2601 | ||
2563 | ClearPageCgroupUsed(pc); | 2602 | ClearPageCgroupUsed(pc); |
2564 | /* | 2603 | /* |
@@ -2579,7 +2618,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2579 | mem_cgroup_get(mem); | 2618 | mem_cgroup_get(mem); |
2580 | } | 2619 | } |
2581 | if (!mem_cgroup_is_root(mem)) | 2620 | if (!mem_cgroup_is_root(mem)) |
2582 | __do_uncharge(mem, ctype); | 2621 | __do_uncharge(mem, ctype, page_size); |
2583 | 2622 | ||
2584 | return mem; | 2623 | return mem; |
2585 | 2624 | ||
@@ -2774,6 +2813,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2774 | enum charge_type ctype; | 2813 | enum charge_type ctype; |
2775 | int ret = 0; | 2814 | int ret = 0; |
2776 | 2815 | ||
2816 | VM_BUG_ON(PageTransHuge(page)); | ||
2777 | if (mem_cgroup_disabled()) | 2817 | if (mem_cgroup_disabled()) |
2778 | return 0; | 2818 | return 0; |
2779 | 2819 | ||
@@ -2823,7 +2863,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2823 | return 0; | 2863 | return 0; |
2824 | 2864 | ||
2825 | *ptr = mem; | 2865 | *ptr = mem; |
2826 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); | 2866 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); |
2827 | css_put(&mem->css);/* drop extra refcnt */ | 2867 | css_put(&mem->css);/* drop extra refcnt */ |
2828 | if (ret || *ptr == NULL) { | 2868 | if (ret || *ptr == NULL) { |
2829 | if (PageAnon(page)) { | 2869 | if (PageAnon(page)) { |
@@ -2850,13 +2890,13 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2850 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 2890 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
2851 | else | 2891 | else |
2852 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 2892 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
2853 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2893 | __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); |
2854 | return ret; | 2894 | return ret; |
2855 | } | 2895 | } |
2856 | 2896 | ||
2857 | /* remove redundant charge if migration failed*/ | 2897 | /* remove redundant charge if migration failed*/ |
2858 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 2898 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
2859 | struct page *oldpage, struct page *newpage) | 2899 | struct page *oldpage, struct page *newpage, bool migration_ok) |
2860 | { | 2900 | { |
2861 | struct page *used, *unused; | 2901 | struct page *used, *unused; |
2862 | struct page_cgroup *pc; | 2902 | struct page_cgroup *pc; |
@@ -2865,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
2865 | return; | 2905 | return; |
2866 | /* blocks rmdir() */ | 2906 | /* blocks rmdir() */ |
2867 | cgroup_exclude_rmdir(&mem->css); | 2907 | cgroup_exclude_rmdir(&mem->css); |
2868 | /* at migration success, oldpage->mapping is NULL. */ | 2908 | if (!migration_ok) { |
2869 | if (oldpage->mapping) { | ||
2870 | used = oldpage; | 2909 | used = oldpage; |
2871 | unused = newpage; | 2910 | unused = newpage; |
2872 | } else { | 2911 | } else { |
@@ -4176,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
4176 | */ | 4215 | */ |
4177 | if (!node_state(node, N_NORMAL_MEMORY)) | 4216 | if (!node_state(node, N_NORMAL_MEMORY)) |
4178 | tmp = -1; | 4217 | tmp = -1; |
4179 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); | 4218 | pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); |
4180 | if (!pn) | 4219 | if (!pn) |
4181 | return 1; | 4220 | return 1; |
4182 | 4221 | ||
4183 | mem->info.nodeinfo[node] = pn; | 4222 | mem->info.nodeinfo[node] = pn; |
4184 | memset(pn, 0, sizeof(*pn)); | ||
4185 | |||
4186 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4223 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4187 | mz = &pn->zoneinfo[zone]; | 4224 | mz = &pn->zoneinfo[zone]; |
4188 | for_each_lru(l) | 4225 | for_each_lru(l) |
@@ -4206,14 +4243,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
4206 | 4243 | ||
4207 | /* Can be very big if MAX_NUMNODES is very big */ | 4244 | /* Can be very big if MAX_NUMNODES is very big */ |
4208 | if (size < PAGE_SIZE) | 4245 | if (size < PAGE_SIZE) |
4209 | mem = kmalloc(size, GFP_KERNEL); | 4246 | mem = kzalloc(size, GFP_KERNEL); |
4210 | else | 4247 | else |
4211 | mem = vmalloc(size); | 4248 | mem = vzalloc(size); |
4212 | 4249 | ||
4213 | if (!mem) | 4250 | if (!mem) |
4214 | return NULL; | 4251 | return NULL; |
4215 | 4252 | ||
4216 | memset(mem, 0, size); | ||
4217 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | 4253 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); |
4218 | if (!mem->stat) | 4254 | if (!mem->stat) |
4219 | goto out_free; | 4255 | goto out_free; |
@@ -4461,7 +4497,8 @@ one_by_one: | |||
4461 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 4497 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
4462 | cond_resched(); | 4498 | cond_resched(); |
4463 | } | 4499 | } |
4464 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 4500 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
4501 | PAGE_SIZE); | ||
4465 | if (ret || !mem) | 4502 | if (ret || !mem) |
4466 | /* mem_cgroup_clear_mc() will do uncharge later */ | 4503 | /* mem_cgroup_clear_mc() will do uncharge later */ |
4467 | return -ENOMEM; | 4504 | return -ENOMEM; |
@@ -4623,6 +4660,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
4623 | pte_t *pte; | 4660 | pte_t *pte; |
4624 | spinlock_t *ptl; | 4661 | spinlock_t *ptl; |
4625 | 4662 | ||
4663 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
4626 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 4664 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
4627 | for (; addr != end; pte++, addr += PAGE_SIZE) | 4665 | for (; addr != end; pte++, addr += PAGE_SIZE) |
4628 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | 4666 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) |
@@ -4638,7 +4676,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4638 | unsigned long precharge; | 4676 | unsigned long precharge; |
4639 | struct vm_area_struct *vma; | 4677 | struct vm_area_struct *vma; |
4640 | 4678 | ||
4641 | /* We've already held the mmap_sem */ | 4679 | down_read(&mm->mmap_sem); |
4642 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4680 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4643 | struct mm_walk mem_cgroup_count_precharge_walk = { | 4681 | struct mm_walk mem_cgroup_count_precharge_walk = { |
4644 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | 4682 | .pmd_entry = mem_cgroup_count_precharge_pte_range, |
@@ -4650,6 +4688,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4650 | walk_page_range(vma->vm_start, vma->vm_end, | 4688 | walk_page_range(vma->vm_start, vma->vm_end, |
4651 | &mem_cgroup_count_precharge_walk); | 4689 | &mem_cgroup_count_precharge_walk); |
4652 | } | 4690 | } |
4691 | up_read(&mm->mmap_sem); | ||
4653 | 4692 | ||
4654 | precharge = mc.precharge; | 4693 | precharge = mc.precharge; |
4655 | mc.precharge = 0; | 4694 | mc.precharge = 0; |
@@ -4659,10 +4698,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4659 | 4698 | ||
4660 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | 4699 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) |
4661 | { | 4700 | { |
4662 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | 4701 | unsigned long precharge = mem_cgroup_count_precharge(mm); |
4702 | |||
4703 | VM_BUG_ON(mc.moving_task); | ||
4704 | mc.moving_task = current; | ||
4705 | return mem_cgroup_do_precharge(precharge); | ||
4663 | } | 4706 | } |
4664 | 4707 | ||
4665 | static void mem_cgroup_clear_mc(void) | 4708 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ |
4709 | static void __mem_cgroup_clear_mc(void) | ||
4666 | { | 4710 | { |
4667 | struct mem_cgroup *from = mc.from; | 4711 | struct mem_cgroup *from = mc.from; |
4668 | struct mem_cgroup *to = mc.to; | 4712 | struct mem_cgroup *to = mc.to; |
@@ -4697,23 +4741,28 @@ static void mem_cgroup_clear_mc(void) | |||
4697 | PAGE_SIZE * mc.moved_swap); | 4741 | PAGE_SIZE * mc.moved_swap); |
4698 | } | 4742 | } |
4699 | /* we've already done mem_cgroup_get(mc.to) */ | 4743 | /* we've already done mem_cgroup_get(mc.to) */ |
4700 | |||
4701 | mc.moved_swap = 0; | 4744 | mc.moved_swap = 0; |
4702 | } | 4745 | } |
4703 | if (mc.mm) { | 4746 | memcg_oom_recover(from); |
4704 | up_read(&mc.mm->mmap_sem); | 4747 | memcg_oom_recover(to); |
4705 | mmput(mc.mm); | 4748 | wake_up_all(&mc.waitq); |
4706 | } | 4749 | } |
4750 | |||
4751 | static void mem_cgroup_clear_mc(void) | ||
4752 | { | ||
4753 | struct mem_cgroup *from = mc.from; | ||
4754 | |||
4755 | /* | ||
4756 | * we must clear moving_task before waking up waiters at the end of | ||
4757 | * task migration. | ||
4758 | */ | ||
4759 | mc.moving_task = NULL; | ||
4760 | __mem_cgroup_clear_mc(); | ||
4707 | spin_lock(&mc.lock); | 4761 | spin_lock(&mc.lock); |
4708 | mc.from = NULL; | 4762 | mc.from = NULL; |
4709 | mc.to = NULL; | 4763 | mc.to = NULL; |
4710 | spin_unlock(&mc.lock); | 4764 | spin_unlock(&mc.lock); |
4711 | mc.moving_task = NULL; | ||
4712 | mc.mm = NULL; | ||
4713 | mem_cgroup_end_move(from); | 4765 | mem_cgroup_end_move(from); |
4714 | memcg_oom_recover(from); | ||
4715 | memcg_oom_recover(to); | ||
4716 | wake_up_all(&mc.waitq); | ||
4717 | } | 4766 | } |
4718 | 4767 | ||
4719 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 4768 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
@@ -4735,38 +4784,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
4735 | return 0; | 4784 | return 0; |
4736 | /* We move charges only when we move a owner of the mm */ | 4785 | /* We move charges only when we move a owner of the mm */ |
4737 | if (mm->owner == p) { | 4786 | if (mm->owner == p) { |
4738 | /* | ||
4739 | * We do all the move charge works under one mmap_sem to | ||
4740 | * avoid deadlock with down_write(&mmap_sem) | ||
4741 | * -> try_charge() -> if (mc.moving_task) -> sleep. | ||
4742 | */ | ||
4743 | down_read(&mm->mmap_sem); | ||
4744 | |||
4745 | VM_BUG_ON(mc.from); | 4787 | VM_BUG_ON(mc.from); |
4746 | VM_BUG_ON(mc.to); | 4788 | VM_BUG_ON(mc.to); |
4747 | VM_BUG_ON(mc.precharge); | 4789 | VM_BUG_ON(mc.precharge); |
4748 | VM_BUG_ON(mc.moved_charge); | 4790 | VM_BUG_ON(mc.moved_charge); |
4749 | VM_BUG_ON(mc.moved_swap); | 4791 | VM_BUG_ON(mc.moved_swap); |
4750 | VM_BUG_ON(mc.moving_task); | ||
4751 | VM_BUG_ON(mc.mm); | ||
4752 | |||
4753 | mem_cgroup_start_move(from); | 4792 | mem_cgroup_start_move(from); |
4754 | spin_lock(&mc.lock); | 4793 | spin_lock(&mc.lock); |
4755 | mc.from = from; | 4794 | mc.from = from; |
4756 | mc.to = mem; | 4795 | mc.to = mem; |
4757 | mc.precharge = 0; | ||
4758 | mc.moved_charge = 0; | ||
4759 | mc.moved_swap = 0; | ||
4760 | spin_unlock(&mc.lock); | 4796 | spin_unlock(&mc.lock); |
4761 | mc.moving_task = current; | 4797 | /* We set mc.moving_task later */ |
4762 | mc.mm = mm; | ||
4763 | 4798 | ||
4764 | ret = mem_cgroup_precharge_mc(mm); | 4799 | ret = mem_cgroup_precharge_mc(mm); |
4765 | if (ret) | 4800 | if (ret) |
4766 | mem_cgroup_clear_mc(); | 4801 | mem_cgroup_clear_mc(); |
4767 | /* We call up_read() and mmput() in clear_mc(). */ | 4802 | } |
4768 | } else | 4803 | mmput(mm); |
4769 | mmput(mm); | ||
4770 | } | 4804 | } |
4771 | return ret; | 4805 | return ret; |
4772 | } | 4806 | } |
@@ -4789,6 +4823,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
4789 | spinlock_t *ptl; | 4823 | spinlock_t *ptl; |
4790 | 4824 | ||
4791 | retry: | 4825 | retry: |
4826 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
4792 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 4827 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
4793 | for (; addr != end; addr += PAGE_SIZE) { | 4828 | for (; addr != end; addr += PAGE_SIZE) { |
4794 | pte_t ptent = *(pte++); | 4829 | pte_t ptent = *(pte++); |
@@ -4854,7 +4889,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4854 | struct vm_area_struct *vma; | 4889 | struct vm_area_struct *vma; |
4855 | 4890 | ||
4856 | lru_add_drain_all(); | 4891 | lru_add_drain_all(); |
4857 | /* We've already held the mmap_sem */ | 4892 | retry: |
4893 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | ||
4894 | /* | ||
4895 | * Someone who are holding the mmap_sem might be waiting in | ||
4896 | * waitq. So we cancel all extra charges, wake up all waiters, | ||
4897 | * and retry. Because we cancel precharges, we might not be able | ||
4898 | * to move enough charges, but moving charge is a best-effort | ||
4899 | * feature anyway, so it wouldn't be a big problem. | ||
4900 | */ | ||
4901 | __mem_cgroup_clear_mc(); | ||
4902 | cond_resched(); | ||
4903 | goto retry; | ||
4904 | } | ||
4858 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4905 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4859 | int ret; | 4906 | int ret; |
4860 | struct mm_walk mem_cgroup_move_charge_walk = { | 4907 | struct mm_walk mem_cgroup_move_charge_walk = { |
@@ -4873,6 +4920,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4873 | */ | 4920 | */ |
4874 | break; | 4921 | break; |
4875 | } | 4922 | } |
4923 | up_read(&mm->mmap_sem); | ||
4876 | } | 4924 | } |
4877 | 4925 | ||
4878 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 4926 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
@@ -4881,11 +4929,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
4881 | struct task_struct *p, | 4929 | struct task_struct *p, |
4882 | bool threadgroup) | 4930 | bool threadgroup) |
4883 | { | 4931 | { |
4884 | if (!mc.mm) | 4932 | struct mm_struct *mm; |
4933 | |||
4934 | if (!mc.to) | ||
4885 | /* no need to move charge */ | 4935 | /* no need to move charge */ |
4886 | return; | 4936 | return; |
4887 | 4937 | ||
4888 | mem_cgroup_move_charge(mc.mm); | 4938 | mm = get_task_mm(p); |
4939 | if (mm) { | ||
4940 | mem_cgroup_move_charge(mm); | ||
4941 | mmput(mm); | ||
4942 | } | ||
4889 | mem_cgroup_clear_mc(); | 4943 | mem_cgroup_clear_mc(); |
4890 | } | 4944 | } |
4891 | #else /* !CONFIG_MMU */ | 4945 | #else /* !CONFIG_MMU */ |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 46ab2c044b0e..548fbd70f026 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
203 | #ifdef __ARCH_SI_TRAPNO | 203 | #ifdef __ARCH_SI_TRAPNO |
204 | si.si_trapno = trapno; | 204 | si.si_trapno = trapno; |
205 | #endif | 205 | #endif |
206 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; | 206 | si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; |
207 | /* | 207 | /* |
208 | * Don't use force here, it's convenient if the signal | 208 | * Don't use force here, it's convenient if the signal |
209 | * can be temporarily blocked. | 209 | * can be temporarily blocked. |
@@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
386 | struct task_struct *tsk; | 386 | struct task_struct *tsk; |
387 | struct anon_vma *av; | 387 | struct anon_vma *av; |
388 | 388 | ||
389 | if (!PageHuge(page) && unlikely(split_huge_page(page))) | ||
390 | return; | ||
389 | read_lock(&tasklist_lock); | 391 | read_lock(&tasklist_lock); |
390 | av = page_lock_anon_vma(page); | 392 | av = page_lock_anon_vma(page); |
391 | if (av == NULL) /* Not actually mapped anymore */ | 393 | if (av == NULL) /* Not actually mapped anymore */ |
@@ -928,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
928 | static void set_page_hwpoison_huge_page(struct page *hpage) | 930 | static void set_page_hwpoison_huge_page(struct page *hpage) |
929 | { | 931 | { |
930 | int i; | 932 | int i; |
931 | int nr_pages = 1 << compound_order(hpage); | 933 | int nr_pages = 1 << compound_trans_order(hpage); |
932 | for (i = 0; i < nr_pages; i++) | 934 | for (i = 0; i < nr_pages; i++) |
933 | SetPageHWPoison(hpage + i); | 935 | SetPageHWPoison(hpage + i); |
934 | } | 936 | } |
@@ -936,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) | |||
936 | static void clear_page_hwpoison_huge_page(struct page *hpage) | 938 | static void clear_page_hwpoison_huge_page(struct page *hpage) |
937 | { | 939 | { |
938 | int i; | 940 | int i; |
939 | int nr_pages = 1 << compound_order(hpage); | 941 | int nr_pages = 1 << compound_trans_order(hpage); |
940 | for (i = 0; i < nr_pages; i++) | 942 | for (i = 0; i < nr_pages; i++) |
941 | ClearPageHWPoison(hpage + i); | 943 | ClearPageHWPoison(hpage + i); |
942 | } | 944 | } |
@@ -966,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
966 | return 0; | 968 | return 0; |
967 | } | 969 | } |
968 | 970 | ||
969 | nr_pages = 1 << compound_order(hpage); | 971 | nr_pages = 1 << compound_trans_order(hpage); |
970 | atomic_long_add(nr_pages, &mce_bad_pages); | 972 | atomic_long_add(nr_pages, &mce_bad_pages); |
971 | 973 | ||
972 | /* | 974 | /* |
@@ -1164,7 +1166,7 @@ int unpoison_memory(unsigned long pfn) | |||
1164 | return 0; | 1166 | return 0; |
1165 | } | 1167 | } |
1166 | 1168 | ||
1167 | nr_pages = 1 << compound_order(page); | 1169 | nr_pages = 1 << compound_trans_order(page); |
1168 | 1170 | ||
1169 | if (!get_page_unless_zero(page)) { | 1171 | if (!get_page_unless_zero(page)) { |
1170 | /* | 1172 | /* |
@@ -1290,9 +1292,10 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1290 | /* Keep page count to indicate a given hugepage is isolated. */ | 1292 | /* Keep page count to indicate a given hugepage is isolated. */ |
1291 | 1293 | ||
1292 | list_add(&hpage->lru, &pagelist); | 1294 | list_add(&hpage->lru, &pagelist); |
1293 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | 1295 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, |
1296 | true); | ||
1294 | if (ret) { | 1297 | if (ret) { |
1295 | putback_lru_pages(&pagelist); | 1298 | putback_lru_pages(&pagelist); |
1296 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | 1299 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", |
1297 | pfn, ret, page->flags); | 1300 | pfn, ret, page->flags); |
1298 | if (ret > 0) | 1301 | if (ret > 0) |
@@ -1301,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1301 | } | 1304 | } |
1302 | done: | 1305 | done: |
1303 | if (!PageHWPoison(hpage)) | 1306 | if (!PageHWPoison(hpage)) |
1304 | atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); | 1307 | atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); |
1305 | set_page_hwpoison_huge_page(hpage); | 1308 | set_page_hwpoison_huge_page(hpage); |
1306 | dequeue_hwpoisoned_huge_page(hpage); | 1309 | dequeue_hwpoisoned_huge_page(hpage); |
1307 | /* keep elevated page count for bad page */ | 1310 | /* keep elevated page count for bad page */ |
@@ -1413,7 +1416,8 @@ int soft_offline_page(struct page *page, int flags) | |||
1413 | LIST_HEAD(pagelist); | 1416 | LIST_HEAD(pagelist); |
1414 | 1417 | ||
1415 | list_add(&page->lru, &pagelist); | 1418 | list_add(&page->lru, &pagelist); |
1416 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | 1419 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1420 | 0, true); | ||
1417 | if (ret) { | 1421 | if (ret) { |
1418 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1422 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1419 | pfn, ret, page->flags); | 1423 | pfn, ret, page->flags); |
diff --git a/mm/memory.c b/mm/memory.c index 02e48aa0ed13..31250faff390 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
394 | } | 394 | } |
395 | } | 395 | } |
396 | 396 | ||
397 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | 397 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
398 | pmd_t *pmd, unsigned long address) | ||
398 | { | 399 | { |
399 | pgtable_t new = pte_alloc_one(mm, address); | 400 | pgtable_t new = pte_alloc_one(mm, address); |
401 | int wait_split_huge_page; | ||
400 | if (!new) | 402 | if (!new) |
401 | return -ENOMEM; | 403 | return -ENOMEM; |
402 | 404 | ||
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | |||
416 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ | 418 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ |
417 | 419 | ||
418 | spin_lock(&mm->page_table_lock); | 420 | spin_lock(&mm->page_table_lock); |
419 | if (!pmd_present(*pmd)) { /* Has another populated it ? */ | 421 | wait_split_huge_page = 0; |
422 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | ||
420 | mm->nr_ptes++; | 423 | mm->nr_ptes++; |
421 | pmd_populate(mm, pmd, new); | 424 | pmd_populate(mm, pmd, new); |
422 | new = NULL; | 425 | new = NULL; |
423 | } | 426 | } else if (unlikely(pmd_trans_splitting(*pmd))) |
427 | wait_split_huge_page = 1; | ||
424 | spin_unlock(&mm->page_table_lock); | 428 | spin_unlock(&mm->page_table_lock); |
425 | if (new) | 429 | if (new) |
426 | pte_free(mm, new); | 430 | pte_free(mm, new); |
431 | if (wait_split_huge_page) | ||
432 | wait_split_huge_page(vma->anon_vma, pmd); | ||
427 | return 0; | 433 | return 0; |
428 | } | 434 | } |
429 | 435 | ||
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
436 | smp_wmb(); /* See comment in __pte_alloc */ | 442 | smp_wmb(); /* See comment in __pte_alloc */ |
437 | 443 | ||
438 | spin_lock(&init_mm.page_table_lock); | 444 | spin_lock(&init_mm.page_table_lock); |
439 | if (!pmd_present(*pmd)) { /* Has another populated it ? */ | 445 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
440 | pmd_populate_kernel(&init_mm, pmd, new); | 446 | pmd_populate_kernel(&init_mm, pmd, new); |
441 | new = NULL; | 447 | new = NULL; |
442 | } | 448 | } else |
449 | VM_BUG_ON(pmd_trans_splitting(*pmd)); | ||
443 | spin_unlock(&init_mm.page_table_lock); | 450 | spin_unlock(&init_mm.page_table_lock); |
444 | if (new) | 451 | if (new) |
445 | pte_free_kernel(&init_mm, new); | 452 | pte_free_kernel(&init_mm, new); |
@@ -719,9 +726,9 @@ out_set_pte: | |||
719 | return 0; | 726 | return 0; |
720 | } | 727 | } |
721 | 728 | ||
722 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 729 | int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
723 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 730 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
724 | unsigned long addr, unsigned long end) | 731 | unsigned long addr, unsigned long end) |
725 | { | 732 | { |
726 | pte_t *orig_src_pte, *orig_dst_pte; | 733 | pte_t *orig_src_pte, *orig_dst_pte; |
727 | pte_t *src_pte, *dst_pte; | 734 | pte_t *src_pte, *dst_pte; |
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src | |||
795 | src_pmd = pmd_offset(src_pud, addr); | 802 | src_pmd = pmd_offset(src_pud, addr); |
796 | do { | 803 | do { |
797 | next = pmd_addr_end(addr, end); | 804 | next = pmd_addr_end(addr, end); |
805 | if (pmd_trans_huge(*src_pmd)) { | ||
806 | int err; | ||
807 | VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); | ||
808 | err = copy_huge_pmd(dst_mm, src_mm, | ||
809 | dst_pmd, src_pmd, addr, vma); | ||
810 | if (err == -ENOMEM) | ||
811 | return -ENOMEM; | ||
812 | if (!err) | ||
813 | continue; | ||
814 | /* fall through */ | ||
815 | } | ||
798 | if (pmd_none_or_clear_bad(src_pmd)) | 816 | if (pmd_none_or_clear_bad(src_pmd)) |
799 | continue; | 817 | continue; |
800 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, | 818 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, |
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
997 | pmd = pmd_offset(pud, addr); | 1015 | pmd = pmd_offset(pud, addr); |
998 | do { | 1016 | do { |
999 | next = pmd_addr_end(addr, end); | 1017 | next = pmd_addr_end(addr, end); |
1018 | if (pmd_trans_huge(*pmd)) { | ||
1019 | if (next-addr != HPAGE_PMD_SIZE) { | ||
1020 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | ||
1021 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
1022 | } else if (zap_huge_pmd(tlb, vma, pmd)) { | ||
1023 | (*zap_work)--; | ||
1024 | continue; | ||
1025 | } | ||
1026 | /* fall through */ | ||
1027 | } | ||
1000 | if (pmd_none_or_clear_bad(pmd)) { | 1028 | if (pmd_none_or_clear_bad(pmd)) { |
1001 | (*zap_work)--; | 1029 | (*zap_work)--; |
1002 | continue; | 1030 | continue; |
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1262 | pud = pud_offset(pgd, address); | 1290 | pud = pud_offset(pgd, address); |
1263 | if (pud_none(*pud)) | 1291 | if (pud_none(*pud)) |
1264 | goto no_page_table; | 1292 | goto no_page_table; |
1265 | if (pud_huge(*pud)) { | 1293 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
1266 | BUG_ON(flags & FOLL_GET); | 1294 | BUG_ON(flags & FOLL_GET); |
1267 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 1295 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); |
1268 | goto out; | 1296 | goto out; |
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1273 | pmd = pmd_offset(pud, address); | 1301 | pmd = pmd_offset(pud, address); |
1274 | if (pmd_none(*pmd)) | 1302 | if (pmd_none(*pmd)) |
1275 | goto no_page_table; | 1303 | goto no_page_table; |
1276 | if (pmd_huge(*pmd)) { | 1304 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
1277 | BUG_ON(flags & FOLL_GET); | 1305 | BUG_ON(flags & FOLL_GET); |
1278 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1306 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
1279 | goto out; | 1307 | goto out; |
1280 | } | 1308 | } |
1309 | if (pmd_trans_huge(*pmd)) { | ||
1310 | if (flags & FOLL_SPLIT) { | ||
1311 | split_huge_page_pmd(mm, pmd); | ||
1312 | goto split_fallthrough; | ||
1313 | } | ||
1314 | spin_lock(&mm->page_table_lock); | ||
1315 | if (likely(pmd_trans_huge(*pmd))) { | ||
1316 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
1317 | spin_unlock(&mm->page_table_lock); | ||
1318 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1319 | } else { | ||
1320 | page = follow_trans_huge_pmd(mm, address, | ||
1321 | pmd, flags); | ||
1322 | spin_unlock(&mm->page_table_lock); | ||
1323 | goto out; | ||
1324 | } | ||
1325 | } else | ||
1326 | spin_unlock(&mm->page_table_lock); | ||
1327 | /* fall through */ | ||
1328 | } | ||
1329 | split_fallthrough: | ||
1281 | if (unlikely(pmd_bad(*pmd))) | 1330 | if (unlikely(pmd_bad(*pmd))) |
1282 | goto no_page_table; | 1331 | goto no_page_table; |
1283 | 1332 | ||
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1310 | */ | 1359 | */ |
1311 | mark_page_accessed(page); | 1360 | mark_page_accessed(page); |
1312 | } | 1361 | } |
1362 | if (flags & FOLL_MLOCK) { | ||
1363 | /* | ||
1364 | * The preliminary mapping check is mainly to avoid the | ||
1365 | * pointless overhead of lock_page on the ZERO_PAGE | ||
1366 | * which might bounce very badly if there is contention. | ||
1367 | * | ||
1368 | * If the page is already locked, we don't need to | ||
1369 | * handle it now - vmscan will handle it later if and | ||
1370 | * when it attempts to reclaim the page. | ||
1371 | */ | ||
1372 | if (page->mapping && trylock_page(page)) { | ||
1373 | lru_add_drain(); /* push cached pages to LRU */ | ||
1374 | /* | ||
1375 | * Because we lock page here and migration is | ||
1376 | * blocked by the pte's page reference, we need | ||
1377 | * only check for file-cache page truncation. | ||
1378 | */ | ||
1379 | if (page->mapping) | ||
1380 | mlock_vma_page(page); | ||
1381 | unlock_page(page); | ||
1382 | } | ||
1383 | } | ||
1313 | unlock: | 1384 | unlock: |
1314 | pte_unmap_unlock(ptep, ptl); | 1385 | pte_unmap_unlock(ptep, ptl); |
1315 | out: | 1386 | out: |
@@ -1341,7 +1412,8 @@ no_page_table: | |||
1341 | 1412 | ||
1342 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1413 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1343 | unsigned long start, int nr_pages, unsigned int gup_flags, | 1414 | unsigned long start, int nr_pages, unsigned int gup_flags, |
1344 | struct page **pages, struct vm_area_struct **vmas) | 1415 | struct page **pages, struct vm_area_struct **vmas, |
1416 | int *nonblocking) | ||
1345 | { | 1417 | { |
1346 | int i; | 1418 | int i; |
1347 | unsigned long vm_flags; | 1419 | unsigned long vm_flags; |
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1386 | pmd = pmd_offset(pud, pg); | 1458 | pmd = pmd_offset(pud, pg); |
1387 | if (pmd_none(*pmd)) | 1459 | if (pmd_none(*pmd)) |
1388 | return i ? : -EFAULT; | 1460 | return i ? : -EFAULT; |
1461 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1389 | pte = pte_offset_map(pmd, pg); | 1462 | pte = pte_offset_map(pmd, pg); |
1390 | if (pte_none(*pte)) { | 1463 | if (pte_none(*pte)) { |
1391 | pte_unmap(pte); | 1464 | pte_unmap(pte); |
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1441 | cond_resched(); | 1514 | cond_resched(); |
1442 | while (!(page = follow_page(vma, start, foll_flags))) { | 1515 | while (!(page = follow_page(vma, start, foll_flags))) { |
1443 | int ret; | 1516 | int ret; |
1517 | unsigned int fault_flags = 0; | ||
1518 | |||
1519 | if (foll_flags & FOLL_WRITE) | ||
1520 | fault_flags |= FAULT_FLAG_WRITE; | ||
1521 | if (nonblocking) | ||
1522 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
1444 | 1523 | ||
1445 | ret = handle_mm_fault(mm, vma, start, | 1524 | ret = handle_mm_fault(mm, vma, start, |
1446 | (foll_flags & FOLL_WRITE) ? | 1525 | fault_flags); |
1447 | FAULT_FLAG_WRITE : 0); | ||
1448 | 1526 | ||
1449 | if (ret & VM_FAULT_ERROR) { | 1527 | if (ret & VM_FAULT_ERROR) { |
1450 | if (ret & VM_FAULT_OOM) | 1528 | if (ret & VM_FAULT_OOM) |
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1460 | else | 1538 | else |
1461 | tsk->min_flt++; | 1539 | tsk->min_flt++; |
1462 | 1540 | ||
1541 | if (ret & VM_FAULT_RETRY) { | ||
1542 | *nonblocking = 0; | ||
1543 | return i; | ||
1544 | } | ||
1545 | |||
1463 | /* | 1546 | /* |
1464 | * The VM_FAULT_WRITE bit tells us that | 1547 | * The VM_FAULT_WRITE bit tells us that |
1465 | * do_wp_page has broken COW when necessary, | 1548 | * do_wp_page has broken COW when necessary, |
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1559 | if (force) | 1642 | if (force) |
1560 | flags |= FOLL_FORCE; | 1643 | flags |= FOLL_FORCE; |
1561 | 1644 | ||
1562 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 1645 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, |
1646 | NULL); | ||
1563 | } | 1647 | } |
1564 | EXPORT_SYMBOL(get_user_pages); | 1648 | EXPORT_SYMBOL(get_user_pages); |
1565 | 1649 | ||
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr) | |||
1584 | struct page *page; | 1668 | struct page *page; |
1585 | 1669 | ||
1586 | if (__get_user_pages(current, current->mm, addr, 1, | 1670 | if (__get_user_pages(current, current->mm, addr, 1, |
1587 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) | 1671 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, |
1672 | NULL) < 1) | ||
1588 | return NULL; | 1673 | return NULL; |
1589 | flush_cache_page(vma, addr, page_to_pfn(page)); | 1674 | flush_cache_page(vma, addr, page_to_pfn(page)); |
1590 | return page; | 1675 | return page; |
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, | |||
1598 | pud_t * pud = pud_alloc(mm, pgd, addr); | 1683 | pud_t * pud = pud_alloc(mm, pgd, addr); |
1599 | if (pud) { | 1684 | if (pud) { |
1600 | pmd_t * pmd = pmd_alloc(mm, pud, addr); | 1685 | pmd_t * pmd = pmd_alloc(mm, pud, addr); |
1601 | if (pmd) | 1686 | if (pmd) { |
1687 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1602 | return pte_alloc_map_lock(mm, pmd, addr, ptl); | 1688 | return pte_alloc_map_lock(mm, pmd, addr, ptl); |
1689 | } | ||
1603 | } | 1690 | } |
1604 | return NULL; | 1691 | return NULL; |
1605 | } | 1692 | } |
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
1818 | pmd = pmd_alloc(mm, pud, addr); | 1905 | pmd = pmd_alloc(mm, pud, addr); |
1819 | if (!pmd) | 1906 | if (!pmd) |
1820 | return -ENOMEM; | 1907 | return -ENOMEM; |
1908 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1821 | do { | 1909 | do { |
1822 | next = pmd_addr_end(addr, end); | 1910 | next = pmd_addr_end(addr, end); |
1823 | if (remap_pte_range(mm, pmd, addr, next, | 1911 | if (remap_pte_range(mm, pmd, addr, next, |
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | |||
2048 | return same; | 2136 | return same; |
2049 | } | 2137 | } |
2050 | 2138 | ||
2051 | /* | ||
2052 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when | ||
2053 | * servicing faults for write access. In the normal case, do always want | ||
2054 | * pte_mkwrite. But get_user_pages can cause write faults for mappings | ||
2055 | * that do not have writing enabled, when used by access_process_vm. | ||
2056 | */ | ||
2057 | static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | ||
2058 | { | ||
2059 | if (likely(vma->vm_flags & VM_WRITE)) | ||
2060 | pte = pte_mkwrite(pte); | ||
2061 | return pte; | ||
2062 | } | ||
2063 | |||
2064 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 2139 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
2065 | { | 2140 | { |
2066 | /* | 2141 | /* |
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2112 | { | 2187 | { |
2113 | struct page *old_page, *new_page; | 2188 | struct page *old_page, *new_page; |
2114 | pte_t entry; | 2189 | pte_t entry; |
2115 | int reuse = 0, ret = 0; | 2190 | int ret = 0; |
2116 | int page_mkwrite = 0; | 2191 | int page_mkwrite = 0; |
2117 | struct page *dirty_page = NULL; | 2192 | struct page *dirty_page = NULL; |
2118 | 2193 | ||
@@ -2149,14 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2149 | } | 2224 | } |
2150 | page_cache_release(old_page); | 2225 | page_cache_release(old_page); |
2151 | } | 2226 | } |
2152 | reuse = reuse_swap_page(old_page); | 2227 | if (reuse_swap_page(old_page)) { |
2153 | if (reuse) | ||
2154 | /* | 2228 | /* |
2155 | * The page is all ours. Move it to our anon_vma so | 2229 | * The page is all ours. Move it to our anon_vma so |
2156 | * the rmap code will not search our parent or siblings. | 2230 | * the rmap code will not search our parent or siblings. |
2157 | * Protected against the rmap code by the page lock. | 2231 | * Protected against the rmap code by the page lock. |
2158 | */ | 2232 | */ |
2159 | page_move_anon_rmap(old_page, vma, address); | 2233 | page_move_anon_rmap(old_page, vma, address); |
2234 | unlock_page(old_page); | ||
2235 | goto reuse; | ||
2236 | } | ||
2160 | unlock_page(old_page); | 2237 | unlock_page(old_page); |
2161 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2238 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2162 | (VM_WRITE|VM_SHARED))) { | 2239 | (VM_WRITE|VM_SHARED))) { |
@@ -2220,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2220 | } | 2297 | } |
2221 | dirty_page = old_page; | 2298 | dirty_page = old_page; |
2222 | get_page(dirty_page); | 2299 | get_page(dirty_page); |
2223 | reuse = 1; | ||
2224 | } | ||
2225 | 2300 | ||
2226 | if (reuse) { | ||
2227 | reuse: | 2301 | reuse: |
2228 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2302 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2229 | entry = pte_mkyoung(orig_pte); | 2303 | entry = pte_mkyoung(orig_pte); |
2230 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2304 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2231 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 2305 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
2232 | update_mmu_cache(vma, address, page_table); | 2306 | update_mmu_cache(vma, address, page_table); |
2307 | pte_unmap_unlock(page_table, ptl); | ||
2233 | ret |= VM_FAULT_WRITE; | 2308 | ret |= VM_FAULT_WRITE; |
2234 | goto unlock; | 2309 | |
2310 | if (!dirty_page) | ||
2311 | return ret; | ||
2312 | |||
2313 | /* | ||
2314 | * Yes, Virginia, this is actually required to prevent a race | ||
2315 | * with clear_page_dirty_for_io() from clearing the page dirty | ||
2316 | * bit after it clear all dirty ptes, but before a racing | ||
2317 | * do_wp_page installs a dirty pte. | ||
2318 | * | ||
2319 | * do_no_page is protected similarly. | ||
2320 | */ | ||
2321 | if (!page_mkwrite) { | ||
2322 | wait_on_page_locked(dirty_page); | ||
2323 | set_page_dirty_balance(dirty_page, page_mkwrite); | ||
2324 | } | ||
2325 | put_page(dirty_page); | ||
2326 | if (page_mkwrite) { | ||
2327 | struct address_space *mapping = dirty_page->mapping; | ||
2328 | |||
2329 | set_page_dirty(dirty_page); | ||
2330 | unlock_page(dirty_page); | ||
2331 | page_cache_release(dirty_page); | ||
2332 | if (mapping) { | ||
2333 | /* | ||
2334 | * Some device drivers do not set page.mapping | ||
2335 | * but still dirty their pages | ||
2336 | */ | ||
2337 | balance_dirty_pages_ratelimited(mapping); | ||
2338 | } | ||
2339 | } | ||
2340 | |||
2341 | /* file_update_time outside page_lock */ | ||
2342 | if (vma->vm_file) | ||
2343 | file_update_time(vma->vm_file); | ||
2344 | |||
2345 | return ret; | ||
2235 | } | 2346 | } |
2236 | 2347 | ||
2237 | /* | 2348 | /* |
@@ -2337,39 +2448,6 @@ gotten: | |||
2337 | page_cache_release(old_page); | 2448 | page_cache_release(old_page); |
2338 | unlock: | 2449 | unlock: |
2339 | pte_unmap_unlock(page_table, ptl); | 2450 | pte_unmap_unlock(page_table, ptl); |
2340 | if (dirty_page) { | ||
2341 | /* | ||
2342 | * Yes, Virginia, this is actually required to prevent a race | ||
2343 | * with clear_page_dirty_for_io() from clearing the page dirty | ||
2344 | * bit after it clear all dirty ptes, but before a racing | ||
2345 | * do_wp_page installs a dirty pte. | ||
2346 | * | ||
2347 | * do_no_page is protected similarly. | ||
2348 | */ | ||
2349 | if (!page_mkwrite) { | ||
2350 | wait_on_page_locked(dirty_page); | ||
2351 | set_page_dirty_balance(dirty_page, page_mkwrite); | ||
2352 | } | ||
2353 | put_page(dirty_page); | ||
2354 | if (page_mkwrite) { | ||
2355 | struct address_space *mapping = dirty_page->mapping; | ||
2356 | |||
2357 | set_page_dirty(dirty_page); | ||
2358 | unlock_page(dirty_page); | ||
2359 | page_cache_release(dirty_page); | ||
2360 | if (mapping) { | ||
2361 | /* | ||
2362 | * Some device drivers do not set page.mapping | ||
2363 | * but still dirty their pages | ||
2364 | */ | ||
2365 | balance_dirty_pages_ratelimited(mapping); | ||
2366 | } | ||
2367 | } | ||
2368 | |||
2369 | /* file_update_time outside page_lock */ | ||
2370 | if (vma->vm_file) | ||
2371 | file_update_time(vma->vm_file); | ||
2372 | } | ||
2373 | return ret; | 2451 | return ret; |
2374 | oom_free_new: | 2452 | oom_free_new: |
2375 | page_cache_release(new_page); | 2453 | page_cache_release(new_page); |
@@ -3147,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3147 | * but allow concurrent faults), and pte mapped but not yet locked. | 3225 | * but allow concurrent faults), and pte mapped but not yet locked. |
3148 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 3226 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
3149 | */ | 3227 | */ |
3150 | static inline int handle_pte_fault(struct mm_struct *mm, | 3228 | int handle_pte_fault(struct mm_struct *mm, |
3151 | struct vm_area_struct *vma, unsigned long address, | 3229 | struct vm_area_struct *vma, unsigned long address, |
3152 | pte_t *pte, pmd_t *pmd, unsigned int flags) | 3230 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
3153 | { | 3231 | { |
3154 | pte_t entry; | 3232 | pte_t entry; |
3155 | spinlock_t *ptl; | 3233 | spinlock_t *ptl; |
@@ -3228,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3228 | pmd = pmd_alloc(mm, pud, address); | 3306 | pmd = pmd_alloc(mm, pud, address); |
3229 | if (!pmd) | 3307 | if (!pmd) |
3230 | return VM_FAULT_OOM; | 3308 | return VM_FAULT_OOM; |
3231 | pte = pte_alloc_map(mm, pmd, address); | 3309 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { |
3232 | if (!pte) | 3310 | if (!vma->vm_ops) |
3311 | return do_huge_pmd_anonymous_page(mm, vma, address, | ||
3312 | pmd, flags); | ||
3313 | } else { | ||
3314 | pmd_t orig_pmd = *pmd; | ||
3315 | barrier(); | ||
3316 | if (pmd_trans_huge(orig_pmd)) { | ||
3317 | if (flags & FAULT_FLAG_WRITE && | ||
3318 | !pmd_write(orig_pmd) && | ||
3319 | !pmd_trans_splitting(orig_pmd)) | ||
3320 | return do_huge_pmd_wp_page(mm, vma, address, | ||
3321 | pmd, orig_pmd); | ||
3322 | return 0; | ||
3323 | } | ||
3324 | } | ||
3325 | |||
3326 | /* | ||
3327 | * Use __pte_alloc instead of pte_alloc_map, because we can't | ||
3328 | * run pte_offset_map on the pmd, if an huge pmd could | ||
3329 | * materialize from under us from a different thread. | ||
3330 | */ | ||
3331 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
3233 | return VM_FAULT_OOM; | 3332 | return VM_FAULT_OOM; |
3333 | /* if an huge pmd materialized from under us just retry later */ | ||
3334 | if (unlikely(pmd_trans_huge(*pmd))) | ||
3335 | return 0; | ||
3336 | /* | ||
3337 | * A regular pmd is established and it can't morph into a huge pmd | ||
3338 | * from under us anymore at this point because we hold the mmap_sem | ||
3339 | * read mode and khugepaged takes it in write mode. So now it's | ||
3340 | * safe to run pte_offset_map(). | ||
3341 | */ | ||
3342 | pte = pte_offset_map(pmd, address); | ||
3234 | 3343 | ||
3235 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3344 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
3236 | } | 3345 | } |
@@ -3296,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
3296 | vma = find_vma(current->mm, addr); | 3405 | vma = find_vma(current->mm, addr); |
3297 | if (!vma) | 3406 | if (!vma) |
3298 | return -ENOMEM; | 3407 | return -ENOMEM; |
3299 | write = (vma->vm_flags & VM_WRITE) != 0; | 3408 | /* |
3409 | * We want to touch writable mappings with a write fault in order | ||
3410 | * to break COW, except for shared mappings because these don't COW | ||
3411 | * and we would not want to dirty them for nothing. | ||
3412 | */ | ||
3413 | write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; | ||
3300 | BUG_ON(addr >= end); | 3414 | BUG_ON(addr >= end); |
3301 | BUG_ON(end > vma->vm_end); | 3415 | BUG_ON(end > vma->vm_end); |
3302 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | 3416 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
@@ -3368,6 +3482,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, | |||
3368 | goto out; | 3482 | goto out; |
3369 | 3483 | ||
3370 | pmd = pmd_offset(pud, address); | 3484 | pmd = pmd_offset(pud, address); |
3485 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
3371 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 3486 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
3372 | goto out; | 3487 | goto out; |
3373 | 3488 | ||
@@ -3608,3 +3723,74 @@ void might_fault(void) | |||
3608 | } | 3723 | } |
3609 | EXPORT_SYMBOL(might_fault); | 3724 | EXPORT_SYMBOL(might_fault); |
3610 | #endif | 3725 | #endif |
3726 | |||
3727 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) | ||
3728 | static void clear_gigantic_page(struct page *page, | ||
3729 | unsigned long addr, | ||
3730 | unsigned int pages_per_huge_page) | ||
3731 | { | ||
3732 | int i; | ||
3733 | struct page *p = page; | ||
3734 | |||
3735 | might_sleep(); | ||
3736 | for (i = 0; i < pages_per_huge_page; | ||
3737 | i++, p = mem_map_next(p, page, i)) { | ||
3738 | cond_resched(); | ||
3739 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
3740 | } | ||
3741 | } | ||
3742 | void clear_huge_page(struct page *page, | ||
3743 | unsigned long addr, unsigned int pages_per_huge_page) | ||
3744 | { | ||
3745 | int i; | ||
3746 | |||
3747 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | ||
3748 | clear_gigantic_page(page, addr, pages_per_huge_page); | ||
3749 | return; | ||
3750 | } | ||
3751 | |||
3752 | might_sleep(); | ||
3753 | for (i = 0; i < pages_per_huge_page; i++) { | ||
3754 | cond_resched(); | ||
3755 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
3756 | } | ||
3757 | } | ||
3758 | |||
3759 | static void copy_user_gigantic_page(struct page *dst, struct page *src, | ||
3760 | unsigned long addr, | ||
3761 | struct vm_area_struct *vma, | ||
3762 | unsigned int pages_per_huge_page) | ||
3763 | { | ||
3764 | int i; | ||
3765 | struct page *dst_base = dst; | ||
3766 | struct page *src_base = src; | ||
3767 | |||
3768 | for (i = 0; i < pages_per_huge_page; ) { | ||
3769 | cond_resched(); | ||
3770 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | ||
3771 | |||
3772 | i++; | ||
3773 | dst = mem_map_next(dst, dst_base, i); | ||
3774 | src = mem_map_next(src, src_base, i); | ||
3775 | } | ||
3776 | } | ||
3777 | |||
3778 | void copy_user_huge_page(struct page *dst, struct page *src, | ||
3779 | unsigned long addr, struct vm_area_struct *vma, | ||
3780 | unsigned int pages_per_huge_page) | ||
3781 | { | ||
3782 | int i; | ||
3783 | |||
3784 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | ||
3785 | copy_user_gigantic_page(dst, src, addr, vma, | ||
3786 | pages_per_huge_page); | ||
3787 | return; | ||
3788 | } | ||
3789 | |||
3790 | might_sleep(); | ||
3791 | for (i = 0; i < pages_per_huge_page; i++) { | ||
3792 | cond_resched(); | ||
3793 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | ||
3794 | } | ||
3795 | } | ||
3796 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2c6523af5473..e92f04749fcb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res) | |||
82 | 82 | ||
83 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | 83 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE |
84 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 84 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
85 | static void get_page_bootmem(unsigned long info, struct page *page, int type) | 85 | static void get_page_bootmem(unsigned long info, struct page *page, |
86 | unsigned long type) | ||
86 | { | 87 | { |
87 | atomic_set(&page->_mapcount, type); | 88 | page->lru.next = (struct list_head *) type; |
88 | SetPagePrivate(page); | 89 | SetPagePrivate(page); |
89 | set_page_private(page, info); | 90 | set_page_private(page, info); |
90 | atomic_inc(&page->_count); | 91 | atomic_inc(&page->_count); |
@@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) | |||
94 | * so use __ref to tell modpost not to generate a warning */ | 95 | * so use __ref to tell modpost not to generate a warning */ |
95 | void __ref put_page_bootmem(struct page *page) | 96 | void __ref put_page_bootmem(struct page *page) |
96 | { | 97 | { |
97 | int type; | 98 | unsigned long type; |
98 | 99 | ||
99 | type = atomic_read(&page->_mapcount); | 100 | type = (unsigned long) page->lru.next; |
100 | BUG_ON(type >= -1); | 101 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
102 | type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); | ||
101 | 103 | ||
102 | if (atomic_dec_return(&page->_count) == 1) { | 104 | if (atomic_dec_return(&page->_count) == 1) { |
103 | ClearPagePrivate(page); | 105 | ClearPagePrivate(page); |
104 | set_page_private(page, 0); | 106 | set_page_private(page, 0); |
105 | reset_page_mapcount(page); | 107 | INIT_LIST_HEAD(&page->lru); |
106 | __free_pages_bootmem(page, 0); | 108 | __free_pages_bootmem(page, 0); |
107 | } | 109 | } |
108 | 110 | ||
@@ -733,7 +735,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
733 | goto out; | 735 | goto out; |
734 | } | 736 | } |
735 | /* this function returns # of failed pages */ | 737 | /* this function returns # of failed pages */ |
736 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); | 738 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, |
739 | true, true); | ||
737 | if (ret) | 740 | if (ret) |
738 | putback_lru_pages(&source); | 741 | putback_lru_pages(&source); |
739 | } | 742 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 11ff260fb282..368fc9d23610 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
514 | pmd = pmd_offset(pud, addr); | 514 | pmd = pmd_offset(pud, addr); |
515 | do { | 515 | do { |
516 | next = pmd_addr_end(addr, end); | 516 | next = pmd_addr_end(addr, end); |
517 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
517 | if (pmd_none_or_clear_bad(pmd)) | 518 | if (pmd_none_or_clear_bad(pmd)) |
518 | continue; | 519 | continue; |
519 | if (check_pte_range(vma, pmd, addr, next, nodes, | 520 | if (check_pte_range(vma, pmd, addr, next, nodes, |
@@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
935 | return PTR_ERR(vma); | 936 | return PTR_ERR(vma); |
936 | 937 | ||
937 | if (!list_empty(&pagelist)) { | 938 | if (!list_empty(&pagelist)) { |
938 | err = migrate_pages(&pagelist, new_node_page, dest, 0); | 939 | err = migrate_pages(&pagelist, new_node_page, dest, |
940 | false, true); | ||
939 | if (err) | 941 | if (err) |
940 | putback_lru_pages(&pagelist); | 942 | putback_lru_pages(&pagelist); |
941 | } | 943 | } |
@@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1155 | 1157 | ||
1156 | if (!list_empty(&pagelist)) { | 1158 | if (!list_empty(&pagelist)) { |
1157 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1159 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1158 | (unsigned long)vma, 0); | 1160 | (unsigned long)vma, |
1161 | false, true); | ||
1159 | if (nr_failed) | 1162 | if (nr_failed) |
1160 | putback_lru_pages(&pagelist); | 1163 | putback_lru_pages(&pagelist); |
1161 | } | 1164 | } |
@@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1308 | 1311 | ||
1309 | /* Find the mm_struct */ | 1312 | /* Find the mm_struct */ |
1310 | rcu_read_lock(); | 1313 | rcu_read_lock(); |
1311 | read_lock(&tasklist_lock); | ||
1312 | task = pid ? find_task_by_vpid(pid) : current; | 1314 | task = pid ? find_task_by_vpid(pid) : current; |
1313 | if (!task) { | 1315 | if (!task) { |
1314 | read_unlock(&tasklist_lock); | ||
1315 | rcu_read_unlock(); | 1316 | rcu_read_unlock(); |
1316 | err = -ESRCH; | 1317 | err = -ESRCH; |
1317 | goto out; | 1318 | goto out; |
1318 | } | 1319 | } |
1319 | mm = get_task_mm(task); | 1320 | mm = get_task_mm(task); |
1320 | read_unlock(&tasklist_lock); | ||
1321 | rcu_read_unlock(); | 1321 | rcu_read_unlock(); |
1322 | 1322 | ||
1323 | err = -EINVAL; | 1323 | err = -EINVAL; |
@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1796 | } | 1796 | } |
1797 | 1797 | ||
1798 | /** | 1798 | /** |
1799 | * alloc_page_vma - Allocate a page for a VMA. | 1799 | * alloc_pages_vma - Allocate a page for a VMA. |
1800 | * | 1800 | * |
1801 | * @gfp: | 1801 | * @gfp: |
1802 | * %GFP_USER user allocation. | 1802 | * %GFP_USER user allocation. |
@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1805 | * %GFP_FS allocation should not call back into a file system. | 1805 | * %GFP_FS allocation should not call back into a file system. |
1806 | * %GFP_ATOMIC don't sleep. | 1806 | * %GFP_ATOMIC don't sleep. |
1807 | * | 1807 | * |
1808 | * @order:Order of the GFP allocation. | ||
1808 | * @vma: Pointer to VMA or NULL if not available. | 1809 | * @vma: Pointer to VMA or NULL if not available. |
1809 | * @addr: Virtual Address of the allocation. Must be inside the VMA. | 1810 | * @addr: Virtual Address of the allocation. Must be inside the VMA. |
1810 | * | 1811 | * |
@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1818 | * Should be called with the mm_sem of the vma hold. | 1819 | * Should be called with the mm_sem of the vma hold. |
1819 | */ | 1820 | */ |
1820 | struct page * | 1821 | struct page * |
1821 | alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | 1822 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
1823 | unsigned long addr) | ||
1822 | { | 1824 | { |
1823 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1825 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1824 | struct zonelist *zl; | 1826 | struct zonelist *zl; |
@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1830 | 1832 | ||
1831 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | 1833 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
1832 | mpol_cond_put(pol); | 1834 | mpol_cond_put(pol); |
1833 | page = alloc_page_interleave(gfp, 0, nid); | 1835 | page = alloc_page_interleave(gfp, order, nid); |
1834 | put_mems_allowed(); | 1836 | put_mems_allowed(); |
1835 | return page; | 1837 | return page; |
1836 | } | 1838 | } |
@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1839 | /* | 1841 | /* |
1840 | * slow path: ref counted shared policy | 1842 | * slow path: ref counted shared policy |
1841 | */ | 1843 | */ |
1842 | struct page *page = __alloc_pages_nodemask(gfp, 0, | 1844 | struct page *page = __alloc_pages_nodemask(gfp, order, |
1843 | zl, policy_nodemask(gfp, pol)); | 1845 | zl, policy_nodemask(gfp, pol)); |
1844 | __mpol_put(pol); | 1846 | __mpol_put(pol); |
1845 | put_mems_allowed(); | 1847 | put_mems_allowed(); |
@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1848 | /* | 1850 | /* |
1849 | * fast path: default or task policy | 1851 | * fast path: default or task policy |
1850 | */ | 1852 | */ |
1851 | page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); | 1853 | page = __alloc_pages_nodemask(gfp, order, zl, |
1854 | policy_nodemask(gfp, pol)); | ||
1852 | put_mems_allowed(); | 1855 | put_mems_allowed(); |
1853 | return page; | 1856 | return page; |
1854 | } | 1857 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 6ae8a66a7045..46fe8cc13d67 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
113 | goto out; | 113 | goto out; |
114 | 114 | ||
115 | pmd = pmd_offset(pud, addr); | 115 | pmd = pmd_offset(pud, addr); |
116 | if (pmd_trans_huge(*pmd)) | ||
117 | goto out; | ||
116 | if (!pmd_present(*pmd)) | 118 | if (!pmd_present(*pmd)) |
117 | goto out; | 119 | goto out; |
118 | 120 | ||
@@ -246,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
246 | 248 | ||
247 | expected_count = 2 + page_has_private(page); | 249 | expected_count = 2 + page_has_private(page); |
248 | if (page_count(page) != expected_count || | 250 | if (page_count(page) != expected_count || |
249 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 251 | radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { |
250 | spin_unlock_irq(&mapping->tree_lock); | 252 | spin_unlock_irq(&mapping->tree_lock); |
251 | return -EAGAIN; | 253 | return -EAGAIN; |
252 | } | 254 | } |
@@ -318,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
318 | 320 | ||
319 | expected_count = 2 + page_has_private(page); | 321 | expected_count = 2 + page_has_private(page); |
320 | if (page_count(page) != expected_count || | 322 | if (page_count(page) != expected_count || |
321 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 323 | radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { |
322 | spin_unlock_irq(&mapping->tree_lock); | 324 | spin_unlock_irq(&mapping->tree_lock); |
323 | return -EAGAIN; | 325 | return -EAGAIN; |
324 | } | 326 | } |
@@ -614,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
614 | * to the newly allocated page in newpage. | 616 | * to the newly allocated page in newpage. |
615 | */ | 617 | */ |
616 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 618 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
617 | struct page *page, int force, int offlining) | 619 | struct page *page, int force, bool offlining, bool sync) |
618 | { | 620 | { |
619 | int rc = 0; | 621 | int rc = 0; |
620 | int *result = NULL; | 622 | int *result = NULL; |
621 | struct page *newpage = get_new_page(page, private, &result); | 623 | struct page *newpage = get_new_page(page, private, &result); |
622 | int remap_swapcache = 1; | 624 | int remap_swapcache = 1; |
623 | int rcu_locked = 0; | ||
624 | int charge = 0; | 625 | int charge = 0; |
625 | struct mem_cgroup *mem = NULL; | 626 | struct mem_cgroup *mem = NULL; |
626 | struct anon_vma *anon_vma = NULL; | 627 | struct anon_vma *anon_vma = NULL; |
@@ -632,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
632 | /* page was freed from under us. So we are done. */ | 633 | /* page was freed from under us. So we are done. */ |
633 | goto move_newpage; | 634 | goto move_newpage; |
634 | } | 635 | } |
636 | if (unlikely(PageTransHuge(page))) | ||
637 | if (unlikely(split_huge_page(page))) | ||
638 | goto move_newpage; | ||
635 | 639 | ||
636 | /* prepare cgroup just returns 0 or -ENOMEM */ | 640 | /* prepare cgroup just returns 0 or -ENOMEM */ |
637 | rc = -EAGAIN; | 641 | rc = -EAGAIN; |
@@ -639,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
639 | if (!trylock_page(page)) { | 643 | if (!trylock_page(page)) { |
640 | if (!force) | 644 | if (!force) |
641 | goto move_newpage; | 645 | goto move_newpage; |
646 | |||
647 | /* | ||
648 | * It's not safe for direct compaction to call lock_page. | ||
649 | * For example, during page readahead pages are added locked | ||
650 | * to the LRU. Later, when the IO completes the pages are | ||
651 | * marked uptodate and unlocked. However, the queueing | ||
652 | * could be merging multiple pages for one bio (e.g. | ||
653 | * mpage_readpages). If an allocation happens for the | ||
654 | * second or third page, the process can end up locking | ||
655 | * the same page twice and deadlocking. Rather than | ||
656 | * trying to be clever about what pages can be locked, | ||
657 | * avoid the use of lock_page for direct compaction | ||
658 | * altogether. | ||
659 | */ | ||
660 | if (current->flags & PF_MEMALLOC) | ||
661 | goto move_newpage; | ||
662 | |||
642 | lock_page(page); | 663 | lock_page(page); |
643 | } | 664 | } |
644 | 665 | ||
@@ -665,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
665 | BUG_ON(charge); | 686 | BUG_ON(charge); |
666 | 687 | ||
667 | if (PageWriteback(page)) { | 688 | if (PageWriteback(page)) { |
668 | if (!force) | 689 | if (!force || !sync) |
669 | goto uncharge; | 690 | goto uncharge; |
670 | wait_on_page_writeback(page); | 691 | wait_on_page_writeback(page); |
671 | } | 692 | } |
672 | /* | 693 | /* |
673 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, | 694 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, |
674 | * we cannot notice that anon_vma is freed while we migrates a page. | 695 | * we cannot notice that anon_vma is freed while we migrates a page. |
675 | * This rcu_read_lock() delays freeing anon_vma pointer until the end | 696 | * This get_anon_vma() delays freeing anon_vma pointer until the end |
676 | * of migration. File cache pages are no problem because of page_lock() | 697 | * of migration. File cache pages are no problem because of page_lock() |
677 | * File Caches may use write_page() or lock_page() in migration, then, | 698 | * File Caches may use write_page() or lock_page() in migration, then, |
678 | * just care Anon page here. | 699 | * just care Anon page here. |
679 | */ | 700 | */ |
680 | if (PageAnon(page)) { | 701 | if (PageAnon(page)) { |
681 | rcu_read_lock(); | 702 | /* |
682 | rcu_locked = 1; | 703 | * Only page_lock_anon_vma() understands the subtleties of |
683 | 704 | * getting a hold on an anon_vma from outside one of its mms. | |
684 | /* Determine how to safely use anon_vma */ | 705 | */ |
685 | if (!page_mapped(page)) { | 706 | anon_vma = page_lock_anon_vma(page); |
686 | if (!PageSwapCache(page)) | 707 | if (anon_vma) { |
687 | goto rcu_unlock; | 708 | /* |
688 | 709 | * Take a reference count on the anon_vma if the | |
710 | * page is mapped so that it is guaranteed to | ||
711 | * exist when the page is remapped later | ||
712 | */ | ||
713 | get_anon_vma(anon_vma); | ||
714 | page_unlock_anon_vma(anon_vma); | ||
715 | } else if (PageSwapCache(page)) { | ||
689 | /* | 716 | /* |
690 | * We cannot be sure that the anon_vma of an unmapped | 717 | * We cannot be sure that the anon_vma of an unmapped |
691 | * swapcache page is safe to use because we don't | 718 | * swapcache page is safe to use because we don't |
@@ -700,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
700 | */ | 727 | */ |
701 | remap_swapcache = 0; | 728 | remap_swapcache = 0; |
702 | } else { | 729 | } else { |
703 | /* | 730 | goto uncharge; |
704 | * Take a reference count on the anon_vma if the | ||
705 | * page is mapped so that it is guaranteed to | ||
706 | * exist when the page is remapped later | ||
707 | */ | ||
708 | anon_vma = page_anon_vma(page); | ||
709 | get_anon_vma(anon_vma); | ||
710 | } | 731 | } |
711 | } | 732 | } |
712 | 733 | ||
@@ -723,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
723 | * free the metadata, so the page can be freed. | 744 | * free the metadata, so the page can be freed. |
724 | */ | 745 | */ |
725 | if (!page->mapping) { | 746 | if (!page->mapping) { |
726 | if (!PageAnon(page) && page_has_private(page)) { | 747 | VM_BUG_ON(PageAnon(page)); |
727 | /* | 748 | if (page_has_private(page)) { |
728 | * Go direct to try_to_free_buffers() here because | ||
729 | * a) that's what try_to_release_page() would do anyway | ||
730 | * b) we may be under rcu_read_lock() here, so we can't | ||
731 | * use GFP_KERNEL which is what try_to_release_page() | ||
732 | * needs to be effective. | ||
733 | */ | ||
734 | try_to_free_buffers(page); | 749 | try_to_free_buffers(page); |
735 | goto rcu_unlock; | 750 | goto uncharge; |
736 | } | 751 | } |
737 | goto skip_unmap; | 752 | goto skip_unmap; |
738 | } | 753 | } |
@@ -746,17 +761,14 @@ skip_unmap: | |||
746 | 761 | ||
747 | if (rc && remap_swapcache) | 762 | if (rc && remap_swapcache) |
748 | remove_migration_ptes(page, page); | 763 | remove_migration_ptes(page, page); |
749 | rcu_unlock: | ||
750 | 764 | ||
751 | /* Drop an anon_vma reference if we took one */ | 765 | /* Drop an anon_vma reference if we took one */ |
752 | if (anon_vma) | 766 | if (anon_vma) |
753 | drop_anon_vma(anon_vma); | 767 | drop_anon_vma(anon_vma); |
754 | 768 | ||
755 | if (rcu_locked) | ||
756 | rcu_read_unlock(); | ||
757 | uncharge: | 769 | uncharge: |
758 | if (!charge) | 770 | if (!charge) |
759 | mem_cgroup_end_migration(mem, page, newpage); | 771 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
760 | unlock: | 772 | unlock: |
761 | unlock_page(page); | 773 | unlock_page(page); |
762 | 774 | ||
@@ -810,12 +822,11 @@ move_newpage: | |||
810 | */ | 822 | */ |
811 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 823 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
812 | unsigned long private, struct page *hpage, | 824 | unsigned long private, struct page *hpage, |
813 | int force, int offlining) | 825 | int force, bool offlining, bool sync) |
814 | { | 826 | { |
815 | int rc = 0; | 827 | int rc = 0; |
816 | int *result = NULL; | 828 | int *result = NULL; |
817 | struct page *new_hpage = get_new_page(hpage, private, &result); | 829 | struct page *new_hpage = get_new_page(hpage, private, &result); |
818 | int rcu_locked = 0; | ||
819 | struct anon_vma *anon_vma = NULL; | 830 | struct anon_vma *anon_vma = NULL; |
820 | 831 | ||
821 | if (!new_hpage) | 832 | if (!new_hpage) |
@@ -824,18 +835,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
824 | rc = -EAGAIN; | 835 | rc = -EAGAIN; |
825 | 836 | ||
826 | if (!trylock_page(hpage)) { | 837 | if (!trylock_page(hpage)) { |
827 | if (!force) | 838 | if (!force || !sync) |
828 | goto out; | 839 | goto out; |
829 | lock_page(hpage); | 840 | lock_page(hpage); |
830 | } | 841 | } |
831 | 842 | ||
832 | if (PageAnon(hpage)) { | 843 | if (PageAnon(hpage)) { |
833 | rcu_read_lock(); | 844 | anon_vma = page_lock_anon_vma(hpage); |
834 | rcu_locked = 1; | 845 | if (anon_vma) { |
835 | 846 | get_anon_vma(anon_vma); | |
836 | if (page_mapped(hpage)) { | 847 | page_unlock_anon_vma(anon_vma); |
837 | anon_vma = page_anon_vma(hpage); | ||
838 | atomic_inc(&anon_vma->external_refcount); | ||
839 | } | 848 | } |
840 | } | 849 | } |
841 | 850 | ||
@@ -847,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
847 | if (rc) | 856 | if (rc) |
848 | remove_migration_ptes(hpage, hpage); | 857 | remove_migration_ptes(hpage, hpage); |
849 | 858 | ||
850 | if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, | 859 | if (anon_vma) |
851 | &anon_vma->lock)) { | 860 | drop_anon_vma(anon_vma); |
852 | int empty = list_empty(&anon_vma->head); | ||
853 | spin_unlock(&anon_vma->lock); | ||
854 | if (empty) | ||
855 | anon_vma_free(anon_vma); | ||
856 | } | ||
857 | |||
858 | if (rcu_locked) | ||
859 | rcu_read_unlock(); | ||
860 | out: | 861 | out: |
861 | unlock_page(hpage); | 862 | unlock_page(hpage); |
862 | 863 | ||
@@ -892,7 +893,8 @@ out: | |||
892 | * Return: Number of pages not migrated or error code. | 893 | * Return: Number of pages not migrated or error code. |
893 | */ | 894 | */ |
894 | int migrate_pages(struct list_head *from, | 895 | int migrate_pages(struct list_head *from, |
895 | new_page_t get_new_page, unsigned long private, int offlining) | 896 | new_page_t get_new_page, unsigned long private, bool offlining, |
897 | bool sync) | ||
896 | { | 898 | { |
897 | int retry = 1; | 899 | int retry = 1; |
898 | int nr_failed = 0; | 900 | int nr_failed = 0; |
@@ -912,7 +914,8 @@ int migrate_pages(struct list_head *from, | |||
912 | cond_resched(); | 914 | cond_resched(); |
913 | 915 | ||
914 | rc = unmap_and_move(get_new_page, private, | 916 | rc = unmap_and_move(get_new_page, private, |
915 | page, pass > 2, offlining); | 917 | page, pass > 2, offlining, |
918 | sync); | ||
916 | 919 | ||
917 | switch(rc) { | 920 | switch(rc) { |
918 | case -ENOMEM: | 921 | case -ENOMEM: |
@@ -941,7 +944,8 @@ out: | |||
941 | } | 944 | } |
942 | 945 | ||
943 | int migrate_huge_pages(struct list_head *from, | 946 | int migrate_huge_pages(struct list_head *from, |
944 | new_page_t get_new_page, unsigned long private, int offlining) | 947 | new_page_t get_new_page, unsigned long private, bool offlining, |
948 | bool sync) | ||
945 | { | 949 | { |
946 | int retry = 1; | 950 | int retry = 1; |
947 | int nr_failed = 0; | 951 | int nr_failed = 0; |
@@ -957,7 +961,8 @@ int migrate_huge_pages(struct list_head *from, | |||
957 | cond_resched(); | 961 | cond_resched(); |
958 | 962 | ||
959 | rc = unmap_and_move_huge_page(get_new_page, | 963 | rc = unmap_and_move_huge_page(get_new_page, |
960 | private, page, pass > 2, offlining); | 964 | private, page, pass > 2, offlining, |
965 | sync); | ||
961 | 966 | ||
962 | switch(rc) { | 967 | switch(rc) { |
963 | case -ENOMEM: | 968 | case -ENOMEM: |
@@ -1042,7 +1047,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1042 | if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) | 1047 | if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) |
1043 | goto set_status; | 1048 | goto set_status; |
1044 | 1049 | ||
1045 | page = follow_page(vma, pp->addr, FOLL_GET); | 1050 | page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); |
1046 | 1051 | ||
1047 | err = PTR_ERR(page); | 1052 | err = PTR_ERR(page); |
1048 | if (IS_ERR(page)) | 1053 | if (IS_ERR(page)) |
@@ -1090,7 +1095,7 @@ set_status: | |||
1090 | err = 0; | 1095 | err = 0; |
1091 | if (!list_empty(&pagelist)) { | 1096 | if (!list_empty(&pagelist)) { |
1092 | err = migrate_pages(&pagelist, new_page_node, | 1097 | err = migrate_pages(&pagelist, new_page_node, |
1093 | (unsigned long)pm, 0); | 1098 | (unsigned long)pm, 0, true); |
1094 | if (err) | 1099 | if (err) |
1095 | putback_lru_pages(&pagelist); | 1100 | putback_lru_pages(&pagelist); |
1096 | } | 1101 | } |
diff --git a/mm/mincore.c b/mm/mincore.c index 9ac42dc6d7b6..a4e6b9d75c76 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
154 | pmd = pmd_offset(pud, addr); | 154 | pmd = pmd_offset(pud, addr); |
155 | do { | 155 | do { |
156 | next = pmd_addr_end(addr, end); | 156 | next = pmd_addr_end(addr, end); |
157 | if (pmd_trans_huge(*pmd)) { | ||
158 | if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { | ||
159 | vec += (next - addr) >> PAGE_SHIFT; | ||
160 | continue; | ||
161 | } | ||
162 | /* fall through */ | ||
163 | } | ||
157 | if (pmd_none_or_clear_bad(pmd)) | 164 | if (pmd_none_or_clear_bad(pmd)) |
158 | mincore_unmapped_range(vma, addr, next, vec); | 165 | mincore_unmapped_range(vma, addr, next, vec); |
159 | else | 166 | else |
diff --git a/mm/mlock.c b/mm/mlock.c index b70919ce4f72..13e81ee8be9d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add | |||
155 | * vma->vm_mm->mmap_sem must be held for at least read. | 155 | * vma->vm_mm->mmap_sem must be held for at least read. |
156 | */ | 156 | */ |
157 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | 157 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, |
158 | unsigned long start, unsigned long end) | 158 | unsigned long start, unsigned long end, |
159 | int *nonblocking) | ||
159 | { | 160 | { |
160 | struct mm_struct *mm = vma->vm_mm; | 161 | struct mm_struct *mm = vma->vm_mm; |
161 | unsigned long addr = start; | 162 | unsigned long addr = start; |
162 | struct page *pages[16]; /* 16 gives a reasonable batch */ | ||
163 | int nr_pages = (end - start) / PAGE_SIZE; | 163 | int nr_pages = (end - start) / PAGE_SIZE; |
164 | int ret = 0; | ||
165 | int gup_flags; | 164 | int gup_flags; |
166 | 165 | ||
167 | VM_BUG_ON(start & ~PAGE_MASK); | 166 | VM_BUG_ON(start & ~PAGE_MASK); |
@@ -170,73 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
170 | VM_BUG_ON(end > vma->vm_end); | 169 | VM_BUG_ON(end > vma->vm_end); |
171 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | 170 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); |
172 | 171 | ||
173 | gup_flags = FOLL_TOUCH | FOLL_GET; | 172 | gup_flags = FOLL_TOUCH; |
174 | if (vma->vm_flags & VM_WRITE) | 173 | /* |
174 | * We want to touch writable mappings with a write fault in order | ||
175 | * to break COW, except for shared mappings because these don't COW | ||
176 | * and we would not want to dirty them for nothing. | ||
177 | */ | ||
178 | if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) | ||
175 | gup_flags |= FOLL_WRITE; | 179 | gup_flags |= FOLL_WRITE; |
176 | 180 | ||
181 | if (vma->vm_flags & VM_LOCKED) | ||
182 | gup_flags |= FOLL_MLOCK; | ||
183 | |||
177 | /* We don't try to access the guard page of a stack vma */ | 184 | /* We don't try to access the guard page of a stack vma */ |
178 | if (stack_guard_page(vma, start)) { | 185 | if (stack_guard_page(vma, start)) { |
179 | addr += PAGE_SIZE; | 186 | addr += PAGE_SIZE; |
180 | nr_pages--; | 187 | nr_pages--; |
181 | } | 188 | } |
182 | 189 | ||
183 | while (nr_pages > 0) { | 190 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, |
184 | int i; | 191 | NULL, NULL, nonblocking); |
185 | |||
186 | cond_resched(); | ||
187 | |||
188 | /* | ||
189 | * get_user_pages makes pages present if we are | ||
190 | * setting mlock. and this extra reference count will | ||
191 | * disable migration of this page. However, page may | ||
192 | * still be truncated out from under us. | ||
193 | */ | ||
194 | ret = __get_user_pages(current, mm, addr, | ||
195 | min_t(int, nr_pages, ARRAY_SIZE(pages)), | ||
196 | gup_flags, pages, NULL); | ||
197 | /* | ||
198 | * This can happen for, e.g., VM_NONLINEAR regions before | ||
199 | * a page has been allocated and mapped at a given offset, | ||
200 | * or for addresses that map beyond end of a file. | ||
201 | * We'll mlock the pages if/when they get faulted in. | ||
202 | */ | ||
203 | if (ret < 0) | ||
204 | break; | ||
205 | |||
206 | lru_add_drain(); /* push cached pages to LRU */ | ||
207 | |||
208 | for (i = 0; i < ret; i++) { | ||
209 | struct page *page = pages[i]; | ||
210 | |||
211 | if (page->mapping) { | ||
212 | /* | ||
213 | * That preliminary check is mainly to avoid | ||
214 | * the pointless overhead of lock_page on the | ||
215 | * ZERO_PAGE: which might bounce very badly if | ||
216 | * there is contention. However, we're still | ||
217 | * dirtying its cacheline with get/put_page: | ||
218 | * we'll add another __get_user_pages flag to | ||
219 | * avoid it if that case turns out to matter. | ||
220 | */ | ||
221 | lock_page(page); | ||
222 | /* | ||
223 | * Because we lock page here and migration is | ||
224 | * blocked by the elevated reference, we need | ||
225 | * only check for file-cache page truncation. | ||
226 | */ | ||
227 | if (page->mapping) | ||
228 | mlock_vma_page(page); | ||
229 | unlock_page(page); | ||
230 | } | ||
231 | put_page(page); /* ref from get_user_pages() */ | ||
232 | } | ||
233 | |||
234 | addr += ret * PAGE_SIZE; | ||
235 | nr_pages -= ret; | ||
236 | ret = 0; | ||
237 | } | ||
238 | |||
239 | return ret; /* 0 or negative error code */ | ||
240 | } | 192 | } |
241 | 193 | ||
242 | /* | 194 | /* |
@@ -280,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
280 | is_vm_hugetlb_page(vma) || | 232 | is_vm_hugetlb_page(vma) || |
281 | vma == get_gate_vma(current))) { | 233 | vma == get_gate_vma(current))) { |
282 | 234 | ||
283 | __mlock_vma_pages_range(vma, start, end); | 235 | __mlock_vma_pages_range(vma, start, end, NULL); |
284 | 236 | ||
285 | /* Hide errors from mmap() and other callers */ | 237 | /* Hide errors from mmap() and other callers */ |
286 | return 0; | 238 | return 0; |
@@ -372,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
372 | int ret = 0; | 324 | int ret = 0; |
373 | int lock = newflags & VM_LOCKED; | 325 | int lock = newflags & VM_LOCKED; |
374 | 326 | ||
375 | if (newflags == vma->vm_flags || | 327 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || |
376 | (vma->vm_flags & (VM_IO | VM_PFNMAP))) | 328 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) |
377 | goto out; /* don't set VM_LOCKED, don't count */ | 329 | goto out; /* don't set VM_LOCKED, don't count */ |
378 | 330 | ||
379 | if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | ||
380 | is_vm_hugetlb_page(vma) || | ||
381 | vma == get_gate_vma(current)) { | ||
382 | if (lock) | ||
383 | make_pages_present(start, end); | ||
384 | goto out; /* don't set VM_LOCKED, don't count */ | ||
385 | } | ||
386 | |||
387 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 331 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
388 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, | 332 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, |
389 | vma->vm_file, pgoff, vma_policy(vma)); | 333 | vma->vm_file, pgoff, vma_policy(vma)); |
@@ -419,14 +363,10 @@ success: | |||
419 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. | 363 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. |
420 | */ | 364 | */ |
421 | 365 | ||
422 | if (lock) { | 366 | if (lock) |
423 | vma->vm_flags = newflags; | 367 | vma->vm_flags = newflags; |
424 | ret = __mlock_vma_pages_range(vma, start, end); | 368 | else |
425 | if (ret < 0) | ||
426 | ret = __mlock_posix_error_return(ret); | ||
427 | } else { | ||
428 | munlock_vma_pages_range(vma, start, end); | 369 | munlock_vma_pages_range(vma, start, end); |
429 | } | ||
430 | 370 | ||
431 | out: | 371 | out: |
432 | *prev = vma; | 372 | *prev = vma; |
@@ -439,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
439 | struct vm_area_struct * vma, * prev; | 379 | struct vm_area_struct * vma, * prev; |
440 | int error; | 380 | int error; |
441 | 381 | ||
442 | len = PAGE_ALIGN(len); | 382 | VM_BUG_ON(start & ~PAGE_MASK); |
383 | VM_BUG_ON(len != PAGE_ALIGN(len)); | ||
443 | end = start + len; | 384 | end = start + len; |
444 | if (end < start) | 385 | if (end < start) |
445 | return -EINVAL; | 386 | return -EINVAL; |
@@ -482,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
482 | return error; | 423 | return error; |
483 | } | 424 | } |
484 | 425 | ||
426 | static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | ||
427 | { | ||
428 | struct mm_struct *mm = current->mm; | ||
429 | unsigned long end, nstart, nend; | ||
430 | struct vm_area_struct *vma = NULL; | ||
431 | int locked = 0; | ||
432 | int ret = 0; | ||
433 | |||
434 | VM_BUG_ON(start & ~PAGE_MASK); | ||
435 | VM_BUG_ON(len != PAGE_ALIGN(len)); | ||
436 | end = start + len; | ||
437 | |||
438 | for (nstart = start; nstart < end; nstart = nend) { | ||
439 | /* | ||
440 | * We want to fault in pages for [nstart; end) address range. | ||
441 | * Find first corresponding VMA. | ||
442 | */ | ||
443 | if (!locked) { | ||
444 | locked = 1; | ||
445 | down_read(&mm->mmap_sem); | ||
446 | vma = find_vma(mm, nstart); | ||
447 | } else if (nstart >= vma->vm_end) | ||
448 | vma = vma->vm_next; | ||
449 | if (!vma || vma->vm_start >= end) | ||
450 | break; | ||
451 | /* | ||
452 | * Set [nstart; nend) to intersection of desired address | ||
453 | * range with the first VMA. Also, skip undesirable VMA types. | ||
454 | */ | ||
455 | nend = min(end, vma->vm_end); | ||
456 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
457 | continue; | ||
458 | if (nstart < vma->vm_start) | ||
459 | nstart = vma->vm_start; | ||
460 | /* | ||
461 | * Now fault in a range of pages. __mlock_vma_pages_range() | ||
462 | * double checks the vma flags, so that it won't mlock pages | ||
463 | * if the vma was already munlocked. | ||
464 | */ | ||
465 | ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); | ||
466 | if (ret < 0) { | ||
467 | if (ignore_errors) { | ||
468 | ret = 0; | ||
469 | continue; /* continue at next VMA */ | ||
470 | } | ||
471 | ret = __mlock_posix_error_return(ret); | ||
472 | break; | ||
473 | } | ||
474 | nend = nstart + ret * PAGE_SIZE; | ||
475 | ret = 0; | ||
476 | } | ||
477 | if (locked) | ||
478 | up_read(&mm->mmap_sem); | ||
479 | return ret; /* 0 or negative error code */ | ||
480 | } | ||
481 | |||
485 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | 482 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) |
486 | { | 483 | { |
487 | unsigned long locked; | 484 | unsigned long locked; |
@@ -507,6 +504,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
507 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) | 504 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
508 | error = do_mlock(start, len, 1); | 505 | error = do_mlock(start, len, 1); |
509 | up_write(¤t->mm->mmap_sem); | 506 | up_write(¤t->mm->mmap_sem); |
507 | if (!error) | ||
508 | error = do_mlock_pages(start, len, 0); | ||
510 | return error; | 509 | return error; |
511 | } | 510 | } |
512 | 511 | ||
@@ -571,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
571 | capable(CAP_IPC_LOCK)) | 570 | capable(CAP_IPC_LOCK)) |
572 | ret = do_mlockall(flags); | 571 | ret = do_mlockall(flags); |
573 | up_write(¤t->mm->mmap_sem); | 572 | up_write(¤t->mm->mmap_sem); |
573 | if (!ret && (flags & MCL_CURRENT)) { | ||
574 | /* Ignore errors */ | ||
575 | do_mlock_pages(0, TASK_SIZE, 1); | ||
576 | } | ||
574 | out: | 577 | out: |
575 | return ret; | 578 | return ret; |
576 | } | 579 | } |
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/mmu_notifier.h> | 29 | #include <linux/mmu_notifier.h> |
30 | #include <linux/perf_event.h> | 30 | #include <linux/perf_event.h> |
31 | #include <linux/audit.h> | 31 | #include <linux/audit.h> |
32 | #include <linux/khugepaged.h> | ||
32 | 33 | ||
33 | #include <asm/uaccess.h> | 34 | #include <asm/uaccess.h> |
34 | #include <asm/cacheflush.h> | 35 | #include <asm/cacheflush.h> |
@@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
253 | down_write(&mm->mmap_sem); | 254 | down_write(&mm->mmap_sem); |
254 | 255 | ||
255 | #ifdef CONFIG_COMPAT_BRK | 256 | #ifdef CONFIG_COMPAT_BRK |
256 | min_brk = mm->end_code; | 257 | /* |
258 | * CONFIG_COMPAT_BRK can still be overridden by setting | ||
259 | * randomize_va_space to 2, which will still cause mm->start_brk | ||
260 | * to be arbitrarily shifted | ||
261 | */ | ||
262 | if (mm->start_brk > PAGE_ALIGN(mm->end_data)) | ||
263 | min_brk = mm->start_brk; | ||
264 | else | ||
265 | min_brk = mm->end_data; | ||
257 | #else | 266 | #else |
258 | min_brk = mm->start_brk; | 267 | min_brk = mm->start_brk; |
259 | #endif | 268 | #endif |
@@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
588 | } | 597 | } |
589 | } | 598 | } |
590 | 599 | ||
600 | vma_adjust_trans_huge(vma, start, end, adjust_next); | ||
601 | |||
591 | /* | 602 | /* |
592 | * When changing only vma->vm_end, we don't really need anon_vma | 603 | * When changing only vma->vm_end, we don't really need anon_vma |
593 | * lock. This is a fairly rare case by itself, but the anon_vma | 604 | * lock. This is a fairly rare case by itself, but the anon_vma |
@@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
815 | end, prev->vm_pgoff, NULL); | 826 | end, prev->vm_pgoff, NULL); |
816 | if (err) | 827 | if (err) |
817 | return NULL; | 828 | return NULL; |
829 | khugepaged_enter_vma_merge(prev); | ||
818 | return prev; | 830 | return prev; |
819 | } | 831 | } |
820 | 832 | ||
@@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
833 | next->vm_pgoff - pglen, NULL); | 845 | next->vm_pgoff - pglen, NULL); |
834 | if (err) | 846 | if (err) |
835 | return NULL; | 847 | return NULL; |
848 | khugepaged_enter_vma_merge(area); | ||
836 | return area; | 849 | return area; |
837 | } | 850 | } |
838 | 851 | ||
@@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1761 | } | 1774 | } |
1762 | } | 1775 | } |
1763 | vma_unlock_anon_vma(vma); | 1776 | vma_unlock_anon_vma(vma); |
1777 | khugepaged_enter_vma_merge(vma); | ||
1764 | return error; | 1778 | return error; |
1765 | } | 1779 | } |
1766 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | 1780 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ |
@@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma, | |||
1808 | } | 1822 | } |
1809 | } | 1823 | } |
1810 | vma_unlock_anon_vma(vma); | 1824 | vma_unlock_anon_vma(vma); |
1825 | khugepaged_enter_vma_merge(vma); | ||
1811 | return error; | 1826 | return error; |
1812 | } | 1827 | } |
1813 | 1828 | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 438951d366f2..8d032de4088e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
100 | return young; | 100 | return young; |
101 | } | 101 | } |
102 | 102 | ||
103 | int __mmu_notifier_test_young(struct mm_struct *mm, | ||
104 | unsigned long address) | ||
105 | { | ||
106 | struct mmu_notifier *mn; | ||
107 | struct hlist_node *n; | ||
108 | int young = 0; | ||
109 | |||
110 | rcu_read_lock(); | ||
111 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
112 | if (mn->ops->test_young) { | ||
113 | young = mn->ops->test_young(mn, mm, address); | ||
114 | if (young) | ||
115 | break; | ||
116 | } | ||
117 | } | ||
118 | rcu_read_unlock(); | ||
119 | |||
120 | return young; | ||
121 | } | ||
122 | |||
103 | void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | 123 | void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, |
104 | pte_t pte) | 124 | pte_t pte) |
105 | { | 125 | { |
diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb82c855..f5b7d1760213 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, | |||
87 | return 1; | 87 | return 1; |
88 | } | 88 | } |
89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | 89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ |
90 | |||
91 | #ifdef CONFIG_SMP | ||
92 | /* Called when a more accurate view of NR_FREE_PAGES is needed */ | ||
93 | unsigned long zone_nr_free_pages(struct zone *zone) | ||
94 | { | ||
95 | unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); | ||
96 | |||
97 | /* | ||
98 | * While kswapd is awake, it is considered the zone is under some | ||
99 | * memory pressure. Under pressure, there is a risk that | ||
100 | * per-cpu-counter-drift will allow the min watermark to be breached | ||
101 | * potentially causing a live-lock. While kswapd is awake and | ||
102 | * free pages are low, get a better estimate for free pages | ||
103 | */ | ||
104 | if (nr_free_pages < zone->percpu_drift_mark && | ||
105 | !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) | ||
106 | return zone_page_state_snapshot(zone, NR_FREE_PAGES); | ||
107 | |||
108 | return nr_free_pages; | ||
109 | } | ||
110 | #endif /* CONFIG_SMP */ | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c5133873097..5a688a2756be 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
78 | pte_unmap_unlock(pte - 1, ptl); | 78 | pte_unmap_unlock(pte - 1, ptl); |
79 | } | 79 | } |
80 | 80 | ||
81 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | 81 | static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
82 | unsigned long addr, unsigned long end, pgprot_t newprot, | 82 | unsigned long addr, unsigned long end, pgprot_t newprot, |
83 | int dirty_accountable) | 83 | int dirty_accountable) |
84 | { | 84 | { |
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
88 | pmd = pmd_offset(pud, addr); | 88 | pmd = pmd_offset(pud, addr); |
89 | do { | 89 | do { |
90 | next = pmd_addr_end(addr, end); | 90 | next = pmd_addr_end(addr, end); |
91 | if (pmd_trans_huge(*pmd)) { | ||
92 | if (next - addr != HPAGE_PMD_SIZE) | ||
93 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) | ||
95 | continue; | ||
96 | /* fall through */ | ||
97 | } | ||
91 | if (pmd_none_or_clear_bad(pmd)) | 98 | if (pmd_none_or_clear_bad(pmd)) |
92 | continue; | 99 | continue; |
93 | change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); | 100 | change_pte_range(vma->vm_mm, pmd, addr, next, newprot, |
101 | dirty_accountable); | ||
94 | } while (pmd++, addr = next, addr != end); | 102 | } while (pmd++, addr = next, addr != end); |
95 | } | 103 | } |
96 | 104 | ||
97 | static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | 105 | static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
98 | unsigned long addr, unsigned long end, pgprot_t newprot, | 106 | unsigned long addr, unsigned long end, pgprot_t newprot, |
99 | int dirty_accountable) | 107 | int dirty_accountable) |
100 | { | 108 | { |
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
106 | next = pud_addr_end(addr, end); | 114 | next = pud_addr_end(addr, end); |
107 | if (pud_none_or_clear_bad(pud)) | 115 | if (pud_none_or_clear_bad(pud)) |
108 | continue; | 116 | continue; |
109 | change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); | 117 | change_pmd_range(vma, pud, addr, next, newprot, |
118 | dirty_accountable); | ||
110 | } while (pud++, addr = next, addr != end); | 119 | } while (pud++, addr = next, addr != end); |
111 | } | 120 | } |
112 | 121 | ||
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma, | |||
126 | next = pgd_addr_end(addr, end); | 135 | next = pgd_addr_end(addr, end); |
127 | if (pgd_none_or_clear_bad(pgd)) | 136 | if (pgd_none_or_clear_bad(pgd)) |
128 | continue; | 137 | continue; |
129 | change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); | 138 | change_pud_range(vma, pgd, addr, next, newprot, |
139 | dirty_accountable); | ||
130 | } while (pgd++, addr = next, addr != end); | 140 | } while (pgd++, addr = next, addr != end); |
131 | flush_tlb_range(vma, start, end); | 141 | flush_tlb_range(vma, start, end); |
132 | } | 142 | } |
diff --git a/mm/mremap.c b/mm/mremap.c index 563fbdd6293a..9925b6391b80 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) | |||
41 | return NULL; | 41 | return NULL; |
42 | 42 | ||
43 | pmd = pmd_offset(pud, addr); | 43 | pmd = pmd_offset(pud, addr); |
44 | split_huge_page_pmd(mm, pmd); | ||
44 | if (pmd_none_or_clear_bad(pmd)) | 45 | if (pmd_none_or_clear_bad(pmd)) |
45 | return NULL; | 46 | return NULL; |
46 | 47 | ||
47 | return pmd; | 48 | return pmd; |
48 | } | 49 | } |
49 | 50 | ||
50 | static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) | 51 | static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, |
52 | unsigned long addr) | ||
51 | { | 53 | { |
52 | pgd_t *pgd; | 54 | pgd_t *pgd; |
53 | pud_t *pud; | 55 | pud_t *pud; |
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) | |||
62 | if (!pmd) | 64 | if (!pmd) |
63 | return NULL; | 65 | return NULL; |
64 | 66 | ||
65 | if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) | 67 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
68 | if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) | ||
66 | return NULL; | 69 | return NULL; |
67 | 70 | ||
68 | return pmd; | 71 | return pmd; |
@@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
147 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); | 150 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); |
148 | if (!old_pmd) | 151 | if (!old_pmd) |
149 | continue; | 152 | continue; |
150 | new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); | 153 | new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); |
151 | if (!new_pmd) | 154 | if (!new_pmd) |
152 | break; | 155 | break; |
153 | next = (new_addr + PMD_SIZE) & PMD_MASK; | 156 | next = (new_addr + PMD_SIZE) & PMD_MASK; |
diff --git a/mm/nommu.c b/mm/nommu.c index ef4045d010d5..f59e1424d3db 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp) | |||
127 | 127 | ||
128 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 128 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
129 | unsigned long start, int nr_pages, unsigned int foll_flags, | 129 | unsigned long start, int nr_pages, unsigned int foll_flags, |
130 | struct page **pages, struct vm_area_struct **vmas) | 130 | struct page **pages, struct vm_area_struct **vmas, |
131 | int *retry) | ||
131 | { | 132 | { |
132 | struct vm_area_struct *vma; | 133 | struct vm_area_struct *vma; |
133 | unsigned long vm_flags; | 134 | unsigned long vm_flags; |
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
185 | if (force) | 186 | if (force) |
186 | flags |= FOLL_FORCE; | 187 | flags |= FOLL_FORCE; |
187 | 188 | ||
188 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 189 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, |
190 | NULL); | ||
189 | } | 191 | } |
190 | EXPORT_SYMBOL(get_user_pages); | 192 | EXPORT_SYMBOL(get_user_pages); |
191 | 193 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b4edfe7ce06c..2cb01f6ec5d0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -404,15 +404,18 @@ unsigned long determine_dirtyable_memory(void) | |||
404 | * - vm.dirty_background_ratio or vm.dirty_background_bytes | 404 | * - vm.dirty_background_ratio or vm.dirty_background_bytes |
405 | * - vm.dirty_ratio or vm.dirty_bytes | 405 | * - vm.dirty_ratio or vm.dirty_bytes |
406 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and | 406 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and |
407 | * runtime tasks. | 407 | * real-time tasks. |
408 | */ | 408 | */ |
409 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | 409 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) |
410 | { | 410 | { |
411 | unsigned long background; | 411 | unsigned long background; |
412 | unsigned long dirty; | 412 | unsigned long dirty; |
413 | unsigned long available_memory = determine_dirtyable_memory(); | 413 | unsigned long uninitialized_var(available_memory); |
414 | struct task_struct *tsk; | 414 | struct task_struct *tsk; |
415 | 415 | ||
416 | if (!vm_dirty_bytes || !dirty_background_bytes) | ||
417 | available_memory = determine_dirtyable_memory(); | ||
418 | |||
416 | if (vm_dirty_bytes) | 419 | if (vm_dirty_bytes) |
417 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); | 420 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); |
418 | else | 421 | else |
@@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page); | |||
1103 | int __set_page_dirty_no_writeback(struct page *page) | 1106 | int __set_page_dirty_no_writeback(struct page *page) |
1104 | { | 1107 | { |
1105 | if (!PageDirty(page)) | 1108 | if (!PageDirty(page)) |
1106 | SetPageDirty(page); | 1109 | return !TestSetPageDirty(page); |
1107 | return 0; | 1110 | return 0; |
1108 | } | 1111 | } |
1109 | 1112 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ff7e15872398..90c1439549fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
357 | } | 357 | } |
358 | } | 358 | } |
359 | 359 | ||
360 | /* update __split_huge_page_refcount if you change this function */ | ||
360 | static int destroy_compound_page(struct page *page, unsigned long order) | 361 | static int destroy_compound_page(struct page *page, unsigned long order) |
361 | { | 362 | { |
362 | int i; | 363 | int i; |
@@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page) | |||
426 | * | 427 | * |
427 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER | 428 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER |
428 | */ | 429 | */ |
429 | static inline struct page * | ||
430 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) | ||
431 | { | ||
432 | unsigned long buddy_idx = page_idx ^ (1 << order); | ||
433 | |||
434 | return page + (buddy_idx - page_idx); | ||
435 | } | ||
436 | |||
437 | static inline unsigned long | 430 | static inline unsigned long |
438 | __find_combined_index(unsigned long page_idx, unsigned int order) | 431 | __find_buddy_index(unsigned long page_idx, unsigned int order) |
439 | { | 432 | { |
440 | return (page_idx & ~(1 << order)); | 433 | return page_idx ^ (1 << order); |
441 | } | 434 | } |
442 | 435 | ||
443 | /* | 436 | /* |
@@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
448 | * (c) a page and its buddy have the same order && | 441 | * (c) a page and its buddy have the same order && |
449 | * (d) a page and its buddy are in the same zone. | 442 | * (d) a page and its buddy are in the same zone. |
450 | * | 443 | * |
451 | * For recording whether a page is in the buddy system, we use PG_buddy. | 444 | * For recording whether a page is in the buddy system, we set ->_mapcount -2. |
452 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. | 445 | * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. |
453 | * | 446 | * |
454 | * For recording page's order, we use page_private(page). | 447 | * For recording page's order, we use page_private(page). |
455 | */ | 448 | */ |
@@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
482 | * as necessary, plus some accounting needed to play nicely with other | 475 | * as necessary, plus some accounting needed to play nicely with other |
483 | * parts of the VM system. | 476 | * parts of the VM system. |
484 | * At each level, we keep a list of pages, which are heads of continuous | 477 | * At each level, we keep a list of pages, which are heads of continuous |
485 | * free pages of length of (1 << order) and marked with PG_buddy. Page's | 478 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's |
486 | * order is recorded in page_private(page) field. | 479 | * order is recorded in page_private(page) field. |
487 | * So when we are allocating or freeing one, we can derive the state of the | 480 | * So when we are allocating or freeing one, we can derive the state of the |
488 | * other. That is, if we allocate a small block, and both were | 481 | * other. That is, if we allocate a small block, and both were |
@@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page, | |||
499 | { | 492 | { |
500 | unsigned long page_idx; | 493 | unsigned long page_idx; |
501 | unsigned long combined_idx; | 494 | unsigned long combined_idx; |
495 | unsigned long uninitialized_var(buddy_idx); | ||
502 | struct page *buddy; | 496 | struct page *buddy; |
503 | 497 | ||
504 | if (unlikely(PageCompound(page))) | 498 | if (unlikely(PageCompound(page))) |
@@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page, | |||
513 | VM_BUG_ON(bad_range(zone, page)); | 507 | VM_BUG_ON(bad_range(zone, page)); |
514 | 508 | ||
515 | while (order < MAX_ORDER-1) { | 509 | while (order < MAX_ORDER-1) { |
516 | buddy = __page_find_buddy(page, page_idx, order); | 510 | buddy_idx = __find_buddy_index(page_idx, order); |
511 | buddy = page + (buddy_idx - page_idx); | ||
517 | if (!page_is_buddy(page, buddy, order)) | 512 | if (!page_is_buddy(page, buddy, order)) |
518 | break; | 513 | break; |
519 | 514 | ||
@@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page, | |||
521 | list_del(&buddy->lru); | 516 | list_del(&buddy->lru); |
522 | zone->free_area[order].nr_free--; | 517 | zone->free_area[order].nr_free--; |
523 | rmv_page_order(buddy); | 518 | rmv_page_order(buddy); |
524 | combined_idx = __find_combined_index(page_idx, order); | 519 | combined_idx = buddy_idx & page_idx; |
525 | page = page + (combined_idx - page_idx); | 520 | page = page + (combined_idx - page_idx); |
526 | page_idx = combined_idx; | 521 | page_idx = combined_idx; |
527 | order++; | 522 | order++; |
@@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page, | |||
538 | */ | 533 | */ |
539 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { | 534 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { |
540 | struct page *higher_page, *higher_buddy; | 535 | struct page *higher_page, *higher_buddy; |
541 | combined_idx = __find_combined_index(page_idx, order); | 536 | combined_idx = buddy_idx & page_idx; |
542 | higher_page = page + combined_idx - page_idx; | 537 | higher_page = page + (combined_idx - page_idx); |
543 | higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); | 538 | buddy_idx = __find_buddy_index(combined_idx, order + 1); |
539 | higher_buddy = page + (buddy_idx - combined_idx); | ||
544 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { | 540 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { |
545 | list_add_tail(&page->lru, | 541 | list_add_tail(&page->lru, |
546 | &zone->free_area[order].free_list[migratetype]); | 542 | &zone->free_area[order].free_list[migratetype]); |
@@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
651 | trace_mm_page_free_direct(page, order); | 647 | trace_mm_page_free_direct(page, order); |
652 | kmemcheck_free_shadow(page, order); | 648 | kmemcheck_free_shadow(page, order); |
653 | 649 | ||
654 | for (i = 0; i < (1 << order); i++) { | 650 | if (PageAnon(page)) |
655 | struct page *pg = page + i; | 651 | page->mapping = NULL; |
656 | 652 | for (i = 0; i < (1 << order); i++) | |
657 | if (PageAnon(pg)) | 653 | bad += free_pages_check(page + i); |
658 | pg->mapping = NULL; | ||
659 | bad += free_pages_check(pg); | ||
660 | } | ||
661 | if (bad) | 654 | if (bad) |
662 | return false; | 655 | return false; |
663 | 656 | ||
@@ -1460,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | |||
1460 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1453 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
1461 | 1454 | ||
1462 | /* | 1455 | /* |
1463 | * Return 1 if free pages are above 'mark'. This takes into account the order | 1456 | * Return true if free pages are above 'mark'. This takes into account the order |
1464 | * of the allocation. | 1457 | * of the allocation. |
1465 | */ | 1458 | */ |
1466 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1459 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1467 | int classzone_idx, int alloc_flags) | 1460 | int classzone_idx, int alloc_flags, long free_pages) |
1468 | { | 1461 | { |
1469 | /* free_pages my go negative - that's OK */ | 1462 | /* free_pages my go negative - that's OK */ |
1470 | long min = mark; | 1463 | long min = mark; |
1471 | long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; | ||
1472 | int o; | 1464 | int o; |
1473 | 1465 | ||
1466 | free_pages -= (1 << order) + 1; | ||
1474 | if (alloc_flags & ALLOC_HIGH) | 1467 | if (alloc_flags & ALLOC_HIGH) |
1475 | min -= min / 2; | 1468 | min -= min / 2; |
1476 | if (alloc_flags & ALLOC_HARDER) | 1469 | if (alloc_flags & ALLOC_HARDER) |
1477 | min -= min / 4; | 1470 | min -= min / 4; |
1478 | 1471 | ||
1479 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1472 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
1480 | return 0; | 1473 | return false; |
1481 | for (o = 0; o < order; o++) { | 1474 | for (o = 0; o < order; o++) { |
1482 | /* At the next order, this order's pages become unavailable */ | 1475 | /* At the next order, this order's pages become unavailable */ |
1483 | free_pages -= z->free_area[o].nr_free << o; | 1476 | free_pages -= z->free_area[o].nr_free << o; |
@@ -1486,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1486 | min >>= 1; | 1479 | min >>= 1; |
1487 | 1480 | ||
1488 | if (free_pages <= min) | 1481 | if (free_pages <= min) |
1489 | return 0; | 1482 | return false; |
1490 | } | 1483 | } |
1491 | return 1; | 1484 | return true; |
1485 | } | ||
1486 | |||
1487 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | ||
1488 | int classzone_idx, int alloc_flags) | ||
1489 | { | ||
1490 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
1491 | zone_page_state(z, NR_FREE_PAGES)); | ||
1492 | } | ||
1493 | |||
1494 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | ||
1495 | int classzone_idx, int alloc_flags) | ||
1496 | { | ||
1497 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | ||
1498 | |||
1499 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | ||
1500 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | ||
1501 | |||
1502 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
1503 | free_pages); | ||
1492 | } | 1504 | } |
1493 | 1505 | ||
1494 | #ifdef CONFIG_NUMA | 1506 | #ifdef CONFIG_NUMA |
@@ -1793,15 +1805,18 @@ static struct page * | |||
1793 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1805 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1794 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1806 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1795 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1807 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1796 | int migratetype, unsigned long *did_some_progress) | 1808 | int migratetype, unsigned long *did_some_progress, |
1809 | bool sync_migration) | ||
1797 | { | 1810 | { |
1798 | struct page *page; | 1811 | struct page *page; |
1799 | 1812 | ||
1800 | if (!order || compaction_deferred(preferred_zone)) | 1813 | if (!order || compaction_deferred(preferred_zone)) |
1801 | return NULL; | 1814 | return NULL; |
1802 | 1815 | ||
1816 | current->flags |= PF_MEMALLOC; | ||
1803 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 1817 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
1804 | nodemask); | 1818 | nodemask, sync_migration); |
1819 | current->flags &= ~PF_MEMALLOC; | ||
1805 | if (*did_some_progress != COMPACT_SKIPPED) { | 1820 | if (*did_some_progress != COMPACT_SKIPPED) { |
1806 | 1821 | ||
1807 | /* Page migration frees to the PCP lists but we want merging */ | 1822 | /* Page migration frees to the PCP lists but we want merging */ |
@@ -1837,7 +1852,8 @@ static inline struct page * | |||
1837 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1852 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1838 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1853 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1839 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1854 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1840 | int migratetype, unsigned long *did_some_progress) | 1855 | int migratetype, unsigned long *did_some_progress, |
1856 | bool sync_migration) | ||
1841 | { | 1857 | { |
1842 | return NULL; | 1858 | return NULL; |
1843 | } | 1859 | } |
@@ -1852,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
1852 | { | 1868 | { |
1853 | struct page *page = NULL; | 1869 | struct page *page = NULL; |
1854 | struct reclaim_state reclaim_state; | 1870 | struct reclaim_state reclaim_state; |
1855 | struct task_struct *p = current; | ||
1856 | bool drained = false; | 1871 | bool drained = false; |
1857 | 1872 | ||
1858 | cond_resched(); | 1873 | cond_resched(); |
1859 | 1874 | ||
1860 | /* We now go into synchronous reclaim */ | 1875 | /* We now go into synchronous reclaim */ |
1861 | cpuset_memory_pressure_bump(); | 1876 | cpuset_memory_pressure_bump(); |
1862 | p->flags |= PF_MEMALLOC; | 1877 | current->flags |= PF_MEMALLOC; |
1863 | lockdep_set_current_reclaim_state(gfp_mask); | 1878 | lockdep_set_current_reclaim_state(gfp_mask); |
1864 | reclaim_state.reclaimed_slab = 0; | 1879 | reclaim_state.reclaimed_slab = 0; |
1865 | p->reclaim_state = &reclaim_state; | 1880 | current->reclaim_state = &reclaim_state; |
1866 | 1881 | ||
1867 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 1882 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); |
1868 | 1883 | ||
1869 | p->reclaim_state = NULL; | 1884 | current->reclaim_state = NULL; |
1870 | lockdep_clear_current_reclaim_state(); | 1885 | lockdep_clear_current_reclaim_state(); |
1871 | p->flags &= ~PF_MEMALLOC; | 1886 | current->flags &= ~PF_MEMALLOC; |
1872 | 1887 | ||
1873 | cond_resched(); | 1888 | cond_resched(); |
1874 | 1889 | ||
@@ -1920,19 +1935,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
1920 | 1935 | ||
1921 | static inline | 1936 | static inline |
1922 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | 1937 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, |
1923 | enum zone_type high_zoneidx) | 1938 | enum zone_type high_zoneidx, |
1939 | enum zone_type classzone_idx) | ||
1924 | { | 1940 | { |
1925 | struct zoneref *z; | 1941 | struct zoneref *z; |
1926 | struct zone *zone; | 1942 | struct zone *zone; |
1927 | 1943 | ||
1928 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 1944 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
1929 | wakeup_kswapd(zone, order); | 1945 | wakeup_kswapd(zone, order, classzone_idx); |
1930 | } | 1946 | } |
1931 | 1947 | ||
1932 | static inline int | 1948 | static inline int |
1933 | gfp_to_alloc_flags(gfp_t gfp_mask) | 1949 | gfp_to_alloc_flags(gfp_t gfp_mask) |
1934 | { | 1950 | { |
1935 | struct task_struct *p = current; | ||
1936 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | 1951 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; |
1937 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1952 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
1938 | 1953 | ||
@@ -1948,18 +1963,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
1948 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); | 1963 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); |
1949 | 1964 | ||
1950 | if (!wait) { | 1965 | if (!wait) { |
1951 | alloc_flags |= ALLOC_HARDER; | 1966 | /* |
1967 | * Not worth trying to allocate harder for | ||
1968 | * __GFP_NOMEMALLOC even if it can't schedule. | ||
1969 | */ | ||
1970 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | ||
1971 | alloc_flags |= ALLOC_HARDER; | ||
1952 | /* | 1972 | /* |
1953 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 1973 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1954 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1974 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1955 | */ | 1975 | */ |
1956 | alloc_flags &= ~ALLOC_CPUSET; | 1976 | alloc_flags &= ~ALLOC_CPUSET; |
1957 | } else if (unlikely(rt_task(p)) && !in_interrupt()) | 1977 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
1958 | alloc_flags |= ALLOC_HARDER; | 1978 | alloc_flags |= ALLOC_HARDER; |
1959 | 1979 | ||
1960 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 1980 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
1961 | if (!in_interrupt() && | 1981 | if (!in_interrupt() && |
1962 | ((p->flags & PF_MEMALLOC) || | 1982 | ((current->flags & PF_MEMALLOC) || |
1963 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 1983 | unlikely(test_thread_flag(TIF_MEMDIE)))) |
1964 | alloc_flags |= ALLOC_NO_WATERMARKS; | 1984 | alloc_flags |= ALLOC_NO_WATERMARKS; |
1965 | } | 1985 | } |
@@ -1978,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
1978 | int alloc_flags; | 1998 | int alloc_flags; |
1979 | unsigned long pages_reclaimed = 0; | 1999 | unsigned long pages_reclaimed = 0; |
1980 | unsigned long did_some_progress; | 2000 | unsigned long did_some_progress; |
1981 | struct task_struct *p = current; | 2001 | bool sync_migration = false; |
1982 | 2002 | ||
1983 | /* | 2003 | /* |
1984 | * In the slowpath, we sanity check order to avoid ever trying to | 2004 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -2003,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2003 | goto nopage; | 2023 | goto nopage; |
2004 | 2024 | ||
2005 | restart: | 2025 | restart: |
2006 | wake_all_kswapd(order, zonelist, high_zoneidx); | 2026 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2027 | wake_all_kswapd(order, zonelist, high_zoneidx, | ||
2028 | zone_idx(preferred_zone)); | ||
2007 | 2029 | ||
2008 | /* | 2030 | /* |
2009 | * OK, we're below the kswapd watermark and have kicked background | 2031 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2034,21 +2056,26 @@ rebalance: | |||
2034 | goto nopage; | 2056 | goto nopage; |
2035 | 2057 | ||
2036 | /* Avoid recursion of direct reclaim */ | 2058 | /* Avoid recursion of direct reclaim */ |
2037 | if (p->flags & PF_MEMALLOC) | 2059 | if (current->flags & PF_MEMALLOC) |
2038 | goto nopage; | 2060 | goto nopage; |
2039 | 2061 | ||
2040 | /* Avoid allocations with no watermarks from looping endlessly */ | 2062 | /* Avoid allocations with no watermarks from looping endlessly */ |
2041 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | 2063 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) |
2042 | goto nopage; | 2064 | goto nopage; |
2043 | 2065 | ||
2044 | /* Try direct compaction */ | 2066 | /* |
2067 | * Try direct compaction. The first pass is asynchronous. Subsequent | ||
2068 | * attempts after direct reclaim are synchronous | ||
2069 | */ | ||
2045 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2070 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2046 | zonelist, high_zoneidx, | 2071 | zonelist, high_zoneidx, |
2047 | nodemask, | 2072 | nodemask, |
2048 | alloc_flags, preferred_zone, | 2073 | alloc_flags, preferred_zone, |
2049 | migratetype, &did_some_progress); | 2074 | migratetype, &did_some_progress, |
2075 | sync_migration); | ||
2050 | if (page) | 2076 | if (page) |
2051 | goto got_pg; | 2077 | goto got_pg; |
2078 | sync_migration = true; | ||
2052 | 2079 | ||
2053 | /* Try direct reclaim and then allocating */ | 2080 | /* Try direct reclaim and then allocating */ |
2054 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2081 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
@@ -2102,13 +2129,27 @@ rebalance: | |||
2102 | /* Wait for some write requests to complete then retry */ | 2129 | /* Wait for some write requests to complete then retry */ |
2103 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2130 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
2104 | goto rebalance; | 2131 | goto rebalance; |
2132 | } else { | ||
2133 | /* | ||
2134 | * High-order allocations do not necessarily loop after | ||
2135 | * direct reclaim and reclaim/compaction depends on compaction | ||
2136 | * being called after reclaim so call directly if necessary | ||
2137 | */ | ||
2138 | page = __alloc_pages_direct_compact(gfp_mask, order, | ||
2139 | zonelist, high_zoneidx, | ||
2140 | nodemask, | ||
2141 | alloc_flags, preferred_zone, | ||
2142 | migratetype, &did_some_progress, | ||
2143 | sync_migration); | ||
2144 | if (page) | ||
2145 | goto got_pg; | ||
2105 | } | 2146 | } |
2106 | 2147 | ||
2107 | nopage: | 2148 | nopage: |
2108 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { | 2149 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { |
2109 | printk(KERN_WARNING "%s: page allocation failure." | 2150 | printk(KERN_WARNING "%s: page allocation failure." |
2110 | " order:%d, mode:0x%x\n", | 2151 | " order:%d, mode:0x%x\n", |
2111 | p->comm, order, gfp_mask); | 2152 | current->comm, order, gfp_mask); |
2112 | dump_stack(); | 2153 | dump_stack(); |
2113 | show_mem(); | 2154 | show_mem(); |
2114 | } | 2155 | } |
@@ -2442,7 +2483,7 @@ void show_free_areas(void) | |||
2442 | " all_unreclaimable? %s" | 2483 | " all_unreclaimable? %s" |
2443 | "\n", | 2484 | "\n", |
2444 | zone->name, | 2485 | zone->name, |
2445 | K(zone_nr_free_pages(zone)), | 2486 | K(zone_page_state(zone, NR_FREE_PAGES)), |
2446 | K(min_wmark_pages(zone)), | 2487 | K(min_wmark_pages(zone)), |
2447 | K(low_wmark_pages(zone)), | 2488 | K(low_wmark_pages(zone)), |
2448 | K(high_wmark_pages(zone)), | 2489 | K(high_wmark_pages(zone)), |
@@ -2585,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s) | |||
2585 | 2626 | ||
2586 | static __init int setup_numa_zonelist_order(char *s) | 2627 | static __init int setup_numa_zonelist_order(char *s) |
2587 | { | 2628 | { |
2588 | if (s) | 2629 | int ret; |
2589 | return __parse_numa_zonelist_order(s); | 2630 | |
2590 | return 0; | 2631 | if (!s) |
2632 | return 0; | ||
2633 | |||
2634 | ret = __parse_numa_zonelist_order(s); | ||
2635 | if (ret == 0) | ||
2636 | strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); | ||
2637 | |||
2638 | return ret; | ||
2591 | } | 2639 | } |
2592 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | 2640 | early_param("numa_zonelist_order", setup_numa_zonelist_order); |
2593 | 2641 | ||
@@ -4014,7 +4062,7 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
4014 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | 4062 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); |
4015 | } | 4063 | } |
4016 | #else | 4064 | #else |
4017 | static void inline setup_usemap(struct pglist_data *pgdat, | 4065 | static inline void setup_usemap(struct pglist_data *pgdat, |
4018 | struct zone *zone, unsigned long zonesize) {} | 4066 | struct zone *zone, unsigned long zonesize) {} |
4019 | #endif /* CONFIG_SPARSEMEM */ | 4067 | #endif /* CONFIG_SPARSEMEM */ |
4020 | 4068 | ||
@@ -5517,7 +5565,6 @@ static struct trace_print_flags pageflag_names[] = { | |||
5517 | {1UL << PG_swapcache, "swapcache" }, | 5565 | {1UL << PG_swapcache, "swapcache" }, |
5518 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | 5566 | {1UL << PG_mappedtodisk, "mappedtodisk" }, |
5519 | {1UL << PG_reclaim, "reclaim" }, | 5567 | {1UL << PG_reclaim, "reclaim" }, |
5520 | {1UL << PG_buddy, "buddy" }, | ||
5521 | {1UL << PG_swapbacked, "swapbacked" }, | 5568 | {1UL << PG_swapbacked, "swapbacked" }, |
5522 | {1UL << PG_unevictable, "unevictable" }, | 5569 | {1UL << PG_unevictable, "unevictable" }, |
5523 | #ifdef CONFIG_MMU | 5570 | #ifdef CONFIG_MMU |
@@ -5565,7 +5612,7 @@ void dump_page(struct page *page) | |||
5565 | { | 5612 | { |
5566 | printk(KERN_ALERT | 5613 | printk(KERN_ALERT |
5567 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | 5614 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", |
5568 | page, page_count(page), page_mapcount(page), | 5615 | page, atomic_read(&page->_count), page_mapcount(page), |
5569 | page->mapping, page->index); | 5616 | page->mapping, page->index); |
5570 | dump_page_flags(page->flags); | 5617 | dump_page_flags(page->flags); |
5571 | } | 5618 | } |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 38cc58b8b2b0..7cfa6ae02303 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, | |||
34 | pmd = pmd_offset(pud, addr); | 34 | pmd = pmd_offset(pud, addr); |
35 | do { | 35 | do { |
36 | next = pmd_addr_end(addr, end); | 36 | next = pmd_addr_end(addr, end); |
37 | split_huge_page_pmd(walk->mm, pmd); | ||
37 | if (pmd_none_or_clear_bad(pmd)) { | 38 | if (pmd_none_or_clear_bad(pmd)) { |
38 | if (walk->pte_hole) | 39 | if (walk->pte_hole) |
39 | err = walk->pte_hole(addr, next, walk); | 40 | err = walk->pte_hole(addr, next, walk); |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 7d9c1d0ebd3f..ea534960a04b 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) | |||
421 | return NULL; | 421 | return NULL; |
422 | 422 | ||
423 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, | 423 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, |
424 | pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); | 424 | pcpu_nr_groups, pcpu_atom_size); |
425 | if (!vms) { | 425 | if (!vms) { |
426 | pcpu_free_chunk(chunk); | 426 | pcpu_free_chunk(chunk); |
427 | return NULL; | 427 | return NULL; |
diff --git a/mm/percpu.c b/mm/percpu.c index 02ba91230b99..3f930018aa60 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -258,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, | |||
258 | 258 | ||
259 | /* | 259 | /* |
260 | * (Un)populated page region iterators. Iterate over (un)populated | 260 | * (Un)populated page region iterators. Iterate over (un)populated |
261 | * page regions betwen @start and @end in @chunk. @rs and @re should | 261 | * page regions between @start and @end in @chunk. @rs and @re should |
262 | * be integer variables and will be set to start and end page index of | 262 | * be integer variables and will be set to start and end page index of |
263 | * the current region. | 263 | * the current region. |
264 | */ | 264 | */ |
@@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size) | |||
293 | 293 | ||
294 | if (size <= PAGE_SIZE) | 294 | if (size <= PAGE_SIZE) |
295 | return kzalloc(size, GFP_KERNEL); | 295 | return kzalloc(size, GFP_KERNEL); |
296 | else { | 296 | else |
297 | void *ptr = vmalloc(size); | 297 | return vzalloc(size); |
298 | if (ptr) | ||
299 | memset(ptr, 0, size); | ||
300 | return ptr; | ||
301 | } | ||
302 | } | 298 | } |
303 | 299 | ||
304 | /** | 300 | /** |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c new file mode 100644 index 000000000000..d030548047e2 --- /dev/null +++ b/mm/pgtable-generic.c | |||
@@ -0,0 +1,123 @@ | |||
1 | /* | ||
2 | * mm/pgtable-generic.c | ||
3 | * | ||
4 | * Generic pgtable methods declared in asm-generic/pgtable.h | ||
5 | * | ||
6 | * Copyright (C) 2010 Linus Torvalds | ||
7 | */ | ||
8 | |||
9 | #include <asm/tlb.h> | ||
10 | #include <asm-generic/pgtable.h> | ||
11 | |||
12 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | ||
13 | /* | ||
14 | * Only sets the access flags (dirty, accessed, and | ||
15 | * writable). Furthermore, we know it always gets set to a "more | ||
16 | * permissive" setting, which allows most architectures to optimize | ||
17 | * this. We return whether the PTE actually changed, which in turn | ||
18 | * instructs the caller to do things like update__mmu_cache. This | ||
19 | * used to be done in the caller, but sparc needs minor faults to | ||
20 | * force that call on sun4c so we changed this macro slightly | ||
21 | */ | ||
22 | int ptep_set_access_flags(struct vm_area_struct *vma, | ||
23 | unsigned long address, pte_t *ptep, | ||
24 | pte_t entry, int dirty) | ||
25 | { | ||
26 | int changed = !pte_same(*ptep, entry); | ||
27 | if (changed) { | ||
28 | set_pte_at(vma->vm_mm, address, ptep, entry); | ||
29 | flush_tlb_page(vma, address); | ||
30 | } | ||
31 | return changed; | ||
32 | } | ||
33 | #endif | ||
34 | |||
35 | #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS | ||
36 | int pmdp_set_access_flags(struct vm_area_struct *vma, | ||
37 | unsigned long address, pmd_t *pmdp, | ||
38 | pmd_t entry, int dirty) | ||
39 | { | ||
40 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
41 | int changed = !pmd_same(*pmdp, entry); | ||
42 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
43 | if (changed) { | ||
44 | set_pmd_at(vma->vm_mm, address, pmdp, entry); | ||
45 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
46 | } | ||
47 | return changed; | ||
48 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
49 | BUG(); | ||
50 | return 0; | ||
51 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
52 | } | ||
53 | #endif | ||
54 | |||
55 | #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH | ||
56 | int ptep_clear_flush_young(struct vm_area_struct *vma, | ||
57 | unsigned long address, pte_t *ptep) | ||
58 | { | ||
59 | int young; | ||
60 | young = ptep_test_and_clear_young(vma, address, ptep); | ||
61 | if (young) | ||
62 | flush_tlb_page(vma, address); | ||
63 | return young; | ||
64 | } | ||
65 | #endif | ||
66 | |||
67 | #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH | ||
68 | int pmdp_clear_flush_young(struct vm_area_struct *vma, | ||
69 | unsigned long address, pmd_t *pmdp) | ||
70 | { | ||
71 | int young; | ||
72 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | ||
73 | BUG(); | ||
74 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
75 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
76 | young = pmdp_test_and_clear_young(vma, address, pmdp); | ||
77 | if (young) | ||
78 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
79 | return young; | ||
80 | } | ||
81 | #endif | ||
82 | |||
83 | #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH | ||
84 | pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, | ||
85 | pte_t *ptep) | ||
86 | { | ||
87 | pte_t pte; | ||
88 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); | ||
89 | flush_tlb_page(vma, address); | ||
90 | return pte; | ||
91 | } | ||
92 | #endif | ||
93 | |||
94 | #ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH | ||
95 | pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | ||
96 | pmd_t *pmdp) | ||
97 | { | ||
98 | pmd_t pmd; | ||
99 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | ||
100 | BUG(); | ||
101 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
102 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
103 | pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); | ||
104 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
105 | return pmd; | ||
106 | } | ||
107 | #endif | ||
108 | |||
109 | #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
110 | pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
111 | pmd_t *pmdp) | ||
112 | { | ||
113 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
114 | pmd_t pmd = pmd_mksplitting(*pmdp); | ||
115 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
116 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); | ||
117 | /* tlb flush only to serialize against gup-fast */ | ||
118 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
119 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
120 | BUG(); | ||
121 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
122 | } | ||
123 | #endif | ||
@@ -94,7 +94,7 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | |||
94 | * anonymous pages mapped into it with that anon_vma. | 94 | * anonymous pages mapped into it with that anon_vma. |
95 | * | 95 | * |
96 | * The common case will be that we already have one, but if | 96 | * The common case will be that we already have one, but if |
97 | * if not we either need to find an adjacent mapping that we | 97 | * not we either need to find an adjacent mapping that we |
98 | * can re-use the anon_vma from (very common when the only | 98 | * can re-use the anon_vma from (very common when the only |
99 | * reason for splitting a vma has been mprotect()), or we | 99 | * reason for splitting a vma has been mprotect()), or we |
100 | * allocate a new one. | 100 | * allocate a new one. |
@@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
177 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 177 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
178 | 178 | ||
179 | anon_vma_lock(anon_vma); | 179 | anon_vma_lock(anon_vma); |
180 | /* | ||
181 | * It's critical to add new vmas to the tail of the anon_vma, | ||
182 | * see comment in huge_memory.c:__split_huge_page(). | ||
183 | */ | ||
180 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | 184 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); |
181 | anon_vma_unlock(anon_vma); | 185 | anon_vma_unlock(anon_vma); |
182 | } | 186 | } |
@@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) | |||
360 | * Returns virtual address or -EFAULT if page's index/offset is not | 364 | * Returns virtual address or -EFAULT if page's index/offset is not |
361 | * within the range mapped the @vma. | 365 | * within the range mapped the @vma. |
362 | */ | 366 | */ |
363 | static inline unsigned long | 367 | inline unsigned long |
364 | vma_address(struct page *page, struct vm_area_struct *vma) | 368 | vma_address(struct page *page, struct vm_area_struct *vma) |
365 | { | 369 | { |
366 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 370 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | |||
435 | pmd = pmd_offset(pud, address); | 439 | pmd = pmd_offset(pud, address); |
436 | if (!pmd_present(*pmd)) | 440 | if (!pmd_present(*pmd)) |
437 | return NULL; | 441 | return NULL; |
442 | if (pmd_trans_huge(*pmd)) | ||
443 | return NULL; | ||
438 | 444 | ||
439 | pte = pte_offset_map(pmd, address); | 445 | pte = pte_offset_map(pmd, address); |
440 | /* Make a quick check before getting the lock */ | 446 | /* Make a quick check before getting the lock */ |
@@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
489 | unsigned long *vm_flags) | 495 | unsigned long *vm_flags) |
490 | { | 496 | { |
491 | struct mm_struct *mm = vma->vm_mm; | 497 | struct mm_struct *mm = vma->vm_mm; |
492 | pte_t *pte; | ||
493 | spinlock_t *ptl; | ||
494 | int referenced = 0; | 498 | int referenced = 0; |
495 | 499 | ||
496 | pte = page_check_address(page, mm, address, &ptl, 0); | ||
497 | if (!pte) | ||
498 | goto out; | ||
499 | |||
500 | /* | 500 | /* |
501 | * Don't want to elevate referenced for mlocked page that gets this far, | 501 | * Don't want to elevate referenced for mlocked page that gets this far, |
502 | * in order that it progresses to try_to_unmap and is moved to the | 502 | * in order that it progresses to try_to_unmap and is moved to the |
503 | * unevictable list. | 503 | * unevictable list. |
504 | */ | 504 | */ |
505 | if (vma->vm_flags & VM_LOCKED) { | 505 | if (vma->vm_flags & VM_LOCKED) { |
506 | *mapcount = 1; /* break early from loop */ | 506 | *mapcount = 0; /* break early from loop */ |
507 | *vm_flags |= VM_LOCKED; | 507 | *vm_flags |= VM_LOCKED; |
508 | goto out_unmap; | 508 | goto out; |
509 | } | ||
510 | |||
511 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | ||
512 | /* | ||
513 | * Don't treat a reference through a sequentially read | ||
514 | * mapping as such. If the page has been used in | ||
515 | * another mapping, we will catch it; if this other | ||
516 | * mapping is already gone, the unmap path will have | ||
517 | * set PG_referenced or activated the page. | ||
518 | */ | ||
519 | if (likely(!VM_SequentialReadHint(vma))) | ||
520 | referenced++; | ||
521 | } | 509 | } |
522 | 510 | ||
523 | /* Pretend the page is referenced if the task has the | 511 | /* Pretend the page is referenced if the task has the |
@@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
526 | rwsem_is_locked(&mm->mmap_sem)) | 514 | rwsem_is_locked(&mm->mmap_sem)) |
527 | referenced++; | 515 | referenced++; |
528 | 516 | ||
529 | out_unmap: | 517 | if (unlikely(PageTransHuge(page))) { |
518 | pmd_t *pmd; | ||
519 | |||
520 | spin_lock(&mm->page_table_lock); | ||
521 | pmd = page_check_address_pmd(page, mm, address, | ||
522 | PAGE_CHECK_ADDRESS_PMD_FLAG); | ||
523 | if (pmd && !pmd_trans_splitting(*pmd) && | ||
524 | pmdp_clear_flush_young_notify(vma, address, pmd)) | ||
525 | referenced++; | ||
526 | spin_unlock(&mm->page_table_lock); | ||
527 | } else { | ||
528 | pte_t *pte; | ||
529 | spinlock_t *ptl; | ||
530 | |||
531 | pte = page_check_address(page, mm, address, &ptl, 0); | ||
532 | if (!pte) | ||
533 | goto out; | ||
534 | |||
535 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | ||
536 | /* | ||
537 | * Don't treat a reference through a sequentially read | ||
538 | * mapping as such. If the page has been used in | ||
539 | * another mapping, we will catch it; if this other | ||
540 | * mapping is already gone, the unmap path will have | ||
541 | * set PG_referenced or activated the page. | ||
542 | */ | ||
543 | if (likely(!VM_SequentialReadHint(vma))) | ||
544 | referenced++; | ||
545 | } | ||
546 | pte_unmap_unlock(pte, ptl); | ||
547 | } | ||
548 | |||
530 | (*mapcount)--; | 549 | (*mapcount)--; |
531 | pte_unmap_unlock(pte, ptl); | ||
532 | 550 | ||
533 | if (referenced) | 551 | if (referenced) |
534 | *vm_flags |= vma->vm_flags; | 552 | *vm_flags |= vma->vm_flags; |
@@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page, | |||
864 | struct vm_area_struct *vma, unsigned long address, int exclusive) | 882 | struct vm_area_struct *vma, unsigned long address, int exclusive) |
865 | { | 883 | { |
866 | int first = atomic_inc_and_test(&page->_mapcount); | 884 | int first = atomic_inc_and_test(&page->_mapcount); |
867 | if (first) | 885 | if (first) { |
868 | __inc_zone_page_state(page, NR_ANON_PAGES); | 886 | if (!PageTransHuge(page)) |
887 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
888 | else | ||
889 | __inc_zone_page_state(page, | ||
890 | NR_ANON_TRANSPARENT_HUGEPAGES); | ||
891 | } | ||
869 | if (unlikely(PageKsm(page))) | 892 | if (unlikely(PageKsm(page))) |
870 | return; | 893 | return; |
871 | 894 | ||
@@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page, | |||
893 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 916 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
894 | SetPageSwapBacked(page); | 917 | SetPageSwapBacked(page); |
895 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 918 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
896 | __inc_zone_page_state(page, NR_ANON_PAGES); | 919 | if (!PageTransHuge(page)) |
920 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
921 | else | ||
922 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | ||
897 | __page_set_anon_rmap(page, vma, address, 1); | 923 | __page_set_anon_rmap(page, vma, address, 1); |
898 | if (page_evictable(page, vma)) | 924 | if (page_evictable(page, vma)) |
899 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 925 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
@@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page) | |||
911 | { | 937 | { |
912 | if (atomic_inc_and_test(&page->_mapcount)) { | 938 | if (atomic_inc_and_test(&page->_mapcount)) { |
913 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 939 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
914 | mem_cgroup_update_file_mapped(page, 1); | 940 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); |
915 | } | 941 | } |
916 | } | 942 | } |
917 | 943 | ||
@@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page) | |||
946 | return; | 972 | return; |
947 | if (PageAnon(page)) { | 973 | if (PageAnon(page)) { |
948 | mem_cgroup_uncharge_page(page); | 974 | mem_cgroup_uncharge_page(page); |
949 | __dec_zone_page_state(page, NR_ANON_PAGES); | 975 | if (!PageTransHuge(page)) |
976 | __dec_zone_page_state(page, NR_ANON_PAGES); | ||
977 | else | ||
978 | __dec_zone_page_state(page, | ||
979 | NR_ANON_TRANSPARENT_HUGEPAGES); | ||
950 | } else { | 980 | } else { |
951 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 981 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
952 | mem_cgroup_update_file_mapped(page, -1); | 982 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); |
953 | } | 983 | } |
954 | /* | 984 | /* |
955 | * It would be tidy to reset the PageAnon mapping here, | 985 | * It would be tidy to reset the PageAnon mapping here, |
@@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1202 | return ret; | 1232 | return ret; |
1203 | } | 1233 | } |
1204 | 1234 | ||
1205 | static bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1235 | bool is_vma_temporary_stack(struct vm_area_struct *vma) |
1206 | { | 1236 | { |
1207 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | 1237 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); |
1208 | 1238 | ||
@@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1400 | int ret; | 1430 | int ret; |
1401 | 1431 | ||
1402 | BUG_ON(!PageLocked(page)); | 1432 | BUG_ON(!PageLocked(page)); |
1433 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); | ||
1403 | 1434 | ||
1404 | if (unlikely(PageKsm(page))) | 1435 | if (unlikely(PageKsm(page))) |
1405 | ret = try_to_unmap_ksm(page, flags); | 1436 | ret = try_to_unmap_ksm(page, flags); |
diff --git a/mm/shmem.c b/mm/shmem.c index 47fdeeb9d636..5ee67c990602 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -2415,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) | |||
2415 | return &p->vfs_inode; | 2415 | return &p->vfs_inode; |
2416 | } | 2416 | } |
2417 | 2417 | ||
2418 | static void shmem_i_callback(struct rcu_head *head) | ||
2419 | { | ||
2420 | struct inode *inode = container_of(head, struct inode, i_rcu); | ||
2421 | INIT_LIST_HEAD(&inode->i_dentry); | ||
2422 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | ||
2423 | } | ||
2424 | |||
2418 | static void shmem_destroy_inode(struct inode *inode) | 2425 | static void shmem_destroy_inode(struct inode *inode) |
2419 | { | 2426 | { |
2420 | if ((inode->i_mode & S_IFMT) == S_IFREG) { | 2427 | if ((inode->i_mode & S_IFMT) == S_IFREG) { |
2421 | /* only struct inode is valid if it's an inline symlink */ | 2428 | /* only struct inode is valid if it's an inline symlink */ |
2422 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2429 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
2423 | } | 2430 | } |
2424 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2431 | call_rcu(&inode->i_rcu, shmem_i_callback); |
2425 | } | 2432 | } |
2426 | 2433 | ||
2427 | static void init_once(void *foo) | 2434 | static void init_once(void *foo) |
@@ -829,12 +829,12 @@ static void init_reap_node(int cpu) | |||
829 | 829 | ||
830 | static void next_reap_node(void) | 830 | static void next_reap_node(void) |
831 | { | 831 | { |
832 | int node = __get_cpu_var(slab_reap_node); | 832 | int node = __this_cpu_read(slab_reap_node); |
833 | 833 | ||
834 | node = next_node(node, node_online_map); | 834 | node = next_node(node, node_online_map); |
835 | if (unlikely(node >= MAX_NUMNODES)) | 835 | if (unlikely(node >= MAX_NUMNODES)) |
836 | node = first_node(node_online_map); | 836 | node = first_node(node_online_map); |
837 | __get_cpu_var(slab_reap_node) = node; | 837 | __this_cpu_write(slab_reap_node, node); |
838 | } | 838 | } |
839 | 839 | ||
840 | #else | 840 | #else |
@@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
1012 | */ | 1012 | */ |
1013 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | 1013 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) |
1014 | { | 1014 | { |
1015 | int node = __get_cpu_var(slab_reap_node); | 1015 | int node = __this_cpu_read(slab_reap_node); |
1016 | 1016 | ||
1017 | if (l3->alien) { | 1017 | if (l3->alien) { |
1018 | struct array_cache *ac = l3->alien[node]; | 1018 | struct array_cache *ac = l3->alien[node]; |
@@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1293 | * anything expensive but will only modify reap_work | 1293 | * anything expensive but will only modify reap_work |
1294 | * and reschedule the timer. | 1294 | * and reschedule the timer. |
1295 | */ | 1295 | */ |
1296 | cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); | 1296 | cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); |
1297 | /* Now the cache_reaper is guaranteed to be not running. */ | 1297 | /* Now the cache_reaper is guaranteed to be not running. */ |
1298 | per_cpu(slab_reap_work, cpu).work.func = NULL; | 1298 | per_cpu(slab_reap_work, cpu).work.func = NULL; |
1299 | break; | 1299 | break; |
@@ -2781,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, | |||
2781 | /* | 2781 | /* |
2782 | * Map pages beginning at addr to the given cache and slab. This is required | 2782 | * Map pages beginning at addr to the given cache and slab. This is required |
2783 | * for the slab allocator to be able to lookup the cache and slab of a | 2783 | * for the slab allocator to be able to lookup the cache and slab of a |
2784 | * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. | 2784 | * virtual address for kfree, ksize, and slab debugging. |
2785 | */ | 2785 | */ |
2786 | static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, | 2786 | static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, |
2787 | void *addr) | 2787 | void *addr) |
@@ -3653,42 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3653 | EXPORT_SYMBOL(kmem_cache_alloc); | 3653 | EXPORT_SYMBOL(kmem_cache_alloc); |
3654 | 3654 | ||
3655 | #ifdef CONFIG_TRACING | 3655 | #ifdef CONFIG_TRACING |
3656 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) | 3656 | void * |
3657 | kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) | ||
3657 | { | 3658 | { |
3658 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3659 | void *ret; |
3659 | } | ||
3660 | EXPORT_SYMBOL(kmem_cache_alloc_notrace); | ||
3661 | #endif | ||
3662 | 3660 | ||
3663 | /** | 3661 | ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); |
3664 | * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. | ||
3665 | * @cachep: the cache we're checking against | ||
3666 | * @ptr: pointer to validate | ||
3667 | * | ||
3668 | * This verifies that the untrusted pointer looks sane; | ||
3669 | * it is _not_ a guarantee that the pointer is actually | ||
3670 | * part of the slab cache in question, but it at least | ||
3671 | * validates that the pointer can be dereferenced and | ||
3672 | * looks half-way sane. | ||
3673 | * | ||
3674 | * Currently only used for dentry validation. | ||
3675 | */ | ||
3676 | int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) | ||
3677 | { | ||
3678 | unsigned long size = cachep->buffer_size; | ||
3679 | struct page *page; | ||
3680 | 3662 | ||
3681 | if (unlikely(!kern_ptr_validate(ptr, size))) | 3663 | trace_kmalloc(_RET_IP_, ret, |
3682 | goto out; | 3664 | size, slab_buffer_size(cachep), flags); |
3683 | page = virt_to_page(ptr); | 3665 | return ret; |
3684 | if (unlikely(!PageSlab(page))) | ||
3685 | goto out; | ||
3686 | if (unlikely(page_get_cache(page) != cachep)) | ||
3687 | goto out; | ||
3688 | return 1; | ||
3689 | out: | ||
3690 | return 0; | ||
3691 | } | 3666 | } |
3667 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | ||
3668 | #endif | ||
3692 | 3669 | ||
3693 | #ifdef CONFIG_NUMA | 3670 | #ifdef CONFIG_NUMA |
3694 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3671 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) |
@@ -3705,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3705 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3682 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3706 | 3683 | ||
3707 | #ifdef CONFIG_TRACING | 3684 | #ifdef CONFIG_TRACING |
3708 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, | 3685 | void *kmem_cache_alloc_node_trace(size_t size, |
3709 | gfp_t flags, | 3686 | struct kmem_cache *cachep, |
3710 | int nodeid) | 3687 | gfp_t flags, |
3688 | int nodeid) | ||
3711 | { | 3689 | { |
3712 | return __cache_alloc_node(cachep, flags, nodeid, | 3690 | void *ret; |
3691 | |||
3692 | ret = __cache_alloc_node(cachep, flags, nodeid, | ||
3713 | __builtin_return_address(0)); | 3693 | __builtin_return_address(0)); |
3694 | trace_kmalloc_node(_RET_IP_, ret, | ||
3695 | size, slab_buffer_size(cachep), | ||
3696 | flags, nodeid); | ||
3697 | return ret; | ||
3714 | } | 3698 | } |
3715 | EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | 3699 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
3716 | #endif | 3700 | #endif |
3717 | 3701 | ||
3718 | static __always_inline void * | 3702 | static __always_inline void * |
3719 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | 3703 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) |
3720 | { | 3704 | { |
3721 | struct kmem_cache *cachep; | 3705 | struct kmem_cache *cachep; |
3722 | void *ret; | ||
3723 | 3706 | ||
3724 | cachep = kmem_find_general_cachep(size, flags); | 3707 | cachep = kmem_find_general_cachep(size, flags); |
3725 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3708 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
3726 | return cachep; | 3709 | return cachep; |
3727 | ret = kmem_cache_alloc_node_notrace(cachep, flags, node); | 3710 | return kmem_cache_alloc_node_trace(size, cachep, flags, node); |
3728 | |||
3729 | trace_kmalloc_node((unsigned long) caller, ret, | ||
3730 | size, cachep->buffer_size, flags, node); | ||
3731 | |||
3732 | return ret; | ||
3733 | } | 3711 | } |
3734 | 3712 | ||
3735 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | 3713 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
@@ -678,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d) | |||
678 | } | 678 | } |
679 | EXPORT_SYMBOL(kmem_cache_shrink); | 679 | EXPORT_SYMBOL(kmem_cache_shrink); |
680 | 680 | ||
681 | int kmem_ptr_validate(struct kmem_cache *a, const void *b) | ||
682 | { | ||
683 | return 0; | ||
684 | } | ||
685 | |||
686 | static unsigned int slob_ready __read_mostly; | 681 | static unsigned int slob_ready __read_mostly; |
687 | 682 | ||
688 | int slab_is_available(void) | 683 | int slab_is_available(void) |
@@ -28,6 +28,8 @@ | |||
28 | #include <linux/math64.h> | 28 | #include <linux/math64.h> |
29 | #include <linux/fault-inject.h> | 29 | #include <linux/fault-inject.h> |
30 | 30 | ||
31 | #include <trace/events/kmem.h> | ||
32 | |||
31 | /* | 33 | /* |
32 | * Lock order: | 34 | * Lock order: |
33 | * 1. slab_lock(page) | 35 | * 1. slab_lock(page) |
@@ -1774,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | |||
1774 | EXPORT_SYMBOL(kmem_cache_alloc); | 1776 | EXPORT_SYMBOL(kmem_cache_alloc); |
1775 | 1777 | ||
1776 | #ifdef CONFIG_TRACING | 1778 | #ifdef CONFIG_TRACING |
1777 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) | 1779 | void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) |
1780 | { | ||
1781 | void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); | ||
1782 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); | ||
1783 | return ret; | ||
1784 | } | ||
1785 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | ||
1786 | |||
1787 | void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) | ||
1778 | { | 1788 | { |
1779 | return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); | 1789 | void *ret = kmalloc_order(size, flags, order); |
1790 | trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); | ||
1791 | return ret; | ||
1780 | } | 1792 | } |
1781 | EXPORT_SYMBOL(kmem_cache_alloc_notrace); | 1793 | EXPORT_SYMBOL(kmalloc_order_trace); |
1782 | #endif | 1794 | #endif |
1783 | 1795 | ||
1784 | #ifdef CONFIG_NUMA | 1796 | #ifdef CONFIG_NUMA |
@@ -1794,13 +1806,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | |||
1794 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 1806 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
1795 | 1807 | ||
1796 | #ifdef CONFIG_TRACING | 1808 | #ifdef CONFIG_TRACING |
1797 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, | 1809 | void *kmem_cache_alloc_node_trace(struct kmem_cache *s, |
1798 | gfp_t gfpflags, | 1810 | gfp_t gfpflags, |
1799 | int node) | 1811 | int node, size_t size) |
1800 | { | 1812 | { |
1801 | return slab_alloc(s, gfpflags, node, _RET_IP_); | 1813 | void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); |
1814 | |||
1815 | trace_kmalloc_node(_RET_IP_, ret, | ||
1816 | size, s->size, gfpflags, node); | ||
1817 | return ret; | ||
1802 | } | 1818 | } |
1803 | EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | 1819 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
1804 | #endif | 1820 | #endif |
1805 | #endif | 1821 | #endif |
1806 | 1822 | ||
@@ -1917,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x) | |||
1917 | } | 1933 | } |
1918 | EXPORT_SYMBOL(kmem_cache_free); | 1934 | EXPORT_SYMBOL(kmem_cache_free); |
1919 | 1935 | ||
1920 | /* Figure out on which slab page the object resides */ | ||
1921 | static struct page *get_object_page(const void *x) | ||
1922 | { | ||
1923 | struct page *page = virt_to_head_page(x); | ||
1924 | |||
1925 | if (!PageSlab(page)) | ||
1926 | return NULL; | ||
1927 | |||
1928 | return page; | ||
1929 | } | ||
1930 | |||
1931 | /* | 1936 | /* |
1932 | * Object placement in a slab is made very easy because we always start at | 1937 | * Object placement in a slab is made very easy because we always start at |
1933 | * offset 0. If we tune the size of the object to the alignment then we can | 1938 | * offset 0. If we tune the size of the object to the alignment then we can |
@@ -2386,35 +2391,6 @@ error: | |||
2386 | } | 2391 | } |
2387 | 2392 | ||
2388 | /* | 2393 | /* |
2389 | * Check if a given pointer is valid | ||
2390 | */ | ||
2391 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) | ||
2392 | { | ||
2393 | struct page *page; | ||
2394 | |||
2395 | if (!kern_ptr_validate(object, s->size)) | ||
2396 | return 0; | ||
2397 | |||
2398 | page = get_object_page(object); | ||
2399 | |||
2400 | if (!page || s != page->slab) | ||
2401 | /* No slab or wrong slab */ | ||
2402 | return 0; | ||
2403 | |||
2404 | if (!check_valid_pointer(s, page, object)) | ||
2405 | return 0; | ||
2406 | |||
2407 | /* | ||
2408 | * We could also check if the object is on the slabs freelist. | ||
2409 | * But this would be too expensive and it seems that the main | ||
2410 | * purpose of kmem_ptr_valid() is to check if the object belongs | ||
2411 | * to a certain slab. | ||
2412 | */ | ||
2413 | return 1; | ||
2414 | } | ||
2415 | EXPORT_SYMBOL(kmem_ptr_validate); | ||
2416 | |||
2417 | /* | ||
2418 | * Determine the size of a slab object | 2394 | * Determine the size of a slab object |
2419 | */ | 2395 | */ |
2420 | unsigned int kmem_cache_size(struct kmem_cache *s) | 2396 | unsigned int kmem_cache_size(struct kmem_cache *s) |
@@ -3660,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
3660 | len += sprintf(buf + len, "%7ld ", l->count); | 3636 | len += sprintf(buf + len, "%7ld ", l->count); |
3661 | 3637 | ||
3662 | if (l->addr) | 3638 | if (l->addr) |
3663 | len += sprint_symbol(buf + len, (unsigned long)l->addr); | 3639 | len += sprintf(buf + len, "%pS", (void *)l->addr); |
3664 | else | 3640 | else |
3665 | len += sprintf(buf + len, "<not-available>"); | 3641 | len += sprintf(buf + len, "<not-available>"); |
3666 | 3642 | ||
@@ -3970,12 +3946,9 @@ SLAB_ATTR(min_partial); | |||
3970 | 3946 | ||
3971 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) | 3947 | static ssize_t ctor_show(struct kmem_cache *s, char *buf) |
3972 | { | 3948 | { |
3973 | if (s->ctor) { | 3949 | if (!s->ctor) |
3974 | int n = sprint_symbol(buf, (unsigned long)s->ctor); | 3950 | return 0; |
3975 | 3951 | return sprintf(buf, "%pS\n", s->ctor); | |
3976 | return n + sprintf(buf + n, "\n"); | ||
3977 | } | ||
3978 | return 0; | ||
3979 | } | 3952 | } |
3980 | SLAB_ATTR_RO(ctor); | 3953 | SLAB_ATTR_RO(ctor); |
3981 | 3954 | ||
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 29d6cbffb283..64b984091edb 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * | 9 | * |
10 | * However, virtual mappings need a page table and TLBs. Many Linux | 10 | * However, virtual mappings need a page table and TLBs. Many Linux |
11 | * architectures already map their physical space using 1-1 mappings | 11 | * architectures already map their physical space using 1-1 mappings |
12 | * via TLBs. For those arches the virtual memmory map is essentially | 12 | * via TLBs. For those arches the virtual memory map is essentially |
13 | * for free if we use the same page size as the 1-1 mappings. In that | 13 | * for free if we use the same page size as the 1-1 mappings. In that |
14 | * case the overhead consists of a few additional pages that are | 14 | * case the overhead consists of a few additional pages that are |
15 | * allocated to create a view of memory for vmemmap. | 15 | * allocated to create a view of memory for vmemmap. |
diff --git a/mm/sparse.c b/mm/sparse.c index 95ac219af379..93250207c5cf 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
671 | static void free_map_bootmem(struct page *page, unsigned long nr_pages) | 671 | static void free_map_bootmem(struct page *page, unsigned long nr_pages) |
672 | { | 672 | { |
673 | unsigned long maps_section_nr, removing_section_nr, i; | 673 | unsigned long maps_section_nr, removing_section_nr, i; |
674 | int magic; | 674 | unsigned long magic; |
675 | 675 | ||
676 | for (i = 0; i < nr_pages; i++, page++) { | 676 | for (i = 0; i < nr_pages; i++, page++) { |
677 | magic = atomic_read(&page->_mapcount); | 677 | magic = (unsigned long) page->lru.next; |
678 | 678 | ||
679 | BUG_ON(magic == NODE_INFO); | 679 | BUG_ON(magic == NODE_INFO); |
680 | 680 | ||
@@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page) | |||
56 | del_page_from_lru(zone, page); | 56 | del_page_from_lru(zone, page); |
57 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 57 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
58 | } | 58 | } |
59 | } | ||
60 | |||
61 | static void __put_single_page(struct page *page) | ||
62 | { | ||
63 | __page_cache_release(page); | ||
59 | free_hot_cold_page(page, 0); | 64 | free_hot_cold_page(page, 0); |
60 | } | 65 | } |
61 | 66 | ||
62 | static void put_compound_page(struct page *page) | 67 | static void __put_compound_page(struct page *page) |
63 | { | 68 | { |
64 | page = compound_head(page); | 69 | compound_page_dtor *dtor; |
65 | if (put_page_testzero(page)) { | ||
66 | compound_page_dtor *dtor; | ||
67 | 70 | ||
68 | dtor = get_compound_page_dtor(page); | 71 | __page_cache_release(page); |
69 | (*dtor)(page); | 72 | dtor = get_compound_page_dtor(page); |
73 | (*dtor)(page); | ||
74 | } | ||
75 | |||
76 | static void put_compound_page(struct page *page) | ||
77 | { | ||
78 | if (unlikely(PageTail(page))) { | ||
79 | /* __split_huge_page_refcount can run under us */ | ||
80 | struct page *page_head = page->first_page; | ||
81 | smp_rmb(); | ||
82 | /* | ||
83 | * If PageTail is still set after smp_rmb() we can be sure | ||
84 | * that the page->first_page we read wasn't a dangling pointer. | ||
85 | * See __split_huge_page_refcount() smp_wmb(). | ||
86 | */ | ||
87 | if (likely(PageTail(page) && get_page_unless_zero(page_head))) { | ||
88 | unsigned long flags; | ||
89 | /* | ||
90 | * Verify that our page_head wasn't converted | ||
91 | * to a a regular page before we got a | ||
92 | * reference on it. | ||
93 | */ | ||
94 | if (unlikely(!PageHead(page_head))) { | ||
95 | /* PageHead is cleared after PageTail */ | ||
96 | smp_rmb(); | ||
97 | VM_BUG_ON(PageTail(page)); | ||
98 | goto out_put_head; | ||
99 | } | ||
100 | /* | ||
101 | * Only run compound_lock on a valid PageHead, | ||
102 | * after having it pinned with | ||
103 | * get_page_unless_zero() above. | ||
104 | */ | ||
105 | smp_mb(); | ||
106 | /* page_head wasn't a dangling pointer */ | ||
107 | flags = compound_lock_irqsave(page_head); | ||
108 | if (unlikely(!PageTail(page))) { | ||
109 | /* __split_huge_page_refcount run before us */ | ||
110 | compound_unlock_irqrestore(page_head, flags); | ||
111 | VM_BUG_ON(PageHead(page_head)); | ||
112 | out_put_head: | ||
113 | if (put_page_testzero(page_head)) | ||
114 | __put_single_page(page_head); | ||
115 | out_put_single: | ||
116 | if (put_page_testzero(page)) | ||
117 | __put_single_page(page); | ||
118 | return; | ||
119 | } | ||
120 | VM_BUG_ON(page_head != page->first_page); | ||
121 | /* | ||
122 | * We can release the refcount taken by | ||
123 | * get_page_unless_zero now that | ||
124 | * split_huge_page_refcount is blocked on the | ||
125 | * compound_lock. | ||
126 | */ | ||
127 | if (put_page_testzero(page_head)) | ||
128 | VM_BUG_ON(1); | ||
129 | /* __split_huge_page_refcount will wait now */ | ||
130 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | ||
131 | atomic_dec(&page->_count); | ||
132 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | ||
133 | compound_unlock_irqrestore(page_head, flags); | ||
134 | if (put_page_testzero(page_head)) { | ||
135 | if (PageHead(page_head)) | ||
136 | __put_compound_page(page_head); | ||
137 | else | ||
138 | __put_single_page(page_head); | ||
139 | } | ||
140 | } else { | ||
141 | /* page_head is a dangling pointer */ | ||
142 | VM_BUG_ON(PageTail(page)); | ||
143 | goto out_put_single; | ||
144 | } | ||
145 | } else if (put_page_testzero(page)) { | ||
146 | if (PageHead(page)) | ||
147 | __put_compound_page(page); | ||
148 | else | ||
149 | __put_single_page(page); | ||
70 | } | 150 | } |
71 | } | 151 | } |
72 | 152 | ||
@@ -75,7 +155,7 @@ void put_page(struct page *page) | |||
75 | if (unlikely(PageCompound(page))) | 155 | if (unlikely(PageCompound(page))) |
76 | put_compound_page(page); | 156 | put_compound_page(page); |
77 | else if (put_page_testzero(page)) | 157 | else if (put_page_testzero(page)) |
78 | __page_cache_release(page); | 158 | __put_single_page(page); |
79 | } | 159 | } |
80 | EXPORT_SYMBOL(put_page); | 160 | EXPORT_SYMBOL(put_page); |
81 | 161 | ||
@@ -98,15 +178,13 @@ void put_pages_list(struct list_head *pages) | |||
98 | } | 178 | } |
99 | EXPORT_SYMBOL(put_pages_list); | 179 | EXPORT_SYMBOL(put_pages_list); |
100 | 180 | ||
101 | /* | 181 | static void pagevec_lru_move_fn(struct pagevec *pvec, |
102 | * pagevec_move_tail() must be called with IRQ disabled. | 182 | void (*move_fn)(struct page *page, void *arg), |
103 | * Otherwise this may cause nasty races. | 183 | void *arg) |
104 | */ | ||
105 | static void pagevec_move_tail(struct pagevec *pvec) | ||
106 | { | 184 | { |
107 | int i; | 185 | int i; |
108 | int pgmoved = 0; | ||
109 | struct zone *zone = NULL; | 186 | struct zone *zone = NULL; |
187 | unsigned long flags = 0; | ||
110 | 188 | ||
111 | for (i = 0; i < pagevec_count(pvec); i++) { | 189 | for (i = 0; i < pagevec_count(pvec); i++) { |
112 | struct page *page = pvec->pages[i]; | 190 | struct page *page = pvec->pages[i]; |
@@ -114,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec) | |||
114 | 192 | ||
115 | if (pagezone != zone) { | 193 | if (pagezone != zone) { |
116 | if (zone) | 194 | if (zone) |
117 | spin_unlock(&zone->lru_lock); | 195 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
118 | zone = pagezone; | 196 | zone = pagezone; |
119 | spin_lock(&zone->lru_lock); | 197 | spin_lock_irqsave(&zone->lru_lock, flags); |
120 | } | ||
121 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | ||
122 | int lru = page_lru_base_type(page); | ||
123 | list_move_tail(&page->lru, &zone->lru[lru].list); | ||
124 | pgmoved++; | ||
125 | } | 198 | } |
199 | |||
200 | (*move_fn)(page, arg); | ||
126 | } | 201 | } |
127 | if (zone) | 202 | if (zone) |
128 | spin_unlock(&zone->lru_lock); | 203 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
129 | __count_vm_events(PGROTATED, pgmoved); | 204 | release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); |
130 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
131 | pagevec_reinit(pvec); | 205 | pagevec_reinit(pvec); |
132 | } | 206 | } |
133 | 207 | ||
208 | static void pagevec_move_tail_fn(struct page *page, void *arg) | ||
209 | { | ||
210 | int *pgmoved = arg; | ||
211 | struct zone *zone = page_zone(page); | ||
212 | |||
213 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | ||
214 | int lru = page_lru_base_type(page); | ||
215 | list_move_tail(&page->lru, &zone->lru[lru].list); | ||
216 | (*pgmoved)++; | ||
217 | } | ||
218 | } | ||
219 | |||
220 | /* | ||
221 | * pagevec_move_tail() must be called with IRQ disabled. | ||
222 | * Otherwise this may cause nasty races. | ||
223 | */ | ||
224 | static void pagevec_move_tail(struct pagevec *pvec) | ||
225 | { | ||
226 | int pgmoved = 0; | ||
227 | |||
228 | pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); | ||
229 | __count_vm_events(PGROTATED, pgmoved); | ||
230 | } | ||
231 | |||
134 | /* | 232 | /* |
135 | * Writeback is about to end against a page which has been marked for immediate | 233 | * Writeback is about to end against a page which has been marked for immediate |
136 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 234 | * reclaim. If it still appears to be reclaimable, move it to the tail of the |
137 | * inactive list. | 235 | * inactive list. |
138 | */ | 236 | */ |
139 | void rotate_reclaimable_page(struct page *page) | 237 | void rotate_reclaimable_page(struct page *page) |
140 | { | 238 | { |
141 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && | 239 | if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && |
142 | !PageUnevictable(page) && PageLRU(page)) { | 240 | !PageUnevictable(page) && PageLRU(page)) { |
@@ -173,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page, | |||
173 | } | 271 | } |
174 | 272 | ||
175 | /* | 273 | /* |
176 | * FIXME: speed this up? | 274 | * A page will go to active list either by activate_page or putback_lru_page. |
275 | * In the activate_page case, the page hasn't active bit set. The page might | ||
276 | * not in LRU list because it's isolated before it gets a chance to be moved to | ||
277 | * active list. The window is small because pagevec just stores several pages. | ||
278 | * For such case, we do nothing for such page. | ||
279 | * In the putback_lru_page case, the page isn't in lru list but has active | ||
280 | * bit set | ||
177 | */ | 281 | */ |
178 | void activate_page(struct page *page) | 282 | static void __activate_page(struct page *page, void *arg) |
179 | { | 283 | { |
180 | struct zone *zone = page_zone(page); | 284 | struct zone *zone = page_zone(page); |
285 | int file = page_is_file_cache(page); | ||
286 | int lru = page_lru_base_type(page); | ||
287 | bool putback = !PageLRU(page); | ||
181 | 288 | ||
182 | spin_lock_irq(&zone->lru_lock); | 289 | /* The page is isolated before it's moved to active list */ |
183 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 290 | if (!PageLRU(page) && !PageActive(page)) |
184 | int file = page_is_file_cache(page); | 291 | return; |
185 | int lru = page_lru_base_type(page); | 292 | if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page)) |
293 | return; | ||
294 | |||
295 | if (!putback) | ||
186 | del_page_from_lru_list(zone, page, lru); | 296 | del_page_from_lru_list(zone, page, lru); |
297 | else | ||
298 | SetPageLRU(page); | ||
187 | 299 | ||
188 | SetPageActive(page); | 300 | SetPageActive(page); |
189 | lru += LRU_ACTIVE; | 301 | lru += LRU_ACTIVE; |
190 | add_page_to_lru_list(zone, page, lru); | 302 | add_page_to_lru_list(zone, page, lru); |
191 | __count_vm_event(PGACTIVATE); | ||
192 | 303 | ||
193 | update_page_reclaim_stat(zone, page, file, 1); | 304 | if (putback) |
305 | return; | ||
306 | __count_vm_event(PGACTIVATE); | ||
307 | update_page_reclaim_stat(zone, page, file, 1); | ||
308 | } | ||
309 | |||
310 | #ifdef CONFIG_SMP | ||
311 | static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); | ||
312 | |||
313 | static void activate_page_drain(int cpu) | ||
314 | { | ||
315 | struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); | ||
316 | |||
317 | if (pagevec_count(pvec)) | ||
318 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | ||
319 | } | ||
320 | |||
321 | void activate_page(struct page *page) | ||
322 | { | ||
323 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | ||
324 | struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); | ||
325 | |||
326 | page_cache_get(page); | ||
327 | if (!pagevec_add(pvec, page)) | ||
328 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | ||
329 | put_cpu_var(activate_page_pvecs); | ||
330 | } | ||
331 | } | ||
332 | |||
333 | /* Caller should hold zone->lru_lock */ | ||
334 | int putback_active_lru_page(struct zone *zone, struct page *page) | ||
335 | { | ||
336 | struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); | ||
337 | |||
338 | if (!pagevec_add(pvec, page)) { | ||
339 | spin_unlock_irq(&zone->lru_lock); | ||
340 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | ||
341 | spin_lock_irq(&zone->lru_lock); | ||
194 | } | 342 | } |
343 | put_cpu_var(activate_page_pvecs); | ||
344 | return 1; | ||
345 | } | ||
346 | |||
347 | #else | ||
348 | static inline void activate_page_drain(int cpu) | ||
349 | { | ||
350 | } | ||
351 | |||
352 | void activate_page(struct page *page) | ||
353 | { | ||
354 | struct zone *zone = page_zone(page); | ||
355 | |||
356 | spin_lock_irq(&zone->lru_lock); | ||
357 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) | ||
358 | __activate_page(page, NULL); | ||
195 | spin_unlock_irq(&zone->lru_lock); | 359 | spin_unlock_irq(&zone->lru_lock); |
196 | } | 360 | } |
361 | #endif | ||
197 | 362 | ||
198 | /* | 363 | /* |
199 | * Mark a page as having seen activity. | 364 | * Mark a page as having seen activity. |
@@ -292,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu) | |||
292 | pagevec_move_tail(pvec); | 457 | pagevec_move_tail(pvec); |
293 | local_irq_restore(flags); | 458 | local_irq_restore(flags); |
294 | } | 459 | } |
460 | activate_page_drain(cpu); | ||
295 | } | 461 | } |
296 | 462 | ||
297 | void lru_add_drain(void) | 463 | void lru_add_drain(void) |
@@ -399,44 +565,70 @@ void __pagevec_release(struct pagevec *pvec) | |||
399 | 565 | ||
400 | EXPORT_SYMBOL(__pagevec_release); | 566 | EXPORT_SYMBOL(__pagevec_release); |
401 | 567 | ||
568 | /* used by __split_huge_page_refcount() */ | ||
569 | void lru_add_page_tail(struct zone* zone, | ||
570 | struct page *page, struct page *page_tail) | ||
571 | { | ||
572 | int active; | ||
573 | enum lru_list lru; | ||
574 | const int file = 0; | ||
575 | struct list_head *head; | ||
576 | |||
577 | VM_BUG_ON(!PageHead(page)); | ||
578 | VM_BUG_ON(PageCompound(page_tail)); | ||
579 | VM_BUG_ON(PageLRU(page_tail)); | ||
580 | VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); | ||
581 | |||
582 | SetPageLRU(page_tail); | ||
583 | |||
584 | if (page_evictable(page_tail, NULL)) { | ||
585 | if (PageActive(page)) { | ||
586 | SetPageActive(page_tail); | ||
587 | active = 1; | ||
588 | lru = LRU_ACTIVE_ANON; | ||
589 | } else { | ||
590 | active = 0; | ||
591 | lru = LRU_INACTIVE_ANON; | ||
592 | } | ||
593 | update_page_reclaim_stat(zone, page_tail, file, active); | ||
594 | if (likely(PageLRU(page))) | ||
595 | head = page->lru.prev; | ||
596 | else | ||
597 | head = &zone->lru[lru].list; | ||
598 | __add_page_to_lru_list(zone, page_tail, lru, head); | ||
599 | } else { | ||
600 | SetPageUnevictable(page_tail); | ||
601 | add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); | ||
602 | } | ||
603 | } | ||
604 | |||
605 | static void ____pagevec_lru_add_fn(struct page *page, void *arg) | ||
606 | { | ||
607 | enum lru_list lru = (enum lru_list)arg; | ||
608 | struct zone *zone = page_zone(page); | ||
609 | int file = is_file_lru(lru); | ||
610 | int active = is_active_lru(lru); | ||
611 | |||
612 | VM_BUG_ON(PageActive(page)); | ||
613 | VM_BUG_ON(PageUnevictable(page)); | ||
614 | VM_BUG_ON(PageLRU(page)); | ||
615 | |||
616 | SetPageLRU(page); | ||
617 | if (active) | ||
618 | SetPageActive(page); | ||
619 | update_page_reclaim_stat(zone, page, file, active); | ||
620 | add_page_to_lru_list(zone, page, lru); | ||
621 | } | ||
622 | |||
402 | /* | 623 | /* |
403 | * Add the passed pages to the LRU, then drop the caller's refcount | 624 | * Add the passed pages to the LRU, then drop the caller's refcount |
404 | * on them. Reinitialises the caller's pagevec. | 625 | * on them. Reinitialises the caller's pagevec. |
405 | */ | 626 | */ |
406 | void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) | 627 | void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) |
407 | { | 628 | { |
408 | int i; | ||
409 | struct zone *zone = NULL; | ||
410 | |||
411 | VM_BUG_ON(is_unevictable_lru(lru)); | 629 | VM_BUG_ON(is_unevictable_lru(lru)); |
412 | 630 | ||
413 | for (i = 0; i < pagevec_count(pvec); i++) { | 631 | pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); |
414 | struct page *page = pvec->pages[i]; | ||
415 | struct zone *pagezone = page_zone(page); | ||
416 | int file; | ||
417 | int active; | ||
418 | |||
419 | if (pagezone != zone) { | ||
420 | if (zone) | ||
421 | spin_unlock_irq(&zone->lru_lock); | ||
422 | zone = pagezone; | ||
423 | spin_lock_irq(&zone->lru_lock); | ||
424 | } | ||
425 | VM_BUG_ON(PageActive(page)); | ||
426 | VM_BUG_ON(PageUnevictable(page)); | ||
427 | VM_BUG_ON(PageLRU(page)); | ||
428 | SetPageLRU(page); | ||
429 | active = is_active_lru(lru); | ||
430 | file = is_file_lru(lru); | ||
431 | if (active) | ||
432 | SetPageActive(page); | ||
433 | update_page_reclaim_stat(zone, page, file, active); | ||
434 | add_page_to_lru_list(zone, page, lru); | ||
435 | } | ||
436 | if (zone) | ||
437 | spin_unlock_irq(&zone->lru_lock); | ||
438 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
439 | pagevec_reinit(pvec); | ||
440 | } | 632 | } |
441 | 633 | ||
442 | EXPORT_SYMBOL(____pagevec_lru_add); | 634 | EXPORT_SYMBOL(____pagevec_lru_add); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index e10f5833167f..5c8cfabbc9bc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -157,6 +157,12 @@ int add_to_swap(struct page *page) | |||
157 | if (!entry.val) | 157 | if (!entry.val) |
158 | return 0; | 158 | return 0; |
159 | 159 | ||
160 | if (unlikely(PageTransHuge(page))) | ||
161 | if (unlikely(split_huge_page(page))) { | ||
162 | swapcache_free(entry, NULL); | ||
163 | return 0; | ||
164 | } | ||
165 | |||
160 | /* | 166 | /* |
161 | * Radix-tree node allocations from PF_MEMALLOC contexts could | 167 | * Radix-tree node allocations from PF_MEMALLOC contexts could |
162 | * completely exhaust the page allocator. __GFP_NOMEMALLOC | 168 | * completely exhaust the page allocator. __GFP_NOMEMALLOC |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 67ddaaf98c74..07a458d72fa8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
964 | pmd = pmd_offset(pud, addr); | 964 | pmd = pmd_offset(pud, addr); |
965 | do { | 965 | do { |
966 | next = pmd_addr_end(addr, end); | 966 | next = pmd_addr_end(addr, end); |
967 | if (unlikely(pmd_trans_huge(*pmd))) | ||
968 | continue; | ||
967 | if (pmd_none_or_clear_bad(pmd)) | 969 | if (pmd_none_or_clear_bad(pmd)) |
968 | continue; | 970 | continue; |
969 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | 971 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
@@ -1677,7 +1679,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1677 | if (S_ISBLK(inode->i_mode)) { | 1679 | if (S_ISBLK(inode->i_mode)) { |
1678 | struct block_device *bdev = I_BDEV(inode); | 1680 | struct block_device *bdev = I_BDEV(inode); |
1679 | set_blocksize(bdev, p->old_block_size); | 1681 | set_blocksize(bdev, p->old_block_size); |
1680 | bd_release(bdev); | 1682 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
1681 | } else { | 1683 | } else { |
1682 | mutex_lock(&inode->i_mutex); | 1684 | mutex_lock(&inode->i_mutex); |
1683 | inode->i_flags &= ~S_SWAPFILE; | 1685 | inode->i_flags &= ~S_SWAPFILE; |
@@ -1939,7 +1941,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1939 | error = -EINVAL; | 1941 | error = -EINVAL; |
1940 | if (S_ISBLK(inode->i_mode)) { | 1942 | if (S_ISBLK(inode->i_mode)) { |
1941 | bdev = I_BDEV(inode); | 1943 | bdev = I_BDEV(inode); |
1942 | error = bd_claim(bdev, sys_swapon); | 1944 | error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, |
1945 | sys_swapon); | ||
1943 | if (error < 0) { | 1946 | if (error < 0) { |
1944 | bdev = NULL; | 1947 | bdev = NULL; |
1945 | error = -EINVAL; | 1948 | error = -EINVAL; |
@@ -2136,7 +2139,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2136 | bad_swap: | 2139 | bad_swap: |
2137 | if (bdev) { | 2140 | if (bdev) { |
2138 | set_blocksize(bdev, p->old_block_size); | 2141 | set_blocksize(bdev, p->old_block_size); |
2139 | bd_release(bdev); | 2142 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
2140 | } | 2143 | } |
2141 | destroy_swap_extents(p); | 2144 | destroy_swap_extents(p); |
2142 | swap_cgroup_swapoff(type); | 2145 | swap_cgroup_swapoff(type); |
@@ -186,27 +186,6 @@ void kzfree(const void *p) | |||
186 | } | 186 | } |
187 | EXPORT_SYMBOL(kzfree); | 187 | EXPORT_SYMBOL(kzfree); |
188 | 188 | ||
189 | int kern_ptr_validate(const void *ptr, unsigned long size) | ||
190 | { | ||
191 | unsigned long addr = (unsigned long)ptr; | ||
192 | unsigned long min_addr = PAGE_OFFSET; | ||
193 | unsigned long align_mask = sizeof(void *) - 1; | ||
194 | |||
195 | if (unlikely(addr < min_addr)) | ||
196 | goto out; | ||
197 | if (unlikely(addr > (unsigned long)high_memory - size)) | ||
198 | goto out; | ||
199 | if (unlikely(addr & align_mask)) | ||
200 | goto out; | ||
201 | if (unlikely(!kern_addr_valid(addr))) | ||
202 | goto out; | ||
203 | if (unlikely(!kern_addr_valid(addr + size - 1))) | ||
204 | goto out; | ||
205 | return 1; | ||
206 | out: | ||
207 | return 0; | ||
208 | } | ||
209 | |||
210 | /* | 189 | /* |
211 | * strndup_user - duplicate an existing string from user space | 190 | * strndup_user - duplicate an existing string from user space |
212 | * @s: The string to duplicate | 191 | * @s: The string to duplicate |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 816f074fb4e1..f9b166732e70 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
748 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, | 748 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, |
749 | VMALLOC_START, VMALLOC_END, | 749 | VMALLOC_START, VMALLOC_END, |
750 | node, gfp_mask); | 750 | node, gfp_mask); |
751 | if (unlikely(IS_ERR(va))) { | 751 | if (IS_ERR(va)) { |
752 | kfree(vb); | 752 | kfree(vb); |
753 | return ERR_CAST(va); | 753 | return ERR_CAST(va); |
754 | } | 754 | } |
@@ -1316,13 +1316,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | |||
1316 | -1, GFP_KERNEL, caller); | 1316 | -1, GFP_KERNEL, caller); |
1317 | } | 1317 | } |
1318 | 1318 | ||
1319 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | ||
1320 | int node, gfp_t gfp_mask) | ||
1321 | { | ||
1322 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | ||
1323 | node, gfp_mask, __builtin_return_address(0)); | ||
1324 | } | ||
1325 | |||
1326 | static struct vm_struct *find_vm_area(const void *addr) | 1319 | static struct vm_struct *find_vm_area(const void *addr) |
1327 | { | 1320 | { |
1328 | struct vmap_area *va; | 1321 | struct vmap_area *va; |
@@ -1538,25 +1531,12 @@ fail: | |||
1538 | return NULL; | 1531 | return NULL; |
1539 | } | 1532 | } |
1540 | 1533 | ||
1541 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | ||
1542 | { | ||
1543 | void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, | ||
1544 | __builtin_return_address(0)); | ||
1545 | |||
1546 | /* | ||
1547 | * A ref_count = 3 is needed because the vm_struct and vmap_area | ||
1548 | * structures allocated in the __get_vm_area_node() function contain | ||
1549 | * references to the virtual address of the vmalloc'ed block. | ||
1550 | */ | ||
1551 | kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); | ||
1552 | |||
1553 | return addr; | ||
1554 | } | ||
1555 | |||
1556 | /** | 1534 | /** |
1557 | * __vmalloc_node - allocate virtually contiguous memory | 1535 | * __vmalloc_node_range - allocate virtually contiguous memory |
1558 | * @size: allocation size | 1536 | * @size: allocation size |
1559 | * @align: desired alignment | 1537 | * @align: desired alignment |
1538 | * @start: vm area range start | ||
1539 | * @end: vm area range end | ||
1560 | * @gfp_mask: flags for the page level allocator | 1540 | * @gfp_mask: flags for the page level allocator |
1561 | * @prot: protection mask for the allocated pages | 1541 | * @prot: protection mask for the allocated pages |
1562 | * @node: node to use for allocation or -1 | 1542 | * @node: node to use for allocation or -1 |
@@ -1566,9 +1546,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
1566 | * allocator with @gfp_mask flags. Map them into contiguous | 1546 | * allocator with @gfp_mask flags. Map them into contiguous |
1567 | * kernel virtual space, using a pagetable protection of @prot. | 1547 | * kernel virtual space, using a pagetable protection of @prot. |
1568 | */ | 1548 | */ |
1569 | static void *__vmalloc_node(unsigned long size, unsigned long align, | 1549 | void *__vmalloc_node_range(unsigned long size, unsigned long align, |
1570 | gfp_t gfp_mask, pgprot_t prot, | 1550 | unsigned long start, unsigned long end, gfp_t gfp_mask, |
1571 | int node, void *caller) | 1551 | pgprot_t prot, int node, void *caller) |
1572 | { | 1552 | { |
1573 | struct vm_struct *area; | 1553 | struct vm_struct *area; |
1574 | void *addr; | 1554 | void *addr; |
@@ -1578,8 +1558,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1578 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1558 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
1579 | return NULL; | 1559 | return NULL; |
1580 | 1560 | ||
1581 | area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, | 1561 | area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, |
1582 | VMALLOC_END, node, gfp_mask, caller); | 1562 | gfp_mask, caller); |
1583 | 1563 | ||
1584 | if (!area) | 1564 | if (!area) |
1585 | return NULL; | 1565 | return NULL; |
@@ -1596,6 +1576,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1596 | return addr; | 1576 | return addr; |
1597 | } | 1577 | } |
1598 | 1578 | ||
1579 | /** | ||
1580 | * __vmalloc_node - allocate virtually contiguous memory | ||
1581 | * @size: allocation size | ||
1582 | * @align: desired alignment | ||
1583 | * @gfp_mask: flags for the page level allocator | ||
1584 | * @prot: protection mask for the allocated pages | ||
1585 | * @node: node to use for allocation or -1 | ||
1586 | * @caller: caller's return address | ||
1587 | * | ||
1588 | * Allocate enough pages to cover @size from the page level | ||
1589 | * allocator with @gfp_mask flags. Map them into contiguous | ||
1590 | * kernel virtual space, using a pagetable protection of @prot. | ||
1591 | */ | ||
1592 | static void *__vmalloc_node(unsigned long size, unsigned long align, | ||
1593 | gfp_t gfp_mask, pgprot_t prot, | ||
1594 | int node, void *caller) | ||
1595 | { | ||
1596 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, | ||
1597 | gfp_mask, prot, node, caller); | ||
1598 | } | ||
1599 | |||
1599 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1600 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
1600 | { | 1601 | { |
1601 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, | 1602 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, |
@@ -2204,17 +2205,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, | |||
2204 | * @sizes: array containing size of each area | 2205 | * @sizes: array containing size of each area |
2205 | * @nr_vms: the number of areas to allocate | 2206 | * @nr_vms: the number of areas to allocate |
2206 | * @align: alignment, all entries in @offsets and @sizes must be aligned to this | 2207 | * @align: alignment, all entries in @offsets and @sizes must be aligned to this |
2207 | * @gfp_mask: allocation mask | ||
2208 | * | 2208 | * |
2209 | * Returns: kmalloc'd vm_struct pointer array pointing to allocated | 2209 | * Returns: kmalloc'd vm_struct pointer array pointing to allocated |
2210 | * vm_structs on success, %NULL on failure | 2210 | * vm_structs on success, %NULL on failure |
2211 | * | 2211 | * |
2212 | * Percpu allocator wants to use congruent vm areas so that it can | 2212 | * Percpu allocator wants to use congruent vm areas so that it can |
2213 | * maintain the offsets among percpu areas. This function allocates | 2213 | * maintain the offsets among percpu areas. This function allocates |
2214 | * congruent vmalloc areas for it. These areas tend to be scattered | 2214 | * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to |
2215 | * pretty far, distance between two areas easily going up to | 2215 | * be scattered pretty far, distance between two areas easily going up |
2216 | * gigabytes. To avoid interacting with regular vmallocs, these areas | 2216 | * to gigabytes. To avoid interacting with regular vmallocs, these |
2217 | * are allocated from top. | 2217 | * areas are allocated from top. |
2218 | * | 2218 | * |
2219 | * Despite its complicated look, this allocator is rather simple. It | 2219 | * Despite its complicated look, this allocator is rather simple. It |
2220 | * does everything top-down and scans areas from the end looking for | 2220 | * does everything top-down and scans areas from the end looking for |
@@ -2225,7 +2225,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, | |||
2225 | */ | 2225 | */ |
2226 | struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | 2226 | struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, |
2227 | const size_t *sizes, int nr_vms, | 2227 | const size_t *sizes, int nr_vms, |
2228 | size_t align, gfp_t gfp_mask) | 2228 | size_t align) |
2229 | { | 2229 | { |
2230 | const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); | 2230 | const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); |
2231 | const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); | 2231 | const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); |
@@ -2235,8 +2235,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
2235 | unsigned long base, start, end, last_end; | 2235 | unsigned long base, start, end, last_end; |
2236 | bool purged = false; | 2236 | bool purged = false; |
2237 | 2237 | ||
2238 | gfp_mask &= GFP_RECLAIM_MASK; | ||
2239 | |||
2240 | /* verify parameters and allocate data structures */ | 2238 | /* verify parameters and allocate data structures */ |
2241 | BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); | 2239 | BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); |
2242 | for (last_area = 0, area = 0; area < nr_vms; area++) { | 2240 | for (last_area = 0, area = 0; area < nr_vms; area++) { |
@@ -2269,14 +2267,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
2269 | return NULL; | 2267 | return NULL; |
2270 | } | 2268 | } |
2271 | 2269 | ||
2272 | vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); | 2270 | vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); |
2273 | vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); | 2271 | vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); |
2274 | if (!vas || !vms) | 2272 | if (!vas || !vms) |
2275 | goto err_free; | 2273 | goto err_free; |
2276 | 2274 | ||
2277 | for (area = 0; area < nr_vms; area++) { | 2275 | for (area = 0; area < nr_vms; area++) { |
2278 | vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); | 2276 | vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); |
2279 | vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); | 2277 | vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); |
2280 | if (!vas[area] || !vms[area]) | 2278 | if (!vas[area] || !vms[area]) |
2281 | goto err_free; | 2279 | goto err_free; |
2282 | } | 2280 | } |
@@ -2457,13 +2455,8 @@ static int s_show(struct seq_file *m, void *p) | |||
2457 | seq_printf(m, "0x%p-0x%p %7ld", | 2455 | seq_printf(m, "0x%p-0x%p %7ld", |
2458 | v->addr, v->addr + v->size, v->size); | 2456 | v->addr, v->addr + v->size, v->size); |
2459 | 2457 | ||
2460 | if (v->caller) { | 2458 | if (v->caller) |
2461 | char buff[KSYM_SYMBOL_LEN]; | 2459 | seq_printf(m, " %pS", v->caller); |
2462 | |||
2463 | seq_putc(m, ' '); | ||
2464 | sprint_symbol(buff, (unsigned long)v->caller); | ||
2465 | seq_puts(m, buff); | ||
2466 | } | ||
2467 | 2460 | ||
2468 | if (v->nr_pages) | 2461 | if (v->nr_pages) |
2469 | seq_printf(m, " pages=%d", v->nr_pages); | 2462 | seq_printf(m, " pages=%d", v->nr_pages); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ca587c69274..99999a9b2b0b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/topology.h> | 32 | #include <linux/topology.h> |
33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
35 | #include <linux/compaction.h> | ||
35 | #include <linux/notifier.h> | 36 | #include <linux/notifier.h> |
36 | #include <linux/rwsem.h> | 37 | #include <linux/rwsem.h> |
37 | #include <linux/delay.h> | 38 | #include <linux/delay.h> |
@@ -40,6 +41,7 @@ | |||
40 | #include <linux/memcontrol.h> | 41 | #include <linux/memcontrol.h> |
41 | #include <linux/delayacct.h> | 42 | #include <linux/delayacct.h> |
42 | #include <linux/sysctl.h> | 43 | #include <linux/sysctl.h> |
44 | #include <linux/compaction.h> | ||
43 | 45 | ||
44 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
45 | #include <asm/div64.h> | 47 | #include <asm/div64.h> |
@@ -51,11 +53,23 @@ | |||
51 | #define CREATE_TRACE_POINTS | 53 | #define CREATE_TRACE_POINTS |
52 | #include <trace/events/vmscan.h> | 54 | #include <trace/events/vmscan.h> |
53 | 55 | ||
54 | enum lumpy_mode { | 56 | /* |
55 | LUMPY_MODE_NONE, | 57 | * reclaim_mode determines how the inactive list is shrunk |
56 | LUMPY_MODE_ASYNC, | 58 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages |
57 | LUMPY_MODE_SYNC, | 59 | * RECLAIM_MODE_ASYNC: Do not block |
58 | }; | 60 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback |
61 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference | ||
62 | * page from the LRU and reclaim all pages within a | ||
63 | * naturally aligned range | ||
64 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of | ||
65 | * order-0 pages and then compact the zone | ||
66 | */ | ||
67 | typedef unsigned __bitwise__ reclaim_mode_t; | ||
68 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) | ||
69 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) | ||
70 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) | ||
71 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) | ||
72 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) | ||
59 | 73 | ||
60 | struct scan_control { | 74 | struct scan_control { |
61 | /* Incremented by the number of inactive pages that were scanned */ | 75 | /* Incremented by the number of inactive pages that were scanned */ |
@@ -88,7 +102,7 @@ struct scan_control { | |||
88 | * Intend to reclaim enough continuous memory rather than reclaim | 102 | * Intend to reclaim enough continuous memory rather than reclaim |
89 | * enough amount of memory. i.e, mode for high order allocation. | 103 | * enough amount of memory. i.e, mode for high order allocation. |
90 | */ | 104 | */ |
91 | enum lumpy_mode lumpy_reclaim_mode; | 105 | reclaim_mode_t reclaim_mode; |
92 | 106 | ||
93 | /* Which cgroup do we reclaim from */ | 107 | /* Which cgroup do we reclaim from */ |
94 | struct mem_cgroup *mem_cgroup; | 108 | struct mem_cgroup *mem_cgroup; |
@@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
271 | return ret; | 285 | return ret; |
272 | } | 286 | } |
273 | 287 | ||
274 | static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, | 288 | static void set_reclaim_mode(int priority, struct scan_control *sc, |
275 | bool sync) | 289 | bool sync) |
276 | { | 290 | { |
277 | enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; | 291 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; |
278 | 292 | ||
279 | /* | 293 | /* |
280 | * Some reclaim have alredy been failed. No worth to try synchronous | 294 | * Initially assume we are entering either lumpy reclaim or |
281 | * lumpy reclaim. | 295 | * reclaim/compaction.Depending on the order, we will either set the |
296 | * sync mode or just reclaim order-0 pages later. | ||
282 | */ | 297 | */ |
283 | if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) | 298 | if (COMPACTION_BUILD) |
284 | return; | 299 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; |
300 | else | ||
301 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; | ||
285 | 302 | ||
286 | /* | 303 | /* |
287 | * If we need a large contiguous chunk of memory, or have | 304 | * Avoid using lumpy reclaim or reclaim/compaction if possible by |
288 | * trouble getting a small set of contiguous pages, we | 305 | * restricting when its set to either costly allocations or when |
289 | * will reclaim both active and inactive pages. | 306 | * under memory pressure |
290 | */ | 307 | */ |
291 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | 308 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) |
292 | sc->lumpy_reclaim_mode = mode; | 309 | sc->reclaim_mode |= syncmode; |
293 | else if (sc->order && priority < DEF_PRIORITY - 2) | 310 | else if (sc->order && priority < DEF_PRIORITY - 2) |
294 | sc->lumpy_reclaim_mode = mode; | 311 | sc->reclaim_mode |= syncmode; |
295 | else | 312 | else |
296 | sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; | 313 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
297 | } | 314 | } |
298 | 315 | ||
299 | static void disable_lumpy_reclaim_mode(struct scan_control *sc) | 316 | static void reset_reclaim_mode(struct scan_control *sc) |
300 | { | 317 | { |
301 | sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; | 318 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
302 | } | 319 | } |
303 | 320 | ||
304 | static inline int is_page_cache_freeable(struct page *page) | 321 | static inline int is_page_cache_freeable(struct page *page) |
@@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
429 | * first attempt to free a range of pages fails. | 446 | * first attempt to free a range of pages fails. |
430 | */ | 447 | */ |
431 | if (PageWriteback(page) && | 448 | if (PageWriteback(page) && |
432 | sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) | 449 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) |
433 | wait_on_page_writeback(page); | 450 | wait_on_page_writeback(page); |
434 | 451 | ||
435 | if (!PageWriteback(page)) { | 452 | if (!PageWriteback(page)) { |
@@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
437 | ClearPageReclaim(page); | 454 | ClearPageReclaim(page); |
438 | } | 455 | } |
439 | trace_mm_vmscan_writepage(page, | 456 | trace_mm_vmscan_writepage(page, |
440 | trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); | 457 | trace_reclaim_flags(page, sc->reclaim_mode)); |
441 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 458 | inc_zone_page_state(page, NR_VMSCAN_WRITE); |
442 | return PAGE_SUCCESS; | 459 | return PAGE_SUCCESS; |
443 | } | 460 | } |
@@ -622,7 +639,7 @@ static enum page_references page_check_references(struct page *page, | |||
622 | referenced_page = TestClearPageReferenced(page); | 639 | referenced_page = TestClearPageReferenced(page); |
623 | 640 | ||
624 | /* Lumpy reclaim - ignore references */ | 641 | /* Lumpy reclaim - ignore references */ |
625 | if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) | 642 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) |
626 | return PAGEREF_RECLAIM; | 643 | return PAGEREF_RECLAIM; |
627 | 644 | ||
628 | /* | 645 | /* |
@@ -739,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
739 | * for any page for which writeback has already | 756 | * for any page for which writeback has already |
740 | * started. | 757 | * started. |
741 | */ | 758 | */ |
742 | if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && | 759 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
743 | may_enter_fs) | 760 | may_enter_fs) |
744 | wait_on_page_writeback(page); | 761 | wait_on_page_writeback(page); |
745 | else { | 762 | else { |
@@ -895,7 +912,7 @@ cull_mlocked: | |||
895 | try_to_free_swap(page); | 912 | try_to_free_swap(page); |
896 | unlock_page(page); | 913 | unlock_page(page); |
897 | putback_lru_page(page); | 914 | putback_lru_page(page); |
898 | disable_lumpy_reclaim_mode(sc); | 915 | reset_reclaim_mode(sc); |
899 | continue; | 916 | continue; |
900 | 917 | ||
901 | activate_locked: | 918 | activate_locked: |
@@ -908,7 +925,7 @@ activate_locked: | |||
908 | keep_locked: | 925 | keep_locked: |
909 | unlock_page(page); | 926 | unlock_page(page); |
910 | keep: | 927 | keep: |
911 | disable_lumpy_reclaim_mode(sc); | 928 | reset_reclaim_mode(sc); |
912 | keep_lumpy: | 929 | keep_lumpy: |
913 | list_add(&page->lru, &ret_pages); | 930 | list_add(&page->lru, &ret_pages); |
914 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 931 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
@@ -1028,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1028 | case 0: | 1045 | case 0: |
1029 | list_move(&page->lru, dst); | 1046 | list_move(&page->lru, dst); |
1030 | mem_cgroup_del_lru(page); | 1047 | mem_cgroup_del_lru(page); |
1031 | nr_taken++; | 1048 | nr_taken += hpage_nr_pages(page); |
1032 | break; | 1049 | break; |
1033 | 1050 | ||
1034 | case -EBUSY: | 1051 | case -EBUSY: |
@@ -1086,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1086 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | 1103 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
1087 | list_move(&cursor_page->lru, dst); | 1104 | list_move(&cursor_page->lru, dst); |
1088 | mem_cgroup_del_lru(cursor_page); | 1105 | mem_cgroup_del_lru(cursor_page); |
1089 | nr_taken++; | 1106 | nr_taken += hpage_nr_pages(page); |
1090 | nr_lumpy_taken++; | 1107 | nr_lumpy_taken++; |
1091 | if (PageDirty(cursor_page)) | 1108 | if (PageDirty(cursor_page)) |
1092 | nr_lumpy_dirty++; | 1109 | nr_lumpy_dirty++; |
@@ -1141,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, | |||
1141 | struct page *page; | 1158 | struct page *page; |
1142 | 1159 | ||
1143 | list_for_each_entry(page, page_list, lru) { | 1160 | list_for_each_entry(page, page_list, lru) { |
1161 | int numpages = hpage_nr_pages(page); | ||
1144 | lru = page_lru_base_type(page); | 1162 | lru = page_lru_base_type(page); |
1145 | if (PageActive(page)) { | 1163 | if (PageActive(page)) { |
1146 | lru += LRU_ACTIVE; | 1164 | lru += LRU_ACTIVE; |
1147 | ClearPageActive(page); | 1165 | ClearPageActive(page); |
1148 | nr_active++; | 1166 | nr_active += numpages; |
1149 | } | 1167 | } |
1150 | if (count) | 1168 | if (count) |
1151 | count[lru]++; | 1169 | count[lru] += numpages; |
1152 | } | 1170 | } |
1153 | 1171 | ||
1154 | return nr_active; | 1172 | return nr_active; |
@@ -1253,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
1253 | spin_lock_irq(&zone->lru_lock); | 1271 | spin_lock_irq(&zone->lru_lock); |
1254 | continue; | 1272 | continue; |
1255 | } | 1273 | } |
1256 | SetPageLRU(page); | ||
1257 | lru = page_lru(page); | 1274 | lru = page_lru(page); |
1258 | add_page_to_lru_list(zone, page, lru); | ||
1259 | if (is_active_lru(lru)) { | 1275 | if (is_active_lru(lru)) { |
1260 | int file = is_file_lru(lru); | 1276 | int file = is_file_lru(lru); |
1261 | reclaim_stat->recent_rotated[file]++; | 1277 | int numpages = hpage_nr_pages(page); |
1278 | reclaim_stat->recent_rotated[file] += numpages; | ||
1279 | if (putback_active_lru_page(zone, page)) | ||
1280 | continue; | ||
1262 | } | 1281 | } |
1282 | SetPageLRU(page); | ||
1283 | add_page_to_lru_list(zone, page, lru); | ||
1263 | if (!pagevec_add(&pvec, page)) { | 1284 | if (!pagevec_add(&pvec, page)) { |
1264 | spin_unlock_irq(&zone->lru_lock); | 1285 | spin_unlock_irq(&zone->lru_lock); |
1265 | __pagevec_release(&pvec); | 1286 | __pagevec_release(&pvec); |
@@ -1324,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, | |||
1324 | return false; | 1345 | return false; |
1325 | 1346 | ||
1326 | /* Only stall on lumpy reclaim */ | 1347 | /* Only stall on lumpy reclaim */ |
1327 | if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) | 1348 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) |
1328 | return false; | 1349 | return false; |
1329 | 1350 | ||
1330 | /* If we have relaimed everything on the isolated list, no stall */ | 1351 | /* If we have relaimed everything on the isolated list, no stall */ |
@@ -1368,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1368 | return SWAP_CLUSTER_MAX; | 1389 | return SWAP_CLUSTER_MAX; |
1369 | } | 1390 | } |
1370 | 1391 | ||
1371 | set_lumpy_reclaim_mode(priority, sc, false); | 1392 | set_reclaim_mode(priority, sc, false); |
1372 | lru_add_drain(); | 1393 | lru_add_drain(); |
1373 | spin_lock_irq(&zone->lru_lock); | 1394 | spin_lock_irq(&zone->lru_lock); |
1374 | 1395 | ||
1375 | if (scanning_global_lru(sc)) { | 1396 | if (scanning_global_lru(sc)) { |
1376 | nr_taken = isolate_pages_global(nr_to_scan, | 1397 | nr_taken = isolate_pages_global(nr_to_scan, |
1377 | &page_list, &nr_scanned, sc->order, | 1398 | &page_list, &nr_scanned, sc->order, |
1378 | sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? | 1399 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
1379 | ISOLATE_INACTIVE : ISOLATE_BOTH, | 1400 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
1380 | zone, 0, file); | 1401 | zone, 0, file); |
1381 | zone->pages_scanned += nr_scanned; | 1402 | zone->pages_scanned += nr_scanned; |
1382 | if (current_is_kswapd()) | 1403 | if (current_is_kswapd()) |
@@ -1388,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1388 | } else { | 1409 | } else { |
1389 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1410 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, |
1390 | &page_list, &nr_scanned, sc->order, | 1411 | &page_list, &nr_scanned, sc->order, |
1391 | sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? | 1412 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
1392 | ISOLATE_INACTIVE : ISOLATE_BOTH, | 1413 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
1393 | zone, sc->mem_cgroup, | 1414 | zone, sc->mem_cgroup, |
1394 | 0, file); | 1415 | 0, file); |
1395 | /* | 1416 | /* |
@@ -1411,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1411 | 1432 | ||
1412 | /* Check if we should syncronously wait for writeback */ | 1433 | /* Check if we should syncronously wait for writeback */ |
1413 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1434 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1414 | set_lumpy_reclaim_mode(priority, sc, true); | 1435 | set_reclaim_mode(priority, sc, true); |
1415 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1436 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); |
1416 | } | 1437 | } |
1417 | 1438 | ||
@@ -1426,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1426 | zone_idx(zone), | 1447 | zone_idx(zone), |
1427 | nr_scanned, nr_reclaimed, | 1448 | nr_scanned, nr_reclaimed, |
1428 | priority, | 1449 | priority, |
1429 | trace_shrink_flags(file, sc->lumpy_reclaim_mode)); | 1450 | trace_shrink_flags(file, sc->reclaim_mode)); |
1430 | return nr_reclaimed; | 1451 | return nr_reclaimed; |
1431 | } | 1452 | } |
1432 | 1453 | ||
@@ -1466,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1466 | 1487 | ||
1467 | list_move(&page->lru, &zone->lru[lru].list); | 1488 | list_move(&page->lru, &zone->lru[lru].list); |
1468 | mem_cgroup_add_lru_list(page, lru); | 1489 | mem_cgroup_add_lru_list(page, lru); |
1469 | pgmoved++; | 1490 | pgmoved += hpage_nr_pages(page); |
1470 | 1491 | ||
1471 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | 1492 | if (!pagevec_add(&pvec, page) || list_empty(list)) { |
1472 | spin_unlock_irq(&zone->lru_lock); | 1493 | spin_unlock_irq(&zone->lru_lock); |
@@ -1534,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1534 | } | 1555 | } |
1535 | 1556 | ||
1536 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | 1557 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
1537 | nr_rotated++; | 1558 | nr_rotated += hpage_nr_pages(page); |
1538 | /* | 1559 | /* |
1539 | * Identify referenced, file-backed active pages and | 1560 | * Identify referenced, file-backed active pages and |
1540 | * give them one more trip around the active list. So | 1561 | * give them one more trip around the active list. So |
@@ -1805,6 +1826,57 @@ out: | |||
1805 | } | 1826 | } |
1806 | 1827 | ||
1807 | /* | 1828 | /* |
1829 | * Reclaim/compaction depends on a number of pages being freed. To avoid | ||
1830 | * disruption to the system, a small number of order-0 pages continue to be | ||
1831 | * rotated and reclaimed in the normal fashion. However, by the time we get | ||
1832 | * back to the allocator and call try_to_compact_zone(), we ensure that | ||
1833 | * there are enough free pages for it to be likely successful | ||
1834 | */ | ||
1835 | static inline bool should_continue_reclaim(struct zone *zone, | ||
1836 | unsigned long nr_reclaimed, | ||
1837 | unsigned long nr_scanned, | ||
1838 | struct scan_control *sc) | ||
1839 | { | ||
1840 | unsigned long pages_for_compaction; | ||
1841 | unsigned long inactive_lru_pages; | ||
1842 | |||
1843 | /* If not in reclaim/compaction mode, stop */ | ||
1844 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | ||
1845 | return false; | ||
1846 | |||
1847 | /* | ||
1848 | * If we failed to reclaim and have scanned the full list, stop. | ||
1849 | * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far | ||
1850 | * faster but obviously would be less likely to succeed | ||
1851 | * allocation. If this is desirable, use GFP_REPEAT to decide | ||
1852 | * if both reclaimed and scanned should be checked or just | ||
1853 | * reclaimed | ||
1854 | */ | ||
1855 | if (!nr_reclaimed && !nr_scanned) | ||
1856 | return false; | ||
1857 | |||
1858 | /* | ||
1859 | * If we have not reclaimed enough pages for compaction and the | ||
1860 | * inactive lists are large enough, continue reclaiming | ||
1861 | */ | ||
1862 | pages_for_compaction = (2UL << sc->order); | ||
1863 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + | ||
1864 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1865 | if (sc->nr_reclaimed < pages_for_compaction && | ||
1866 | inactive_lru_pages > pages_for_compaction) | ||
1867 | return true; | ||
1868 | |||
1869 | /* If compaction would go ahead or the allocation would succeed, stop */ | ||
1870 | switch (compaction_suitable(zone, sc->order)) { | ||
1871 | case COMPACT_PARTIAL: | ||
1872 | case COMPACT_CONTINUE: | ||
1873 | return false; | ||
1874 | default: | ||
1875 | return true; | ||
1876 | } | ||
1877 | } | ||
1878 | |||
1879 | /* | ||
1808 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1880 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
1809 | */ | 1881 | */ |
1810 | static void shrink_zone(int priority, struct zone *zone, | 1882 | static void shrink_zone(int priority, struct zone *zone, |
@@ -1813,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1813 | unsigned long nr[NR_LRU_LISTS]; | 1885 | unsigned long nr[NR_LRU_LISTS]; |
1814 | unsigned long nr_to_scan; | 1886 | unsigned long nr_to_scan; |
1815 | enum lru_list l; | 1887 | enum lru_list l; |
1816 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1888 | unsigned long nr_reclaimed; |
1817 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1889 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1890 | unsigned long nr_scanned = sc->nr_scanned; | ||
1818 | 1891 | ||
1892 | restart: | ||
1893 | nr_reclaimed = 0; | ||
1819 | get_scan_count(zone, sc, nr, priority); | 1894 | get_scan_count(zone, sc, nr, priority); |
1820 | 1895 | ||
1821 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1896 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
@@ -1841,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1841 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 1916 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
1842 | break; | 1917 | break; |
1843 | } | 1918 | } |
1844 | 1919 | sc->nr_reclaimed += nr_reclaimed; | |
1845 | sc->nr_reclaimed = nr_reclaimed; | ||
1846 | 1920 | ||
1847 | /* | 1921 | /* |
1848 | * Even if we did not try to evict anon pages at all, we want to | 1922 | * Even if we did not try to evict anon pages at all, we want to |
@@ -1851,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1851 | if (inactive_anon_is_low(zone, sc)) | 1925 | if (inactive_anon_is_low(zone, sc)) |
1852 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1926 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1853 | 1927 | ||
1928 | /* reclaim/compaction might need reclaim to continue */ | ||
1929 | if (should_continue_reclaim(zone, nr_reclaimed, | ||
1930 | sc->nr_scanned - nr_scanned, sc)) | ||
1931 | goto restart; | ||
1932 | |||
1854 | throttle_vm_writeout(sc->gfp_mask); | 1933 | throttle_vm_writeout(sc->gfp_mask); |
1855 | } | 1934 | } |
1856 | 1935 | ||
@@ -2124,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2124 | } | 2203 | } |
2125 | #endif | 2204 | #endif |
2126 | 2205 | ||
2206 | /* | ||
2207 | * pgdat_balanced is used when checking if a node is balanced for high-order | ||
2208 | * allocations. Only zones that meet watermarks and are in a zone allowed | ||
2209 | * by the callers classzone_idx are added to balanced_pages. The total of | ||
2210 | * balanced pages must be at least 25% of the zones allowed by classzone_idx | ||
2211 | * for the node to be considered balanced. Forcing all zones to be balanced | ||
2212 | * for high orders can cause excessive reclaim when there are imbalanced zones. | ||
2213 | * The choice of 25% is due to | ||
2214 | * o a 16M DMA zone that is balanced will not balance a zone on any | ||
2215 | * reasonable sized machine | ||
2216 | * o On all other machines, the top zone must be at least a reasonable | ||
2217 | * precentage of the middle zones. For example, on 32-bit x86, highmem | ||
2218 | * would need to be at least 256M for it to be balance a whole node. | ||
2219 | * Similarly, on x86-64 the Normal zone would need to be at least 1G | ||
2220 | * to balance a node on its own. These seemed like reasonable ratios. | ||
2221 | */ | ||
2222 | static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | ||
2223 | int classzone_idx) | ||
2224 | { | ||
2225 | unsigned long present_pages = 0; | ||
2226 | int i; | ||
2227 | |||
2228 | for (i = 0; i <= classzone_idx; i++) | ||
2229 | present_pages += pgdat->node_zones[i].present_pages; | ||
2230 | |||
2231 | return balanced_pages > (present_pages >> 2); | ||
2232 | } | ||
2233 | |||
2127 | /* is kswapd sleeping prematurely? */ | 2234 | /* is kswapd sleeping prematurely? */ |
2128 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | 2235 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, |
2236 | int classzone_idx) | ||
2129 | { | 2237 | { |
2130 | int i; | 2238 | int i; |
2239 | unsigned long balanced = 0; | ||
2240 | bool all_zones_ok = true; | ||
2131 | 2241 | ||
2132 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2242 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2133 | if (remaining) | 2243 | if (remaining) |
2134 | return 1; | 2244 | return true; |
2135 | 2245 | ||
2136 | /* If after HZ/10, a zone is below the high mark, it's premature */ | 2246 | /* Check the watermark levels */ |
2137 | for (i = 0; i < pgdat->nr_zones; i++) { | 2247 | for (i = 0; i < pgdat->nr_zones; i++) { |
2138 | struct zone *zone = pgdat->node_zones + i; | 2248 | struct zone *zone = pgdat->node_zones + i; |
2139 | 2249 | ||
2140 | if (!populated_zone(zone)) | 2250 | if (!populated_zone(zone)) |
2141 | continue; | 2251 | continue; |
2142 | 2252 | ||
2143 | if (zone->all_unreclaimable) | 2253 | /* |
2254 | * balance_pgdat() skips over all_unreclaimable after | ||
2255 | * DEF_PRIORITY. Effectively, it considers them balanced so | ||
2256 | * they must be considered balanced here as well if kswapd | ||
2257 | * is to sleep | ||
2258 | */ | ||
2259 | if (zone->all_unreclaimable) { | ||
2260 | balanced += zone->present_pages; | ||
2144 | continue; | 2261 | continue; |
2262 | } | ||
2145 | 2263 | ||
2146 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 2264 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2147 | 0, 0)) | 2265 | classzone_idx, 0)) |
2148 | return 1; | 2266 | all_zones_ok = false; |
2267 | else | ||
2268 | balanced += zone->present_pages; | ||
2149 | } | 2269 | } |
2150 | 2270 | ||
2151 | return 0; | 2271 | /* |
2272 | * For high-order requests, the balanced zones must contain at least | ||
2273 | * 25% of the nodes pages for kswapd to sleep. For order-0, all zones | ||
2274 | * must be balanced | ||
2275 | */ | ||
2276 | if (order) | ||
2277 | return pgdat_balanced(pgdat, balanced, classzone_idx); | ||
2278 | else | ||
2279 | return !all_zones_ok; | ||
2152 | } | 2280 | } |
2153 | 2281 | ||
2154 | /* | 2282 | /* |
2155 | * For kswapd, balance_pgdat() will work across all this node's zones until | 2283 | * For kswapd, balance_pgdat() will work across all this node's zones until |
2156 | * they are all at high_wmark_pages(zone). | 2284 | * they are all at high_wmark_pages(zone). |
2157 | * | 2285 | * |
2158 | * Returns the number of pages which were actually freed. | 2286 | * Returns the final order kswapd was reclaiming at |
2159 | * | 2287 | * |
2160 | * There is special handling here for zones which are full of pinned pages. | 2288 | * There is special handling here for zones which are full of pinned pages. |
2161 | * This can happen if the pages are all mlocked, or if they are all used by | 2289 | * This can happen if the pages are all mlocked, or if they are all used by |
@@ -2172,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
2172 | * interoperates with the page allocator fallback scheme to ensure that aging | 2300 | * interoperates with the page allocator fallback scheme to ensure that aging |
2173 | * of pages is balanced across the zones. | 2301 | * of pages is balanced across the zones. |
2174 | */ | 2302 | */ |
2175 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 2303 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2304 | int *classzone_idx) | ||
2176 | { | 2305 | { |
2177 | int all_zones_ok; | 2306 | int all_zones_ok; |
2307 | unsigned long balanced; | ||
2178 | int priority; | 2308 | int priority; |
2179 | int i; | 2309 | int i; |
2310 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
2180 | unsigned long total_scanned; | 2311 | unsigned long total_scanned; |
2181 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2312 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2182 | struct scan_control sc = { | 2313 | struct scan_control sc = { |
@@ -2199,7 +2330,6 @@ loop_again: | |||
2199 | count_vm_event(PAGEOUTRUN); | 2330 | count_vm_event(PAGEOUTRUN); |
2200 | 2331 | ||
2201 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2332 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2202 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
2203 | unsigned long lru_pages = 0; | 2333 | unsigned long lru_pages = 0; |
2204 | int has_under_min_watermark_zone = 0; | 2334 | int has_under_min_watermark_zone = 0; |
2205 | 2335 | ||
@@ -2208,6 +2338,7 @@ loop_again: | |||
2208 | disable_swap_token(); | 2338 | disable_swap_token(); |
2209 | 2339 | ||
2210 | all_zones_ok = 1; | 2340 | all_zones_ok = 1; |
2341 | balanced = 0; | ||
2211 | 2342 | ||
2212 | /* | 2343 | /* |
2213 | * Scan in the highmem->dma direction for the highest | 2344 | * Scan in the highmem->dma direction for the highest |
@@ -2230,9 +2361,10 @@ loop_again: | |||
2230 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 2361 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
2231 | &sc, priority, 0); | 2362 | &sc, priority, 0); |
2232 | 2363 | ||
2233 | if (!zone_watermark_ok(zone, order, | 2364 | if (!zone_watermark_ok_safe(zone, order, |
2234 | high_wmark_pages(zone), 0, 0)) { | 2365 | high_wmark_pages(zone), 0, 0)) { |
2235 | end_zone = i; | 2366 | end_zone = i; |
2367 | *classzone_idx = i; | ||
2236 | break; | 2368 | break; |
2237 | } | 2369 | } |
2238 | } | 2370 | } |
@@ -2255,6 +2387,7 @@ loop_again: | |||
2255 | * cause too much scanning of the lower zones. | 2387 | * cause too much scanning of the lower zones. |
2256 | */ | 2388 | */ |
2257 | for (i = 0; i <= end_zone; i++) { | 2389 | for (i = 0; i <= end_zone; i++) { |
2390 | int compaction; | ||
2258 | struct zone *zone = pgdat->node_zones + i; | 2391 | struct zone *zone = pgdat->node_zones + i; |
2259 | int nr_slab; | 2392 | int nr_slab; |
2260 | 2393 | ||
@@ -2276,7 +2409,7 @@ loop_again: | |||
2276 | * We put equal pressure on every zone, unless one | 2409 | * We put equal pressure on every zone, unless one |
2277 | * zone has way too many pages free already. | 2410 | * zone has way too many pages free already. |
2278 | */ | 2411 | */ |
2279 | if (!zone_watermark_ok(zone, order, | 2412 | if (!zone_watermark_ok_safe(zone, order, |
2280 | 8*high_wmark_pages(zone), end_zone, 0)) | 2413 | 8*high_wmark_pages(zone), end_zone, 0)) |
2281 | shrink_zone(priority, zone, &sc); | 2414 | shrink_zone(priority, zone, &sc); |
2282 | reclaim_state->reclaimed_slab = 0; | 2415 | reclaim_state->reclaimed_slab = 0; |
@@ -2284,9 +2417,26 @@ loop_again: | |||
2284 | lru_pages); | 2417 | lru_pages); |
2285 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2418 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2286 | total_scanned += sc.nr_scanned; | 2419 | total_scanned += sc.nr_scanned; |
2420 | |||
2421 | compaction = 0; | ||
2422 | if (order && | ||
2423 | zone_watermark_ok(zone, 0, | ||
2424 | high_wmark_pages(zone), | ||
2425 | end_zone, 0) && | ||
2426 | !zone_watermark_ok(zone, order, | ||
2427 | high_wmark_pages(zone), | ||
2428 | end_zone, 0)) { | ||
2429 | compact_zone_order(zone, | ||
2430 | order, | ||
2431 | sc.gfp_mask, false, | ||
2432 | COMPACT_MODE_KSWAPD); | ||
2433 | compaction = 1; | ||
2434 | } | ||
2435 | |||
2287 | if (zone->all_unreclaimable) | 2436 | if (zone->all_unreclaimable) |
2288 | continue; | 2437 | continue; |
2289 | if (nr_slab == 0 && !zone_reclaimable(zone)) | 2438 | if (!compaction && nr_slab == 0 && |
2439 | !zone_reclaimable(zone)) | ||
2290 | zone->all_unreclaimable = 1; | 2440 | zone->all_unreclaimable = 1; |
2291 | /* | 2441 | /* |
2292 | * If we've done a decent amount of scanning and | 2442 | * If we've done a decent amount of scanning and |
@@ -2297,7 +2447,7 @@ loop_again: | |||
2297 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2447 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2298 | sc.may_writepage = 1; | 2448 | sc.may_writepage = 1; |
2299 | 2449 | ||
2300 | if (!zone_watermark_ok(zone, order, | 2450 | if (!zone_watermark_ok_safe(zone, order, |
2301 | high_wmark_pages(zone), end_zone, 0)) { | 2451 | high_wmark_pages(zone), end_zone, 0)) { |
2302 | all_zones_ok = 0; | 2452 | all_zones_ok = 0; |
2303 | /* | 2453 | /* |
@@ -2305,7 +2455,7 @@ loop_again: | |||
2305 | * means that we have a GFP_ATOMIC allocation | 2455 | * means that we have a GFP_ATOMIC allocation |
2306 | * failure risk. Hurry up! | 2456 | * failure risk. Hurry up! |
2307 | */ | 2457 | */ |
2308 | if (!zone_watermark_ok(zone, order, | 2458 | if (!zone_watermark_ok_safe(zone, order, |
2309 | min_wmark_pages(zone), end_zone, 0)) | 2459 | min_wmark_pages(zone), end_zone, 0)) |
2310 | has_under_min_watermark_zone = 1; | 2460 | has_under_min_watermark_zone = 1; |
2311 | } else { | 2461 | } else { |
@@ -2317,10 +2467,12 @@ loop_again: | |||
2317 | * spectulatively avoid congestion waits | 2467 | * spectulatively avoid congestion waits |
2318 | */ | 2468 | */ |
2319 | zone_clear_flag(zone, ZONE_CONGESTED); | 2469 | zone_clear_flag(zone, ZONE_CONGESTED); |
2470 | if (i <= *classzone_idx) | ||
2471 | balanced += zone->present_pages; | ||
2320 | } | 2472 | } |
2321 | 2473 | ||
2322 | } | 2474 | } |
2323 | if (all_zones_ok) | 2475 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2324 | break; /* kswapd: all done */ | 2476 | break; /* kswapd: all done */ |
2325 | /* | 2477 | /* |
2326 | * OK, kswapd is getting into trouble. Take a nap, then take | 2478 | * OK, kswapd is getting into trouble. Take a nap, then take |
@@ -2343,7 +2495,13 @@ loop_again: | |||
2343 | break; | 2495 | break; |
2344 | } | 2496 | } |
2345 | out: | 2497 | out: |
2346 | if (!all_zones_ok) { | 2498 | |
2499 | /* | ||
2500 | * order-0: All zones must meet high watermark for a balanced node | ||
2501 | * high-order: Balanced zones must make up at least 25% of the node | ||
2502 | * for the node to be balanced | ||
2503 | */ | ||
2504 | if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { | ||
2347 | cond_resched(); | 2505 | cond_resched(); |
2348 | 2506 | ||
2349 | try_to_freeze(); | 2507 | try_to_freeze(); |
@@ -2368,7 +2526,88 @@ out: | |||
2368 | goto loop_again; | 2526 | goto loop_again; |
2369 | } | 2527 | } |
2370 | 2528 | ||
2371 | return sc.nr_reclaimed; | 2529 | /* |
2530 | * If kswapd was reclaiming at a higher order, it has the option of | ||
2531 | * sleeping without all zones being balanced. Before it does, it must | ||
2532 | * ensure that the watermarks for order-0 on *all* zones are met and | ||
2533 | * that the congestion flags are cleared. The congestion flag must | ||
2534 | * be cleared as kswapd is the only mechanism that clears the flag | ||
2535 | * and it is potentially going to sleep here. | ||
2536 | */ | ||
2537 | if (order) { | ||
2538 | for (i = 0; i <= end_zone; i++) { | ||
2539 | struct zone *zone = pgdat->node_zones + i; | ||
2540 | |||
2541 | if (!populated_zone(zone)) | ||
2542 | continue; | ||
2543 | |||
2544 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | ||
2545 | continue; | ||
2546 | |||
2547 | /* Confirm the zone is balanced for order-0 */ | ||
2548 | if (!zone_watermark_ok(zone, 0, | ||
2549 | high_wmark_pages(zone), 0, 0)) { | ||
2550 | order = sc.order = 0; | ||
2551 | goto loop_again; | ||
2552 | } | ||
2553 | |||
2554 | /* If balanced, clear the congested flag */ | ||
2555 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2556 | } | ||
2557 | } | ||
2558 | |||
2559 | /* | ||
2560 | * Return the order we were reclaiming at so sleeping_prematurely() | ||
2561 | * makes a decision on the order we were last reclaiming at. However, | ||
2562 | * if another caller entered the allocator slow path while kswapd | ||
2563 | * was awake, order will remain at the higher level | ||
2564 | */ | ||
2565 | *classzone_idx = end_zone; | ||
2566 | return order; | ||
2567 | } | ||
2568 | |||
2569 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | ||
2570 | { | ||
2571 | long remaining = 0; | ||
2572 | DEFINE_WAIT(wait); | ||
2573 | |||
2574 | if (freezing(current) || kthread_should_stop()) | ||
2575 | return; | ||
2576 | |||
2577 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2578 | |||
2579 | /* Try to sleep for a short interval */ | ||
2580 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
2581 | remaining = schedule_timeout(HZ/10); | ||
2582 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2583 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2584 | } | ||
2585 | |||
2586 | /* | ||
2587 | * After a short sleep, check if it was a premature sleep. If not, then | ||
2588 | * go fully to sleep until explicitly woken up. | ||
2589 | */ | ||
2590 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
2591 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
2592 | |||
2593 | /* | ||
2594 | * vmstat counters are not perfectly accurate and the estimated | ||
2595 | * value for counters such as NR_FREE_PAGES can deviate from the | ||
2596 | * true value by nr_online_cpus * threshold. To avoid the zone | ||
2597 | * watermarks being breached while under pressure, we reduce the | ||
2598 | * per-cpu vmstat threshold while kswapd is awake and restore | ||
2599 | * them before going back to sleep. | ||
2600 | */ | ||
2601 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | ||
2602 | schedule(); | ||
2603 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | ||
2604 | } else { | ||
2605 | if (remaining) | ||
2606 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2607 | else | ||
2608 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2609 | } | ||
2610 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2372 | } | 2611 | } |
2373 | 2612 | ||
2374 | /* | 2613 | /* |
@@ -2387,9 +2626,10 @@ out: | |||
2387 | static int kswapd(void *p) | 2626 | static int kswapd(void *p) |
2388 | { | 2627 | { |
2389 | unsigned long order; | 2628 | unsigned long order; |
2629 | int classzone_idx; | ||
2390 | pg_data_t *pgdat = (pg_data_t*)p; | 2630 | pg_data_t *pgdat = (pg_data_t*)p; |
2391 | struct task_struct *tsk = current; | 2631 | struct task_struct *tsk = current; |
2392 | DEFINE_WAIT(wait); | 2632 | |
2393 | struct reclaim_state reclaim_state = { | 2633 | struct reclaim_state reclaim_state = { |
2394 | .reclaimed_slab = 0, | 2634 | .reclaimed_slab = 0, |
2395 | }; | 2635 | }; |
@@ -2417,49 +2657,30 @@ static int kswapd(void *p) | |||
2417 | set_freezable(); | 2657 | set_freezable(); |
2418 | 2658 | ||
2419 | order = 0; | 2659 | order = 0; |
2660 | classzone_idx = MAX_NR_ZONES - 1; | ||
2420 | for ( ; ; ) { | 2661 | for ( ; ; ) { |
2421 | unsigned long new_order; | 2662 | unsigned long new_order; |
2663 | int new_classzone_idx; | ||
2422 | int ret; | 2664 | int ret; |
2423 | 2665 | ||
2424 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2425 | new_order = pgdat->kswapd_max_order; | 2666 | new_order = pgdat->kswapd_max_order; |
2667 | new_classzone_idx = pgdat->classzone_idx; | ||
2426 | pgdat->kswapd_max_order = 0; | 2668 | pgdat->kswapd_max_order = 0; |
2427 | if (order < new_order) { | 2669 | pgdat->classzone_idx = MAX_NR_ZONES - 1; |
2670 | if (order < new_order || classzone_idx > new_classzone_idx) { | ||
2428 | /* | 2671 | /* |
2429 | * Don't sleep if someone wants a larger 'order' | 2672 | * Don't sleep if someone wants a larger 'order' |
2430 | * allocation | 2673 | * allocation or has tigher zone constraints |
2431 | */ | 2674 | */ |
2432 | order = new_order; | 2675 | order = new_order; |
2676 | classzone_idx = new_classzone_idx; | ||
2433 | } else { | 2677 | } else { |
2434 | if (!freezing(current) && !kthread_should_stop()) { | 2678 | kswapd_try_to_sleep(pgdat, order, classzone_idx); |
2435 | long remaining = 0; | ||
2436 | |||
2437 | /* Try to sleep for a short interval */ | ||
2438 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2439 | remaining = schedule_timeout(HZ/10); | ||
2440 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2441 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2442 | } | ||
2443 | |||
2444 | /* | ||
2445 | * After a short sleep, check if it was a | ||
2446 | * premature sleep. If not, then go fully | ||
2447 | * to sleep until explicitly woken up | ||
2448 | */ | ||
2449 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2450 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
2451 | schedule(); | ||
2452 | } else { | ||
2453 | if (remaining) | ||
2454 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2455 | else | ||
2456 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2457 | } | ||
2458 | } | ||
2459 | |||
2460 | order = pgdat->kswapd_max_order; | 2679 | order = pgdat->kswapd_max_order; |
2680 | classzone_idx = pgdat->classzone_idx; | ||
2681 | pgdat->kswapd_max_order = 0; | ||
2682 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | ||
2461 | } | 2683 | } |
2462 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2463 | 2684 | ||
2464 | ret = try_to_freeze(); | 2685 | ret = try_to_freeze(); |
2465 | if (kthread_should_stop()) | 2686 | if (kthread_should_stop()) |
@@ -2471,7 +2692,7 @@ static int kswapd(void *p) | |||
2471 | */ | 2692 | */ |
2472 | if (!ret) { | 2693 | if (!ret) { |
2473 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2694 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
2474 | balance_pgdat(pgdat, order); | 2695 | order = balance_pgdat(pgdat, order, &classzone_idx); |
2475 | } | 2696 | } |
2476 | } | 2697 | } |
2477 | return 0; | 2698 | return 0; |
@@ -2480,23 +2701,26 @@ static int kswapd(void *p) | |||
2480 | /* | 2701 | /* |
2481 | * A zone is low on free memory, so wake its kswapd task to service it. | 2702 | * A zone is low on free memory, so wake its kswapd task to service it. |
2482 | */ | 2703 | */ |
2483 | void wakeup_kswapd(struct zone *zone, int order) | 2704 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) |
2484 | { | 2705 | { |
2485 | pg_data_t *pgdat; | 2706 | pg_data_t *pgdat; |
2486 | 2707 | ||
2487 | if (!populated_zone(zone)) | 2708 | if (!populated_zone(zone)) |
2488 | return; | 2709 | return; |
2489 | 2710 | ||
2490 | pgdat = zone->zone_pgdat; | ||
2491 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2492 | return; | ||
2493 | if (pgdat->kswapd_max_order < order) | ||
2494 | pgdat->kswapd_max_order = order; | ||
2495 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2496 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2711 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2497 | return; | 2712 | return; |
2713 | pgdat = zone->zone_pgdat; | ||
2714 | if (pgdat->kswapd_max_order < order) { | ||
2715 | pgdat->kswapd_max_order = order; | ||
2716 | pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); | ||
2717 | } | ||
2498 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 2718 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
2499 | return; | 2719 | return; |
2720 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2721 | return; | ||
2722 | |||
2723 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2500 | wake_up_interruptible(&pgdat->kswapd_wait); | 2724 | wake_up_interruptible(&pgdat->kswapd_wait); |
2501 | } | 2725 | } |
2502 | 2726 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 8f62f17ee1c7..0c3b5048773e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat); | |||
83 | 83 | ||
84 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |
85 | 85 | ||
86 | static int calculate_threshold(struct zone *zone) | 86 | int calculate_pressure_threshold(struct zone *zone) |
87 | { | ||
88 | int threshold; | ||
89 | int watermark_distance; | ||
90 | |||
91 | /* | ||
92 | * As vmstats are not up to date, there is drift between the estimated | ||
93 | * and real values. For high thresholds and a high number of CPUs, it | ||
94 | * is possible for the min watermark to be breached while the estimated | ||
95 | * value looks fine. The pressure threshold is a reduced value such | ||
96 | * that even the maximum amount of drift will not accidentally breach | ||
97 | * the min watermark | ||
98 | */ | ||
99 | watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); | ||
100 | threshold = max(1, (int)(watermark_distance / num_online_cpus())); | ||
101 | |||
102 | /* | ||
103 | * Maximum threshold is 125 | ||
104 | */ | ||
105 | threshold = min(125, threshold); | ||
106 | |||
107 | return threshold; | ||
108 | } | ||
109 | |||
110 | int calculate_normal_threshold(struct zone *zone) | ||
87 | { | 111 | { |
88 | int threshold; | 112 | int threshold; |
89 | int mem; /* memory in 128 MB units */ | 113 | int mem; /* memory in 128 MB units */ |
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) | |||
142 | for_each_populated_zone(zone) { | 166 | for_each_populated_zone(zone) { |
143 | unsigned long max_drift, tolerate_drift; | 167 | unsigned long max_drift, tolerate_drift; |
144 | 168 | ||
145 | threshold = calculate_threshold(zone); | 169 | threshold = calculate_normal_threshold(zone); |
146 | 170 | ||
147 | for_each_online_cpu(cpu) | 171 | for_each_online_cpu(cpu) |
148 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | 172 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
@@ -161,42 +185,50 @@ static void refresh_zone_stat_thresholds(void) | |||
161 | } | 185 | } |
162 | } | 186 | } |
163 | 187 | ||
188 | void set_pgdat_percpu_threshold(pg_data_t *pgdat, | ||
189 | int (*calculate_pressure)(struct zone *)) | ||
190 | { | ||
191 | struct zone *zone; | ||
192 | int cpu; | ||
193 | int threshold; | ||
194 | int i; | ||
195 | |||
196 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
197 | zone = &pgdat->node_zones[i]; | ||
198 | if (!zone->percpu_drift_mark) | ||
199 | continue; | ||
200 | |||
201 | threshold = (*calculate_pressure)(zone); | ||
202 | for_each_possible_cpu(cpu) | ||
203 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | ||
204 | = threshold; | ||
205 | } | ||
206 | } | ||
207 | |||
164 | /* | 208 | /* |
165 | * For use when we know that interrupts are disabled. | 209 | * For use when we know that interrupts are disabled. |
166 | */ | 210 | */ |
167 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 211 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
168 | int delta) | 212 | int delta) |
169 | { | 213 | { |
170 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 214 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
171 | 215 | s8 __percpu *p = pcp->vm_stat_diff + item; | |
172 | s8 *p = pcp->vm_stat_diff + item; | ||
173 | long x; | 216 | long x; |
217 | long t; | ||
218 | |||
219 | x = delta + __this_cpu_read(*p); | ||
174 | 220 | ||
175 | x = delta + *p; | 221 | t = __this_cpu_read(pcp->stat_threshold); |
176 | 222 | ||
177 | if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { | 223 | if (unlikely(x > t || x < -t)) { |
178 | zone_page_state_add(x, zone, item); | 224 | zone_page_state_add(x, zone, item); |
179 | x = 0; | 225 | x = 0; |
180 | } | 226 | } |
181 | *p = x; | 227 | __this_cpu_write(*p, x); |
182 | } | 228 | } |
183 | EXPORT_SYMBOL(__mod_zone_page_state); | 229 | EXPORT_SYMBOL(__mod_zone_page_state); |
184 | 230 | ||
185 | /* | 231 | /* |
186 | * For an unknown interrupt state | ||
187 | */ | ||
188 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
189 | int delta) | ||
190 | { | ||
191 | unsigned long flags; | ||
192 | |||
193 | local_irq_save(flags); | ||
194 | __mod_zone_page_state(zone, item, delta); | ||
195 | local_irq_restore(flags); | ||
196 | } | ||
197 | EXPORT_SYMBOL(mod_zone_page_state); | ||
198 | |||
199 | /* | ||
200 | * Optimized increment and decrement functions. | 232 | * Optimized increment and decrement functions. |
201 | * | 233 | * |
202 | * These are only for a single page and therefore can take a struct page * | 234 | * These are only for a single page and therefore can take a struct page * |
@@ -221,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state); | |||
221 | */ | 253 | */ |
222 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | 254 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
223 | { | 255 | { |
224 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 256 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
225 | s8 *p = pcp->vm_stat_diff + item; | 257 | s8 __percpu *p = pcp->vm_stat_diff + item; |
258 | s8 v, t; | ||
226 | 259 | ||
227 | (*p)++; | 260 | v = __this_cpu_inc_return(*p); |
261 | t = __this_cpu_read(pcp->stat_threshold); | ||
262 | if (unlikely(v > t)) { | ||
263 | s8 overstep = t >> 1; | ||
228 | 264 | ||
229 | if (unlikely(*p > pcp->stat_threshold)) { | 265 | zone_page_state_add(v + overstep, zone, item); |
230 | int overstep = pcp->stat_threshold / 2; | 266 | __this_cpu_write(*p, -overstep); |
231 | |||
232 | zone_page_state_add(*p + overstep, zone, item); | ||
233 | *p = -overstep; | ||
234 | } | 267 | } |
235 | } | 268 | } |
236 | 269 | ||
@@ -242,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state); | |||
242 | 275 | ||
243 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | 276 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
244 | { | 277 | { |
245 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 278 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
246 | s8 *p = pcp->vm_stat_diff + item; | 279 | s8 __percpu *p = pcp->vm_stat_diff + item; |
247 | 280 | s8 v, t; | |
248 | (*p)--; | ||
249 | 281 | ||
250 | if (unlikely(*p < - pcp->stat_threshold)) { | 282 | v = __this_cpu_dec_return(*p); |
251 | int overstep = pcp->stat_threshold / 2; | 283 | t = __this_cpu_read(pcp->stat_threshold); |
284 | if (unlikely(v < - t)) { | ||
285 | s8 overstep = t >> 1; | ||
252 | 286 | ||
253 | zone_page_state_add(*p - overstep, zone, item); | 287 | zone_page_state_add(v - overstep, zone, item); |
254 | *p = overstep; | 288 | __this_cpu_write(*p, overstep); |
255 | } | 289 | } |
256 | } | 290 | } |
257 | 291 | ||
@@ -261,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
261 | } | 295 | } |
262 | EXPORT_SYMBOL(__dec_zone_page_state); | 296 | EXPORT_SYMBOL(__dec_zone_page_state); |
263 | 297 | ||
298 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
299 | /* | ||
300 | * If we have cmpxchg_local support then we do not need to incur the overhead | ||
301 | * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. | ||
302 | * | ||
303 | * mod_state() modifies the zone counter state through atomic per cpu | ||
304 | * operations. | ||
305 | * | ||
306 | * Overstep mode specifies how overstep should handled: | ||
307 | * 0 No overstepping | ||
308 | * 1 Overstepping half of threshold | ||
309 | * -1 Overstepping minus half of threshold | ||
310 | */ | ||
311 | static inline void mod_state(struct zone *zone, | ||
312 | enum zone_stat_item item, int delta, int overstep_mode) | ||
313 | { | ||
314 | struct per_cpu_pageset __percpu *pcp = zone->pageset; | ||
315 | s8 __percpu *p = pcp->vm_stat_diff + item; | ||
316 | long o, n, t, z; | ||
317 | |||
318 | do { | ||
319 | z = 0; /* overflow to zone counters */ | ||
320 | |||
321 | /* | ||
322 | * The fetching of the stat_threshold is racy. We may apply | ||
323 | * a counter threshold to the wrong the cpu if we get | ||
324 | * rescheduled while executing here. However, the following | ||
325 | * will apply the threshold again and therefore bring the | ||
326 | * counter under the threshold. | ||
327 | */ | ||
328 | t = this_cpu_read(pcp->stat_threshold); | ||
329 | |||
330 | o = this_cpu_read(*p); | ||
331 | n = delta + o; | ||
332 | |||
333 | if (n > t || n < -t) { | ||
334 | int os = overstep_mode * (t >> 1) ; | ||
335 | |||
336 | /* Overflow must be added to zone counters */ | ||
337 | z = n + os; | ||
338 | n = -os; | ||
339 | } | ||
340 | } while (this_cpu_cmpxchg(*p, o, n) != o); | ||
341 | |||
342 | if (z) | ||
343 | zone_page_state_add(z, zone, item); | ||
344 | } | ||
345 | |||
346 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
347 | int delta) | ||
348 | { | ||
349 | mod_state(zone, item, delta, 0); | ||
350 | } | ||
351 | EXPORT_SYMBOL(mod_zone_page_state); | ||
352 | |||
353 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | ||
354 | { | ||
355 | mod_state(zone, item, 1, 1); | ||
356 | } | ||
357 | |||
358 | void inc_zone_page_state(struct page *page, enum zone_stat_item item) | ||
359 | { | ||
360 | mod_state(page_zone(page), item, 1, 1); | ||
361 | } | ||
362 | EXPORT_SYMBOL(inc_zone_page_state); | ||
363 | |||
364 | void dec_zone_page_state(struct page *page, enum zone_stat_item item) | ||
365 | { | ||
366 | mod_state(page_zone(page), item, -1, -1); | ||
367 | } | ||
368 | EXPORT_SYMBOL(dec_zone_page_state); | ||
369 | #else | ||
370 | /* | ||
371 | * Use interrupt disable to serialize counter updates | ||
372 | */ | ||
373 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
374 | int delta) | ||
375 | { | ||
376 | unsigned long flags; | ||
377 | |||
378 | local_irq_save(flags); | ||
379 | __mod_zone_page_state(zone, item, delta); | ||
380 | local_irq_restore(flags); | ||
381 | } | ||
382 | EXPORT_SYMBOL(mod_zone_page_state); | ||
383 | |||
264 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | 384 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) |
265 | { | 385 | { |
266 | unsigned long flags; | 386 | unsigned long flags; |
@@ -291,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
291 | local_irq_restore(flags); | 411 | local_irq_restore(flags); |
292 | } | 412 | } |
293 | EXPORT_SYMBOL(dec_zone_page_state); | 413 | EXPORT_SYMBOL(dec_zone_page_state); |
414 | #endif | ||
294 | 415 | ||
295 | /* | 416 | /* |
296 | * Update the zone counters for one cpu. | 417 | * Update the zone counters for one cpu. |
@@ -759,6 +880,7 @@ static const char * const vmstat_text[] = { | |||
759 | "numa_local", | 880 | "numa_local", |
760 | "numa_other", | 881 | "numa_other", |
761 | #endif | 882 | #endif |
883 | "nr_anon_transparent_hugepages", | ||
762 | "nr_dirty_threshold", | 884 | "nr_dirty_threshold", |
763 | "nr_dirty_background_threshold", | 885 | "nr_dirty_background_threshold", |
764 | 886 | ||
@@ -834,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
834 | "\n scanned %lu" | 956 | "\n scanned %lu" |
835 | "\n spanned %lu" | 957 | "\n spanned %lu" |
836 | "\n present %lu", | 958 | "\n present %lu", |
837 | zone_nr_free_pages(zone), | 959 | zone_page_state(zone, NR_FREE_PAGES), |
838 | min_wmark_pages(zone), | 960 | min_wmark_pages(zone), |
839 | low_wmark_pages(zone), | 961 | low_wmark_pages(zone), |
840 | high_wmark_pages(zone), | 962 | high_wmark_pages(zone), |
@@ -1033,7 +1155,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
1033 | break; | 1155 | break; |
1034 | case CPU_DOWN_PREPARE: | 1156 | case CPU_DOWN_PREPARE: |
1035 | case CPU_DOWN_PREPARE_FROZEN: | 1157 | case CPU_DOWN_PREPARE_FROZEN: |
1036 | cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); | 1158 | cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); |
1037 | per_cpu(vmstat_work, cpu).work.func = NULL; | 1159 | per_cpu(vmstat_work, cpu).work.func = NULL; |
1038 | break; | 1160 | break; |
1039 | case CPU_DOWN_FAILED: | 1161 | case CPU_DOWN_FAILED: |